From 6372dc688466efec867dcfe2186d668fda4efe0b Mon Sep 17 00:00:00 2001 From: FunkyFr3sh Date: Tue, 20 Sep 2022 02:27:01 +0200 Subject: [PATCH] add AVX memset --- inc/blt.h | 1 + src/blt.c | 44 +++++++++++++++++++++++++++++--------------- src/ddsurface.c | 2 +- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/inc/blt.h b/inc/blt.h index 1cd2a3f..76f3bf5 100644 --- a/inc/blt.h +++ b/inc/blt.h @@ -74,6 +74,7 @@ void blt_colorkey_mirror_stretch( void blt_clear( unsigned char* dst, + char color, size_t size); void blt_colorfill( diff --git a/src/blt.c b/src/blt.c index 59549c9..028329a 100644 --- a/src/blt.c +++ b/src/blt.c @@ -461,16 +461,39 @@ void blt_colorkey_mirror_stretch( } void blt_clear( - unsigned char* dst, + unsigned char* dst, + char color, size_t size) { - if (size >= 1024 * 200) +#ifdef _MSC_VER + if (size < 1024 * 100 && g_blt_use_avx && !((DWORD)dst % 32)) { - __stosb(dst, 0, size); + while (size >= 128) + { + __m256i c0 = _mm256_set1_epi8(color); + + _mm256_store_si256((((__m256i*)dst) + 0), c0); + _mm256_store_si256((((__m256i*)dst) + 1), c0); + _mm256_store_si256((((__m256i*)dst) + 2), c0); + _mm256_store_si256((((__m256i*)dst) + 3), c0); + + dst += 128; + size -= 128; + } + + _mm256_zeroupper(); + + /* memset below handles the remainder */ + } +#endif + + if (size >= 1024 * 100) + { + __stosb(dst, color, size); } else { - memset(dst, 0, size); + memset(dst, color, size); } } @@ -500,22 +523,13 @@ void blt_colorfill( { if (size == dst_p) { - size_t s = dst_p * dst_h; - - if (s >= 1024 * 200) - { - __stosb(dst, color, s); - } - else - { - memset(dst, color, s); - } + blt_clear(dst, color, dst_p * dst_h); } else { for (int i = 0; i < dst_h; i++) { - memset(dst, color, size); + blt_clear(dst, color, size); dst += dst_p; } } diff --git a/src/ddsurface.c b/src/ddsurface.c index 913e5d0..7fdbf26 100644 --- a/src/ddsurface.c +++ b/src/ddsurface.c @@ -551,7 +551,7 @@ HRESULT dds_Flip(IDirectDrawSurfaceImpl* This, IDirectDrawSurfaceImpl* lpDDSurfa if (g_ddraw->flipclear) { - blt_clear(buf, backbuffer->size); + blt_clear(buf, 0, backbuffer->size); } LeaveCriticalSection(&g_ddraw->cs);