DxWnd.reloaded/filter/bilinear16_565.cpp

#include <windows.h>
#include <xmmintrin.h>
#include <math.h>

static int* g_px1a    = NULL;
static int* g_px1c    = NULL;
static int  g_px1a_w  = 0;
static int* g_px1ab   = NULL;
static int  g_px1ab_w = 0;

void WINAPI Resize_HQ_2ch565( unsigned char* src, RECT *srcrect, int srcpitch,
                    unsigned char* dest, RECT *destrect, int destpitch)
{
    // Both buffers must be in RGB 565 format.

	int w1, w2, h1, h2;
	w1 = srcrect->right - srcrect->left;
	h1 = srcrect->bottom - srcrect->top;
	w2 = destrect->right - destrect->left;
	h2 = destrect->bottom - destrect->top;

	if(!srcpitch) srcpitch=w1<<1;
	if(!destpitch) destpitch=w1<<1;

	// GHO addiction: new variables
	// p1, p2: pitch offsets of source and dest surfaces in DWORD offset, that is pitch / sizeof(DWORD)
	// beware: current version can operate on displaced source rect, but assumes the dest rect is always the full surface!!
	USHORT p1 = srcpitch >> 1; 
	USHORT p2 = destpitch >> 1; 
	USHORT *dsrc  = (USHORT *)src + (srcrect->top * p1) + srcrect->left;
    USHORT *ddest = (USHORT *)dest;

    // arbitrary resize.

    bool bUpsampleX = (w1 < w2);
    bool bUpsampleY = (h1 < h2);

    // If too many input pixels map to one output pixel, our 32-bit accumulation values
    // could overflow - so, if we have huge mappings like that, cut down the weights:
    //    256 max color value
    //   *256 weight_x
    //   *256 weight_y
    //   *256 (16*16) maximum # of input pixels (x,y) - unless we cut the weights down...
    int weight_shift = 0;

	//gsky916: weight_shift calculation in bUpsampleX && bUpsampleY cases are not necessary.
	//Move to else block to reduce floating point calculations.

    float fh = 256*h1/(float)h2;
    float fw = 256*w1/(float)w2;

    if (bUpsampleX && bUpsampleY)
    {
        // faster to just do 2x2 bilinear interp here

        // cache x1a, x1b for all the columns:
        // ...and your OS better have garbage collection on process exit :)
		//gsky916: also cache x1c for better performance
        if (g_px1a_w < w2)
        {
            if (g_px1a) delete [] g_px1a;
			if (g_px1c) delete [] g_px1c;
            g_px1a = new int[w2*2 * 1];
 			g_px1c = new int[w2*2 * 1];
			g_px1a_w = w2*2;
        }
        for (int x2=0; x2<w2; x2++)
        {
            // find the x-range of input pixels that will contribute:
            int x1a = (int)(x2*fw);
            x1a = min(x1a, 256*(w1-1) - 1);
			g_px1c[x2] = x1a >> 8;
            g_px1a[x2] = x1a & 0xFF;
		
		}

        // FOR EVERY OUTPUT PIXEL
		// gsky916: Use OpenMP to speed up nested for loops (Enable OpenMP support in compiler).
		#pragma omp parallel for schedule(dynamic)
        for (int y2=0; y2<h2; y2++)
        {   
            // find the y-range of input pixels that will contribute:
            int y1a = (int)(y2*fh);
            y1a = min(y1a, 256*(h1-1) - 1);
            int y1c = y1a >> 8;
			int y1cp = y1c * p1;
			y1a = y1a & 0xFF;  		

            USHORT *ddest = &((USHORT *)dest)[y2*p2 + 0];

            for (int x2=0; x2<w2; x2++)
            {
                // find the x-range of input pixels that will contribute:
                int x1a = g_px1a[x2];//(int)(x2*fw); 
                int x1c = g_px1c[x2];

                USHORT *dsrc2 = &dsrc[y1c*p1 + x1c]; // GHO

                // PERFORM BILINEAR INTERPOLATION on 2x2 pixels
                UINT r=0, g=0, b=0, a=0;
                UINT weight_x = 256 - x1a;
                UINT weight_y = 256 - y1a;

                // gsky916: expand the innermost nested loops for speed improvement,
				// and reduce calculation operations...

                UINT c = (UINT)dsrc2[0]; // GHO
                UINT r_src = (c    ) & 0x1F;
                UINT g_src = (c>> 5) & 0x3F;
                UINT b_src = (c>>11) & 0x1F;
                UINT w = (weight_x * weight_y);
                r += r_src * w;
                g += g_src * w;
                b += b_src * w;
                UINT weight_x1 = x1a;

                c = (UINT)dsrc2[1]; // GHO
                r_src = (c    ) & 0x1F;
                g_src = (c>> 5) & 0x3F;
                b_src = (c>>11) & 0x1F;
                w = (weight_x1 * weight_y);
                r += r_src * w;
                g += g_src * w;
                b += b_src * w;
                UINT weight_y1 = y1a;

                c = (UINT)dsrc2[p1]; // GHO
                r_src = (c    ) & 0x1F;
                g_src = (c>> 5) & 0x3F;
                b_src = (c>>11) & 0x1F;
                w = (weight_x * weight_y1);
                r += r_src * w;
                g += g_src * w;
                b += b_src * w;

                c = (UINT)dsrc2[p1+1]; // GHO
                r_src = (c    ) & 0x1F;
                g_src = (c>> 5) & 0x3F;
                b_src = (c>>11) & 0x1F;
                w = (weight_x1 * weight_y1);
                r += r_src * w;
                g += g_src * w;
                b += b_src * w;

				UINT cc = ((r>>16) & 0x1F) | ((g>>(16-5)) & 0x7E0) | ((b>>(16-11)) & 0xF800);
				*ddest++ = (USHORT)cc;
            }
        }
    }
    else // either downscale on vertical or horizontal direction ...
    {
		//gsky916: weight_shift calculation moved here.
        float source_texels_per_out_pixel = (   (w1/(float)w2 + 1) 
                                              * (h1/(float)h2 + 1)
                                            );
        float weight_per_pixel = source_texels_per_out_pixel * 256 * 256;  //weight_x * weight_y
        float accum_per_pixel = weight_per_pixel*256; //color value is 0-255
        float weight_div = accum_per_pixel / 4294967000.0f;
        if (weight_div > 1)
            weight_shift = (int)ceilf( logf((float)weight_div)/logf(2.0f) );
        weight_shift = min(15, weight_shift);  // this could go to 15 and still be ok.

        // cache x1a, x1b for all the columns:
        // ...and your OS better have garbage collection on process exit :)
        if (g_px1ab_w < w2)
        {
            if (g_px1ab) delete [] g_px1ab;
            g_px1ab = new int[w2*2 * 2];
            g_px1ab_w = w2*2;
        }
        for (int x2=0; x2<w2; x2++)
        {
            // find the x-range of input pixels that will contribute:
            int x1a = (int)((x2  )*fw); 
            int x1b = (int)((x2+1)*fw); 
            if (bUpsampleX) // map to same pixel -> we want to interpolate between two pixels!
                x1b = x1a + 256;
            x1b = min(x1b, 256*w1 - 1);
            g_px1ab[x2*2+0] = x1a;
            g_px1ab[x2*2+1] = x1b;
        }

        // FOR EVERY OUTPUT PIXEL
        for (int y2=0; y2<h2; y2++)
        {   
            // find the y-range of input pixels that will contribute:
            int y1a = (int)((y2  )*fh); 
            int y1b = (int)((y2+1)*fh); 
            if (bUpsampleY) // map to same pixel -> we want to interpolate between two pixels!
                y1b = y1a + 256;
            y1b = min(y1b, 256*h1 - 1);
            int y1c = y1a >> 8;
            int y1d = y1b >> 8;

			ddest = &((USHORT *)dest)[y2*p2 + 0];

            for (int x2=0; x2<w2; x2++)
            {
                // find the x-range of input pixels that will contribute:
                int x1a = g_px1ab[x2*2+0];    // (computed earlier)
                int x1b = g_px1ab[x2*2+1];    // (computed earlier)
                int x1c = x1a >> 8;
                int x1d = x1b >> 8;

                // ADD UP ALL INPUT PIXELS CONTRIBUTING TO THIS OUTPUT PIXEL:
                UINT r=0, g=0, b=0, a=0;
                for (int y=y1c; y<=y1d; y++)
                {
                    UINT weight_y = 256;
                    if (y1c != y1d) 
                    {
                        if (y==y1c)
                            weight_y = 256 - (y1a & 0xFF);
                        else if (y==y1d)
                            weight_y = (y1b & 0xFF);
                    }

                    USHORT *dsrc2 = &dsrc[y*p1 + x1c]; // GHO
                    for (int x=x1c; x<=x1d; x++)
                    {
                        UINT weight_x = 256;
                        if (x1c != x1d) 
                        {
                            if (x==x1c)
                                weight_x = 256 - (x1a & 0xFF);
                            else if (x==x1d)
                                weight_x = (x1b & 0xFF);
                        }

                        UINT c = dsrc[y*p1 + x];
                        UINT r_src = (c    ) & 0x1F;
                        UINT g_src = (c>> 5) & 0x3F;
                        UINT b_src = (c>>11) & 0x1F;
                        UINT w = (weight_x * weight_y) >> weight_shift;
                        r += r_src * w;
                        g += g_src * w;
                        b += b_src * w;
                        a += w;
                    }
                }

                // write results
				UINT c = ((r/a) & 0x1F) | (((g/a) << 5) & 0x7E0) | (((b/a) << 11) & 0xF800);
				*ddest++ = c;
            }
        }
    }
}
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00			`#include <windows.h>`
			`#include <xmmintrin.h>`
			`#include <math.h>`

			`static int* g_px1a = NULL;`
v2_03_80_src Former-commit-id: faa04e235cf8faae10f9e220ca51eef95c7115f0 2016-08-15 12:46:49 -04:00			`static int* g_px1c = NULL;`
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00			`static int g_px1a_w = 0;`
			`static int* g_px1ab = NULL;`
			`static int g_px1ab_w = 0;`

			`void WINAPI Resize_HQ_2ch565( unsigned char* src, RECT *srcrect, int srcpitch,`
			`unsigned char* dest, RECT *destrect, int destpitch)`
			`{`
			`// Both buffers must be in RGB 565 format.`

			`int w1, w2, h1, h2;`
			`w1 = srcrect->right - srcrect->left;`
			`h1 = srcrect->bottom - srcrect->top;`
			`w2 = destrect->right - destrect->left;`
			`h2 = destrect->bottom - destrect->top;`

			`if(!srcpitch) srcpitch=w1<<1;`
			`if(!destpitch) destpitch=w1<<1;`

			`// GHO addiction: new variables`
			`// p1, p2: pitch offsets of source and dest surfaces in DWORD offset, that is pitch / sizeof(DWORD)`
			`// beware: current version can operate on displaced source rect, but assumes the dest rect is always the full surface!!`
			`USHORT p1 = srcpitch >> 1;`
			`USHORT p2 = destpitch >> 1;`
			`USHORT dsrc = (USHORT )src + (srcrect->top * p1) + srcrect->left;`
			`USHORT ddest = (USHORT )dest;`

			`// arbitrary resize.`

			`bool bUpsampleX = (w1 < w2);`
			`bool bUpsampleY = (h1 < h2);`

			`// If too many input pixels map to one output pixel, our 32-bit accumulation values`
			`// could overflow - so, if we have huge mappings like that, cut down the weights:`
			`// 256 max color value`
			`// *256 weight_x`
			`// *256 weight_y`
			`// 256 (1616) maximum # of input pixels (x,y) - unless we cut the weights down...`
			`int weight_shift = 0;`

			`//gsky916: weight_shift calculation in bUpsampleX && bUpsampleY cases are not necessary.`
			`//Move to else block to reduce floating point calculations.`

			`float fh = 256*h1/(float)h2;`
			`float fw = 256*w1/(float)w2;`

			`if (bUpsampleX && bUpsampleY)`
			`{`
			`// faster to just do 2x2 bilinear interp here`

			`// cache x1a, x1b for all the columns:`
			`// ...and your OS better have garbage collection on process exit :)`
v2_03_80_src Former-commit-id: faa04e235cf8faae10f9e220ca51eef95c7115f0 2016-08-15 12:46:49 -04:00			`//gsky916: also cache x1c for better performance`
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00			`if (g_px1a_w < w2)`
			`{`
			`if (g_px1a) delete [] g_px1a;`
v2_03_80_src Former-commit-id: faa04e235cf8faae10f9e220ca51eef95c7115f0 2016-08-15 12:46:49 -04:00			`if (g_px1c) delete [] g_px1c;`
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00			`g_px1a = new int[w22 1];`
v2_03_80_src Former-commit-id: faa04e235cf8faae10f9e220ca51eef95c7115f0 2016-08-15 12:46:49 -04:00			`g_px1c = new int[w22 1];`
			`g_px1a_w = w2*2;`
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00			`}`
			`for (int x2=0; x2<w2; x2++)`
			`{`
			`// find the x-range of input pixels that will contribute:`
			`int x1a = (int)(x2*fw);`
			`x1a = min(x1a, 256*(w1-1) - 1);`
v2_03_80_src Former-commit-id: faa04e235cf8faae10f9e220ca51eef95c7115f0 2016-08-15 12:46:49 -04:00			`g_px1c[x2] = x1a >> 8;`
			`g_px1a[x2] = x1a & 0xFF;`

			`}`
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00
			`// FOR EVERY OUTPUT PIXEL`
			`// gsky916: Use OpenMP to speed up nested for loops (Enable OpenMP support in compiler).`
			`#pragma omp parallel for schedule(dynamic)`
			`for (int y2=0; y2<h2; y2++)`
			`{`
			`// find the y-range of input pixels that will contribute:`
			`int y1a = (int)(y2*fh);`
			`y1a = min(y1a, 256*(h1-1) - 1);`
			`int y1c = y1a >> 8;`
v2_03_80_src Former-commit-id: faa04e235cf8faae10f9e220ca51eef95c7115f0 2016-08-15 12:46:49 -04:00			`int y1cp = y1c * p1;`
			`y1a = y1a & 0xFF;`
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00
			`USHORT ddest = &((USHORT )dest)[y2*p2 + 0];`

			`for (int x2=0; x2<w2; x2++)`
			`{`
			`// find the x-range of input pixels that will contribute:`
			`int x1a = g_px1a[x2];//(int)(x2*fw);`
v2_03_80_src Former-commit-id: faa04e235cf8faae10f9e220ca51eef95c7115f0 2016-08-15 12:46:49 -04:00			`int x1c = g_px1c[x2];`
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00
			`USHORT dsrc2 = &dsrc[y1cp1 + x1c]; // GHO`

			`// PERFORM BILINEAR INTERPOLATION on 2x2 pixels`
			`UINT r=0, g=0, b=0, a=0;`
v2_03_80_src Former-commit-id: faa04e235cf8faae10f9e220ca51eef95c7115f0 2016-08-15 12:46:49 -04:00			`UINT weight_x = 256 - x1a;`
			`UINT weight_y = 256 - y1a;`

			`// gsky916: expand the innermost nested loops for speed improvement,`
			`// and reduce calculation operations...`

			`UINT c = (UINT)dsrc2[0]; // GHO`
			`UINT r_src = (c ) & 0x1F;`
			`UINT g_src = (c>> 5) & 0x3F;`
			`UINT b_src = (c>>11) & 0x1F;`
			`UINT w = (weight_x * weight_y);`
			`r += r_src * w;`
			`g += g_src * w;`
			`b += b_src * w;`
			`UINT weight_x1 = x1a;`

			`c = (UINT)dsrc2[1]; // GHO`
			`r_src = (c ) & 0x1F;`
			`g_src = (c>> 5) & 0x3F;`
			`b_src = (c>>11) & 0x1F;`
			`w = (weight_x1 * weight_y);`
			`r += r_src * w;`
			`g += g_src * w;`
			`b += b_src * w;`
			`UINT weight_y1 = y1a;`

			`c = (UINT)dsrc2[p1]; // GHO`
			`r_src = (c ) & 0x1F;`
			`g_src = (c>> 5) & 0x3F;`
			`b_src = (c>>11) & 0x1F;`
			`w = (weight_x * weight_y1);`
			`r += r_src * w;`
			`g += g_src * w;`
			`b += b_src * w;`

			`c = (UINT)dsrc2[p1+1]; // GHO`
			`r_src = (c ) & 0x1F;`
			`g_src = (c>> 5) & 0x3F;`
			`b_src = (c>>11) & 0x1F;`
			`w = (weight_x1 * weight_y1);`
			`r += r_src * w;`
			`g += g_src * w;`
			`b += b_src * w;`

			`UINT cc = ((r>>16) & 0x1F) \| ((g>>(16-5)) & 0x7E0) \| ((b>>(16-11)) & 0xF800);`
			`*ddest++ = (USHORT)cc;`
v2_03_79_src Former-commit-id: 17f4961d27c00d9167001f0251be794d2f88cb1c 2016-04-08 12:46:45 -04:00			`}`
			`}`
			`}`
			`else // either downscale on vertical or horizontal direction ...`
			`{`
			`//gsky916: weight_shift calculation moved here.`
			`float source_texels_per_out_pixel = ( (w1/(float)w2 + 1)`
			`* (h1/(float)h2 + 1)`
			`);`
			`float weight_per_pixel = source_texels_per_out_pixel * 256 * 256; //weight_x * weight_y`
			`float accum_per_pixel = weight_per_pixel*256; //color value is 0-255`
			`float weight_div = accum_per_pixel / 4294967000.0f;`
			`if (weight_div > 1)`
			`weight_shift = (int)ceilf( logf((float)weight_div)/logf(2.0f) );`
			`weight_shift = min(15, weight_shift); // this could go to 15 and still be ok.`

			`// cache x1a, x1b for all the columns:`
			`// ...and your OS better have garbage collection on process exit :)`
			`if (g_px1ab_w < w2)`
			`{`
			`if (g_px1ab) delete [] g_px1ab;`
			`g_px1ab = new int[w22 2];`
			`g_px1ab_w = w2*2;`
			`}`
			`for (int x2=0; x2<w2; x2++)`
			`{`
			`// find the x-range of input pixels that will contribute:`
			`int x1a = (int)((x2 )*fw);`
			`int x1b = (int)((x2+1)*fw);`
			`if (bUpsampleX) // map to same pixel -> we want to interpolate between two pixels!`
			`x1b = x1a + 256;`
			`x1b = min(x1b, 256*w1 - 1);`
			`g_px1ab[x2*2+0] = x1a;`
			`g_px1ab[x2*2+1] = x1b;`
			`}`

			`// FOR EVERY OUTPUT PIXEL`
			`for (int y2=0; y2<h2; y2++)`
			`{`
			`// find the y-range of input pixels that will contribute:`
			`int y1a = (int)((y2 )*fh);`
			`int y1b = (int)((y2+1)*fh);`
			`if (bUpsampleY) // map to same pixel -> we want to interpolate between two pixels!`
			`y1b = y1a + 256;`
			`y1b = min(y1b, 256*h1 - 1);`
			`int y1c = y1a >> 8;`
			`int y1d = y1b >> 8;`

			`ddest = &((USHORT )dest)[y2p2 + 0];`

			`for (int x2=0; x2<w2; x2++)`
			`{`
			`// find the x-range of input pixels that will contribute:`
			`int x1a = g_px1ab[x2*2+0]; // (computed earlier)`
			`int x1b = g_px1ab[x2*2+1]; // (computed earlier)`
			`int x1c = x1a >> 8;`
			`int x1d = x1b >> 8;`

			`// ADD UP ALL INPUT PIXELS CONTRIBUTING TO THIS OUTPUT PIXEL:`
			`UINT r=0, g=0, b=0, a=0;`
			`for (int y=y1c; y<=y1d; y++)`
			`{`
			`UINT weight_y = 256;`
			`if (y1c != y1d)`
			`{`
			`if (y==y1c)`
			`weight_y = 256 - (y1a & 0xFF);`
			`else if (y==y1d)`
			`weight_y = (y1b & 0xFF);`
			`}`

			`USHORT dsrc2 = &dsrc[yp1 + x1c]; // GHO`
			`for (int x=x1c; x<=x1d; x++)`
			`{`
			`UINT weight_x = 256;`
			`if (x1c != x1d)`
			`{`
			`if (x==x1c)`
			`weight_x = 256 - (x1a & 0xFF);`
			`else if (x==x1d)`
			`weight_x = (x1b & 0xFF);`
			`}`

			`UINT c = dsrc[y*p1 + x];`
			`UINT r_src = (c ) & 0x1F;`
			`UINT g_src = (c>> 5) & 0x3F;`
			`UINT b_src = (c>>11) & 0x1F;`
			`UINT w = (weight_x * weight_y) >> weight_shift;`
			`r += r_src * w;`
			`g += g_src * w;`
			`b += b_src * w;`
			`a += w;`
			`}`
			`}`

			`// write results`
			`UINT c = ((r/a) & 0x1F) \| (((g/a) << 5) & 0x7E0) \| (((b/a) << 11) & 0xF800);`
			`*ddest++ = c;`
			`}`
			`}`
			`}`
			`}`