#include <windows.h>
#include <xmmintrin.h>
#include <math.h>

static int* g_px1a    = NULL;
static int* g_px1c    = NULL;
static int  g_px1a_w  = 0;
static int* g_px1ab   = NULL;
static int  g_px1ab_w = 0;

void WINAPI Resize_HQ_2ch565( unsigned char* src, RECT *srcrect, int srcpitch,
                    unsigned char* dest, RECT *destrect, int destpitch)
{
    // Both buffers must be in RGB 565 format.

	int w1, w2, h1, h2;
	w1 = srcrect->right - srcrect->left;
	h1 = srcrect->bottom - srcrect->top;
	w2 = destrect->right - destrect->left;
	h2 = destrect->bottom - destrect->top;

	if(!srcpitch) srcpitch=w1<<1;
	if(!destpitch) destpitch=w1<<1;

	// GHO addiction: new variables
	// p1, p2: pitch offsets of source and dest surfaces in DWORD offset, that is pitch / sizeof(DWORD)
	// beware: current version can operate on displaced source rect, but assumes the dest rect is always the full surface!!
	USHORT p1 = srcpitch >> 1; 
	USHORT p2 = destpitch >> 1; 
	USHORT *dsrc  = (USHORT *)src + (srcrect->top * p1) + srcrect->left;
    USHORT *ddest = (USHORT *)dest;

    // arbitrary resize.

    bool bUpsampleX = (w1 < w2);
    bool bUpsampleY = (h1 < h2);

    // If too many input pixels map to one output pixel, our 32-bit accumulation values
    // could overflow - so, if we have huge mappings like that, cut down the weights:
    //    256 max color value
    //   *256 weight_x
    //   *256 weight_y
    //   *256 (16*16) maximum # of input pixels (x,y) - unless we cut the weights down...
    int weight_shift = 0;

	//gsky916: weight_shift calculation in bUpsampleX && bUpsampleY cases are not necessary.
	//Move to else block to reduce floating point calculations.

    float fh = 256*h1/(float)h2;
    float fw = 256*w1/(float)w2;

    if (bUpsampleX && bUpsampleY)
    {
        // faster to just do 2x2 bilinear interp here

        // cache x1a, x1b for all the columns:
        // ...and your OS better have garbage collection on process exit :)
		//gsky916: also cache x1c for better performance
        if (g_px1a_w < w2)
        {
            if (g_px1a) delete [] g_px1a;
			if (g_px1c) delete [] g_px1c;
            g_px1a = new int[w2*2 * 1];
 			g_px1c = new int[w2*2 * 1];
			g_px1a_w = w2*2;
        }
        for (int x2=0; x2<w2; x2++)
        {
            // find the x-range of input pixels that will contribute:
            int x1a = (int)(x2*fw);
            x1a = min(x1a, 256*(w1-1) - 1);
			g_px1c[x2] = x1a >> 8;
            g_px1a[x2] = x1a & 0xFF;
		
		}

        // FOR EVERY OUTPUT PIXEL
		// gsky916: Use OpenMP to speed up nested for loops (Enable OpenMP support in compiler).
		#pragma omp parallel for schedule(dynamic)
        for (int y2=0; y2<h2; y2++)
        {   
            // find the y-range of input pixels that will contribute:
            int y1a = (int)(y2*fh);
            y1a = min(y1a, 256*(h1-1) - 1);
            int y1c = y1a >> 8;
			int y1cp = y1c * p1;
			y1a = y1a & 0xFF;  		

            USHORT *ddest = &((USHORT *)dest)[y2*p2 + 0];

            for (int x2=0; x2<w2; x2++)
            {
                // find the x-range of input pixels that will contribute:
                int x1a = g_px1a[x2];//(int)(x2*fw); 
                int x1c = g_px1c[x2];

                USHORT *dsrc2 = &dsrc[y1c*p1 + x1c]; // GHO

                // PERFORM BILINEAR INTERPOLATION on 2x2 pixels
                UINT r=0, g=0, b=0, a=0;
                UINT weight_x = 256 - x1a;
                UINT weight_y = 256 - y1a;

                // gsky916: expand the innermost nested loops for speed improvement,
				// and reduce calculation operations...

                UINT c = (UINT)dsrc2[0]; // GHO
                UINT r_src = (c    ) & 0x1F;
                UINT g_src = (c>> 5) & 0x3F;
                UINT b_src = (c>>11) & 0x1F;
                UINT w = (weight_x * weight_y);
                r += r_src * w;
                g += g_src * w;
                b += b_src * w;
                UINT weight_x1 = x1a;

                c = (UINT)dsrc2[1]; // GHO
                r_src = (c    ) & 0x1F;
                g_src = (c>> 5) & 0x3F;
                b_src = (c>>11) & 0x1F;
                w = (weight_x1 * weight_y);
                r += r_src * w;
                g += g_src * w;
                b += b_src * w;
                UINT weight_y1 = y1a;

                c = (UINT)dsrc2[p1]; // GHO
                r_src = (c    ) & 0x1F;
                g_src = (c>> 5) & 0x3F;
                b_src = (c>>11) & 0x1F;
                w = (weight_x * weight_y1);
                r += r_src * w;
                g += g_src * w;
                b += b_src * w;

                c = (UINT)dsrc2[p1+1]; // GHO
                r_src = (c    ) & 0x1F;
                g_src = (c>> 5) & 0x3F;
                b_src = (c>>11) & 0x1F;
                w = (weight_x1 * weight_y1);
                r += r_src * w;
                g += g_src * w;
                b += b_src * w;

				UINT cc = ((r>>16) & 0x1F) | ((g>>(16-5)) & 0x7E0) | ((b>>(16-11)) & 0xF800);
				*ddest++ = (USHORT)cc;
            }
        }
    }
    else // either downscale on vertical or horizontal direction ...
    {
		//gsky916: weight_shift calculation moved here.
        float source_texels_per_out_pixel = (   (w1/(float)w2 + 1) 
                                              * (h1/(float)h2 + 1)
                                            );
        float weight_per_pixel = source_texels_per_out_pixel * 256 * 256;  //weight_x * weight_y
        float accum_per_pixel = weight_per_pixel*256; //color value is 0-255
        float weight_div = accum_per_pixel / 4294967000.0f;
        if (weight_div > 1)
            weight_shift = (int)ceilf( logf((float)weight_div)/logf(2.0f) );
        weight_shift = min(15, weight_shift);  // this could go to 15 and still be ok.

        // cache x1a, x1b for all the columns:
        // ...and your OS better have garbage collection on process exit :)
        if (g_px1ab_w < w2)
        {
            if (g_px1ab) delete [] g_px1ab;
            g_px1ab = new int[w2*2 * 2];
            g_px1ab_w = w2*2;
        }
        for (int x2=0; x2<w2; x2++)
        {
            // find the x-range of input pixels that will contribute:
            int x1a = (int)((x2  )*fw); 
            int x1b = (int)((x2+1)*fw); 
            if (bUpsampleX) // map to same pixel -> we want to interpolate between two pixels!
                x1b = x1a + 256;
            x1b = min(x1b, 256*w1 - 1);
            g_px1ab[x2*2+0] = x1a;
            g_px1ab[x2*2+1] = x1b;
        }

        // FOR EVERY OUTPUT PIXEL
        for (int y2=0; y2<h2; y2++)
        {   
            // find the y-range of input pixels that will contribute:
            int y1a = (int)((y2  )*fh); 
            int y1b = (int)((y2+1)*fh); 
            if (bUpsampleY) // map to same pixel -> we want to interpolate between two pixels!
                y1b = y1a + 256;
            y1b = min(y1b, 256*h1 - 1);
            int y1c = y1a >> 8;
            int y1d = y1b >> 8;

			ddest = &((USHORT *)dest)[y2*p2 + 0];

            for (int x2=0; x2<w2; x2++)
            {
                // find the x-range of input pixels that will contribute:
                int x1a = g_px1ab[x2*2+0];    // (computed earlier)
                int x1b = g_px1ab[x2*2+1];    // (computed earlier)
                int x1c = x1a >> 8;
                int x1d = x1b >> 8;

                // ADD UP ALL INPUT PIXELS CONTRIBUTING TO THIS OUTPUT PIXEL:
                UINT r=0, g=0, b=0, a=0;
                for (int y=y1c; y<=y1d; y++)
                {
                    UINT weight_y = 256;
                    if (y1c != y1d) 
                    {
                        if (y==y1c)
                            weight_y = 256 - (y1a & 0xFF);
                        else if (y==y1d)
                            weight_y = (y1b & 0xFF);
                    }

                    USHORT *dsrc2 = &dsrc[y*p1 + x1c]; // GHO
                    for (int x=x1c; x<=x1d; x++)
                    {
                        UINT weight_x = 256;
                        if (x1c != x1d) 
                        {
                            if (x==x1c)
                                weight_x = 256 - (x1a & 0xFF);
                            else if (x==x1d)
                                weight_x = (x1b & 0xFF);
                        }

                        UINT c = dsrc[y*p1 + x];
                        UINT r_src = (c    ) & 0x1F;
                        UINT g_src = (c>> 5) & 0x3F;
                        UINT b_src = (c>>11) & 0x1F;
                        UINT w = (weight_x * weight_y) >> weight_shift;
                        r += r_src * w;
                        g += g_src * w;
                        b += b_src * w;
                        a += w;
                    }
                }

                // write results
				UINT c = ((r/a) & 0x1F) | (((g/a) << 5) & 0x7E0) | (((b/a) << 11) & 0xF800);
				*ddest++ = c;
            }
        }
    }
}