1
0
mirror of https://github.com/DxWnd/DxWnd.reloaded synced 2024-12-30 09:25:35 +01:00
DxWnd.reloaded/filter/bilinear16_565.cpp

248 lines
8.8 KiB
C++
Raw Permalink Normal View History

#include <windows.h>
#include <xmmintrin.h>
#include <math.h>
static int* g_px1a = NULL;
static int* g_px1c = NULL;
static int g_px1a_w = 0;
static int* g_px1ab = NULL;
static int g_px1ab_w = 0;
void WINAPI Resize_HQ_2ch565( unsigned char* src, RECT *srcrect, int srcpitch,
unsigned char* dest, RECT *destrect, int destpitch)
{
// Both buffers must be in RGB 565 format.
int w1, w2, h1, h2;
w1 = srcrect->right - srcrect->left;
h1 = srcrect->bottom - srcrect->top;
w2 = destrect->right - destrect->left;
h2 = destrect->bottom - destrect->top;
if(!srcpitch) srcpitch=w1<<1;
if(!destpitch) destpitch=w1<<1;
// GHO addiction: new variables
// p1, p2: pitch offsets of source and dest surfaces in DWORD offset, that is pitch / sizeof(DWORD)
// beware: current version can operate on displaced source rect, but assumes the dest rect is always the full surface!!
USHORT p1 = srcpitch >> 1;
USHORT p2 = destpitch >> 1;
USHORT *dsrc = (USHORT *)src + (srcrect->top * p1) + srcrect->left;
USHORT *ddest = (USHORT *)dest;
// arbitrary resize.
bool bUpsampleX = (w1 < w2);
bool bUpsampleY = (h1 < h2);
// If too many input pixels map to one output pixel, our 32-bit accumulation values
// could overflow - so, if we have huge mappings like that, cut down the weights:
// 256 max color value
// *256 weight_x
// *256 weight_y
// *256 (16*16) maximum # of input pixels (x,y) - unless we cut the weights down...
int weight_shift = 0;
//gsky916: weight_shift calculation in bUpsampleX && bUpsampleY cases are not necessary.
//Move to else block to reduce floating point calculations.
float fh = 256*h1/(float)h2;
float fw = 256*w1/(float)w2;
if (bUpsampleX && bUpsampleY)
{
// faster to just do 2x2 bilinear interp here
// cache x1a, x1b for all the columns:
// ...and your OS better have garbage collection on process exit :)
//gsky916: also cache x1c for better performance
if (g_px1a_w < w2)
{
if (g_px1a) delete [] g_px1a;
if (g_px1c) delete [] g_px1c;
g_px1a = new int[w2*2 * 1];
g_px1c = new int[w2*2 * 1];
g_px1a_w = w2*2;
}
for (int x2=0; x2<w2; x2++)
{
// find the x-range of input pixels that will contribute:
int x1a = (int)(x2*fw);
x1a = min(x1a, 256*(w1-1) - 1);
g_px1c[x2] = x1a >> 8;
g_px1a[x2] = x1a & 0xFF;
}
// FOR EVERY OUTPUT PIXEL
// gsky916: Use OpenMP to speed up nested for loops (Enable OpenMP support in compiler).
#pragma omp parallel for schedule(dynamic)
for (int y2=0; y2<h2; y2++)
{
// find the y-range of input pixels that will contribute:
int y1a = (int)(y2*fh);
y1a = min(y1a, 256*(h1-1) - 1);
int y1c = y1a >> 8;
int y1cp = y1c * p1;
y1a = y1a & 0xFF;
USHORT *ddest = &((USHORT *)dest)[y2*p2 + 0];
for (int x2=0; x2<w2; x2++)
{
// find the x-range of input pixels that will contribute:
int x1a = g_px1a[x2];//(int)(x2*fw);
int x1c = g_px1c[x2];
USHORT *dsrc2 = &dsrc[y1c*p1 + x1c]; // GHO
// PERFORM BILINEAR INTERPOLATION on 2x2 pixels
UINT r=0, g=0, b=0, a=0;
UINT weight_x = 256 - x1a;
UINT weight_y = 256 - y1a;
// gsky916: expand the innermost nested loops for speed improvement,
// and reduce calculation operations...
UINT c = (UINT)dsrc2[0]; // GHO
UINT r_src = (c ) & 0x1F;
UINT g_src = (c>> 5) & 0x3F;
UINT b_src = (c>>11) & 0x1F;
UINT w = (weight_x * weight_y);
r += r_src * w;
g += g_src * w;
b += b_src * w;
UINT weight_x1 = x1a;
c = (UINT)dsrc2[1]; // GHO
r_src = (c ) & 0x1F;
g_src = (c>> 5) & 0x3F;
b_src = (c>>11) & 0x1F;
w = (weight_x1 * weight_y);
r += r_src * w;
g += g_src * w;
b += b_src * w;
UINT weight_y1 = y1a;
c = (UINT)dsrc2[p1]; // GHO
r_src = (c ) & 0x1F;
g_src = (c>> 5) & 0x3F;
b_src = (c>>11) & 0x1F;
w = (weight_x * weight_y1);
r += r_src * w;
g += g_src * w;
b += b_src * w;
c = (UINT)dsrc2[p1+1]; // GHO
r_src = (c ) & 0x1F;
g_src = (c>> 5) & 0x3F;
b_src = (c>>11) & 0x1F;
w = (weight_x1 * weight_y1);
r += r_src * w;
g += g_src * w;
b += b_src * w;
UINT cc = ((r>>16) & 0x1F) | ((g>>(16-5)) & 0x7E0) | ((b>>(16-11)) & 0xF800);
*ddest++ = (USHORT)cc;
}
}
}
else // either downscale on vertical or horizontal direction ...
{
//gsky916: weight_shift calculation moved here.
float source_texels_per_out_pixel = ( (w1/(float)w2 + 1)
* (h1/(float)h2 + 1)
);
float weight_per_pixel = source_texels_per_out_pixel * 256 * 256; //weight_x * weight_y
float accum_per_pixel = weight_per_pixel*256; //color value is 0-255
float weight_div = accum_per_pixel / 4294967000.0f;
if (weight_div > 1)
weight_shift = (int)ceilf( logf((float)weight_div)/logf(2.0f) );
weight_shift = min(15, weight_shift); // this could go to 15 and still be ok.
// cache x1a, x1b for all the columns:
// ...and your OS better have garbage collection on process exit :)
if (g_px1ab_w < w2)
{
if (g_px1ab) delete [] g_px1ab;
g_px1ab = new int[w2*2 * 2];
g_px1ab_w = w2*2;
}
for (int x2=0; x2<w2; x2++)
{
// find the x-range of input pixels that will contribute:
int x1a = (int)((x2 )*fw);
int x1b = (int)((x2+1)*fw);
if (bUpsampleX) // map to same pixel -> we want to interpolate between two pixels!
x1b = x1a + 256;
x1b = min(x1b, 256*w1 - 1);
g_px1ab[x2*2+0] = x1a;
g_px1ab[x2*2+1] = x1b;
}
// FOR EVERY OUTPUT PIXEL
for (int y2=0; y2<h2; y2++)
{
// find the y-range of input pixels that will contribute:
int y1a = (int)((y2 )*fh);
int y1b = (int)((y2+1)*fh);
if (bUpsampleY) // map to same pixel -> we want to interpolate between two pixels!
y1b = y1a + 256;
y1b = min(y1b, 256*h1 - 1);
int y1c = y1a >> 8;
int y1d = y1b >> 8;
ddest = &((USHORT *)dest)[y2*p2 + 0];
for (int x2=0; x2<w2; x2++)
{
// find the x-range of input pixels that will contribute:
int x1a = g_px1ab[x2*2+0]; // (computed earlier)
int x1b = g_px1ab[x2*2+1]; // (computed earlier)
int x1c = x1a >> 8;
int x1d = x1b >> 8;
// ADD UP ALL INPUT PIXELS CONTRIBUTING TO THIS OUTPUT PIXEL:
UINT r=0, g=0, b=0, a=0;
for (int y=y1c; y<=y1d; y++)
{
UINT weight_y = 256;
if (y1c != y1d)
{
if (y==y1c)
weight_y = 256 - (y1a & 0xFF);
else if (y==y1d)
weight_y = (y1b & 0xFF);
}
USHORT *dsrc2 = &dsrc[y*p1 + x1c]; // GHO
for (int x=x1c; x<=x1d; x++)
{
UINT weight_x = 256;
if (x1c != x1d)
{
if (x==x1c)
weight_x = 256 - (x1a & 0xFF);
else if (x==x1d)
weight_x = (x1b & 0xFF);
}
UINT c = dsrc[y*p1 + x];
UINT r_src = (c ) & 0x1F;
UINT g_src = (c>> 5) & 0x3F;
UINT b_src = (c>>11) & 0x1F;
UINT w = (weight_x * weight_y) >> weight_shift;
r += r_src * w;
g += g_src * w;
b += b_src * w;
a += w;
}
}
// write results
UINT c = ((r/a) & 0x1F) | (((g/a) << 5) & 0x7E0) | (((b/a) << 11) & 0xF800);
*ddest++ = c;
}
}
}
}