From f259bf27e9e465737d346ad188a62245bb71bd0b Mon Sep 17 00:00:00 2001 From: Sean Barrett Date: Fri, 19 Dec 2014 04:39:04 -0800 Subject: [PATCH] VC6: support using inline asm for cpuid YCbCr: switch SSE code to constants that match old C; create C version that is same as SSE; tiny optimization(?) of SSE --- stb_image.h | 99 +++++++++++++++++++++++++++++++++++----------- tests/image_test.c | 39 ++++++++++++++++++ 2 files changed, 114 insertions(+), 24 deletions(-) diff --git a/stb_image.h b/stb_image.h index 1831028..ef01a26 100644 --- a/stb_image.h +++ b/stb_image.h @@ -81,7 +81,8 @@ // Limitations: // - no jpeg progressive support -// - non-HDR formats support 8-bit samples only (jpeg, png) +// - no 16-bit-per-channel PNG +// - no 12-bit-per-channel jpeg // - no delayed line count (jpeg) -- IJG doesn't support either // - no 1-bit BMP // - GIF always returns *comp=4 @@ -196,8 +197,12 @@ // // SIMD support // -// The JPEG decoder will automatically use SIMD kernels where supported, -// replacing the STBI_SIMD-do-it-yourself interface from previous versions. +// The JPEG decoder will automatically use SIMD kernels on x86 platforms +// where supported. +// +// (The old do-it-yourself SIMD API is no longer supported in the current +// code.) +// // The code will automatically detect if the required SIMD instructions are // available, and fall back to the generic C version where they're not. // @@ -402,16 +407,35 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1]; #include #ifdef _MSC_VER + +#if _MSC_VER >= 1400 // not VC6 #include // __cpuid +static int stbi__cpuid3(void) +{ + int info[4]; + __cpuid(info,1); + return info[3]; +} +#else +static int stbi__cpuid3(void) +{ + int res; + __asm { + mov eax,1 + cpuid + mov res,edx + } + return res; +} +#endif + #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name static int stbi__sse2_available() { - int info[4]; - __cpuid(info, 1); - return ((info[3] >> 26) & 1) != 0; + int info3 = stbi__cpuid3(); + return ((info3 >> 26) & 1) != 0; } - #else // assume GCC-style if not VC++ #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) @@ -2117,6 +2141,35 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc } } +#define float2fixed2(x) (((int) ((x) * 4096.0f + 0.5f)) << 8) + +static void stbi__YCbCr_to_RGB_backport(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step) +{ + int i; + for (i=0; i < count; ++i) { + int y_fixed = (y[i] << 20) + (1<<19); // rounding + int r,g,b; + int cr = pcr[i] - 128; + int cb = pcb[i] - 128; + r = y_fixed + cr*float2fixed2(1.40200f); + g = y_fixed; + g += (cr*-float2fixed2(0.71414f)) & 0xffff0000; + g += (cb*-float2fixed2(0.34414f)) & 0xffff0000; + b = y_fixed + cb*float2fixed2(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; + if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } + if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } + if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } + out[0] = (stbi_uc)r; + out[1] = (stbi_uc)g; + out[2] = (stbi_uc)b; + out[3] = 255; + out += step; + } +} + #ifdef STBI_SSE2 static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step) { @@ -2130,37 +2183,35 @@ static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc cons if (step == 4) { // this is a fairly straightforward implementation and not super-optimized. __m128i signflip = _mm_set1_epi8(-0x80); - __m128i cr_const0 = _mm_set1_epi16((short) ( 1.40200f*4096.0f)); - __m128i cr_const1 = _mm_set1_epi16((short) (-0.71414f*4096.0f)); - __m128i cb_const0 = _mm_set1_epi16((short) (-0.34414f*4096.0f)); - __m128i cb_const1 = _mm_set1_epi16((short) ( 1.77200f*4096.0f)); - __m128i y_bias = _mm_set1_epi16(8); - __m128i xw = _mm_set1_epi16(255); + __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f)); + __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f)); + __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f)); + __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f)); + __m128i y_bias = _mm_set1_epi8((char) 128); + __m128i xw = _mm_set1_epi16(255); // alpha channel for (; i+7 < count; i += 8) { // load - __m128i zero = _mm_setzero_si128(); __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i)); __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i)); __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i)); - __m128i cr_bias = _mm_xor_si128(cr_bytes, signflip); // -128 - __m128i cb_bias = _mm_xor_si128(cb_bytes, signflip); // -128 + __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 + __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 // unpack to short (and left-shift cr, cb by 8) - __m128i yw = _mm_unpacklo_epi8(y_bytes, zero); - __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_bias); - __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_bias); + __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); + __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased); + __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased); // color transform - __m128i yws = _mm_slli_epi16(yw, 4); + __m128i yws = _mm_srli_epi16(yw, 4); __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); - __m128i ywb = _mm_add_epi16(yws, y_bias); __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); - __m128i rws = _mm_add_epi16(cr0, ywb); - __m128i gwt = _mm_add_epi16(cb0, ywb); - __m128i bws = _mm_add_epi16(ywb, cb1); + __m128i rws = _mm_add_epi16(cr0, yws); + __m128i gwt = _mm_add_epi16(cb0, yws); + __m128i bws = _mm_add_epi16(yws, cb1); __m128i gws = _mm_add_epi16(gwt, cr1); // descale diff --git a/tests/image_test.c b/tests/image_test.c index 5971704..b3c8e1d 100644 --- a/tests/image_test.c +++ b/tests/image_test.c @@ -9,9 +9,48 @@ #define PNGSUITE_PRIMARY +#if 1 +void test_ycbcr(void) +{ + STBI_SIMD_ALIGN(unsigned char, y[256]); + STBI_SIMD_ALIGN(unsigned char, cb[256]); + STBI_SIMD_ALIGN(unsigned char, cr[256]); + STBI_SIMD_ALIGN(unsigned char, out1[256][4]); + STBI_SIMD_ALIGN(unsigned char, out2[256][4]); + + int i,j,k; + int count = 0, bigcount=0; + + for (i=0; i < 256; ++i) { + for (j=0; j < 256; ++j) { + for (k=0; k < 256; ++k) { + y[k] = k; + cb[k] = j; + cr[k] = i; + } + stbi__YCbCr_to_RGB_sse2(out1[0], y, cb, cr, 256, 4); + stbi__YCbCr_to_RGB_backport(out2[0], y, cb, cr, 256, 4); + for (k=0; k < 256; ++k) { + if (out1[k][0] != out2[k][0] || out1[k][1] != out2[k][1] || out1[k][2] != out2[k][2]) { + int dist1 = abs(out1[k][0] - out2[k][0]); + int dist2 = abs(out1[k][1] - out2[k][1]); + int dist3 = abs(out1[k][2] - out2[k][2]); + ++count; + if (dist2) + ++bigcount; + } + } + } + printf("So far: %d (%d big)\n", count, bigcount); + } + printf("Final: %d (%d big)\n", count, bigcount); +} +#endif + int main(int argc, char **argv) { int w,h; + test_ycbcr(); if (argc > 1) { int i; for (i=1; i < argc; ++i) {