From bd6b78f2680dd95320a6658851586164d7b0ff7b Mon Sep 17 00:00:00 2001 From: Sean Barrett Date: Sat, 20 Dec 2014 05:13:25 -0800 Subject: [PATCH] finalize scalar YCbCr conversion back-ported from SSE version; add missing STBIDEFs to a few functions; update documentation with full 1.49 info even though most isn't implemented yet --- stb_image.h | 152 ++++++++++++++++++++++++++++++--------------- tests/image_test.c | 10 +-- 2 files changed, 107 insertions(+), 55 deletions(-) diff --git a/stb_image.h b/stb_image.h index ef01a26..c701e3f 100644 --- a/stb_image.h +++ b/stb_image.h @@ -7,6 +7,8 @@ before you include this file in *one* C or C++ file to create the implementation. #define STBI_ASSERT(x) to avoid using assert.h. + #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free + QUICK NOTES: Primarily of interest to game developers and other people who can @@ -27,7 +29,56 @@ - decode from arbitrary I/O callbacks - SIMD acceleration on x86/x64 - Latest revisions: + Full documentation under "DOCUMENTATION" below. + + + Revision 1.49 release notes: + + - The old STBI_SIMD system which allowed installing a user-defined + IDCT etc. has been removed. If you need this, don't upgrade. My + assumption is that almost nobody was doing this, and those who + were will find the next bullet item more satisfactory anyway. + + - x86 platforms now make use of SSE2 SIMD instructions if available. + This release is 2x faster on our test JPEGs, mostly due to SIMD. + This work was done by Fabian "ryg" Giesen. + + - Compilation of SIMD code can be suppressed with + #define STBI_NO_SIMD + It should not be necessary to disable it unless you have issues + compiling (e.g. using an x86 compiler which doesn't support SSE + intrinsics or that doesn't support the method used to detect + SSE2 support at run-time), and even those can be reported as + bugs so I can refine the built-in compile-time checking to be + smarter. + + - RGB values computed for JPEG images are slightly different from + previous versions of stb_image. (This is due to using less + integer precision in SIMD.) The C code has been adjusted so + that the same RGB values will be computed regardless of whether + SIMD support is available, so your app should always produce + consistent results. But these results are slightly different from + previous versions. (Specifically, about 3% of available YCbCr values + will compute different RGB results from pre-1.49 versions by +-1; + most of the deviating values are one smaller in the G channel.) + + - If you must produce consistent results with previous versions of + stb_image, #define STBI_JPEG_OLD and you will get the same results + you used to; however, you will not get the SIMD speedups for + the YCbCr-to-RGB conversion step (although you should still see + significant JPEG speedup from the other changes). + + Please note that STBI_JPEG_OLD is a temporary feature; it will be + removed in future versions of the library. It is only intended for + back-compatibility use. + + + Latest revision history: + 1.49 (2014-12-25) optimize JPG, incl. x86 SIMD + PGM/PPM support + allocation macros + stbi_load_into() -- load into pre-defined memory + STBI_MALLOC,STBI_REALLOC,STBI_FREE 1.48 (2014-12-14) fix incorrectly-named assert() 1.47 (2014-12-14) 1/2/4-bit PNG support (both grayscale and paletted) optimize PNG @@ -41,9 +92,6 @@ See end of file for full revision history. - TODO: - stbi_info support for BMP,PSD,HDR,PIC - ============================ Contributors ========================= @@ -79,6 +127,8 @@ #ifndef STBI_INCLUDE_STB_IMAGE_H #define STBI_INCLUDE_STB_IMAGE_H +// DOCUMENTATION +// // Limitations: // - no jpeg progressive support // - no 16-bit-per-channel PNG @@ -248,14 +298,6 @@ extern "C" { // load image by filename, open file, or memory buffer // -STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp); - -#ifndef STBI_NO_STDIO -STBIDEF stbi_uc *stbi_load (char const *filename, int *x, int *y, int *comp, int req_comp); -STBIDEF stbi_uc *stbi_load_from_file (FILE *f, int *x, int *y, int *comp, int req_comp); -// for stbi_load_from_file, file pointer is left pointing immediately after image -#endif - typedef struct { int (*read) (void *user,char *data,int size); // fill 'data' with 'size' bytes. return number of bytes actually read @@ -263,18 +305,26 @@ typedef struct int (*eof) (void *user); // returns nonzero if we are at end of file/data } stbi_io_callbacks; -STBIDEF stbi_uc *stbi_load_from_callbacks (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp); +STBIDEF stbi_uc *stbi_load (char const *filename, int *x, int *y, int *comp, int req_comp); +STBIDEF stbi_uc *stbi_load_from_memory (stbi_uc const *buffer, int len , int *x, int *y, int *comp, int req_comp); +STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk , void *user, int *x, int *y, int *comp, int req_comp); + +#ifndef STBI_NO_STDIO +STBIDEF stbi_uc *stbi_load_from_file (FILE *f, int *x, int *y, int *comp, int req_comp); +// for stbi_load_from_file, file pointer is left pointing immediately after image +#endif #ifndef STBI_NO_HDR - STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp); - - #ifndef STBI_NO_STDIO - STBIDEF float *stbi_loadf (char const *filename, int *x, int *y, int *comp, int req_comp); - STBIDEF float *stbi_loadf_from_file (FILE *f, int *x, int *y, int *comp, int req_comp); - #endif - + STBIDEF float *stbi_loadf (char const *filename, int *x, int *y, int *comp, int req_comp); + STBIDEF float *stbi_loadf_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp); STBIDEF float *stbi_loadf_from_callbacks (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp); + #ifndef STBI_NO_STDIO + STBIDEF float *stbi_loadf_from_file (FILE *f, int *x, int *y, int *comp, int req_comp); + #endif +#endif + +#ifndef STBI_NO_HDR STBIDEF void stbi_hdr_to_ldr_gamma(float gamma); STBIDEF void stbi_hdr_to_ldr_scale(float scale); @@ -282,7 +332,7 @@ STBIDEF stbi_uc *stbi_load_from_callbacks (stbi_io_callbacks const *clbk, void STBIDEF void stbi_ldr_to_hdr_scale(float scale); #endif // STBI_NO_HDR -// stbi_is_hdr is always defined +// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user); STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len); #ifndef STBI_NO_STDIO @@ -648,7 +698,7 @@ static FILE *stbi__fopen(char const *filename, char const *mode) } -STBIDEF unsigned char *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp) +STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp) { FILE *f = stbi__fopen(filename, "rb"); unsigned char *result; @@ -658,7 +708,7 @@ STBIDEF unsigned char *stbi_load(char const *filename, int *x, int *y, int *comp return result; } -STBIDEF unsigned char *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp) +STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp) { unsigned char *result; stbi__context s; @@ -672,14 +722,14 @@ STBIDEF unsigned char *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, i } #endif //!STBI_NO_STDIO -STBIDEF unsigned char *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp) +STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp) { stbi__context s; stbi__start_mem(&s,buffer,len); return stbi_load_main(&s,x,y,comp,req_comp); } -STBIDEF unsigned char *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp) +STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp) { stbi__context s; stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user); @@ -740,7 +790,7 @@ STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_ // defined, for API simplicity; if STBI_NO_HDR is defined, it always // reports false! -int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len) +STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len) { #ifndef STBI_NO_HDR stbi__context s; @@ -792,11 +842,11 @@ STBIDEF int stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f; static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f; -void stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; } -void stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; } +STBIDEF void stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; } +STBIDEF void stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; } -void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; } -void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; } +STBIDEF void stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; } +STBIDEF void stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; } #endif @@ -2112,10 +2162,10 @@ static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_ return out; } +#ifdef STBI_JPEG_OLD +// this is the same YCbCr-to-RGB calculation that stb_image has used +// historically before the algorithm changes in 1.49 #define float2fixed(x) ((int) ((x) * 65536 + 0.5)) - -// 0.38 seconds on 3*anemones.jpg (0.25 with processor = Pro) -// VC6 without processor=Pro is generating multiple LEAs per multiply! static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step) { int i; @@ -2140,10 +2190,11 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc out += step; } } - -#define float2fixed2(x) (((int) ((x) * 4096.0f + 0.5f)) << 8) - -static void stbi__YCbCr_to_RGB_backport(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step) +#else +// this is a reduced-precision calculation of YCbCr-to-RGB introduced +// to make sure the code produces the same results in both SIMD and scalar +#define float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8) +static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step) { int i; for (i=0; i < count; ++i) { @@ -2151,11 +2202,9 @@ static void stbi__YCbCr_to_RGB_backport(stbi_uc *out, const stbi_uc *y, const st int r,g,b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; - r = y_fixed + cr*float2fixed2(1.40200f); - g = y_fixed; - g += (cr*-float2fixed2(0.71414f)) & 0xffff0000; - g += (cb*-float2fixed2(0.34414f)) & 0xffff0000; - b = y_fixed + cb*float2fixed2(1.77200f); + r = y_fixed + cr* float2fixed(1.40200f); + g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb* float2fixed(1.77200f); r >>= 20; g >>= 20; b >>= 20; @@ -2169,6 +2218,7 @@ static void stbi__YCbCr_to_RGB_backport(stbi_uc *out, const stbi_uc *y, const st out += step; } } +#endif #ifdef STBI_SSE2 static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step) @@ -2182,7 +2232,7 @@ static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc cons // note: unlike the IDCT, this isn't bit-identical to the integer version. if (step == 4) { // this is a fairly straightforward implementation and not super-optimized. - __m128i signflip = _mm_set1_epi8(-0x80); + __m128i signflip = _mm_set1_epi8(-0x80); __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f)); __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f)); __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f)); @@ -2237,16 +2287,16 @@ static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc cons } for (; i < count; ++i) { - int y_fixed = (y[i] << 16) + 32768; // rounding + int y_fixed = (y[i] << 20) + (1<<19); // rounding int r,g,b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; - r = y_fixed + cr*float2fixed(1.40200f); - g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f); - b = y_fixed + cb*float2fixed(1.77200f); - r >>= 16; - g >>= 16; - b >>= 16; + r = y_fixed + cr* float2fixed(1.40200f); + g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000); + b = y_fixed + cb* float2fixed(1.77200f); + r >>= 20; + g >>= 20; + b >>= 20; if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } @@ -2269,7 +2319,9 @@ static void stbi__setup_jpeg(stbi__jpeg *j) #ifdef STBI_SSE2 if (stbi__sse2_available()) { j->idct_block_kernel = stbi__idct_sse2; + #ifndef STBI_JPEG_OLD j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_sse2; + #endif j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_sse2; } #endif diff --git a/tests/image_test.c b/tests/image_test.c index b3c8e1d..8f35150 100644 --- a/tests/image_test.c +++ b/tests/image_test.c @@ -9,7 +9,7 @@ #define PNGSUITE_PRIMARY -#if 1 +#if 0 void test_ycbcr(void) { STBI_SIMD_ALIGN(unsigned char, y[256]); @@ -28,15 +28,15 @@ void test_ycbcr(void) cb[k] = j; cr[k] = i; } - stbi__YCbCr_to_RGB_sse2(out1[0], y, cb, cr, 256, 4); - stbi__YCbCr_to_RGB_backport(out2[0], y, cb, cr, 256, 4); + stbi__YCbCr_to_RGB_row(out1[0], y, cb, cr, 256, 4); + stbi__YCbCr_to_RGB_sse2(out2[0], y, cb, cr, 256, 4); for (k=0; k < 256; ++k) { if (out1[k][0] != out2[k][0] || out1[k][1] != out2[k][1] || out1[k][2] != out2[k][2]) { int dist1 = abs(out1[k][0] - out2[k][0]); int dist2 = abs(out1[k][1] - out2[k][1]); int dist3 = abs(out1[k][2] - out2[k][2]); ++count; - if (dist2) + if (out1[k][1] > out2[k][1]) ++bigcount; } } @@ -50,7 +50,7 @@ void test_ycbcr(void) int main(int argc, char **argv) { int w,h; - test_ycbcr(); + //test_ycbcr(); if (argc > 1) { int i; for (i=1; i < argc; ++i) {