From 49c7f1b397507d1af08073713673a7cda8eb954b Mon Sep 17 00:00:00 2001 From: Fabian Giesen Date: Sat, 22 Jul 2017 18:43:36 -0700 Subject: [PATCH] stb_image: Optimise vertical flip. This incorporates #462, but also factors everything into one function that is shared between 8-bit integer, 16-bit integer, and float pixels (vertical flip operates on rows of bytes and doesn't really care), and finally always uses a 2k on-stack buffer without dynamic memory allocation, doing multiple memcpys per row if necessary. Not only does this remove an out-of-memory failure mode, it is also preferable for large images, since it's more L1-cache-firendly this way. Fixes #462. --- stb_image.h | 72 +++++++++++++++++++++-------------------------------- 1 file changed, 29 insertions(+), 43 deletions(-) diff --git a/stb_image.h b/stb_image.h index 2e0b4c5..19a7f97 100644 --- a/stb_image.h +++ b/stb_image.h @@ -83,6 +83,7 @@ RECENT REVISION HISTORY: Optimizations & bugfixes Fabian "ryg" Giesen Arseny Kapoulkine + John-Mark Allen Bug & warning fixes Marc LeBlanc David Woo Guillaume George Martins Mozeiko @@ -1031,6 +1032,30 @@ static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int chan return enlarged; } +static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel) +{ + int row; + size_t bytes_per_row = (size_t)w * bytes_per_pixel; + stbi_uc temp[2048]; + stbi_uc *bytes = (stbi_uc *)image; + + for (row = 0; row < (h>>1); row++) { + stbi_uc *row0 = bytes + row*bytes_per_row; + stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row; + // swap row0 with row1 + size_t bytes_left = bytes_per_row; + while (bytes_left) { + size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp); + memcpy(temp, row0, bytes_copy); + memcpy(row0, row1, bytes_copy); + memcpy(row1, temp, bytes_copy); + row0 += bytes_copy; + row1 += bytes_copy; + bytes_left -= bytes_copy; + } + } +} + static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp) { stbi__result_info ri; @@ -1048,21 +1073,8 @@ static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, // @TODO: move stbi__convert_format to here if (stbi__vertically_flip_on_load) { - int w = *x, h = *y; int channels = req_comp ? req_comp : *comp; - int row,col,z; - stbi_uc *image = (stbi_uc *) result; - - // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once - for (row = 0; row < (h>>1); row++) { - for (col = 0; col < w; col++) { - for (z = 0; z < channels; z++) { - stbi_uc temp = image[(row * w + col) * channels + z]; - image[(row * w + col) * channels + z] = image[((h - row - 1) * w + col) * channels + z]; - image[((h - row - 1) * w + col) * channels + z] = temp; - } - } - } + stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc)); } return (unsigned char *) result; @@ -1086,21 +1098,8 @@ static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision if (stbi__vertically_flip_on_load) { - int w = *x, h = *y; int channels = req_comp ? req_comp : *comp; - int row,col,z; - stbi__uint16 *image = (stbi__uint16 *) result; - - // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once - for (row = 0; row < (h>>1); row++) { - for (col = 0; col < w; col++) { - for (z = 0; z < channels; z++) { - stbi__uint16 temp = image[(row * w + col) * channels + z]; - image[(row * w + col) * channels + z] = image[((h - row - 1) * w + col) * channels + z]; - image[((h - row - 1) * w + col) * channels + z] = temp; - } - } - } + stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16)); } return (stbi__uint16 *) result; @@ -1110,21 +1109,8 @@ static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp) { if (stbi__vertically_flip_on_load && result != NULL) { - int w = *x, h = *y; - int depth = req_comp ? req_comp : *comp; - int row,col,z; - float temp; - - // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once - for (row = 0; row < (h>>1); row++) { - for (col = 0; col < w; col++) { - for (z = 0; z < depth; z++) { - temp = result[(row * w + col) * depth + z]; - result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z]; - result[((h - row - 1) * w + col) * depth + z] = temp; - } - } - } + int channels = req_comp ? req_comp : *comp; + stbi__vertical_flip(result, *x, *y, channels * sizeof(float)); } } #endif