cache filter-width derived computations that use floor/ceil because those were showing up like crazy on VC6 profile despite only being per-scanline-ish?!?

unroll inner loop in horizontal_resample
This commit is contained in:
Sean Barrett 2014-09-13 07:29:04 -07:00
parent 972456cb63
commit 9c2ae9bdb8

View File

@ -488,6 +488,16 @@ typedef struct
float* horizontal_buffer;
// cache these because ceil/floor are inexplicably showing up in profile
int horizontal_coefficient_width;
int vertical_coefficient_width;
int horizontal_filter_pixel_width;
int vertical_filter_pixel_width;
int horizontal_filter_pixel_margin;
int vertical_filter_pixel_margin;
int horizontal_num_contributors;
int vertical_num_contributors;
int ring_buffer_length_bytes; // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
int ring_buffer_first_scanline;
int ring_buffer_last_scanline;
@ -749,7 +759,7 @@ stbir__inline static int stbir__use_height_upsampling(stbir__info* stbir_info)
// This is the maximum number of input samples that can affect an output sample
// with the given filter
stbir__inline static int stbir__get_filter_pixel_width(stbir_filter filter, float scale)
static int stbir__get_filter_pixel_width(stbir_filter filter, float scale)
{
STBIR_ASSERT(filter != 0);
STBIR_ASSERT(filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
@ -760,34 +770,14 @@ stbir__inline static int stbir__get_filter_pixel_width(stbir_filter filter, floa
return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2 / scale);
}
stbir__inline static int stbir__get_filter_pixel_width_horizontal(stbir__info* stbir_info)
{
return stbir__get_filter_pixel_width(stbir_info->horizontal_filter, stbir_info->horizontal_scale);
}
stbir__inline static int stbir__get_filter_pixel_width_vertical(stbir__info* stbir_info)
{
return stbir__get_filter_pixel_width(stbir_info->vertical_filter, stbir_info->vertical_scale);
}
// This is how much to expand buffers to account for filters seeking outside
// the image boundaries.
stbir__inline static int stbir__get_filter_pixel_margin(stbir_filter filter, float scale)
static int stbir__get_filter_pixel_margin(stbir_filter filter, float scale)
{
return stbir__get_filter_pixel_width(filter, scale) / 2;
}
stbir__inline static int stbir__get_filter_pixel_margin_horizontal(stbir__info* stbir_info)
{
return stbir__get_filter_pixel_width(stbir_info->horizontal_filter, stbir_info->horizontal_scale) / 2;
}
stbir__inline static int stbir__get_filter_pixel_margin_vertical(stbir__info* stbir_info)
{
return stbir__get_filter_pixel_width(stbir_info->vertical_filter, stbir_info->vertical_scale) / 2;
}
stbir__inline static int stbir__get_coefficient_width(stbir_filter filter, float scale)
static int stbir__get_coefficient_width(stbir_filter filter, float scale)
{
if (stbir__use_upsampling(scale))
return (int)ceil(stbir__filter_info_table[filter].support(1 / scale) * 2);
@ -795,7 +785,7 @@ stbir__inline static int stbir__get_coefficient_width(stbir_filter filter, float
return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2);
}
stbir__inline static int stbir__get_contributors(float scale, stbir_filter filter, int input_size, int output_size)
static int stbir__get_contributors(float scale, stbir_filter filter, int input_size, int output_size)
{
if (stbir__use_upsampling(scale))
return output_size;
@ -803,25 +793,15 @@ stbir__inline static int stbir__get_contributors(float scale, stbir_filter filte
return (input_size + stbir__get_filter_pixel_margin(filter, scale) * 2);
}
stbir__inline static int stbir__get_horizontal_contributors(stbir__info* info)
static int stbir__get_total_horizontal_coefficients(stbir__info* info)
{
return stbir__get_contributors(info->horizontal_scale, info->horizontal_filter, info->input_w, info->output_w);
}
stbir__inline static int stbir__get_vertical_contributors(stbir__info* info)
{
return stbir__get_contributors(info->vertical_scale, info->vertical_filter, info->input_h, info->output_h);
}
stbir__inline static int stbir__get_total_horizontal_coefficients(stbir__info* info)
{
return stbir__get_horizontal_contributors(info)
return info->horizontal_num_contributors
* stbir__get_coefficient_width (info->horizontal_filter, info->horizontal_scale);
}
stbir__inline static int stbir__get_total_vertical_coefficients(stbir__info* info)
static int stbir__get_total_vertical_coefficients(stbir__info* info)
{
return stbir__get_vertical_contributors(info)
return info->vertical_num_contributors
* stbir__get_coefficient_width (info->vertical_filter, info->vertical_scale);
}
@ -832,13 +812,13 @@ stbir__inline static stbir__contributors* stbir__get_contributor(stbir__contribu
stbir__inline static stbir__contributors* stbir__get_horizontal_contributor(stbir__info* stbir_info, int n)
{
STBIR__DEBUG_ASSERT(n >= 0 && n < stbir__get_horizontal_contributors(stbir_info));
STBIR__DEBUG_ASSERT(n >= 0 && n < stbir_info->horizontal_num_contributors);
return stbir__get_contributor(stbir_info->horizontal_contributors, n);
}
stbir__inline static stbir__contributors* stbir__get_vertical_contributor(stbir__info* stbir_info, int n)
{
STBIR__DEBUG_ASSERT(n >= 0 && n < stbir__get_vertical_contributors(stbir_info));
STBIR__DEBUG_ASSERT(n >= 0 && n < stbir_info->vertical_num_contributors);
return stbir__get_contributor(stbir_info->vertical_contributors, n);
}
@ -850,20 +830,6 @@ stbir__inline static float* stbir__get_coefficient(float* coefficients, stbir_fi
return &coefficients[width*n + c];
}
stbir__inline static float* stbir__get_horizontal_coefficient(stbir__info* stbir_info, int n, int c)
{
STBIR__DEBUG_ASSERT(c >= 0 && c < stbir__get_coefficient_width(stbir_info->horizontal_filter, stbir_info->horizontal_scale));
STBIR__DEBUG_ASSERT(n >= 0 && n < stbir__get_total_horizontal_coefficients(stbir_info));
return stbir__get_coefficient(stbir_info->horizontal_coefficients, stbir_info->horizontal_filter, stbir_info->horizontal_scale, n, c);
}
stbir__inline static float* stbir__get_vertical_coefficient(stbir__info* stbir_info, int n, int c)
{
STBIR__DEBUG_ASSERT(c >= 0 && c < stbir__get_coefficient_width(stbir_info->vertical_filter, stbir_info->vertical_scale));
STBIR__DEBUG_ASSERT(n >= 0 && n < stbir__get_total_vertical_coefficients(stbir_info));
return stbir__get_coefficient(stbir_info->vertical_coefficients, stbir_info->vertical_filter, stbir_info->vertical_scale, n, c);
}
static int stbir__edge_wrap_slow(stbir_edge edge, int n, int max)
{
switch (edge)
@ -1081,7 +1047,7 @@ static void stbir__normalize_downsample_coefficients(stbir__info* stbir_info, st
// Do this after normalizing because normalization depends on the n0/n1 values.
for (j = 0; j < num_contributors; j++)
{
int range, max;
int range, max, width;
skip = 0;
while (*stbir__get_coefficient(coefficients, filter, scale_ratio, j, skip) == 0)
@ -1098,9 +1064,10 @@ static void stbir__normalize_downsample_coefficients(stbir__info* stbir_info, st
range = contributors[j].n1 - contributors[j].n0 + 1;
max = stbir__min(num_coefficients, range);
width = stbir__get_coefficient_width(filter, scale_ratio);
for (i = 0; i < max; i++)
{
if (i + skip >= stbir__get_coefficient_width(filter, scale_ratio))
if (i + skip >= width)
break;
*stbir__get_coefficient(coefficients, filter, scale_ratio, j, i) = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i + skip);
@ -1160,7 +1127,7 @@ static float* stbir__get_decode_buffer(stbir__info* stbir_info)
{
// The 0 index of the decode buffer starts after the margin. This makes
// it okay to use negative indexes on the decode buffer.
return &stbir_info->decode_buffer[stbir__get_filter_pixel_margin_horizontal(stbir_info) * stbir_info->channels];
return &stbir_info->decode_buffer[stbir_info->horizontal_filter_pixel_margin * stbir_info->channels];
}
#define STBIR__DECODE(type, colorspace) ((type) * (STBIR_MAX_COLORSPACES) + (colorspace))
@ -1179,10 +1146,10 @@ static void stbir__decode_scanline(stbir__info* stbir_info, int n)
stbir_edge edge_vertical = stbir_info->edge_vertical;
int in_buffer_row_offset = stbir__edge_wrap(edge_vertical, n, stbir_info->input_h) * input_stride_bytes;
const void* input_data = (char *) stbir_info->input_data + in_buffer_row_offset;
int max_x = input_w + stbir__get_filter_pixel_margin_horizontal(stbir_info);
int max_x = input_w + stbir_info->horizontal_filter_pixel_margin;
int decode = STBIR__DECODE(type, colorspace);
int x = -stbir__get_filter_pixel_margin_horizontal(stbir_info);
int x = -stbir_info->horizontal_filter_pixel_margin;
// special handling for STBIR_EDGE_ZERO because it needs to return an item that doesn't appear in the input,
// and we want to avoid paying overhead on every pixel if not STBIR_EDGE_ZERO
@ -1296,7 +1263,7 @@ static void stbir__decode_scanline(stbir__info* stbir_info, int n)
if (!(stbir_info->flags & STBIR_FLAG_ALPHA_PREMULTIPLIED))
{
for (x = -stbir__get_filter_pixel_margin_horizontal(stbir_info); x < max_x; x++)
for (x = -stbir_info->horizontal_filter_pixel_margin; x < max_x; x++)
{
int decode_pixel_index = x * channels;
@ -1320,7 +1287,7 @@ static void stbir__decode_scanline(stbir__info* stbir_info, int n)
if (edge_horizontal == STBIR_EDGE_ZERO)
{
for (x = -stbir__get_filter_pixel_margin_horizontal(stbir_info); x < 0; x++)
for (x = -stbir_info->horizontal_filter_pixel_margin; x < 0; x++)
{
for (c = 0; c < channels; c++)
decode_buffer[x*channels + c] = 0;
@ -1350,7 +1317,7 @@ static float* stbir__add_empty_ring_buffer_entry(stbir__info* stbir_info, int n)
}
else
{
ring_buffer_index = (stbir_info->ring_buffer_begin_index + (stbir_info->ring_buffer_last_scanline - stbir_info->ring_buffer_first_scanline) + 1) % stbir__get_filter_pixel_width_vertical(stbir_info);
ring_buffer_index = (stbir_info->ring_buffer_begin_index + (stbir_info->ring_buffer_last_scanline - stbir_info->ring_buffer_first_scanline) + 1) % stbir_info->vertical_filter_pixel_width;
STBIR__DEBUG_ASSERT(ring_buffer_index != stbir_info->ring_buffer_begin_index);
}
@ -1367,12 +1334,12 @@ static void stbir__resample_horizontal_upsample(stbir__info* stbir_info, int n,
{
int x, k;
int output_w = stbir_info->output_w;
int kernel_pixel_width = stbir__get_filter_pixel_width_horizontal(stbir_info);
int kernel_pixel_width = stbir_info->horizontal_filter_pixel_width;
int channels = stbir_info->channels;
float* decode_buffer = stbir__get_decode_buffer(stbir_info);
stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
float* horizontal_coefficients = stbir_info->horizontal_coefficients;
int coefficient_width = stbir__get_coefficient_width(stbir_info->horizontal_filter, stbir_info->horizontal_scale);
int coefficient_width = stbir_info->horizontal_coefficient_width;
for (x = 0; x < output_w; x++)
{
@ -1384,21 +1351,65 @@ static void stbir__resample_horizontal_upsample(stbir__info* stbir_info, int n,
int coefficient_counter = 0;
STBIR__DEBUG_ASSERT(n1 >= n0);
STBIR__DEBUG_ASSERT(n0 >= -stbir__get_filter_pixel_margin_horizontal(stbir_info));
STBIR__DEBUG_ASSERT(n1 >= -stbir__get_filter_pixel_margin_horizontal(stbir_info));
STBIR__DEBUG_ASSERT(n0 < stbir_info->input_w + stbir__get_filter_pixel_margin_horizontal(stbir_info));
STBIR__DEBUG_ASSERT(n1 < stbir_info->input_w + stbir__get_filter_pixel_margin_horizontal(stbir_info));
STBIR__DEBUG_ASSERT(n0 >= -stbir_info->horizontal_filter_pixel_margin);
STBIR__DEBUG_ASSERT(n1 >= -stbir_info->horizontal_filter_pixel_margin);
STBIR__DEBUG_ASSERT(n0 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
STBIR__DEBUG_ASSERT(n1 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
for (k = n0; k <= n1; k++)
{
int in_pixel_index = k * channels;
float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
int c;
STBIR__DEBUG_ASSERT(coefficient != 0);
for (c = 0; c < channels; c++)
output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
switch (channels) {
case 1:
for (k = n0; k <= n1; k++)
{
int in_pixel_index = k * 1;
float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
STBIR__DEBUG_ASSERT(coefficient != 0);
output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
}
break;
case 2:
for (k = n0; k <= n1; k++)
{
int in_pixel_index = k * 2;
float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
STBIR__DEBUG_ASSERT(coefficient != 0);
output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
}
break;
case 3:
for (k = n0; k <= n1; k++)
{
int in_pixel_index = k * 3;
float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
STBIR__DEBUG_ASSERT(coefficient != 0);
output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
}
break;
case 4:
for (k = n0; k <= n1; k++)
{
int in_pixel_index = k * 4;
float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
STBIR__DEBUG_ASSERT(coefficient != 0);
output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
}
break;
default:
for (k = n0; k <= n1; k++)
{
int in_pixel_index = k * channels;
float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
int c;
STBIR__DEBUG_ASSERT(coefficient != 0);
for (c = 0; c < channels; c++)
output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
}
break;
}
}
}
@ -1408,13 +1419,13 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, int n
int x, k;
int input_w = stbir_info->input_w;
int output_w = stbir_info->output_w;
int kernel_pixel_width = stbir__get_filter_pixel_width_horizontal(stbir_info);
int kernel_pixel_width = stbir_info->horizontal_filter_pixel_width;
int channels = stbir_info->channels;
float* decode_buffer = stbir__get_decode_buffer(stbir_info);
stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
float* horizontal_coefficients = stbir_info->horizontal_coefficients;
int coefficient_width = stbir__get_coefficient_width(stbir_info->horizontal_filter, stbir_info->horizontal_scale);
int filter_pixel_margin = stbir__get_filter_pixel_margin_horizontal(stbir_info);
int coefficient_width = stbir_info->horizontal_coefficient_width;
int filter_pixel_margin = stbir_info->horizontal_filter_pixel_margin;
int max_x = input_w + filter_pixel_margin * 2;
STBIR__DEBUG_ASSERT(!stbir__use_width_upsampling(stbir_info));
@ -1687,11 +1698,11 @@ static void stbir__resample_vertical_upsample(stbir__info* stbir_info, int n, in
int alpha_channel = stbir_info->alpha_channel;
int type = stbir_info->type;
int colorspace = stbir_info->colorspace;
int kernel_pixel_width = stbir__get_filter_pixel_width_vertical(stbir_info);
int kernel_pixel_width = stbir_info->vertical_filter_pixel_width;
void* output_data = stbir_info->output_data;
float* encode_buffer = stbir_info->encode_buffer;
int decode = STBIR__DECODE(type, colorspace);
int coefficient_width = stbir__get_coefficient_width(stbir_info->vertical_filter, stbir_info->vertical_scale);
int coefficient_width = stbir_info->vertical_coefficient_width;
int contributor = n;
float* ring_buffer = stbir_info->ring_buffer;
@ -1739,11 +1750,11 @@ static void stbir__resample_vertical_downsample(stbir__info* stbir_info, int n,
stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
float* vertical_coefficients = stbir_info->vertical_coefficients;
int channels = stbir_info->channels;
int kernel_pixel_width = stbir__get_filter_pixel_width_vertical(stbir_info);
int kernel_pixel_width = stbir_info->vertical_filter_pixel_width;
void* output_data = stbir_info->output_data;
float* horizontal_buffer = stbir_info->horizontal_buffer;
int coefficient_width = stbir__get_coefficient_width(stbir_info->vertical_filter, stbir_info->vertical_scale);
int contributor = n + stbir__get_filter_pixel_margin_vertical(stbir_info);
int coefficient_width = stbir_info->vertical_coefficient_width;
int contributor = n + stbir_info->vertical_filter_pixel_margin;
float* ring_buffer = stbir_info->ring_buffer;
int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
@ -1829,7 +1840,7 @@ static void stbir__buffer_loop_upsample(stbir__info* stbir_info)
stbir__calculate_sample_range_upsample(y, out_scanlines_radius, scale_ratio, stbir_info->vertical_shift, &in_first_scanline, &in_last_scanline, &in_center_of_out);
STBIR__DEBUG_ASSERT(in_last_scanline - in_first_scanline <= stbir__get_filter_pixel_width_vertical(stbir_info));
STBIR__DEBUG_ASSERT(in_last_scanline - in_first_scanline <= stbir_info->vertical_filter_pixel_width);
if (stbir_info->ring_buffer_begin_index >= 0)
{
@ -1848,7 +1859,7 @@ static void stbir__buffer_loop_upsample(stbir__info* stbir_info)
else
{
stbir_info->ring_buffer_first_scanline++;
stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir__get_filter_pixel_width_vertical(stbir_info);
stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->vertical_filter_pixel_width;
}
}
}
@ -1906,7 +1917,7 @@ static void stbir__empty_ring_buffer(stbir__info* stbir_info, int first_necessar
else
{
stbir_info->ring_buffer_first_scanline++;
stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir__get_filter_pixel_width_vertical(stbir_info);
stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->vertical_filter_pixel_width;
}
}
}
@ -1918,7 +1929,7 @@ static void stbir__buffer_loop_downsample(stbir__info* stbir_info)
float scale_ratio = stbir_info->vertical_scale;
int output_h = stbir_info->output_h;
float in_pixels_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(scale_ratio) / scale_ratio;
int pixel_margin = stbir__get_filter_pixel_margin_vertical(stbir_info);
int pixel_margin = stbir_info->vertical_filter_pixel_margin;
int max_y = stbir_info->input_h + pixel_margin;
STBIR__DEBUG_ASSERT(!stbir__use_height_upsampling(stbir_info));
@ -1930,7 +1941,7 @@ static void stbir__buffer_loop_downsample(stbir__info* stbir_info)
stbir__calculate_sample_range_downsample(y, in_pixels_radius, scale_ratio, stbir_info->vertical_shift, &out_first_scanline, &out_last_scanline, &out_center_of_in);
STBIR__DEBUG_ASSERT(out_last_scanline - out_first_scanline <= stbir__get_filter_pixel_width_vertical(stbir_info));
STBIR__DEBUG_ASSERT(out_last_scanline - out_first_scanline <= stbir_info->vertical_filter_pixel_width);
if (out_last_scanline < 0 || out_first_scanline >= output_h)
continue;
@ -2001,9 +2012,12 @@ static stbir_uint32 stbir__calculate_memory(stbir__info *info)
int pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
int filter_height = stbir__get_filter_pixel_width(info->vertical_filter, info->vertical_scale);
info->horizontal_contributors_size = stbir__get_horizontal_contributors(info) * sizeof(stbir__contributors);
info->horizontal_num_contributors = stbir__get_contributors(info->horizontal_scale, info->horizontal_filter, info->input_w, info->output_w);
info->vertical_num_contributors = stbir__get_contributors(info->vertical_scale , info->vertical_filter , info->input_h, info->output_h);
info->horizontal_contributors_size = info->horizontal_num_contributors * sizeof(stbir__contributors);
info->horizontal_coefficients_size = stbir__get_total_horizontal_coefficients(info) * sizeof(float);
info->vertical_contributors_size = stbir__get_vertical_contributors(info) * sizeof(stbir__contributors);
info->vertical_contributors_size = info->vertical_num_contributors * sizeof(stbir__contributors);
info->vertical_coefficients_size = stbir__get_total_vertical_coefficients(info) * sizeof(float);
info->decode_buffer_size = (info->input_w + pixel_margin * 2) * info->channels * sizeof(float);
info->horizontal_buffer_size = info->output_w * info->channels * sizeof(float);
@ -2104,8 +2118,15 @@ static int stbir__resize_allocated(stbir__info *info,
info->edge_vertical = edge_vertical;
info->colorspace = colorspace;
info->horizontal_coefficient_width = stbir__get_coefficient_width (info->horizontal_filter, info->horizontal_scale);
info->vertical_coefficient_width = stbir__get_coefficient_width (info->vertical_filter , info->vertical_scale );
info->horizontal_filter_pixel_width = stbir__get_filter_pixel_width (info->horizontal_filter, info->horizontal_scale);
info->vertical_filter_pixel_width = stbir__get_filter_pixel_width (info->vertical_filter , info->vertical_scale );
info->horizontal_filter_pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
info->vertical_filter_pixel_margin = stbir__get_filter_pixel_margin(info->vertical_filter , info->vertical_scale );
info->ring_buffer_length_bytes = info->output_w * info->channels * sizeof(float);
info->decode_buffer_pixels = info->input_w + stbir__get_filter_pixel_margin_horizontal(info) * 2;
info->decode_buffer_pixels = info->input_w + info->horizontal_filter_pixel_margin * 2;
#define STBIR__NEXT_MEMPTR(current, newtype) (newtype*)(((unsigned char*)current) + current##_size)