stb_image: NEON integer IDCT (not yet tested!)
This commit is contained in:
parent
0f3bf1564b
commit
a32d73dc3b
209
stb_image.h
209
stb_image.h
@ -1631,6 +1631,214 @@ static void stbi__idct_sse2(stbi_uc *out, int out_stride, short data[64])
|
||||
|
||||
#endif // STBI_SSE2
|
||||
|
||||
#ifdef STBI_NEON
|
||||
|
||||
// NEON integer IDCT. should produce bit-identical
|
||||
// results to the generic C version.
|
||||
static void stbi__idct_neon(stbi_uc *out, int out_stride, short data[64])
|
||||
{
|
||||
int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
|
||||
int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
|
||||
int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
|
||||
int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
|
||||
int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
|
||||
int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
|
||||
int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
|
||||
int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
|
||||
int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
|
||||
int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
|
||||
int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
|
||||
int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
|
||||
int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
|
||||
|
||||
#define dct_long_mul(out, inq, coeff) \
|
||||
int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
|
||||
int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
|
||||
|
||||
#define dct_long_mac(out, acc, inq, coeff) \
|
||||
int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
|
||||
int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
|
||||
|
||||
#define dct_widen(out, inq) \
|
||||
int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
|
||||
int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
|
||||
|
||||
// wide add
|
||||
#define dct_wadd(out, a, b) \
|
||||
int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
|
||||
int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
|
||||
|
||||
// wide sub
|
||||
#define dct_wsub(out, a, b) \
|
||||
int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
|
||||
int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
|
||||
|
||||
// butterfly a/b, then shift using "shiftop" by "s" and pack
|
||||
#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
|
||||
{ \
|
||||
dct_wadd(sum, a, b); \
|
||||
dct_wsub(dif, a, b); \
|
||||
out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
|
||||
out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
|
||||
}
|
||||
|
||||
#define dct_pass(shiftop, shift) \
|
||||
{ \
|
||||
/* even part */ \
|
||||
int16x8_t sum26 = vaddq_s16(row2, row6); \
|
||||
dct_long_mul(p1e, sum26, rot0_0); \
|
||||
dct_long_mac(t2e, p1e, row6, rot0_1); \
|
||||
dct_long_mac(t3e, p1e, row2, rot0_2); \
|
||||
int16x8_t sum04 = vaddq_s16(row0, row4); \
|
||||
int16x8_t dif04 = vsubq_s16(row0, row4); \
|
||||
dct_widen(t0e, sum04); \
|
||||
dct_widen(t1e, dif04); \
|
||||
dct_wadd(x0, t0e, t3e); \
|
||||
dct_wsub(x3, t0e, t3e); \
|
||||
dct_wadd(x1, t1e, t2e); \
|
||||
dct_wsub(x2, t1e, t2e); \
|
||||
/* odd part */ \
|
||||
int16x8_t sum15 = vaddq_s16(row1, row5); \
|
||||
int16x8_t sum17 = vaddq_s16(row1, row7); \
|
||||
int16x8_t sum35 = vaddq_s16(row3, row5); \
|
||||
int16x8_t sum37 = vaddq_s16(row3, row7); \
|
||||
int16x8_t sumodd = vaddq_s16(sum17, sum35); \
|
||||
dct_long_mul(p5o, sumodd, rot1_0); \
|
||||
dct_long_mac(p1o, p5o, sum17, rot1_1); \
|
||||
dct_long_mac(p2o, p5o, sum35, rot1_2); \
|
||||
dct_long_mul(p3o, sum37, rot2_0); \
|
||||
dct_long_mul(p4o, sum15, rot2_1); \
|
||||
dct_wadd(sump13o, p1o, p3o); \
|
||||
dct_wadd(sump24o, p2o, p4o); \
|
||||
dct_wadd(sump23o, p2o, p3o); \
|
||||
dct_wadd(sump14o, p1o, p4o); \
|
||||
dct_long_mac(x4, sump13o, row7, rot3_0); \
|
||||
dct_long_mac(x5, sump24o, row5, rot3_1); \
|
||||
dct_long_mac(x6, sump23o, row3, rot3_2); \
|
||||
dct_long_mac(x7, sump14o, row1, rot3_3); \
|
||||
dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
|
||||
dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
|
||||
dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
|
||||
dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
|
||||
}
|
||||
|
||||
// load
|
||||
row0 = vld1q_s16(data + 0*8);
|
||||
row1 = vld1q_s16(data + 1*8);
|
||||
row2 = vld1q_s16(data + 2*8);
|
||||
row3 = vld1q_s16(data + 3*8);
|
||||
row4 = vld1q_s16(data + 4*8);
|
||||
row5 = vld1q_s16(data + 5*8);
|
||||
row6 = vld1q_s16(data + 6*8);
|
||||
row7 = vld1q_s16(data + 7*8);
|
||||
|
||||
// add DC bias
|
||||
row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
|
||||
|
||||
// column pass
|
||||
dct_pass(vrshrn_n_s32, 10);
|
||||
|
||||
// 16bit 8x8 transpose
|
||||
{
|
||||
// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
|
||||
// whether compilers actually get this is another story, sadly.
|
||||
#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
|
||||
#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
|
||||
#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
|
||||
|
||||
// pass 1
|
||||
dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
|
||||
dct_trn16(row2, row3);
|
||||
dct_trn16(row4, row5);
|
||||
dct_trn16(row6, row7);
|
||||
|
||||
// pass 2
|
||||
dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
|
||||
dct_trn32(row1, row3);
|
||||
dct_trn32(row4, row6);
|
||||
dct_trn32(row5, row7);
|
||||
|
||||
// pass 3
|
||||
dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
|
||||
dct_trn64(row1, row5);
|
||||
dct_trn64(row2, row6);
|
||||
dct_trn64(row3, row7);
|
||||
|
||||
#undef dct_trn16
|
||||
#undef dct_trn32
|
||||
#undef dct_trn64
|
||||
}
|
||||
|
||||
// row pass
|
||||
// vrshrn_n_s32 only supports shifts up to 16, we need
|
||||
// 17. so do a non-rounding shift of 16 first then follow
|
||||
// up with a rounding shift by 1.
|
||||
dct_pass(vshrn_n_s32, 16);
|
||||
|
||||
{
|
||||
// pack and round
|
||||
uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
|
||||
uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
|
||||
uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
|
||||
uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
|
||||
uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
|
||||
uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
|
||||
uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
|
||||
uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
|
||||
|
||||
// again, these can translate into one instruction, but often don't.
|
||||
#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
|
||||
#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
|
||||
#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
|
||||
|
||||
// sadly can't use interleaved stores here since we only write
|
||||
// 8 bytes to each scan line!
|
||||
|
||||
// 8x8 8-bit transpose pass 1
|
||||
dct_trn8_8(p0, p1);
|
||||
dct_trn8_8(p2, p3);
|
||||
dct_trn8_8(p4, p5);
|
||||
dct_trn8_8(p6, p7);
|
||||
|
||||
// pass 2
|
||||
dct_trn8_16(p0, p2);
|
||||
dct_trn8_16(p1, p3);
|
||||
dct_trn8_16(p4, p6);
|
||||
dct_trn8_16(p5, p7);
|
||||
|
||||
// pass 3
|
||||
dct_trn8_32(p0, p4);
|
||||
dct_trn8_32(p1, p5);
|
||||
dct_trn8_32(p2, p6);
|
||||
dct_trn8_32(p3, p7);
|
||||
|
||||
// store
|
||||
vst1_u8(out, p0); out += out_stride;
|
||||
vst1_u8(out, p1); out += out_stride;
|
||||
vst1_u8(out, p2); out += out_stride;
|
||||
vst1_u8(out, p3); out += out_stride;
|
||||
vst1_u8(out, p4); out += out_stride;
|
||||
vst1_u8(out, p5); out += out_stride;
|
||||
vst1_u8(out, p6); out += out_stride;
|
||||
vst1_u8(out, p7);
|
||||
|
||||
#undef dct_trn8_8
|
||||
#undef dct_trn8_16
|
||||
#undef dct_trn8_32
|
||||
}
|
||||
|
||||
#undef dct_long_mul
|
||||
#undef dct_long_mac
|
||||
#undef dct_widen
|
||||
#undef dct_wadd
|
||||
#undef dct_wsub
|
||||
#undef dct_bfly32o
|
||||
#undef dct_pass
|
||||
}
|
||||
|
||||
#endif // STBI_NEON
|
||||
|
||||
#define STBI__MARKER_none 0xff
|
||||
// if there's a pending marker from the entropy stream, return that
|
||||
// otherwise, fetch from the stream and get a marker. if there's no
|
||||
@ -2321,6 +2529,7 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
|
||||
#endif
|
||||
|
||||
#ifdef STBI_NEON
|
||||
j->idct_block_kernel = stbi__idct_neon;
|
||||
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
|
||||
#endif
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user