diff options
Diffstat (limited to 'source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_sse_func.h')
-rw-r--r-- | source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_sse_func.h | 498 |
1 files changed, 498 insertions, 0 deletions
diff --git a/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_sse_func.h b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_sse_func.h new file mode 100644 index 0000000..f81140e --- /dev/null +++ b/source/3rd-party/SDL2/src/video/yuv2rgb/yuv_rgb_sse_func.h @@ -0,0 +1,498 @@ +// Copyright 2016 Adrien Descamps +// Distributed under BSD 3-Clause License + +/* You need to define the following macros before including this file: + SSE_FUNCTION_NAME + STD_FUNCTION_NAME + YUV_FORMAT + RGB_FORMAT +*/ +/* You may define the following macro, which affects generated code: + SSE_ALIGNED +*/ + +#ifdef SSE_ALIGNED +/* Unaligned instructions seem faster, even on aligned data? */ +/* +#define LOAD_SI128 _mm_load_si128 +#define SAVE_SI128 _mm_stream_si128 +*/ +#define LOAD_SI128 _mm_loadu_si128 +#define SAVE_SI128 _mm_storeu_si128 +#else +#define LOAD_SI128 _mm_loadu_si128 +#define SAVE_SI128 _mm_storeu_si128 +#endif + +#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \ + r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \ + g_tmp = _mm_add_epi16( \ + _mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \ + _mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \ + b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \ + R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \ + G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \ + B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \ + R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \ + G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \ + B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \ + +#define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \ + Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \ + Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \ + \ + R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \ + G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \ + B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \ + R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \ + G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \ + B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \ + +#define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \ +{ \ + __m128i red_mask, tmp1, tmp2, tmp3, tmp4; \ +\ + red_mask = _mm_set1_epi16((short)0xF800); \ + RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \ + RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \ + RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \ + RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \ + tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \ + tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \ + tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \ + tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \ + RGB1 = _mm_or_si128(RGB1, tmp1); \ + RGB2 = _mm_or_si128(RGB2, tmp2); \ + RGB3 = _mm_or_si128(RGB3, tmp3); \ + RGB4 = _mm_or_si128(RGB4, tmp4); \ + tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \ + tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \ + tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \ + tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \ + RGB1 = _mm_or_si128(RGB1, tmp1); \ + RGB2 = _mm_or_si128(RGB2, tmp2); \ + RGB3 = _mm_or_si128(RGB3, tmp3); \ + RGB4 = _mm_or_si128(RGB4, tmp4); \ +} + +#define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \ +RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \ +RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \ +RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \ +RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \ +RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \ + +#define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \ +R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \ +G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \ +G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \ +B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \ +B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \ + +#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ +PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ + +#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \ +{ \ + __m128i lo_ab, hi_ab, lo_gr, hi_gr; \ +\ + lo_ab = _mm_unpacklo_epi8( A1, B1 ); \ + hi_ab = _mm_unpackhi_epi8( A1, B1 ); \ + lo_gr = _mm_unpacklo_epi8( G1, R1 ); \ + hi_gr = _mm_unpackhi_epi8( G1, R1 ); \ + RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \ + RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \ + RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \ + RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \ +\ + lo_ab = _mm_unpacklo_epi8( A2, B2 ); \ + hi_ab = _mm_unpackhi_epi8( A2, B2 ); \ + lo_gr = _mm_unpacklo_epi8( G2, R2 ); \ + hi_gr = _mm_unpackhi_epi8( G2, R2 ); \ + RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \ + RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \ + RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \ + RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \ +} + +#if RGB_FORMAT == RGB_FORMAT_RGB565 + +#define PACK_PIXEL \ + __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ + \ + PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \ + \ + PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \ + +#elif RGB_FORMAT == RGB_FORMAT_RGB24 + +#define PACK_PIXEL \ + __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \ + __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \ + \ + PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \ + \ + PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \ + +#elif RGB_FORMAT == RGB_FORMAT_RGBA + +#define PACK_PIXEL \ + __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ + __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ + __m128i a = _mm_set1_epi8((char)0xFF); \ + \ + PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ + \ + PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ + +#elif RGB_FORMAT == RGB_FORMAT_BGRA + +#define PACK_PIXEL \ + __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ + __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ + __m128i a = _mm_set1_epi8((char)0xFF); \ + \ + PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ + \ + PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ + +#elif RGB_FORMAT == RGB_FORMAT_ARGB + +#define PACK_PIXEL \ + __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ + __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ + __m128i a = _mm_set1_epi8((char)0xFF); \ + \ + PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ + \ + PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ + +#elif RGB_FORMAT == RGB_FORMAT_ABGR + +#define PACK_PIXEL \ + __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ + __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ + __m128i a = _mm_set1_epi8((char)0xFF); \ + \ + PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ + \ + PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ + +#else +#error PACK_PIXEL unimplemented +#endif + +#if RGB_FORMAT == RGB_FORMAT_RGB565 + +#define SAVE_LINE1 \ + SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ + SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ + SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ + SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ + +#define SAVE_LINE2 \ + SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \ + SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \ + SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \ + SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \ + +#elif RGB_FORMAT == RGB_FORMAT_RGB24 + +#define SAVE_LINE1 \ + SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ + SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ + SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ + SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ + SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \ + SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \ + +#define SAVE_LINE2 \ + SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \ + SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \ + SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \ + SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \ + SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \ + SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \ + +#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \ + RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR + +#define SAVE_LINE1 \ + SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ + SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ + SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ + SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ + SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \ + SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \ + SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \ + SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \ + +#define SAVE_LINE2 \ + SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \ + SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \ + SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \ + SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \ + SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \ + SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \ + SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \ + SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \ + +#else +#error SAVE_LINE unimplemented +#endif + +#if YUV_FORMAT == YUV_FORMAT_420 + +#define READ_Y(y_ptr) \ + y = LOAD_SI128((const __m128i*)(y_ptr)); \ + +#define READ_UV \ + u = LOAD_SI128((const __m128i*)(u_ptr)); \ + v = LOAD_SI128((const __m128i*)(v_ptr)); \ + +#elif YUV_FORMAT == YUV_FORMAT_422 + +#define READ_Y(y_ptr) \ +{ \ + __m128i y1, y2; \ + y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \ + y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \ + y = _mm_packus_epi16(y1, y2); \ +} + +#define READ_UV \ +{ \ + __m128i u1, u2, u3, u4, v1, v2, v3, v4; \ + u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \ + u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \ + u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \ + u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \ + u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \ + v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \ + v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \ + v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \ + v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \ + v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \ +} + +#elif YUV_FORMAT == YUV_FORMAT_NV12 + +#define READ_Y(y_ptr) \ + y = LOAD_SI128((const __m128i*)(y_ptr)); \ + +#define READ_UV \ +{ \ + __m128i u1, u2, v1, v2; \ + u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \ + u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \ + u = _mm_packus_epi16(u1, u2); \ + v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \ + v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \ + v = _mm_packus_epi16(v1, v2); \ +} + +#else +#error READ_UV unimplemented +#endif + +#define YUV2RGB_32 \ + __m128i r_tmp, g_tmp, b_tmp; \ + __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \ + __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \ + __m128i y_16_1, y_16_2; \ + __m128i y, u, v, u_16, v_16; \ + __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \ + __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \ + \ + READ_UV \ + \ + /* process first 16 pixels of first line */\ + u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \ + v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \ + u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \ + v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \ + \ + UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \ + r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \ + \ + READ_Y(y_ptr1) \ + y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ + y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ + \ + ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + \ + r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \ + g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \ + b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \ + \ + /* process first 16 pixels of second line */\ + r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ + r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ + \ + READ_Y(y_ptr2) \ + y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ + y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ + \ + ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + \ + r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \ + g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \ + b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \ + \ + /* process last 16 pixels of first line */\ + u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \ + v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \ + u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \ + v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \ + \ + UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \ + r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \ + \ + READ_Y(y_ptr1+16*y_pixel_stride) \ + y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ + y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ + \ + ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + \ + r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \ + g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \ + b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \ + \ + /* process last 16 pixels of second line */\ + r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ + r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ + \ + READ_Y(y_ptr2+16*y_pixel_stride) \ + y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ + y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ + \ + ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ + \ + r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \ + g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \ + b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \ + \ + + +void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, + const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, + uint8_t *RGB, uint32_t RGB_stride, + YCbCrType yuv_type) +{ + const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); +#if YUV_FORMAT == YUV_FORMAT_420 + const int y_pixel_stride = 1; + const int uv_pixel_stride = 1; + const int uv_x_sample_interval = 2; + const int uv_y_sample_interval = 2; +#elif YUV_FORMAT == YUV_FORMAT_422 + const int y_pixel_stride = 2; + const int uv_pixel_stride = 4; + const int uv_x_sample_interval = 2; + const int uv_y_sample_interval = 1; +#elif YUV_FORMAT == YUV_FORMAT_NV12 + const int y_pixel_stride = 1; + const int uv_pixel_stride = 2; + const int uv_x_sample_interval = 2; + const int uv_y_sample_interval = 2; +#endif +#if RGB_FORMAT == RGB_FORMAT_RGB565 + const int rgb_pixel_stride = 2; +#elif RGB_FORMAT == RGB_FORMAT_RGB24 + const int rgb_pixel_stride = 3; +#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \ + RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR + const int rgb_pixel_stride = 4; +#else +#error Unknown RGB pixel size +#endif + + if (width >= 32) { + uint32_t xpos, ypos; + for(ypos=0; ypos<(height-(uv_y_sample_interval-1)); ypos+=uv_y_sample_interval) + { + const uint8_t *y_ptr1=Y+ypos*Y_stride, + *y_ptr2=Y+(ypos+1)*Y_stride, + *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride, + *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride; + + uint8_t *rgb_ptr1=RGB+ypos*RGB_stride, + *rgb_ptr2=RGB+(ypos+1)*RGB_stride; + + for(xpos=0; xpos<(width-31); xpos+=32) + { + YUV2RGB_32 + { + PACK_PIXEL + SAVE_LINE1 + if (uv_y_sample_interval > 1) + { + SAVE_LINE2 + } + } + + y_ptr1+=32*y_pixel_stride; + y_ptr2+=32*y_pixel_stride; + u_ptr+=32*uv_pixel_stride/uv_x_sample_interval; + v_ptr+=32*uv_pixel_stride/uv_x_sample_interval; + rgb_ptr1+=32*rgb_pixel_stride; + rgb_ptr2+=32*rgb_pixel_stride; + } + } + + /* Catch the last line, if needed */ + if (uv_y_sample_interval == 2 && ypos == (height-1)) + { + const uint8_t *y_ptr=Y+ypos*Y_stride, + *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride, + *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride; + + uint8_t *rgb_ptr=RGB+ypos*RGB_stride; + + STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type); + } + } + + /* Catch the right column, if needed */ + { + int converted = (width & ~31); + if (converted != width) + { + const uint8_t *y_ptr=Y+converted*y_pixel_stride, + *u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval, + *v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval; + + uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride; + + STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type); + } + } +} + +#undef SSE_FUNCTION_NAME +#undef STD_FUNCTION_NAME +#undef YUV_FORMAT +#undef RGB_FORMAT +#undef SSE_ALIGNED +#undef LOAD_SI128 +#undef SAVE_SI128 +#undef UV2RGB_16 +#undef ADD_Y2RGB_16 +#undef PACK_RGB24_32_STEP1 +#undef PACK_RGB24_32_STEP2 +#undef PACK_RGB24_32 +#undef PACK_RGBA_32 +#undef PACK_PIXEL +#undef SAVE_LINE1 +#undef SAVE_LINE2 +#undef READ_Y +#undef READ_UV +#undef YUV2RGB_32 |