Atlas - yuv_rgb_sse_func.h

Home / ext / SDL2 / src / video / yuv2rgb Lines: 1 | Size: 19011 bytes [Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)]
[FILE BEGIN]
1// Copyright 2016 Adrien Descamps 2// Distributed under BSD 3-Clause License 3 4/* You need to define the following macros before including this file: 5 SSE_FUNCTION_NAME 6 STD_FUNCTION_NAME 7 YUV_FORMAT 8 RGB_FORMAT 9*/ 10/* You may define the following macro, which affects generated code: 11 SSE_ALIGNED 12*/ 13 14#ifdef SSE_ALIGNED 15/* Unaligned instructions seem faster, even on aligned data? */ 16/* 17#define LOAD_SI128 _mm_load_si128 18#define SAVE_SI128 _mm_stream_si128 19*/ 20#define LOAD_SI128 _mm_loadu_si128 21#define SAVE_SI128 _mm_storeu_si128 22#else 23#define LOAD_SI128 _mm_loadu_si128 24#define SAVE_SI128 _mm_storeu_si128 25#endif 26 27#define UV2RGB_16(U,V,R1,G1,B1,R2,G2,B2) \ 28 r_tmp = _mm_mullo_epi16(V, _mm_set1_epi16(param->v_r_factor)); \ 29 g_tmp = _mm_add_epi16( \ 30 _mm_mullo_epi16(U, _mm_set1_epi16(param->u_g_factor)), \ 31 _mm_mullo_epi16(V, _mm_set1_epi16(param->v_g_factor))); \ 32 b_tmp = _mm_mullo_epi16(U, _mm_set1_epi16(param->u_b_factor)); \ 33 R1 = _mm_unpacklo_epi16(r_tmp, r_tmp); \ 34 G1 = _mm_unpacklo_epi16(g_tmp, g_tmp); \ 35 B1 = _mm_unpacklo_epi16(b_tmp, b_tmp); \ 36 R2 = _mm_unpackhi_epi16(r_tmp, r_tmp); \ 37 G2 = _mm_unpackhi_epi16(g_tmp, g_tmp); \ 38 B2 = _mm_unpackhi_epi16(b_tmp, b_tmp); \ 39 40#define ADD_Y2RGB_16(Y1,Y2,R1,G1,B1,R2,G2,B2) \ 41 Y1 = _mm_mullo_epi16(_mm_sub_epi16(Y1, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \ 42 Y2 = _mm_mullo_epi16(_mm_sub_epi16(Y2, _mm_set1_epi16(param->y_shift)), _mm_set1_epi16(param->y_factor)); \ 43 \ 44 R1 = _mm_srai_epi16(_mm_add_epi16(R1, Y1), PRECISION); \ 45 G1 = _mm_srai_epi16(_mm_add_epi16(G1, Y1), PRECISION); \ 46 B1 = _mm_srai_epi16(_mm_add_epi16(B1, Y1), PRECISION); \ 47 R2 = _mm_srai_epi16(_mm_add_epi16(R2, Y2), PRECISION); \ 48 G2 = _mm_srai_epi16(_mm_add_epi16(G2, Y2), PRECISION); \ 49 B2 = _mm_srai_epi16(_mm_add_epi16(B2, Y2), PRECISION); \ 50 51#define PACK_RGB565_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4) \ 52{ \ 53 __m128i red_mask, tmp1, tmp2, tmp3, tmp4; \ 54\ 55 red_mask = _mm_set1_epi16((short)0xF800); \ 56 RGB1 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R1), red_mask); \ 57 RGB2 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R1), red_mask); \ 58 RGB3 = _mm_and_si128(_mm_unpacklo_epi8(_mm_setzero_si128(), R2), red_mask); \ 59 RGB4 = _mm_and_si128(_mm_unpackhi_epi8(_mm_setzero_si128(), R2), red_mask); \ 60 tmp1 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G1, _mm_setzero_si128()), 2), 5); \ 61 tmp2 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G1, _mm_setzero_si128()), 2), 5); \ 62 tmp3 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpacklo_epi8(G2, _mm_setzero_si128()), 2), 5); \ 63 tmp4 = _mm_slli_epi16(_mm_srli_epi16(_mm_unpackhi_epi8(G2, _mm_setzero_si128()), 2), 5); \ 64 RGB1 = _mm_or_si128(RGB1, tmp1); \ 65 RGB2 = _mm_or_si128(RGB2, tmp2); \ 66 RGB3 = _mm_or_si128(RGB3, tmp3); \ 67 RGB4 = _mm_or_si128(RGB4, tmp4); \ 68 tmp1 = _mm_srli_epi16(_mm_unpacklo_epi8(B1, _mm_setzero_si128()), 3); \ 69 tmp2 = _mm_srli_epi16(_mm_unpackhi_epi8(B1, _mm_setzero_si128()), 3); \ 70 tmp3 = _mm_srli_epi16(_mm_unpacklo_epi8(B2, _mm_setzero_si128()), 3); \ 71 tmp4 = _mm_srli_epi16(_mm_unpackhi_epi8(B2, _mm_setzero_si128()), 3); \ 72 RGB1 = _mm_or_si128(RGB1, tmp1); \ 73 RGB2 = _mm_or_si128(RGB2, tmp2); \ 74 RGB3 = _mm_or_si128(RGB3, tmp3); \ 75 RGB4 = _mm_or_si128(RGB4, tmp4); \ 76} 77 78#define PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ 79RGB1 = _mm_packus_epi16(_mm_and_si128(R1,_mm_set1_epi16(0xFF)), _mm_and_si128(R2,_mm_set1_epi16(0xFF))); \ 80RGB2 = _mm_packus_epi16(_mm_and_si128(G1,_mm_set1_epi16(0xFF)), _mm_and_si128(G2,_mm_set1_epi16(0xFF))); \ 81RGB3 = _mm_packus_epi16(_mm_and_si128(B1,_mm_set1_epi16(0xFF)), _mm_and_si128(B2,_mm_set1_epi16(0xFF))); \ 82RGB4 = _mm_packus_epi16(_mm_srli_epi16(R1,8), _mm_srli_epi16(R2,8)); \ 83RGB5 = _mm_packus_epi16(_mm_srli_epi16(G1,8), _mm_srli_epi16(G2,8)); \ 84RGB6 = _mm_packus_epi16(_mm_srli_epi16(B1,8), _mm_srli_epi16(B2,8)); \ 85 86#define PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ 87R1 = _mm_packus_epi16(_mm_and_si128(RGB1,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB2,_mm_set1_epi16(0xFF))); \ 88R2 = _mm_packus_epi16(_mm_and_si128(RGB3,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB4,_mm_set1_epi16(0xFF))); \ 89G1 = _mm_packus_epi16(_mm_and_si128(RGB5,_mm_set1_epi16(0xFF)), _mm_and_si128(RGB6,_mm_set1_epi16(0xFF))); \ 90G2 = _mm_packus_epi16(_mm_srli_epi16(RGB1,8), _mm_srli_epi16(RGB2,8)); \ 91B1 = _mm_packus_epi16(_mm_srli_epi16(RGB3,8), _mm_srli_epi16(RGB4,8)); \ 92B2 = _mm_packus_epi16(_mm_srli_epi16(RGB5,8), _mm_srli_epi16(RGB6,8)); \ 93 94#define PACK_RGB24_32(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ 95PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ 96PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ 97PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ 98PACK_RGB24_32_STEP2(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ 99PACK_RGB24_32_STEP1(R1, R2, G1, G2, B1, B2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6) \ 100 101#define PACK_RGBA_32(R1, R2, G1, G2, B1, B2, A1, A2, RGB1, RGB2, RGB3, RGB4, RGB5, RGB6, RGB7, RGB8) \ 102{ \ 103 __m128i lo_ab, hi_ab, lo_gr, hi_gr; \ 104\ 105 lo_ab = _mm_unpacklo_epi8( A1, B1 ); \ 106 hi_ab = _mm_unpackhi_epi8( A1, B1 ); \ 107 lo_gr = _mm_unpacklo_epi8( G1, R1 ); \ 108 hi_gr = _mm_unpackhi_epi8( G1, R1 ); \ 109 RGB1 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \ 110 RGB2 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \ 111 RGB3 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \ 112 RGB4 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \ 113\ 114 lo_ab = _mm_unpacklo_epi8( A2, B2 ); \ 115 hi_ab = _mm_unpackhi_epi8( A2, B2 ); \ 116 lo_gr = _mm_unpacklo_epi8( G2, R2 ); \ 117 hi_gr = _mm_unpackhi_epi8( G2, R2 ); \ 118 RGB5 = _mm_unpacklo_epi16( lo_ab, lo_gr ); \ 119 RGB6 = _mm_unpackhi_epi16( lo_ab, lo_gr ); \ 120 RGB7 = _mm_unpacklo_epi16( hi_ab, hi_gr ); \ 121 RGB8 = _mm_unpackhi_epi16( hi_ab, hi_gr ); \ 122} 123 124#if RGB_FORMAT == RGB_FORMAT_RGB565 125 126#define PACK_PIXEL \ 127 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ 128 \ 129 PACK_RGB565_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4) \ 130 \ 131 PACK_RGB565_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_5, rgb_6, rgb_7, rgb_8) \ 132 133#elif RGB_FORMAT == RGB_FORMAT_RGB24 134 135#define PACK_PIXEL \ 136 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6; \ 137 __m128i rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12; \ 138 \ 139 PACK_RGB24_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6) \ 140 \ 141 PACK_RGB24_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_7, rgb_8, rgb_9, rgb_10, rgb_11, rgb_12) \ 142 143#elif RGB_FORMAT == RGB_FORMAT_RGBA 144 145#define PACK_PIXEL \ 146 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ 147 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ 148 __m128i a = _mm_set1_epi8((char)0xFF); \ 149 \ 150 PACK_RGBA_32(r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ 151 \ 152 PACK_RGBA_32(r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ 153 154#elif RGB_FORMAT == RGB_FORMAT_BGRA 155 156#define PACK_PIXEL \ 157 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ 158 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ 159 __m128i a = _mm_set1_epi8((char)0xFF); \ 160 \ 161 PACK_RGBA_32(b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, a, a, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ 162 \ 163 PACK_RGBA_32(b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, a, a, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ 164 165#elif RGB_FORMAT == RGB_FORMAT_ARGB 166 167#define PACK_PIXEL \ 168 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ 169 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ 170 __m128i a = _mm_set1_epi8((char)0xFF); \ 171 \ 172 PACK_RGBA_32(a, a, r_8_11, r_8_12, g_8_11, g_8_12, b_8_11, b_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ 173 \ 174 PACK_RGBA_32(a, a, r_8_21, r_8_22, g_8_21, g_8_22, b_8_21, b_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ 175 176#elif RGB_FORMAT == RGB_FORMAT_ABGR 177 178#define PACK_PIXEL \ 179 __m128i rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8; \ 180 __m128i rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16; \ 181 __m128i a = _mm_set1_epi8((char)0xFF); \ 182 \ 183 PACK_RGBA_32(a, a, b_8_11, b_8_12, g_8_11, g_8_12, r_8_11, r_8_12, rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_6, rgb_7, rgb_8) \ 184 \ 185 PACK_RGBA_32(a, a, b_8_21, b_8_22, g_8_21, g_8_22, r_8_21, r_8_22, rgb_9, rgb_10, rgb_11, rgb_12, rgb_13, rgb_14, rgb_15, rgb_16) \ 186 187#else 188#error PACK_PIXEL unimplemented 189#endif 190 191#if RGB_FORMAT == RGB_FORMAT_RGB565 192 193#define SAVE_LINE1 \ 194 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ 195 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ 196 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ 197 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ 198 199#define SAVE_LINE2 \ 200 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_5); \ 201 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_6); \ 202 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_7); \ 203 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_8); \ 204 205#elif RGB_FORMAT == RGB_FORMAT_RGB24 206 207#define SAVE_LINE1 \ 208 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ 209 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ 210 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ 211 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ 212 SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \ 213 SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \ 214 215#define SAVE_LINE2 \ 216 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_7); \ 217 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_8); \ 218 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_9); \ 219 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_10); \ 220 SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_11); \ 221 SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_12); \ 222 223#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \ 224 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR 225 226#define SAVE_LINE1 \ 227 SAVE_SI128((__m128i*)(rgb_ptr1), rgb_1); \ 228 SAVE_SI128((__m128i*)(rgb_ptr1+16), rgb_2); \ 229 SAVE_SI128((__m128i*)(rgb_ptr1+32), rgb_3); \ 230 SAVE_SI128((__m128i*)(rgb_ptr1+48), rgb_4); \ 231 SAVE_SI128((__m128i*)(rgb_ptr1+64), rgb_5); \ 232 SAVE_SI128((__m128i*)(rgb_ptr1+80), rgb_6); \ 233 SAVE_SI128((__m128i*)(rgb_ptr1+96), rgb_7); \ 234 SAVE_SI128((__m128i*)(rgb_ptr1+112), rgb_8); \ 235 236#define SAVE_LINE2 \ 237 SAVE_SI128((__m128i*)(rgb_ptr2), rgb_9); \ 238 SAVE_SI128((__m128i*)(rgb_ptr2+16), rgb_10); \ 239 SAVE_SI128((__m128i*)(rgb_ptr2+32), rgb_11); \ 240 SAVE_SI128((__m128i*)(rgb_ptr2+48), rgb_12); \ 241 SAVE_SI128((__m128i*)(rgb_ptr2+64), rgb_13); \ 242 SAVE_SI128((__m128i*)(rgb_ptr2+80), rgb_14); \ 243 SAVE_SI128((__m128i*)(rgb_ptr2+96), rgb_15); \ 244 SAVE_SI128((__m128i*)(rgb_ptr2+112), rgb_16); \ 245 246#else 247#error SAVE_LINE unimplemented 248#endif 249 250#if YUV_FORMAT == YUV_FORMAT_420 251 252#define READ_Y(y_ptr) \ 253 y = LOAD_SI128((const __m128i*)(y_ptr)); \ 254 255#define READ_UV \ 256 u = LOAD_SI128((const __m128i*)(u_ptr)); \ 257 v = LOAD_SI128((const __m128i*)(v_ptr)); \ 258 259#elif YUV_FORMAT == YUV_FORMAT_422 260 261#define READ_Y(y_ptr) \ 262{ \ 263 __m128i y1, y2; \ 264 y1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr)), 8), 8); \ 265 y2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(y_ptr+16)), 8), 8); \ 266 y = _mm_packus_epi16(y1, y2); \ 267} 268 269#define READ_UV \ 270{ \ 271 __m128i u1, u2, u3, u4, v1, v2, v3, v4; \ 272 u1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr)), 24), 24); \ 273 u2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+16)), 24), 24); \ 274 u3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+32)), 24), 24); \ 275 u4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(u_ptr+48)), 24), 24); \ 276 u = _mm_packus_epi16(_mm_packs_epi32(u1, u2), _mm_packs_epi32(u3, u4)); \ 277 v1 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr)), 24), 24); \ 278 v2 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+16)), 24), 24); \ 279 v3 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+32)), 24), 24); \ 280 v4 = _mm_srli_epi32(_mm_slli_epi32(LOAD_SI128((const __m128i*)(v_ptr+48)), 24), 24); \ 281 v = _mm_packus_epi16(_mm_packs_epi32(v1, v2), _mm_packs_epi32(v3, v4)); \ 282} 283 284#elif YUV_FORMAT == YUV_FORMAT_NV12 285 286#define READ_Y(y_ptr) \ 287 y = LOAD_SI128((const __m128i*)(y_ptr)); \ 288 289#define READ_UV \ 290{ \ 291 __m128i u1, u2, v1, v2; \ 292 u1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr)), 8), 8); \ 293 u2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(u_ptr+16)), 8), 8); \ 294 u = _mm_packus_epi16(u1, u2); \ 295 v1 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr)), 8), 8); \ 296 v2 = _mm_srli_epi16(_mm_slli_epi16(LOAD_SI128((const __m128i*)(v_ptr+16)), 8), 8); \ 297 v = _mm_packus_epi16(v1, v2); \ 298} 299 300#else 301#error READ_UV unimplemented 302#endif 303 304#define YUV2RGB_32 \ 305 __m128i r_tmp, g_tmp, b_tmp; \ 306 __m128i r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2; \ 307 __m128i r_uv_16_1, g_uv_16_1, b_uv_16_1, r_uv_16_2, g_uv_16_2, b_uv_16_2; \ 308 __m128i y_16_1, y_16_2; \ 309 __m128i y, u, v, u_16, v_16; \ 310 __m128i r_8_11, g_8_11, b_8_11, r_8_21, g_8_21, b_8_21; \ 311 __m128i r_8_12, g_8_12, b_8_12, r_8_22, g_8_22, b_8_22; \ 312 \ 313 READ_UV \ 314 \ 315 /* process first 16 pixels of first line */\ 316 u_16 = _mm_unpacklo_epi8(u, _mm_setzero_si128()); \ 317 v_16 = _mm_unpacklo_epi8(v, _mm_setzero_si128()); \ 318 u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \ 319 v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \ 320 \ 321 UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ 322 r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \ 323 r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \ 324 \ 325 READ_Y(y_ptr1) \ 326 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ 327 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ 328 \ 329 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ 330 \ 331 r_8_11 = _mm_packus_epi16(r_16_1, r_16_2); \ 332 g_8_11 = _mm_packus_epi16(g_16_1, g_16_2); \ 333 b_8_11 = _mm_packus_epi16(b_16_1, b_16_2); \ 334 \ 335 /* process first 16 pixels of second line */\ 336 r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ 337 r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ 338 \ 339 READ_Y(y_ptr2) \ 340 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ 341 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ 342 \ 343 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ 344 \ 345 r_8_21 = _mm_packus_epi16(r_16_1, r_16_2); \ 346 g_8_21 = _mm_packus_epi16(g_16_1, g_16_2); \ 347 b_8_21 = _mm_packus_epi16(b_16_1, b_16_2); \ 348 \ 349 /* process last 16 pixels of first line */\ 350 u_16 = _mm_unpackhi_epi8(u, _mm_setzero_si128()); \ 351 v_16 = _mm_unpackhi_epi8(v, _mm_setzero_si128()); \ 352 u_16 = _mm_add_epi16(u_16, _mm_set1_epi16(-128)); \ 353 v_16 = _mm_add_epi16(v_16, _mm_set1_epi16(-128)); \ 354 \ 355 UV2RGB_16(u_16, v_16, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ 356 r_uv_16_1=r_16_1; g_uv_16_1=g_16_1; b_uv_16_1=b_16_1; \ 357 r_uv_16_2=r_16_2; g_uv_16_2=g_16_2; b_uv_16_2=b_16_2; \ 358 \ 359 READ_Y(y_ptr1+16*y_pixel_stride) \ 360 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ 361 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ 362 \ 363 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ 364 \ 365 r_8_12 = _mm_packus_epi16(r_16_1, r_16_2); \ 366 g_8_12 = _mm_packus_epi16(g_16_1, g_16_2); \ 367 b_8_12 = _mm_packus_epi16(b_16_1, b_16_2); \ 368 \ 369 /* process last 16 pixels of second line */\ 370 r_16_1=r_uv_16_1; g_16_1=g_uv_16_1; b_16_1=b_uv_16_1; \ 371 r_16_2=r_uv_16_2; g_16_2=g_uv_16_2; b_16_2=b_uv_16_2; \ 372 \ 373 READ_Y(y_ptr2+16*y_pixel_stride) \ 374 y_16_1 = _mm_unpacklo_epi8(y, _mm_setzero_si128()); \ 375 y_16_2 = _mm_unpackhi_epi8(y, _mm_setzero_si128()); \ 376 \ 377 ADD_Y2RGB_16(y_16_1, y_16_2, r_16_1, g_16_1, b_16_1, r_16_2, g_16_2, b_16_2) \ 378 \ 379 r_8_22 = _mm_packus_epi16(r_16_1, r_16_2); \ 380 g_8_22 = _mm_packus_epi16(g_16_1, g_16_2); \ 381 b_8_22 = _mm_packus_epi16(b_16_1, b_16_2); \ 382 \ 383 384 385void SSE_FUNCTION_NAME(uint32_t width, uint32_t height, 386 const uint8_t *Y, const uint8_t *U, const uint8_t *V, uint32_t Y_stride, uint32_t UV_stride, 387 uint8_t *RGB, uint32_t RGB_stride, 388 YCbCrType yuv_type) 389{ 390 const YUV2RGBParam *const param = &(YUV2RGB[yuv_type]); 391#if YUV_FORMAT == YUV_FORMAT_420 392 const int y_pixel_stride = 1; 393 const int uv_pixel_stride = 1; 394 const int uv_x_sample_interval = 2; 395 const int uv_y_sample_interval = 2; 396#elif YUV_FORMAT == YUV_FORMAT_422 397 const int y_pixel_stride = 2; 398 const int uv_pixel_stride = 4; 399 const int uv_x_sample_interval = 2; 400 const int uv_y_sample_interval = 1; 401#elif YUV_FORMAT == YUV_FORMAT_NV12 402 const int y_pixel_stride = 1; 403 const int uv_pixel_stride = 2; 404 const int uv_x_sample_interval = 2; 405 const int uv_y_sample_interval = 2; 406#endif 407#if RGB_FORMAT == RGB_FORMAT_RGB565 408 const int rgb_pixel_stride = 2; 409#elif RGB_FORMAT == RGB_FORMAT_RGB24 410 const int rgb_pixel_stride = 3; 411#elif RGB_FORMAT == RGB_FORMAT_RGBA || RGB_FORMAT == RGB_FORMAT_BGRA || \ 412 RGB_FORMAT == RGB_FORMAT_ARGB || RGB_FORMAT == RGB_FORMAT_ABGR 413 const int rgb_pixel_stride = 4; 414#else 415#error Unknown RGB pixel size 416#endif 417 418 if (width >= 32) { 419 uint32_t xpos, ypos; 420 for(ypos=0; ypos<(height-(uv_y_sample_interval-1)); ypos+=uv_y_sample_interval) 421 { 422 const uint8_t *y_ptr1=Y+ypos*Y_stride, 423 *y_ptr2=Y+(ypos+1)*Y_stride, 424 *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride, 425 *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride; 426 427 uint8_t *rgb_ptr1=RGB+ypos*RGB_stride, 428 *rgb_ptr2=RGB+(ypos+1)*RGB_stride; 429 430 for(xpos=0; xpos<(width-31); xpos+=32) 431 { 432 YUV2RGB_32 433 { 434 PACK_PIXEL 435 SAVE_LINE1 436 if (uv_y_sample_interval > 1) 437 { 438 SAVE_LINE2 439 } 440 } 441 442 y_ptr1+=32*y_pixel_stride; 443 y_ptr2+=32*y_pixel_stride; 444 u_ptr+=32*uv_pixel_stride/uv_x_sample_interval; 445 v_ptr+=32*uv_pixel_stride/uv_x_sample_interval; 446 rgb_ptr1+=32*rgb_pixel_stride; 447 rgb_ptr2+=32*rgb_pixel_stride; 448 } 449 } 450 451 /* Catch the last line, if needed */ 452 if (uv_y_sample_interval == 2 && ypos == (height-1)) 453 { 454 const uint8_t *y_ptr=Y+ypos*Y_stride, 455 *u_ptr=U+(ypos/uv_y_sample_interval)*UV_stride, 456 *v_ptr=V+(ypos/uv_y_sample_interval)*UV_stride; 457 458 uint8_t *rgb_ptr=RGB+ypos*RGB_stride; 459 460 STD_FUNCTION_NAME(width, 1, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type); 461 } 462 } 463 464 /* Catch the right column, if needed */ 465 { 466 int converted = (width & ~31); 467 if (converted != width) 468 { 469 const uint8_t *y_ptr=Y+converted*y_pixel_stride, 470 *u_ptr=U+converted*uv_pixel_stride/uv_x_sample_interval, 471 *v_ptr=V+converted*uv_pixel_stride/uv_x_sample_interval; 472 473 uint8_t *rgb_ptr=RGB+converted*rgb_pixel_stride; 474 475 STD_FUNCTION_NAME(width-converted, height, y_ptr, u_ptr, v_ptr, Y_stride, UV_stride, rgb_ptr, RGB_stride, yuv_type); 476 } 477 } 478} 479 480#undef SSE_FUNCTION_NAME 481#undef STD_FUNCTION_NAME 482#undef YUV_FORMAT 483#undef RGB_FORMAT 484#undef SSE_ALIGNED 485#undef LOAD_SI128 486#undef SAVE_SI128 487#undef UV2RGB_16 488#undef ADD_Y2RGB_16 489#undef PACK_RGB24_32_STEP1 490#undef PACK_RGB24_32_STEP2 491#undef PACK_RGB24_32 492#undef PACK_RGBA_32 493#undef PACK_PIXEL 494#undef SAVE_LINE1 495#undef SAVE_LINE2 496#undef READ_Y 497#undef READ_UV 498#undef YUV2RGB_32 499
[FILE END]
(C) 2025 0x4248 (C) 2025 4248 Media and 4248 Systems, All part of 0x4248 See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.