Atlas - SDL_blit_A.c

Home / ext / SDL / src / video Lines: 1 | Size: 55422 bytes [Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)]
[FILE BEGIN]
1/* 2 Simple DirectMedia Layer 3 Copyright (C) 1997-2025 Sam Lantinga <[email protected]> 4 5 This software is provided 'as-is', without any express or implied 6 warranty. In no event will the authors be held liable for any damages 7 arising from the use of this software. 8 9 Permission is granted to anyone to use this software for any purpose, 10 including commercial applications, and to alter it and redistribute it 11 freely, subject to the following restrictions: 12 13 1. The origin of this software must not be misrepresented; you must not 14 claim that you wrote the original software. If you use this software 15 in a product, an acknowledgment in the product documentation would be 16 appreciated but is not required. 17 2. Altered source versions must be plainly marked as such, and must not be 18 misrepresented as being the original software. 19 3. This notice may not be removed or altered from any source distribution. 20*/ 21#include "SDL_internal.h" 22 23#ifdef SDL_HAVE_BLIT_A 24 25#include "SDL_pixels_c.h" 26#include "SDL_surface_c.h" 27 28// Functions to perform alpha blended blitting 29 30// N->1 blending with per-surface alpha 31static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) 32{ 33 int width = info->dst_w; 34 int height = info->dst_h; 35 Uint8 *src = info->src; 36 int srcskip = info->src_skip; 37 Uint8 *dst = info->dst; 38 int dstskip = info->dst_skip; 39 Uint8 *palmap = info->table; 40 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 41 const SDL_Color *dstpal = info->dst_pal->colors; 42 int srcbpp = srcfmt->bytes_per_pixel; 43 Uint32 Pixel; 44 unsigned sR, sG, sB; 45 unsigned dR, dG, dB; 46 const unsigned A = info->a; 47 48 while (height--) { 49 /* *INDENT-OFF* */ // clang-format off 50 DUFFS_LOOP( 51 { 52 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 53 dR = dstpal[*dst].r; 54 dG = dstpal[*dst].g; 55 dB = dstpal[*dst].b; 56 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB); 57 dR &= 0xff; 58 dG &= 0xff; 59 dB &= 0xff; 60 // Pack RGB into 8bit pixel 61 if ( palmap == NULL ) { 62 *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))); 63 } else { 64 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; 65 } 66 dst++; 67 src += srcbpp; 68 }, 69 width); 70 /* *INDENT-ON* */ // clang-format on 71 src += srcskip; 72 dst += dstskip; 73 } 74} 75 76// N->1 blending with pixel alpha 77static void BlitNto1PixelAlpha(SDL_BlitInfo *info) 78{ 79 int width = info->dst_w; 80 int height = info->dst_h; 81 Uint8 *src = info->src; 82 int srcskip = info->src_skip; 83 Uint8 *dst = info->dst; 84 int dstskip = info->dst_skip; 85 Uint8 *palmap = info->table; 86 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 87 const SDL_Color *dstpal = info->dst_pal->colors; 88 int srcbpp = srcfmt->bytes_per_pixel; 89 Uint32 Pixel; 90 unsigned sR, sG, sB, sA; 91 unsigned dR, dG, dB; 92 93 while (height--) { 94 /* *INDENT-OFF* */ // clang-format off 95 DUFFS_LOOP( 96 { 97 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA); 98 dR = dstpal[*dst].r; 99 dG = dstpal[*dst].g; 100 dB = dstpal[*dst].b; 101 ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB); 102 dR &= 0xff; 103 dG &= 0xff; 104 dB &= 0xff; 105 // Pack RGB into 8bit pixel 106 if ( palmap == NULL ) { 107 *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))); 108 } else { 109 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; 110 } 111 dst++; 112 src += srcbpp; 113 }, 114 width); 115 /* *INDENT-ON* */ // clang-format on 116 src += srcskip; 117 dst += dstskip; 118 } 119} 120 121// colorkeyed N->1 blending with per-surface alpha 122static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info) 123{ 124 int width = info->dst_w; 125 int height = info->dst_h; 126 Uint8 *src = info->src; 127 int srcskip = info->src_skip; 128 Uint8 *dst = info->dst; 129 int dstskip = info->dst_skip; 130 Uint8 *palmap = info->table; 131 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 132 const SDL_Color *dstpal = info->dst_pal->colors; 133 int srcbpp = srcfmt->bytes_per_pixel; 134 Uint32 ckey = info->colorkey; 135 Uint32 Pixel; 136 unsigned sR, sG, sB; 137 unsigned dR, dG, dB; 138 const unsigned A = info->a; 139 140 while (height--) { 141 /* *INDENT-OFF* */ // clang-format off 142 DUFFS_LOOP( 143 { 144 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 145 if ( Pixel != ckey ) { 146 dR = dstpal[*dst].r; 147 dG = dstpal[*dst].g; 148 dB = dstpal[*dst].b; 149 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB); 150 dR &= 0xff; 151 dG &= 0xff; 152 dB &= 0xff; 153 // Pack RGB into 8bit pixel 154 if ( palmap == NULL ) { 155 *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))); 156 } else { 157 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))]; 158 } 159 } 160 dst++; 161 src += srcbpp; 162 }, 163 width); 164 /* *INDENT-ON* */ // clang-format on 165 src += srcskip; 166 dst += dstskip; 167 } 168} 169 170#ifdef SDL_SSE2_INTRINSICS 171 172static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *info) 173{ 174 int width = info->dst_w; 175 int height = info->dst_h; 176 Uint8 *src = info->src; 177 int srcskip = info->src_skip; 178 Uint8 *dst = info->dst; 179 int dstskip = info->dst_skip; 180 Uint8 alpha = info->a; 181 182 const __m128i alpha_fill_mask = _mm_set1_epi32((int)0xff000000); 183 const __m128i srcA = _mm_set1_epi16(alpha); 184 185 while (height--) { 186 int i = 0; 187 188 for (; i + 4 <= width; i += 4) { 189 // Load 4 src pixels 190 __m128i src128 = _mm_loadu_si128((__m128i *)src); 191 192 // Load 4 dst pixels 193 __m128i dst128 = _mm_loadu_si128((__m128i *)dst); 194 195 __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128()); 196 __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128()); 197 198 __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128()); 199 __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128()); 200 201 // dst = ((src - dst) * srcA) + ((dst << 8) - dst) 202 dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srcA), 203 _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo)); 204 dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srcA), 205 _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi)); 206 207 // dst += 0x1U (use 0x80 to round instead of floor) 208 dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1)); 209 dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1)); 210 211 // dst = (dst + (dst >> 8)) >> 8 212 dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8); 213 dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8); 214 215 dst128 = _mm_packus_epi16(dst_lo, dst_hi); 216 217 // Set the alpha channels of dst to 255 218 dst128 = _mm_or_si128(dst128, alpha_fill_mask); 219 220 _mm_storeu_si128((__m128i *)dst, dst128); 221 222 src += 16; 223 dst += 16; 224 } 225 226 for (; i < width; ++i) { 227 Uint32 src32 = *(Uint32 *)src; 228 Uint32 dst32 = *(Uint32 *)dst; 229 230 FACTOR_BLEND_8888(src32, dst32, alpha); 231 232 *dst = dst32 | 0xff000000; 233 234 src += 4; 235 dst += 4; 236 } 237 238 src += srcskip; 239 dst += dstskip; 240 } 241} 242 243#endif 244 245#ifdef SDL_LSX_INTRINSICS 246 247static void SDL_TARGETING("lsx") Blit8888to8888PixelAlphaSwizzleLSX(SDL_BlitInfo *info) 248{ 249 int width = info->dst_w; 250 int height = info->dst_h; 251 Uint8 *src = info->src; 252 int srcskip = info->src_skip; 253 Uint8 *dst = info->dst; 254 int dstskip = info->dst_skip; 255 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 256 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; 257 bool fill_alpha = !dstfmt->Amask; 258 Uint32 dstAmask, dstAshift; 259 const Uint8 offsets[] = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; 260 261 SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift); 262 263 const __m128i const_0xff00 = __lsx_vreplgr2vr_h(0xff00); 264 const __m128i const_128 = __lsx_vreplgr2vr_b((Uint8)128); 265 const __m128i const_32641 = __lsx_vreplgr2vr_h(32641); 266 const __m128i const_257 = __lsx_vreplgr2vr_h(257); 267 268 // The byte offsets for the start of each pixel 269 const __m128i mask_offsets = __lsx_vld(offsets, 0); 270 271 const __m128i convert_mask = __lsx_vadd_w( 272 __lsx_vreplgr2vr_w( 273 ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | 274 ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | 275 ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), 276 mask_offsets); 277 278 const __m128i alpha_splat_mask = __lsx_vadd_b(__lsx_vreplgr2vr_b(srcfmt->Ashift >> 3), mask_offsets); 279 const __m128i alpha_fill_mask = __lsx_vreplgr2vr_w((int)dstAmask); 280 281 while (height--) { 282 int i = 0; 283 284 for (; i + 4 <= width; i += 4) { 285 __m128i src128 = __lsx_vld(src, 0); 286 __m128i dst128 = __lsx_vld(dst, 0); 287 288 __m128i srcA = __lsx_vshuf_b(src128, src128, alpha_splat_mask); 289 src128 = __lsx_vshuf_b(src128, src128, convert_mask); 290 291 src128 = __lsx_vor_v(src128, alpha_fill_mask); 292 293 __m128i srca_lo = __lsx_vilvl_b(srcA, srcA); 294 __m128i srca_hi = __lsx_vilvh_b(srcA, srcA); 295 296 srca_lo = __lsx_vxor_v(srca_lo, const_0xff00); 297 srca_hi = __lsx_vxor_v(srca_hi, const_0xff00); 298 299 src128 = __lsx_vsub_b(src128, const_128); 300 dst128 = __lsx_vsub_b(dst128, const_128); 301 302 __m128i tmp = __lsx_vilvl_b(dst128, src128); 303 __m128i dst_lo = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_lo, tmp), __lsx_vmulwod_h_bu_b(srca_lo, tmp)); 304 tmp = __lsx_vilvh_b(dst128, src128); 305 __m128i dst_hi = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_hi, tmp), __lsx_vmulwod_h_bu_b(srca_hi, tmp)); 306 307 dst_lo = __lsx_vadd_h(dst_lo, const_32641); 308 dst_hi = __lsx_vadd_h(dst_hi, const_32641); 309 310 dst_lo = __lsx_vmuh_hu(dst_lo, const_257); 311 dst_hi = __lsx_vmuh_hu(dst_hi, const_257); 312 313 dst128 = __lsx_vssrarni_bu_h(dst_hi, dst_lo, 0); 314 if (fill_alpha) { 315 dst128 = __lsx_vor_v(dst128, alpha_fill_mask); 316 } 317 __lsx_vst(dst128, dst, 0); 318 319 src += 16; 320 dst += 16; 321 } 322 323 for (; i < width; ++i) { 324 Uint32 src32 = *(Uint32 *)src; 325 Uint32 dst32 = *(Uint32 *)dst; 326 ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); 327 if (fill_alpha) { 328 dst32 |= dstAmask; 329 } 330 *(Uint32 *)dst = dst32; 331 src += 4; 332 dst += 4; 333 } 334 335 src += srcskip; 336 dst += dstskip; 337 } 338} 339 340#endif 341 342// fast RGB888->(A)RGB888 blending with surface alpha=128 special case 343static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) 344{ 345 int width = info->dst_w; 346 int height = info->dst_h; 347 Uint32 *srcp = (Uint32 *)info->src; 348 int srcskip = info->src_skip >> 2; 349 Uint32 *dstp = (Uint32 *)info->dst; 350 int dstskip = info->dst_skip >> 2; 351 352 while (height--) { 353 /* *INDENT-OFF* */ // clang-format off 354 DUFFS_LOOP({ 355 Uint32 s = *srcp++; 356 Uint32 d = *dstp; 357 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 358 + (s & d & 0x00010101)) | 0xff000000; 359 }, width); 360 /* *INDENT-ON* */ // clang-format on 361 srcp += srcskip; 362 dstp += dstskip; 363 } 364} 365 366// fast RGB888->(A)RGB888 blending with surface alpha 367static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info) 368{ 369 unsigned alpha = info->a; 370 if (alpha == 128) { 371 BlitRGBtoRGBSurfaceAlpha128(info); 372 } else { 373 int width = info->dst_w; 374 int height = info->dst_h; 375 Uint32 *srcp = (Uint32 *)info->src; 376 int srcskip = info->src_skip >> 2; 377 Uint32 *dstp = (Uint32 *)info->dst; 378 int dstskip = info->dst_skip >> 2; 379 Uint32 s; 380 Uint32 d; 381 382 while (height--) { 383 /* *INDENT-OFF* */ // clang-format off 384 DUFFS_LOOP({ 385 s = *srcp; 386 d = *dstp; 387 388 FACTOR_BLEND_8888(s, d, alpha); 389 390 *dstp = d | 0xff000000; 391 ++srcp; 392 ++dstp; 393 }, width); 394 /* *INDENT-ON* */ // clang-format on 395 srcp += srcskip; 396 dstp += dstskip; 397 } 398 } 399} 400 401// 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel 402 403// blend a single 16 bit pixel at 50% 404#define BLEND16_50(d, s, mask) \ 405 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff))) 406 407// blend two 16 bit pixels at 50% 408#define BLEND2x16_50(d, s, mask) \ 409 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) + (s & d & (~(mask | mask << 16)))) 410 411static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask) 412{ 413 int width = info->dst_w; 414 int height = info->dst_h; 415 Uint16 *srcp = (Uint16 *)info->src; 416 int srcskip = info->src_skip >> 1; 417 Uint16 *dstp = (Uint16 *)info->dst; 418 int dstskip = info->dst_skip >> 1; 419 420 while (height--) { 421 if (((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) { 422 /* 423 * Source and destination not aligned, pipeline it. 424 * This is mostly a win for big blits but no loss for 425 * small ones 426 */ 427 Uint32 prev_sw; 428 int w = width; 429 430 // handle odd destination 431 if ((uintptr_t)dstp & 2) { 432 Uint16 d = *dstp, s = *srcp; 433 *dstp = BLEND16_50(d, s, mask); 434 dstp++; 435 srcp++; 436 w--; 437 } 438 srcp++; // srcp is now 32-bit aligned 439 440 // bootstrap pipeline with first halfword 441 prev_sw = ((Uint32 *)srcp)[-1]; 442 443 while (w > 1) { 444 Uint32 sw, dw, s; 445 sw = *(Uint32 *)srcp; 446 dw = *(Uint32 *)dstp; 447#if SDL_BYTEORDER == SDL_BIG_ENDIAN 448 s = (prev_sw << 16) + (sw >> 16); 449#else 450 s = (prev_sw >> 16) + (sw << 16); 451#endif 452 prev_sw = sw; 453 *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask); 454 dstp += 2; 455 srcp += 2; 456 w -= 2; 457 } 458 459 // final pixel if any 460 if (w) { 461 Uint16 d = *dstp, s; 462#if SDL_BYTEORDER == SDL_BIG_ENDIAN 463 s = (Uint16)prev_sw; 464#else 465 s = (Uint16)(prev_sw >> 16); 466#endif 467 *dstp = BLEND16_50(d, s, mask); 468 srcp++; 469 dstp++; 470 } 471 srcp += srcskip - 1; 472 dstp += dstskip; 473 } else { 474 // source and destination are aligned 475 int w = width; 476 477 // first odd pixel? 478 if ((uintptr_t)srcp & 2) { 479 Uint16 d = *dstp, s = *srcp; 480 *dstp = BLEND16_50(d, s, mask); 481 srcp++; 482 dstp++; 483 w--; 484 } 485 // srcp and dstp are now 32-bit aligned 486 487 while (w > 1) { 488 Uint32 sw = *(Uint32 *)srcp; 489 Uint32 dw = *(Uint32 *)dstp; 490 *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask); 491 srcp += 2; 492 dstp += 2; 493 w -= 2; 494 } 495 496 // last odd pixel? 497 if (w) { 498 Uint16 d = *dstp, s = *srcp; 499 *dstp = BLEND16_50(d, s, mask); 500 srcp++; 501 dstp++; 502 } 503 srcp += srcskip; 504 dstp += dstskip; 505 } 506 } 507} 508 509#ifdef SDL_MMX_INTRINSICS 510 511// fast RGB565->RGB565 blending with surface alpha 512static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) 513{ 514 unsigned alpha = info->a; 515 if (alpha == 128) { 516 Blit16to16SurfaceAlpha128(info, 0xf7de); 517 } else { 518 int width = info->dst_w; 519 int height = info->dst_h; 520 Uint16 *srcp = (Uint16 *)info->src; 521 int srcskip = info->src_skip >> 1; 522 Uint16 *dstp = (Uint16 *)info->dst; 523 int dstskip = info->dst_skip >> 1; 524 Uint32 s, d; 525 526#ifdef USE_DUFFS_LOOP 527 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; 528 529 alpha &= ~(1 + 2 + 4); // cut alpha to get the exact same behaviour 530 mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha 531 alpha >>= 3; // downscale alpha to 5 bits 532 533 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha 534 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha 535 /* position alpha to allow for mullo and mulhi on diff channels 536 to reduce the number of operations */ 537 mm_alpha = _mm_slli_si64(mm_alpha, 3); 538 539 // Setup the 565 color channel masks 540 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); // MASKGREEN -> gmask 541 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask 542#endif 543 544 while (height--) { 545 /* *INDENT-OFF* */ // clang-format off 546 DUFFS_LOOP_124( 547 { 548 s = *srcp++; 549 d = *dstp; 550 /* 551 * shift out the middle component (green) to 552 * the high 16 bits, and process all three RGB 553 * components at the same time. 554 */ 555 s = (s | s << 16) & 0x07e0f81f; 556 d = (d | d << 16) & 0x07e0f81f; 557 d += (s - d) * alpha >> 5; 558 d &= 0x07e0f81f; 559 *dstp++ = (Uint16)(d | d >> 16); 560 },{ 561 s = *srcp++; 562 d = *dstp; 563 /* 564 * shift out the middle component (green) to 565 * the high 16 bits, and process all three RGB 566 * components at the same time. 567 */ 568 s = (s | s << 16) & 0x07e0f81f; 569 d = (d | d << 16) & 0x07e0f81f; 570 d += (s - d) * alpha >> 5; 571 d &= 0x07e0f81f; 572 *dstp++ = (Uint16)(d | d >> 16); 573 s = *srcp++; 574 d = *dstp; 575 /* 576 * shift out the middle component (green) to 577 * the high 16 bits, and process all three RGB 578 * components at the same time. 579 */ 580 s = (s | s << 16) & 0x07e0f81f; 581 d = (d | d << 16) & 0x07e0f81f; 582 d += (s - d) * alpha >> 5; 583 d &= 0x07e0f81f; 584 *dstp++ = (Uint16)(d | d >> 16); 585 },{ 586 src1 = *(__m64 *)srcp; // 4 src pixels -> src1 587 dst1 = *(__m64 *)dstp; // 4 dst pixels -> dst1 588 589 // red 590 src2 = src1; 591 src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 [000r 000r 000r 000r] 592 593 dst2 = dst1; 594 dst2 = _mm_srli_pi16(dst2, 11); // dst2 >> 11 -> dst2 [000r 000r 000r 000r] 595 596 // blend 597 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 598 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 599 src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 600 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 601 dst2 = _mm_slli_pi16(dst2, 11); // dst2 << 11 -> dst2 602 603 mm_res = dst2; // RED -> mm_res 604 605 // green -- process the bits in place 606 src2 = src1; 607 src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2 608 609 dst2 = dst1; 610 dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2 611 612 // blend 613 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 614 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 615 src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2 616 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 617 618 mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res 619 620 // blue 621 src2 = src1; 622 src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b] 623 624 dst2 = dst1; 625 dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b] 626 627 // blend 628 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 629 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 630 src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 631 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 632 dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2 633 634 mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res 635 636 *(__m64 *)dstp = mm_res; // mm_res -> 4 dst pixels 637 638 srcp += 4; 639 dstp += 4; 640 }, width); 641 /* *INDENT-ON* */ // clang-format on 642 srcp += srcskip; 643 dstp += dstskip; 644 } 645 _mm_empty(); 646 } 647} 648 649// fast RGB555->RGB555 blending with surface alpha 650static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) 651{ 652 unsigned alpha = info->a; 653 if (alpha == 128) { 654 Blit16to16SurfaceAlpha128(info, 0xfbde); 655 } else { 656 int width = info->dst_w; 657 int height = info->dst_h; 658 Uint16 *srcp = (Uint16 *)info->src; 659 int srcskip = info->src_skip >> 1; 660 Uint16 *dstp = (Uint16 *)info->dst; 661 int dstskip = info->dst_skip >> 1; 662 Uint32 s, d; 663 664#ifdef USE_DUFFS_LOOP 665 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; 666 667 alpha &= ~(1 + 2 + 4); // cut alpha to get the exact same behaviour 668 mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha 669 alpha >>= 3; // downscale alpha to 5 bits 670 671 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha 672 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha 673 /* position alpha to allow for mullo and mulhi on diff channels 674 to reduce the number of operations */ 675 mm_alpha = _mm_slli_si64(mm_alpha, 3); 676 677 // Setup the 555 color channel masks 678 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); // MASKRED -> rmask 679 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); // MASKGREEN -> gmask 680 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask 681#endif 682 while (height--) { 683 /* *INDENT-OFF* */ // clang-format off 684 DUFFS_LOOP_124( 685 { 686 s = *srcp++; 687 d = *dstp; 688 /* 689 * shift out the middle component (green) to 690 * the high 16 bits, and process all three RGB 691 * components at the same time. 692 */ 693 s = (s | s << 16) & 0x03e07c1f; 694 d = (d | d << 16) & 0x03e07c1f; 695 d += (s - d) * alpha >> 5; 696 d &= 0x03e07c1f; 697 *dstp++ = (Uint16)(d | d >> 16); 698 },{ 699 s = *srcp++; 700 d = *dstp; 701 /* 702 * shift out the middle component (green) to 703 * the high 16 bits, and process all three RGB 704 * components at the same time. 705 */ 706 s = (s | s << 16) & 0x03e07c1f; 707 d = (d | d << 16) & 0x03e07c1f; 708 d += (s - d) * alpha >> 5; 709 d &= 0x03e07c1f; 710 *dstp++ = (Uint16)(d | d >> 16); 711 s = *srcp++; 712 d = *dstp; 713 /* 714 * shift out the middle component (green) to 715 * the high 16 bits, and process all three RGB 716 * components at the same time. 717 */ 718 s = (s | s << 16) & 0x03e07c1f; 719 d = (d | d << 16) & 0x03e07c1f; 720 d += (s - d) * alpha >> 5; 721 d &= 0x03e07c1f; 722 *dstp++ = (Uint16)(d | d >> 16); 723 },{ 724 src1 = *(__m64 *)srcp; // 4 src pixels -> src1 725 dst1 = *(__m64 *)dstp; // 4 dst pixels -> dst1 726 727 // red -- process the bits in place 728 src2 = src1; 729 src2 = _mm_and_si64(src2, rmask); // src & MASKRED -> src2 730 731 dst2 = dst1; 732 dst2 = _mm_and_si64(dst2, rmask); // dst & MASKRED -> dst2 733 734 // blend 735 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 736 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 737 src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2 738 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 739 dst2 = _mm_and_si64(dst2, rmask); // dst2 & MASKRED -> dst2 740 741 mm_res = dst2; // RED -> mm_res 742 743 // green -- process the bits in place 744 src2 = src1; 745 src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2 746 747 dst2 = dst1; 748 dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2 749 750 // blend 751 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 752 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 753 src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2 754 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 755 756 mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res 757 758 // blue 759 src2 = src1; // src -> src2 760 src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b] 761 762 dst2 = dst1; // dst -> dst2 763 dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b] 764 765 // blend 766 src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2 767 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ 768 src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 769 dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2 770 dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2 771 772 mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res 773 774 *(__m64 *)dstp = mm_res; // mm_res -> 4 dst pixels 775 776 srcp += 4; 777 dstp += 4; 778 }, width); 779 /* *INDENT-ON* */ // clang-format on 780 srcp += srcskip; 781 dstp += dstskip; 782 } 783 _mm_empty(); 784 } 785} 786 787#endif // SDL_MMX_INTRINSICS 788 789// fast RGB565->RGB565 blending with surface alpha 790static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) 791{ 792 unsigned alpha = info->a; 793 if (alpha == 128) { 794 Blit16to16SurfaceAlpha128(info, 0xf7de); 795 } else { 796 int width = info->dst_w; 797 int height = info->dst_h; 798 Uint16 *srcp = (Uint16 *)info->src; 799 int srcskip = info->src_skip >> 1; 800 Uint16 *dstp = (Uint16 *)info->dst; 801 int dstskip = info->dst_skip >> 1; 802 alpha >>= 3; // downscale alpha to 5 bits 803 804 while (height--) { 805 /* *INDENT-OFF* */ // clang-format off 806 DUFFS_LOOP({ 807 Uint32 s = *srcp++; 808 Uint32 d = *dstp; 809 /* 810 * shift out the middle component (green) to 811 * the high 16 bits, and process all three RGB 812 * components at the same time. 813 */ 814 s = (s | s << 16) & 0x07e0f81f; 815 d = (d | d << 16) & 0x07e0f81f; 816 d += (s - d) * alpha >> 5; 817 d &= 0x07e0f81f; 818 *dstp++ = (Uint16)(d | d >> 16); 819 }, width); 820 /* *INDENT-ON* */ // clang-format on 821 srcp += srcskip; 822 dstp += dstskip; 823 } 824 } 825} 826 827// fast RGB555->RGB555 blending with surface alpha 828static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info) 829{ 830 unsigned alpha = info->a; // downscale alpha to 5 bits 831 if (alpha == 128) { 832 Blit16to16SurfaceAlpha128(info, 0xfbde); 833 } else { 834 int width = info->dst_w; 835 int height = info->dst_h; 836 Uint16 *srcp = (Uint16 *)info->src; 837 int srcskip = info->src_skip >> 1; 838 Uint16 *dstp = (Uint16 *)info->dst; 839 int dstskip = info->dst_skip >> 1; 840 alpha >>= 3; // downscale alpha to 5 bits 841 842 while (height--) { 843 /* *INDENT-OFF* */ // clang-format off 844 DUFFS_LOOP({ 845 Uint32 s = *srcp++; 846 Uint32 d = *dstp; 847 /* 848 * shift out the middle component (green) to 849 * the high 16 bits, and process all three RGB 850 * components at the same time. 851 */ 852 s = (s | s << 16) & 0x03e07c1f; 853 d = (d | d << 16) & 0x03e07c1f; 854 d += (s - d) * alpha >> 5; 855 d &= 0x03e07c1f; 856 *dstp++ = (Uint16)(d | d >> 16); 857 }, width); 858 /* *INDENT-ON* */ // clang-format on 859 srcp += srcskip; 860 dstp += dstskip; 861 } 862 } 863} 864 865// fast ARGB8888->RGB565 blending with pixel alpha 866static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info) 867{ 868 int width = info->dst_w; 869 int height = info->dst_h; 870 Uint32 *srcp = (Uint32 *)info->src; 871 int srcskip = info->src_skip >> 2; 872 Uint16 *dstp = (Uint16 *)info->dst; 873 int dstskip = info->dst_skip >> 1; 874 875 while (height--) { 876 /* *INDENT-OFF* */ // clang-format off 877 DUFFS_LOOP({ 878 Uint32 s = *srcp; 879 unsigned alpha = s >> 27; // downscale alpha to 5 bits 880 /* Here we special-case opaque alpha since the 881 compositioning used (>>8 instead of /255) doesn't handle 882 it correctly. */ 883 if (alpha) { 884 if (alpha == (SDL_ALPHA_OPAQUE >> 3)) { 885 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f)); 886 } else { 887 Uint32 d = *dstp; 888 /* 889 * convert source and destination to G0RAB65565 890 * and blend all components at the same time 891 */ 892 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) + (s >> 3 & 0x1f); 893 d = (d | d << 16) & 0x07e0f81f; 894 d += (s - d) * alpha >> 5; 895 d &= 0x07e0f81f; 896 *dstp = (Uint16)(d | d >> 16); 897 } 898 } 899 srcp++; 900 dstp++; 901 }, width); 902 /* *INDENT-ON* */ // clang-format on 903 srcp += srcskip; 904 dstp += dstskip; 905 } 906} 907 908// fast ARGB8888->RGB555 blending with pixel alpha 909static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info) 910{ 911 int width = info->dst_w; 912 int height = info->dst_h; 913 Uint32 *srcp = (Uint32 *)info->src; 914 int srcskip = info->src_skip >> 2; 915 Uint16 *dstp = (Uint16 *)info->dst; 916 int dstskip = info->dst_skip >> 1; 917 918 while (height--) { 919 /* *INDENT-OFF* */ // clang-format off 920 DUFFS_LOOP({ 921 unsigned alpha; 922 Uint32 s = *srcp; 923 alpha = s >> 27; // downscale alpha to 5 bits 924 /* Here we special-case opaque alpha since the 925 compositioning used (>>8 instead of /255) doesn't handle 926 it correctly. */ 927 if (alpha) { 928 if (alpha == (SDL_ALPHA_OPAQUE >> 3)) { 929 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f)); 930 } else { 931 Uint32 d = *dstp; 932 /* 933 * convert source and destination to G0RAB55555 934 * and blend all components at the same time 935 */ 936 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) + (s >> 3 & 0x1f); 937 d = (d | d << 16) & 0x03e07c1f; 938 d += (s - d) * alpha >> 5; 939 d &= 0x03e07c1f; 940 *dstp = (Uint16)(d | d >> 16); 941 } 942 } 943 srcp++; 944 dstp++; 945 }, width); 946 /* *INDENT-ON* */ // clang-format on 947 srcp += srcskip; 948 dstp += dstskip; 949 } 950} 951 952// General (slow) N->N blending with per-surface alpha 953static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info) 954{ 955 int width = info->dst_w; 956 int height = info->dst_h; 957 Uint8 *src = info->src; 958 int srcskip = info->src_skip; 959 Uint8 *dst = info->dst; 960 int dstskip = info->dst_skip; 961 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 962 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; 963 int srcbpp = srcfmt->bytes_per_pixel; 964 int dstbpp = dstfmt->bytes_per_pixel; 965 Uint32 Pixel; 966 unsigned sR, sG, sB; 967 unsigned dR, dG, dB, dA; 968 const unsigned sA = info->a; 969 970 if (sA) { 971 while (height--) { 972 /* *INDENT-OFF* */ // clang-format off 973 DUFFS_LOOP( 974 { 975 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB); 976 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); 977 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); 978 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 979 src += srcbpp; 980 dst += dstbpp; 981 }, 982 width); 983 /* *INDENT-ON* */ // clang-format on 984 src += srcskip; 985 dst += dstskip; 986 } 987 } 988} 989 990// General (slow) colorkeyed N->N blending with per-surface alpha 991static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) 992{ 993 int width = info->dst_w; 994 int height = info->dst_h; 995 Uint8 *src = info->src; 996 int srcskip = info->src_skip; 997 Uint8 *dst = info->dst; 998 int dstskip = info->dst_skip; 999 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 1000 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; 1001 Uint32 ckey = info->colorkey; 1002 int srcbpp = srcfmt->bytes_per_pixel; 1003 int dstbpp = dstfmt->bytes_per_pixel; 1004 Uint32 Pixel; 1005 unsigned sR, sG, sB; 1006 unsigned dR, dG, dB, dA; 1007 const unsigned sA = info->a; 1008 1009 while (height--) { 1010 /* *INDENT-OFF* */ // clang-format off 1011 DUFFS_LOOP( 1012 { 1013 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel); 1014 if (sA && Pixel != ckey) { 1015 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); 1016 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); 1017 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); 1018 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 1019 } 1020 src += srcbpp; 1021 dst += dstbpp; 1022 }, 1023 width); 1024 /* *INDENT-ON* */ // clang-format on 1025 src += srcskip; 1026 dst += dstskip; 1027 } 1028} 1029 1030// Fast 32-bit RGBA->RGBA blending with pixel alpha 1031static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info) 1032{ 1033 int width = info->dst_w; 1034 int height = info->dst_h; 1035 Uint8 *src = info->src; 1036 int srcskip = info->src_skip; 1037 Uint8 *dst = info->dst; 1038 int dstskip = info->dst_skip; 1039 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 1040 1041 while (height--) { 1042 int i = 0; 1043 1044 for (; i < width; ++i) { 1045 Uint32 src32 = *(Uint32 *)src; 1046 Uint32 dst32 = *(Uint32 *)dst; 1047 ALPHA_BLEND_8888(src32, dst32, srcfmt); 1048 *(Uint32 *)dst = dst32; 1049 src += 4; 1050 dst += 4; 1051 } 1052 1053 src += srcskip; 1054 dst += dstskip; 1055 } 1056} 1057 1058// Fast 32-bit RGBA->RGB(A) blending with pixel alpha and src swizzling 1059static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info) 1060{ 1061 int width = info->dst_w; 1062 int height = info->dst_h; 1063 Uint8 *src = info->src; 1064 int srcskip = info->src_skip; 1065 Uint8 *dst = info->dst; 1066 int dstskip = info->dst_skip; 1067 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 1068 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; 1069 bool fill_alpha = !dstfmt->Amask; 1070 Uint32 dstAmask, dstAshift; 1071 1072 SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift); 1073 1074 while (height--) { 1075 int i = 0; 1076 1077 for (; i < width; ++i) { 1078 Uint32 src32 = *(Uint32 *)src; 1079 Uint32 dst32 = *(Uint32 *)dst; 1080 ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); 1081 if (fill_alpha) { 1082 dst32 |= dstAmask; 1083 } 1084 *(Uint32 *)dst = dst32; 1085 src += 4; 1086 dst += 4; 1087 } 1088 1089 src += srcskip; 1090 dst += dstskip; 1091 } 1092} 1093 1094#ifdef SDL_SSE4_1_INTRINSICS 1095 1096static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info) 1097{ 1098 int width = info->dst_w; 1099 int height = info->dst_h; 1100 Uint8 *src = info->src; 1101 int srcskip = info->src_skip; 1102 Uint8 *dst = info->dst; 1103 int dstskip = info->dst_skip; 1104 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 1105 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; 1106 bool fill_alpha = !dstfmt->Amask; 1107 Uint32 dstAmask, dstAshift; 1108 1109 SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift); 1110 1111 // The byte offsets for the start of each pixel 1112 const __m128i mask_offsets = _mm_set_epi8( 1113 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); 1114 1115 const __m128i convert_mask = _mm_add_epi32( 1116 _mm_set1_epi32( 1117 ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | 1118 ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | 1119 ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), 1120 mask_offsets); 1121 1122 const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); 1123 const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstAmask); 1124 1125 while (height--) { 1126 int i = 0; 1127 1128 for (; i + 4 <= width; i += 4) { 1129 // Load 4 src pixels 1130 __m128i src128 = _mm_loadu_si128((__m128i *)src); 1131 1132 // Load 4 dst pixels 1133 __m128i dst128 = _mm_loadu_si128((__m128i *)dst); 1134 1135 // Extract the alpha from each pixel and splat it into all the channels 1136 __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask); 1137 1138 // Convert to dst format 1139 src128 = _mm_shuffle_epi8(src128, convert_mask); 1140 1141 // Set the alpha channels of src to 255 1142 src128 = _mm_or_si128(src128, alpha_fill_mask); 1143 1144 // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes 1145 __m128i srca_lo = _mm_unpacklo_epi8(srcA, srcA); 1146 __m128i srca_hi = _mm_unpackhi_epi8(srcA, srcA); 1147 1148 // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff) 1149 srca_lo = _mm_xor_si128(srca_lo, _mm_set1_epi16(0xff00)); 1150 srca_hi = _mm_xor_si128(srca_hi, _mm_set1_epi16(0xff00)); 1151 1152 // maddubs expects second argument to be signed, so subtract 128 1153 src128 = _mm_sub_epi8(src128, _mm_set1_epi8((Uint8)128)); 1154 dst128 = _mm_sub_epi8(dst128, _mm_set1_epi8((Uint8)128)); 1155 1156 // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255 1157 __m128i dst_lo = _mm_maddubs_epi16(srca_lo, _mm_unpacklo_epi8(src128, dst128)); 1158 __m128i dst_hi = _mm_maddubs_epi16(srca_hi, _mm_unpackhi_epi8(src128, dst128)); 1159 1160 // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result) 1161 dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1 + 128 * 255)); 1162 dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1 + 128 * 255)); 1163 1164 // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16 1165 dst_lo = _mm_mulhi_epu16(dst_lo, _mm_set1_epi16(257)); 1166 dst_hi = _mm_mulhi_epu16(dst_hi, _mm_set1_epi16(257)); 1167 1168 // Blend the pixels together and save the result 1169 dst128 = _mm_packus_epi16(dst_lo, dst_hi); 1170 if (fill_alpha) { 1171 dst128 = _mm_or_si128(dst128, alpha_fill_mask); 1172 } 1173 _mm_storeu_si128((__m128i *)dst, dst128); 1174 1175 src += 16; 1176 dst += 16; 1177 } 1178 1179 for (; i < width; ++i) { 1180 Uint32 src32 = *(Uint32 *)src; 1181 Uint32 dst32 = *(Uint32 *)dst; 1182 ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); 1183 if (fill_alpha) { 1184 dst32 |= dstAmask; 1185 } 1186 *(Uint32 *)dst = dst32; 1187 src += 4; 1188 dst += 4; 1189 } 1190 1191 src += srcskip; 1192 dst += dstskip; 1193 } 1194} 1195 1196#endif 1197 1198#ifdef SDL_AVX2_INTRINSICS 1199 1200static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info) 1201{ 1202 int width = info->dst_w; 1203 int height = info->dst_h; 1204 Uint8 *src = info->src; 1205 int srcskip = info->src_skip; 1206 Uint8 *dst = info->dst; 1207 int dstskip = info->dst_skip; 1208 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 1209 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; 1210 bool fill_alpha = !dstfmt->Amask; 1211 Uint32 dstAmask, dstAshift; 1212 1213 SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift); 1214 1215 // The byte offsets for the start of each pixel 1216 const __m256i mask_offsets = _mm256_set_epi8( 1217 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); 1218 1219 const __m256i convert_mask = _mm256_add_epi32( 1220 _mm256_set1_epi32( 1221 ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | 1222 ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | 1223 ((srcfmt->Bshift >> 3) << dstfmt->Bshift)), 1224 mask_offsets); 1225 1226 const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets); 1227 const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstAmask); 1228 1229 while (height--) { 1230 int i = 0; 1231 1232 for (; i + 8 <= width; i += 8) { 1233 // Load 8 src pixels 1234 __m256i src256 = _mm256_loadu_si256((__m256i *)src); 1235 1236 // Load 8 dst pixels 1237 __m256i dst256 = _mm256_loadu_si256((__m256i *)dst); 1238 1239 // Extract the alpha from each pixel and splat it into all the channels 1240 __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask); 1241 1242 // Convert to dst format 1243 src256 = _mm256_shuffle_epi8(src256, convert_mask); 1244 1245 // Set the alpha channels of src to 255 1246 src256 = _mm256_or_si256(src256, alpha_fill_mask); 1247 1248 // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes 1249 __m256i alpha_lo = _mm256_unpacklo_epi8(srcA, srcA); 1250 __m256i alpha_hi = _mm256_unpackhi_epi8(srcA, srcA); 1251 1252 // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff) 1253 alpha_lo = _mm256_xor_si256(alpha_lo, _mm256_set1_epi16(0xff00)); 1254 alpha_hi = _mm256_xor_si256(alpha_hi, _mm256_set1_epi16(0xff00)); 1255 1256 // maddubs expects second argument to be signed, so subtract 128 1257 src256 = _mm256_sub_epi8(src256, _mm256_set1_epi8((Uint8)128)); 1258 dst256 = _mm256_sub_epi8(dst256, _mm256_set1_epi8((Uint8)128)); 1259 1260 // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255 1261 __m256i dst_lo = _mm256_maddubs_epi16(alpha_lo, _mm256_unpacklo_epi8(src256, dst256)); 1262 __m256i dst_hi = _mm256_maddubs_epi16(alpha_hi, _mm256_unpackhi_epi8(src256, dst256)); 1263 1264 // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result) 1265 dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1 + 128 * 255)); 1266 dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1 + 128 * 255)); 1267 1268 // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16 1269 dst_lo = _mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(257)); 1270 dst_hi = _mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(257)); 1271 1272 // Blend the pixels together and save the result 1273 dst256 = _mm256_packus_epi16(dst_lo, dst_hi); 1274 if (fill_alpha) { 1275 dst256 = _mm256_or_si256(dst256, alpha_fill_mask); 1276 } 1277 _mm256_storeu_si256((__m256i *)dst, dst256); 1278 1279 src += 32; 1280 dst += 32; 1281 } 1282 1283 for (; i < width; ++i) { 1284 Uint32 src32 = *(Uint32 *)src; 1285 Uint32 dst32 = *(Uint32 *)dst; 1286 ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt); 1287 if (fill_alpha) { 1288 dst32 |= dstAmask; 1289 } 1290 *(Uint32 *)dst = dst32; 1291 src += 4; 1292 dst += 4; 1293 } 1294 1295 src += srcskip; 1296 dst += dstskip; 1297 } 1298} 1299 1300#endif 1301 1302#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) 1303 1304static void Blit8888to8888PixelAlphaSwizzleNEON(SDL_BlitInfo *info) 1305{ 1306 int width = info->dst_w; 1307 int height = info->dst_h; 1308 Uint8 *src = info->src; 1309 int srcskip = info->src_skip; 1310 Uint8 *dst = info->dst; 1311 int dstskip = info->dst_skip; 1312 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 1313 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; 1314 bool fill_alpha = !dstfmt->Amask; 1315 Uint32 dstAmask, dstAshift; 1316 1317 SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift); 1318 1319 // The byte offsets for the start of each pixel 1320 const uint8x16_t mask_offsets = vreinterpretq_u8_u64(vcombine_u64( 1321 vcreate_u64(0x0404040400000000), vcreate_u64(0x0c0c0c0c08080808))); 1322 1323 const uint8x16_t convert_mask = vreinterpretq_u8_u32(vaddq_u32( 1324 vreinterpretq_u32_u8(mask_offsets), 1325 vdupq_n_u32( 1326 ((srcfmt->Rshift >> 3) << dstfmt->Rshift) | 1327 ((srcfmt->Gshift >> 3) << dstfmt->Gshift) | 1328 ((srcfmt->Bshift >> 3) << dstfmt->Bshift)))); 1329 1330 const uint8x16_t alpha_splat_mask = vaddq_u8(vdupq_n_u8(srcfmt->Ashift >> 3), mask_offsets); 1331 const uint8x16_t alpha_fill_mask = vreinterpretq_u8_u32(vdupq_n_u32(dstAmask)); 1332 1333 while (height--) { 1334 int i = 0; 1335 1336 for (; i + 4 <= width; i += 4) { 1337 // Load 4 src pixels 1338 uint8x16_t src128 = vld1q_u8(src); 1339 1340 // Load 4 dst pixels 1341 uint8x16_t dst128 = vld1q_u8(dst); 1342 1343 // Extract the alpha from each pixel and splat it into all the channels 1344 uint8x16_t srcA = vqtbl1q_u8(src128, alpha_splat_mask); 1345 1346 // Convert to dst format 1347 src128 = vqtbl1q_u8(src128, convert_mask); 1348 1349 // Set the alpha channels of src to 255 1350 src128 = vorrq_u8(src128, alpha_fill_mask); 1351 1352 // 255 - srcA = ~srcA 1353 uint8x16_t srcInvA = vmvnq_u8(srcA); 1354 1355 // Result initialized with 1, this is for truncated divide later 1356 uint16x8_t res_lo = vdupq_n_u16(1); 1357 uint16x8_t res_hi = vdupq_n_u16(1); 1358 1359 // res = alpha * src + (255 - alpha) * dst 1360 res_lo = vmlal_u8(res_lo, vget_low_u8(srcA), vget_low_u8(src128)); 1361 res_lo = vmlal_u8(res_lo, vget_low_u8(srcInvA), vget_low_u8(dst128)); 1362 res_hi = vmlal_high_u8(res_hi, srcA, src128); 1363 res_hi = vmlal_high_u8(res_hi, srcInvA, dst128); 1364 1365 // Now result has +1 already added for truncated division 1366 // dst = (res + (res >> 8)) >> 8 1367 uint8x8_t temp; 1368 temp = vaddhn_u16(res_lo, vshrq_n_u16(res_lo, 8)); 1369 dst128 = vaddhn_high_u16(temp, res_hi, vshrq_n_u16(res_hi, 8)); 1370 1371 // For rounded division remove the constant 1 and change first two vmlal_u8 to vmull_u8 1372 // Then replace two previous lines with following code: 1373 // temp = vraddhn_u16(res_lo, vrshrq_n_u16(res_lo, 8)); 1374 // dst128 = vraddhn_high_u16(temp, res_hi, vrshrq_n_u16(res_hi, 8)); 1375 1376 if (fill_alpha) { 1377 dst128 = vorrq_u8(dst128, alpha_fill_mask); 1378 } 1379 1380 // Save the result 1381 vst1q_u8(dst, dst128); 1382 1383 src += 16; 1384 dst += 16; 1385 } 1386 1387 // Process 1 pixel per iteration, max 3 iterations, same calculations as above 1388 for (; i < width; ++i) { 1389 // Top 32-bits will be not used in src32 & dst32 1390 uint8x8_t src32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32 *)src)); 1391 uint8x8_t dst32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32 *)dst)); 1392 1393 uint8x8_t srcA = vtbl1_u8(src32, vget_low_u8(alpha_splat_mask)); 1394 src32 = vtbl1_u8(src32, vget_low_u8(convert_mask)); 1395 src32 = vorr_u8(src32, vget_low_u8(alpha_fill_mask)); 1396 uint8x8_t srcInvA = vmvn_u8(srcA); 1397 1398 uint16x8_t res = vdupq_n_u16(1); 1399 res = vmlal_u8(res, srcA, src32); 1400 res = vmlal_u8(res, srcInvA, dst32); 1401 1402 dst32 = vaddhn_u16(res, vshrq_n_u16(res, 8)); 1403 1404 if (fill_alpha) { 1405 dst32 = vorr_u8(dst32, vget_low_u8(alpha_fill_mask)); 1406 } 1407 1408 // Save the result, only low 32-bits 1409 vst1_lane_u32((Uint32 *)dst, vreinterpret_u32_u8(dst32), 0); 1410 1411 src += 4; 1412 dst += 4; 1413 } 1414 1415 src += srcskip; 1416 dst += dstskip; 1417 } 1418} 1419 1420#endif 1421 1422// General (slow) N->N blending with pixel alpha 1423static void BlitNtoNPixelAlpha(SDL_BlitInfo *info) 1424{ 1425 int width = info->dst_w; 1426 int height = info->dst_h; 1427 Uint8 *src = info->src; 1428 int srcskip = info->src_skip; 1429 Uint8 *dst = info->dst; 1430 int dstskip = info->dst_skip; 1431 const SDL_PixelFormatDetails *srcfmt = info->src_fmt; 1432 const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; 1433 int srcbpp; 1434 int dstbpp; 1435 Uint32 Pixel; 1436 unsigned sR, sG, sB, sA; 1437 unsigned dR, dG, dB, dA; 1438 1439 // Set up some basic variables 1440 srcbpp = srcfmt->bytes_per_pixel; 1441 dstbpp = dstfmt->bytes_per_pixel; 1442 1443 while (height--) { 1444 DUFFS_LOOP( 1445 { 1446 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA); 1447 if (sA) { 1448 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA); 1449 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA); 1450 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 1451 } 1452 src += srcbpp; 1453 dst += dstbpp; 1454 }, 1455 width); 1456 /* *INDENT-ON* */ // clang-format on 1457 src += srcskip; 1458 dst += dstskip; 1459 } 1460} 1461 1462SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) 1463{ 1464 const SDL_PixelFormatDetails *sf = surface->fmt; 1465 const SDL_PixelFormatDetails *df = surface->map.info.dst_fmt; 1466 1467 switch (surface->map.info.flags & ~SDL_COPY_RLE_MASK) { 1468 case SDL_COPY_BLEND: 1469 // Per-pixel alpha blits 1470 switch (df->bytes_per_pixel) { 1471 case 1: 1472 if (surface->map.info.dst_pal) { 1473 return BlitNto1PixelAlpha; 1474 } else { 1475 // RGB332 has no palette ! 1476 return BlitNtoNPixelAlpha; 1477 } 1478 1479 case 2: 1480 if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { 1481 if (df->Gmask == 0x7e0) { 1482 return BlitARGBto565PixelAlpha; 1483 } else if (df->Gmask == 0x3e0 && !df->Amask) { 1484 return BlitARGBto555PixelAlpha; 1485 } 1486 } 1487 return BlitNtoNPixelAlpha; 1488 1489 case 4: 1490 if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask && 1491 SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) { 1492#ifdef SDL_AVX2_INTRINSICS 1493 if (SDL_HasAVX2()) { 1494 return Blit8888to8888PixelAlphaSwizzleAVX2; 1495 } 1496#endif 1497#ifdef SDL_SSE4_1_INTRINSICS 1498 if (SDL_HasSSE41()) { 1499 return Blit8888to8888PixelAlphaSwizzleSSE41; 1500 } 1501#endif 1502#ifdef SDL_LSX_INTRINSICS 1503 if (SDL_HasLSX()) { 1504 return Blit8888to8888PixelAlphaSwizzleLSX; 1505 } 1506#endif 1507#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) 1508 // To prevent "unused function" compiler warnings/errors 1509 (void)Blit8888to8888PixelAlpha; 1510 (void)Blit8888to8888PixelAlphaSwizzle; 1511 return Blit8888to8888PixelAlphaSwizzleNEON; 1512#else 1513 if (sf->format == df->format) { 1514 return Blit8888to8888PixelAlpha; 1515 } else { 1516 return Blit8888to8888PixelAlphaSwizzle; 1517 } 1518#endif 1519 } 1520 return BlitNtoNPixelAlpha; 1521 1522 case 3: 1523 default: 1524 break; 1525 } 1526 return BlitNtoNPixelAlpha; 1527 1528 case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: 1529 if (sf->Amask == 0) { 1530 // Per-surface alpha blits 1531 switch (df->bytes_per_pixel) { 1532 case 1: 1533 if (surface->map.info.dst_pal) { 1534 return BlitNto1SurfaceAlpha; 1535 } else { 1536 // RGB332 has no palette ! 1537 return BlitNtoNSurfaceAlpha; 1538 } 1539 1540 case 2: 1541 if (surface->map.identity) { 1542 if (df->Gmask == 0x7e0) { 1543#ifdef SDL_MMX_INTRINSICS 1544 if (SDL_HasMMX()) { 1545 return Blit565to565SurfaceAlphaMMX; 1546 } else 1547#endif 1548 { 1549 return Blit565to565SurfaceAlpha; 1550 } 1551 } else if (df->Gmask == 0x3e0) { 1552#ifdef SDL_MMX_INTRINSICS 1553 if (SDL_HasMMX()) { 1554 return Blit555to555SurfaceAlphaMMX; 1555 } else 1556#endif 1557 { 1558 return Blit555to555SurfaceAlpha; 1559 } 1560 } 1561 } 1562 return BlitNtoNSurfaceAlpha; 1563 1564 case 4: 1565 if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) { 1566#ifdef SDL_SSE2_INTRINSICS 1567 if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasSSE2()) { 1568 return Blit888to888SurfaceAlphaSSE2; 1569 } 1570#endif 1571 if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) { 1572 return BlitRGBtoRGBSurfaceAlpha; 1573 } 1574 } 1575 return BlitNtoNSurfaceAlpha; 1576 1577 case 3: 1578 default: 1579 return BlitNtoNSurfaceAlpha; 1580 } 1581 } 1582 break; 1583 1584 case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: 1585 if (sf->Amask == 0) { 1586 if (df->bytes_per_pixel == 1) { 1587 1588 if (surface->map.info.dst_pal) { 1589 return BlitNto1SurfaceAlphaKey; 1590 } else { 1591 // RGB332 has no palette ! 1592 return BlitNtoNSurfaceAlphaKey; 1593 } 1594 } else { 1595 return BlitNtoNSurfaceAlphaKey; 1596 } 1597 } 1598 break; 1599 } 1600 1601 return NULL; 1602} 1603 1604#endif // SDL_HAVE_BLIT_A 1605 1606
[FILE END]
(C) 2025 0x4248 (C) 2025 4248 Media and 4248 Systems, All part of 0x4248 See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.