Atlas - SDL_blit

Atlas - SDL_blit_A.c
Home / ext / SDL / src / video
Lines: 1 | Size: 55422 bytes
[Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)] 
[FILE BEGIN]
1/*
2  Simple DirectMedia Layer
3  Copyright (C) 1997-2026 Sam Lantinga <[email protected]>
4
5  This software is provided 'as-is', without any express or implied
6  warranty.  In no event will the authors be held liable for any damages
7  arising from the use of this software.
8
9  Permission is granted to anyone to use this software for any purpose,
10  including commercial applications, and to alter it and redistribute it
11  freely, subject to the following restrictions:
12
13  1. The origin of this software must not be misrepresented; you must not
14     claim that you wrote the original software. If you use this software
15     in a product, an acknowledgment in the product documentation would be
16     appreciated but is not required.
17  2. Altered source versions must be plainly marked as such, and must not be
18     misrepresented as being the original software.
19  3. This notice may not be removed or altered from any source distribution.
20*/
21#include "SDL_internal.h"
22
23#ifdef SDL_HAVE_BLIT_A
24
25#include "SDL_pixels_c.h"
26#include "SDL_surface_c.h"
27
28// Functions to perform alpha blended blitting
29
30// N->1 blending with per-surface alpha
31static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
32{
33    int width = info->dst_w;
34    int height = info->dst_h;
35    Uint8 *src = info->src;
36    int srcskip = info->src_skip;
37    Uint8 *dst = info->dst;
38    int dstskip = info->dst_skip;
39    Uint8 *palmap = info->table;
40    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
41    const SDL_Color *dstpal = info->dst_pal->colors;
42    int srcbpp = srcfmt->bytes_per_pixel;
43    Uint32 Pixel;
44    unsigned sR, sG, sB;
45    unsigned dR, dG, dB;
46    const unsigned A = info->a;
47
48    while (height--) {
49        /* *INDENT-OFF* */ // clang-format off
50        DUFFS_LOOP(
51        {
52        DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
53        dR = dstpal[*dst].r;
54        dG = dstpal[*dst].g;
55        dB = dstpal[*dst].b;
56        ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
57        dR &= 0xff;
58        dG &= 0xff;
59        dB &= 0xff;
60        // Pack RGB into 8bit pixel
61        if ( palmap == NULL ) {
62            *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
63        } else {
64            *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
65        }
66        dst++;
67        src += srcbpp;
68        },
69        width);
70        /* *INDENT-ON* */ // clang-format on
71        src += srcskip;
72        dst += dstskip;
73    }
74}
75
76// N->1 blending with pixel alpha
77static void BlitNto1PixelAlpha(SDL_BlitInfo *info)
78{
79    int width = info->dst_w;
80    int height = info->dst_h;
81    Uint8 *src = info->src;
82    int srcskip = info->src_skip;
83    Uint8 *dst = info->dst;
84    int dstskip = info->dst_skip;
85    Uint8 *palmap = info->table;
86    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
87    const SDL_Color *dstpal = info->dst_pal->colors;
88    int srcbpp = srcfmt->bytes_per_pixel;
89    Uint32 Pixel;
90    unsigned sR, sG, sB, sA;
91    unsigned dR, dG, dB;
92
93    while (height--) {
94        /* *INDENT-OFF* */ // clang-format off
95        DUFFS_LOOP(
96        {
97        DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
98        dR = dstpal[*dst].r;
99        dG = dstpal[*dst].g;
100        dB = dstpal[*dst].b;
101        ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
102        dR &= 0xff;
103        dG &= 0xff;
104        dB &= 0xff;
105        // Pack RGB into 8bit pixel
106        if ( palmap == NULL ) {
107            *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
108        } else {
109            *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
110        }
111        dst++;
112        src += srcbpp;
113        },
114        width);
115        /* *INDENT-ON* */ // clang-format on
116        src += srcskip;
117        dst += dstskip;
118    }
119}
120
121// colorkeyed N->1 blending with per-surface alpha
122static void BlitNto1SurfaceAlphaKey(SDL_BlitInfo *info)
123{
124    int width = info->dst_w;
125    int height = info->dst_h;
126    Uint8 *src = info->src;
127    int srcskip = info->src_skip;
128    Uint8 *dst = info->dst;
129    int dstskip = info->dst_skip;
130    Uint8 *palmap = info->table;
131    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
132    const SDL_Color *dstpal = info->dst_pal->colors;
133    int srcbpp = srcfmt->bytes_per_pixel;
134    Uint32 ckey = info->colorkey;
135    Uint32 Pixel;
136    unsigned sR, sG, sB;
137    unsigned dR, dG, dB;
138    const unsigned A = info->a;
139
140    while (height--) {
141        /* *INDENT-OFF* */ // clang-format off
142        DUFFS_LOOP(
143        {
144        DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
145        if ( Pixel != ckey ) {
146            dR = dstpal[*dst].r;
147            dG = dstpal[*dst].g;
148            dB = dstpal[*dst].b;
149            ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
150            dR &= 0xff;
151            dG &= 0xff;
152            dB &= 0xff;
153            // Pack RGB into 8bit pixel
154            if ( palmap == NULL ) {
155                *dst = (Uint8)(((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0)));
156            } else {
157                *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
158            }
159        }
160        dst++;
161        src += srcbpp;
162        },
163        width);
164        /* *INDENT-ON* */ // clang-format on
165        src += srcskip;
166        dst += dstskip;
167    }
168}
169
170#ifdef SDL_SSE2_INTRINSICS
171
172static void SDL_TARGETING("sse2") Blit888to888SurfaceAlphaSSE2(SDL_BlitInfo *info)
173{
174    int width = info->dst_w;
175    int height = info->dst_h;
176    Uint8 *src = info->src;
177    int srcskip = info->src_skip;
178    Uint8 *dst = info->dst;
179    int dstskip = info->dst_skip;
180    Uint8 alpha = info->a;
181
182    const __m128i alpha_fill_mask = _mm_set1_epi32((int)0xff000000);
183    const __m128i srcA = _mm_set1_epi16(alpha);
184
185    while (height--) {
186        int i = 0;
187
188        for (; i + 4 <= width; i += 4) {
189            // Load 4 src pixels
190            __m128i src128 = _mm_loadu_si128((__m128i *)src);
191
192            // Load 4 dst pixels
193            __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
194
195            __m128i src_lo = _mm_unpacklo_epi8(src128, _mm_setzero_si128());
196            __m128i src_hi = _mm_unpackhi_epi8(src128, _mm_setzero_si128());
197
198            __m128i dst_lo = _mm_unpacklo_epi8(dst128, _mm_setzero_si128());
199            __m128i dst_hi = _mm_unpackhi_epi8(dst128, _mm_setzero_si128());
200
201            // dst = ((src - dst) * srcA) + ((dst << 8) - dst)
202            dst_lo = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_lo, dst_lo), srcA),
203                                      _mm_sub_epi16(_mm_slli_epi16(dst_lo, 8), dst_lo));
204            dst_hi = _mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(src_hi, dst_hi), srcA),
205                                      _mm_sub_epi16(_mm_slli_epi16(dst_hi, 8), dst_hi));
206
207            // dst += 0x1U (use 0x80 to round instead of floor)
208            dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1));
209            dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1));
210
211            // dst = (dst + (dst >> 8)) >> 8
212            dst_lo = _mm_srli_epi16(_mm_add_epi16(dst_lo, _mm_srli_epi16(dst_lo, 8)), 8);
213            dst_hi = _mm_srli_epi16(_mm_add_epi16(dst_hi, _mm_srli_epi16(dst_hi, 8)), 8);
214
215            dst128 = _mm_packus_epi16(dst_lo, dst_hi);
216
217            // Set the alpha channels of dst to 255
218            dst128 = _mm_or_si128(dst128, alpha_fill_mask);
219
220            _mm_storeu_si128((__m128i *)dst, dst128);
221
222            src += 16;
223            dst += 16;
224        }
225
226        for (; i < width; ++i) {
227            Uint32 src32 = *(Uint32 *)src;
228            Uint32 dst32 = *(Uint32 *)dst;
229
230            FACTOR_BLEND_8888(src32, dst32, alpha);
231
232            *dst = dst32 | 0xff000000;
233
234            src += 4;
235            dst += 4;
236        }
237
238        src += srcskip;
239        dst += dstskip;
240    }
241}
242
243#endif
244
245#ifdef SDL_LSX_INTRINSICS
246
247static void SDL_TARGETING("lsx") Blit8888to8888PixelAlphaSwizzleLSX(SDL_BlitInfo *info)
248{
249    int width = info->dst_w;
250    int height = info->dst_h;
251    Uint8 *src = info->src;
252    int srcskip = info->src_skip;
253    Uint8 *dst = info->dst;
254    int dstskip = info->dst_skip;
255    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
256    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
257    bool fill_alpha = !dstfmt->Amask;
258    Uint32 dstAmask, dstAshift;
259    const Uint8 offsets[] = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
260
261    SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
262
263    const __m128i const_0xff00 = __lsx_vreplgr2vr_h(0xff00);
264    const __m128i const_128 = __lsx_vreplgr2vr_b((Uint8)128);
265    const __m128i const_32641 = __lsx_vreplgr2vr_h(32641);
266    const __m128i const_257 = __lsx_vreplgr2vr_h(257);
267
268    // The byte offsets for the start of each pixel
269    const __m128i mask_offsets = __lsx_vld(offsets, 0);
270
271    const __m128i convert_mask = __lsx_vadd_w(
272        __lsx_vreplgr2vr_w(
273            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
274            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
275            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
276        mask_offsets);
277
278    const __m128i alpha_splat_mask = __lsx_vadd_b(__lsx_vreplgr2vr_b(srcfmt->Ashift >> 3), mask_offsets);
279    const __m128i alpha_fill_mask = __lsx_vreplgr2vr_w((int)dstAmask);
280
281    while (height--) {
282        int i = 0;
283
284        for (; i + 4 <= width; i += 4) {
285            __m128i src128 = __lsx_vld(src, 0);
286            __m128i dst128 = __lsx_vld(dst, 0);
287
288            __m128i srcA = __lsx_vshuf_b(src128, src128, alpha_splat_mask);
289            src128 = __lsx_vshuf_b(src128, src128, convert_mask);
290
291            src128 = __lsx_vor_v(src128, alpha_fill_mask);
292
293            __m128i srca_lo = __lsx_vilvl_b(srcA, srcA);
294            __m128i srca_hi = __lsx_vilvh_b(srcA, srcA);
295
296            srca_lo = __lsx_vxor_v(srca_lo, const_0xff00);
297            srca_hi = __lsx_vxor_v(srca_hi, const_0xff00);
298
299            src128 = __lsx_vsub_b(src128, const_128);
300            dst128 = __lsx_vsub_b(dst128, const_128);
301
302            __m128i tmp = __lsx_vilvl_b(dst128, src128);
303            __m128i dst_lo = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_lo, tmp), __lsx_vmulwod_h_bu_b(srca_lo, tmp));
304            tmp = __lsx_vilvh_b(dst128, src128);
305            __m128i dst_hi = __lsx_vsadd_h(__lsx_vmulwev_h_bu_b(srca_hi, tmp), __lsx_vmulwod_h_bu_b(srca_hi, tmp));
306
307            dst_lo = __lsx_vadd_h(dst_lo, const_32641);
308            dst_hi = __lsx_vadd_h(dst_hi, const_32641);
309
310            dst_lo = __lsx_vmuh_hu(dst_lo, const_257);
311            dst_hi = __lsx_vmuh_hu(dst_hi, const_257);
312
313            dst128 = __lsx_vssrarni_bu_h(dst_hi, dst_lo, 0);
314            if (fill_alpha) {
315                dst128 = __lsx_vor_v(dst128, alpha_fill_mask);
316            }
317            __lsx_vst(dst128, dst, 0);
318
319            src += 16;
320            dst += 16;
321        }
322
323        for (; i < width; ++i) {
324            Uint32 src32 = *(Uint32 *)src;
325            Uint32 dst32 = *(Uint32 *)dst;
326            ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
327            if (fill_alpha) {
328                dst32 |= dstAmask;
329            }
330            *(Uint32 *)dst = dst32;
331            src += 4;
332            dst += 4;
333        }
334
335        src += srcskip;
336        dst += dstskip;
337    }
338}
339
340#endif
341
342// fast RGB888->(A)RGB888 blending with surface alpha=128 special case
343static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
344{
345    int width = info->dst_w;
346    int height = info->dst_h;
347    Uint32 *srcp = (Uint32 *)info->src;
348    int srcskip = info->src_skip >> 2;
349    Uint32 *dstp = (Uint32 *)info->dst;
350    int dstskip = info->dst_skip >> 2;
351
352    while (height--) {
353        /* *INDENT-OFF* */ // clang-format off
354        DUFFS_LOOP({
355            Uint32 s = *srcp++;
356            Uint32 d = *dstp;
357            *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
358                   + (s & d & 0x00010101)) | 0xff000000;
359        }, width);
360        /* *INDENT-ON* */ // clang-format on
361        srcp += srcskip;
362        dstp += dstskip;
363    }
364}
365
366// fast RGB888->(A)RGB888 blending with surface alpha
367static void BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo *info)
368{
369    unsigned alpha = info->a;
370    if (alpha == 128) {
371        BlitRGBtoRGBSurfaceAlpha128(info);
372    } else {
373        int width = info->dst_w;
374        int height = info->dst_h;
375        Uint32 *srcp = (Uint32 *)info->src;
376        int srcskip = info->src_skip >> 2;
377        Uint32 *dstp = (Uint32 *)info->dst;
378        int dstskip = info->dst_skip >> 2;
379        Uint32 s;
380        Uint32 d;
381
382        while (height--) {
383            /* *INDENT-OFF* */ // clang-format off
384            DUFFS_LOOP({
385                s = *srcp;
386                d = *dstp;
387
388                FACTOR_BLEND_8888(s, d, alpha);
389
390                *dstp = d | 0xff000000;
391                ++srcp;
392                ++dstp;
393            }, width);
394            /* *INDENT-ON* */ // clang-format on
395            srcp += srcskip;
396            dstp += dstskip;
397        }
398    }
399}
400
401// 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel
402
403// blend a single 16 bit pixel at 50%
404#define BLEND16_50(d, s, mask) \
405    ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
406
407// blend two 16 bit pixels at 50%
408#define BLEND2x16_50(d, s, mask) \
409    (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) + (s & d & (~(mask | mask << 16))))
410
411static void Blit16to16SurfaceAlpha128(SDL_BlitInfo *info, Uint16 mask)
412{
413    int width = info->dst_w;
414    int height = info->dst_h;
415    Uint16 *srcp = (Uint16 *)info->src;
416    int srcskip = info->src_skip >> 1;
417    Uint16 *dstp = (Uint16 *)info->dst;
418    int dstskip = info->dst_skip >> 1;
419
420    while (height--) {
421        if (((uintptr_t)srcp ^ (uintptr_t)dstp) & 2) {
422            /*
423             * Source and destination not aligned, pipeline it.
424             * This is mostly a win for big blits but no loss for
425             * small ones
426             */
427            Uint32 prev_sw;
428            int w = width;
429
430            // handle odd destination
431            if ((uintptr_t)dstp & 2) {
432                Uint16 d = *dstp, s = *srcp;
433                *dstp = BLEND16_50(d, s, mask);
434                dstp++;
435                srcp++;
436                w--;
437            }
438            srcp++; // srcp is now 32-bit aligned
439
440            // bootstrap pipeline with first halfword
441            prev_sw = ((Uint32 *)srcp)[-1];
442
443            while (w > 1) {
444                Uint32 sw, dw, s;
445                sw = *(Uint32 *)srcp;
446                dw = *(Uint32 *)dstp;
447#if SDL_BYTEORDER == SDL_BIG_ENDIAN
448                s = (prev_sw << 16) + (sw >> 16);
449#else
450                s = (prev_sw >> 16) + (sw << 16);
451#endif
452                prev_sw = sw;
453                *(Uint32 *)dstp = BLEND2x16_50(dw, s, mask);
454                dstp += 2;
455                srcp += 2;
456                w -= 2;
457            }
458
459            // final pixel if any
460            if (w) {
461                Uint16 d = *dstp, s;
462#if SDL_BYTEORDER == SDL_BIG_ENDIAN
463                s = (Uint16)prev_sw;
464#else
465                s = (Uint16)(prev_sw >> 16);
466#endif
467                *dstp = BLEND16_50(d, s, mask);
468                srcp++;
469                dstp++;
470            }
471            srcp += srcskip - 1;
472            dstp += dstskip;
473        } else {
474            // source and destination are aligned
475            int w = width;
476
477            // first odd pixel?
478            if ((uintptr_t)srcp & 2) {
479                Uint16 d = *dstp, s = *srcp;
480                *dstp = BLEND16_50(d, s, mask);
481                srcp++;
482                dstp++;
483                w--;
484            }
485            // srcp and dstp are now 32-bit aligned
486
487            while (w > 1) {
488                Uint32 sw = *(Uint32 *)srcp;
489                Uint32 dw = *(Uint32 *)dstp;
490                *(Uint32 *)dstp = BLEND2x16_50(dw, sw, mask);
491                srcp += 2;
492                dstp += 2;
493                w -= 2;
494            }
495
496            // last odd pixel?
497            if (w) {
498                Uint16 d = *dstp, s = *srcp;
499                *dstp = BLEND16_50(d, s, mask);
500                srcp++;
501                dstp++;
502            }
503            srcp += srcskip;
504            dstp += dstskip;
505        }
506    }
507}
508
509#ifdef SDL_MMX_INTRINSICS
510
511// fast RGB565->RGB565 blending with surface alpha
512static void SDL_TARGETING("mmx") Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
513{
514    unsigned alpha = info->a;
515    if (alpha == 128) {
516        Blit16to16SurfaceAlpha128(info, 0xf7de);
517    } else {
518        int width = info->dst_w;
519        int height = info->dst_h;
520        Uint16 *srcp = (Uint16 *)info->src;
521        int srcskip = info->src_skip >> 1;
522        Uint16 *dstp = (Uint16 *)info->dst;
523        int dstskip = info->dst_skip >> 1;
524        Uint32 s, d;
525
526#ifdef USE_DUFFS_LOOP
527        __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
528
529        alpha &= ~(1 + 2 + 4);             // cut alpha to get the exact same behaviour
530        mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha
531        alpha >>= 3;                       // downscale alpha to 5 bits
532
533        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha
534        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha
535        /* position alpha to allow for mullo and mulhi on diff channels
536           to reduce the number of operations */
537        mm_alpha = _mm_slli_si64(mm_alpha, 3);
538
539        // Setup the 565 color channel masks
540        gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); // MASKGREEN -> gmask
541        bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask
542#endif
543
544        while (height--) {
545            /* *INDENT-OFF* */ // clang-format off
546            DUFFS_LOOP_124(
547            {
548                s = *srcp++;
549                d = *dstp;
550                /*
551                 * shift out the middle component (green) to
552                 * the high 16 bits, and process all three RGB
553                 * components at the same time.
554                 */
555                s = (s | s << 16) & 0x07e0f81f;
556                d = (d | d << 16) & 0x07e0f81f;
557                d += (s - d) * alpha >> 5;
558                d &= 0x07e0f81f;
559                *dstp++ = (Uint16)(d | d >> 16);
560            },{
561                s = *srcp++;
562                d = *dstp;
563                /*
564                 * shift out the middle component (green) to
565                 * the high 16 bits, and process all three RGB
566                 * components at the same time.
567                 */
568                s = (s | s << 16) & 0x07e0f81f;
569                d = (d | d << 16) & 0x07e0f81f;
570                d += (s - d) * alpha >> 5;
571                d &= 0x07e0f81f;
572                *dstp++ = (Uint16)(d | d >> 16);
573                s = *srcp++;
574                d = *dstp;
575                /*
576                 * shift out the middle component (green) to
577                 * the high 16 bits, and process all three RGB
578                 * components at the same time.
579                 */
580                s = (s | s << 16) & 0x07e0f81f;
581                d = (d | d << 16) & 0x07e0f81f;
582                d += (s - d) * alpha >> 5;
583                d &= 0x07e0f81f;
584                *dstp++ = (Uint16)(d | d >> 16);
585            },{
586                src1 = *(__m64 *)srcp; // 4 src pixels -> src1
587                dst1 = *(__m64 *)dstp; // 4 dst pixels -> dst1
588
589                // red
590                src2 = src1;
591                src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2 [000r 000r 000r 000r]
592
593                dst2 = dst1;
594                dst2 = _mm_srli_pi16(dst2, 11); // dst2 >> 11 -> dst2 [000r 000r 000r 000r]
595
596                // blend
597                src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
598                src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
599                src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
600                dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
601                dst2 = _mm_slli_pi16(dst2, 11); // dst2 << 11 -> dst2
602
603                mm_res = dst2; // RED -> mm_res
604
605                // green -- process the bits in place
606                src2 = src1;
607                src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2
608
609                dst2 = dst1;
610                dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2
611
612                // blend
613                src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
614                src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
615                src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
616                dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
617
618                mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res
619
620                // blue
621                src2 = src1;
622                src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b]
623
624                dst2 = dst1;
625                dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b]
626
627                // blend
628                src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
629                src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
630                src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
631                dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
632                dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2
633
634                mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res
635
636                *(__m64 *)dstp = mm_res; // mm_res -> 4 dst pixels
637
638                srcp += 4;
639                dstp += 4;
640            }, width);
641            /* *INDENT-ON* */ // clang-format on
642            srcp += srcskip;
643            dstp += dstskip;
644        }
645        _mm_empty();
646    }
647}
648
649// fast RGB555->RGB555 blending with surface alpha
650static void SDL_TARGETING("mmx") Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
651{
652    unsigned alpha = info->a;
653    if (alpha == 128) {
654        Blit16to16SurfaceAlpha128(info, 0xfbde);
655    } else {
656        int width = info->dst_w;
657        int height = info->dst_h;
658        Uint16 *srcp = (Uint16 *)info->src;
659        int srcskip = info->src_skip >> 1;
660        Uint16 *dstp = (Uint16 *)info->dst;
661        int dstskip = info->dst_skip >> 1;
662        Uint32 s, d;
663
664#ifdef USE_DUFFS_LOOP
665        __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
666
667        alpha &= ~(1 + 2 + 4);             // cut alpha to get the exact same behaviour
668        mm_alpha = _mm_set_pi32(0, alpha); // 0000000A -> mm_alpha
669        alpha >>= 3;                       // downscale alpha to 5 bits
670
671        mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); // 00000A0A -> mm_alpha
672        mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); // 0A0A0A0A -> mm_alpha
673        /* position alpha to allow for mullo and mulhi on diff channels
674           to reduce the number of operations */
675        mm_alpha = _mm_slli_si64(mm_alpha, 3);
676
677        // Setup the 555 color channel masks
678        rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); // MASKRED -> rmask
679        gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); // MASKGREEN -> gmask
680        bmask = _mm_set_pi32(0x001F001F, 0x001F001F); // MASKBLUE -> bmask
681#endif
682        while (height--) {
683            /* *INDENT-OFF* */ // clang-format off
684            DUFFS_LOOP_124(
685            {
686                s = *srcp++;
687                d = *dstp;
688                /*
689                 * shift out the middle component (green) to
690                 * the high 16 bits, and process all three RGB
691                 * components at the same time.
692                 */
693                s = (s | s << 16) & 0x03e07c1f;
694                d = (d | d << 16) & 0x03e07c1f;
695                d += (s - d) * alpha >> 5;
696                d &= 0x03e07c1f;
697                *dstp++ = (Uint16)(d | d >> 16);
698            },{
699                s = *srcp++;
700                d = *dstp;
701                /*
702                 * shift out the middle component (green) to
703                 * the high 16 bits, and process all three RGB
704                 * components at the same time.
705                 */
706                s = (s | s << 16) & 0x03e07c1f;
707                d = (d | d << 16) & 0x03e07c1f;
708                d += (s - d) * alpha >> 5;
709                d &= 0x03e07c1f;
710                *dstp++ = (Uint16)(d | d >> 16);
711                    s = *srcp++;
712                d = *dstp;
713                /*
714                 * shift out the middle component (green) to
715                 * the high 16 bits, and process all three RGB
716                 * components at the same time.
717                 */
718                s = (s | s << 16) & 0x03e07c1f;
719                d = (d | d << 16) & 0x03e07c1f;
720                d += (s - d) * alpha >> 5;
721                d &= 0x03e07c1f;
722                *dstp++ = (Uint16)(d | d >> 16);
723            },{
724                src1 = *(__m64 *)srcp; // 4 src pixels -> src1
725                dst1 = *(__m64 *)dstp; // 4 dst pixels -> dst1
726
727                // red -- process the bits in place
728                src2 = src1;
729                src2 = _mm_and_si64(src2, rmask); // src & MASKRED -> src2
730
731                dst2 = dst1;
732                dst2 = _mm_and_si64(dst2, rmask); // dst & MASKRED -> dst2
733
734                // blend
735                src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
736                src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
737                src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
738                dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
739                dst2 = _mm_and_si64(dst2, rmask); // dst2 & MASKRED -> dst2
740
741                mm_res = dst2; // RED -> mm_res
742
743                // green -- process the bits in place
744                src2 = src1;
745                src2 = _mm_and_si64(src2, gmask); // src & MASKGREEN -> src2
746
747                dst2 = dst1;
748                dst2 = _mm_and_si64(dst2, gmask); // dst & MASKGREEN -> dst2
749
750                // blend
751                src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
752                src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
753                src2 = _mm_slli_pi16(src2, 5); // src2 << 5 -> src2
754                dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
755
756                mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN -> mm_res
757
758                // blue
759                src2 = src1; // src -> src2
760                src2 = _mm_and_si64(src2, bmask); // src & MASKBLUE -> src2[000b 000b 000b 000b]
761
762                dst2 = dst1; // dst -> dst2
763                dst2 = _mm_and_si64(dst2, bmask); // dst & MASKBLUE -> dst2[000b 000b 000b 000b]
764
765                // blend
766                src2 = _mm_sub_pi16(src2, dst2);// src - dst -> src2
767                src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
768                src2 = _mm_srli_pi16(src2, 11); // src2 >> 11 -> src2
769                dst2 = _mm_add_pi16(src2, dst2); // src2 + dst2 -> dst2
770                dst2 = _mm_and_si64(dst2, bmask); // dst2 & MASKBLUE -> dst2
771
772                mm_res = _mm_or_si64(mm_res, dst2); // RED | GREEN | BLUE -> mm_res
773
774                *(__m64 *)dstp = mm_res; // mm_res -> 4 dst pixels
775
776                srcp += 4;
777                dstp += 4;
778            }, width);
779            /* *INDENT-ON* */ // clang-format on
780            srcp += srcskip;
781            dstp += dstskip;
782        }
783        _mm_empty();
784    }
785}
786
787#endif // SDL_MMX_INTRINSICS
788
789// fast RGB565->RGB565 blending with surface alpha
790static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
791{
792    unsigned alpha = info->a;
793    if (alpha == 128) {
794        Blit16to16SurfaceAlpha128(info, 0xf7de);
795    } else {
796        int width = info->dst_w;
797        int height = info->dst_h;
798        Uint16 *srcp = (Uint16 *)info->src;
799        int srcskip = info->src_skip >> 1;
800        Uint16 *dstp = (Uint16 *)info->dst;
801        int dstskip = info->dst_skip >> 1;
802        alpha >>= 3; // downscale alpha to 5 bits
803
804        while (height--) {
805            /* *INDENT-OFF* */ // clang-format off
806            DUFFS_LOOP({
807                Uint32 s = *srcp++;
808                Uint32 d = *dstp;
809                /*
810                 * shift out the middle component (green) to
811                 * the high 16 bits, and process all three RGB
812                 * components at the same time.
813                 */
814                s = (s | s << 16) & 0x07e0f81f;
815                d = (d | d << 16) & 0x07e0f81f;
816                d += (s - d) * alpha >> 5;
817                d &= 0x07e0f81f;
818                *dstp++ = (Uint16)(d | d >> 16);
819            }, width);
820            /* *INDENT-ON* */ // clang-format on
821            srcp += srcskip;
822            dstp += dstskip;
823        }
824    }
825}
826
827// fast RGB555->RGB555 blending with surface alpha
828static void Blit555to555SurfaceAlpha(SDL_BlitInfo *info)
829{
830    unsigned alpha = info->a; // downscale alpha to 5 bits
831    if (alpha == 128) {
832        Blit16to16SurfaceAlpha128(info, 0xfbde);
833    } else {
834        int width = info->dst_w;
835        int height = info->dst_h;
836        Uint16 *srcp = (Uint16 *)info->src;
837        int srcskip = info->src_skip >> 1;
838        Uint16 *dstp = (Uint16 *)info->dst;
839        int dstskip = info->dst_skip >> 1;
840        alpha >>= 3; // downscale alpha to 5 bits
841
842        while (height--) {
843            /* *INDENT-OFF* */ // clang-format off
844            DUFFS_LOOP({
845                Uint32 s = *srcp++;
846                Uint32 d = *dstp;
847                /*
848                 * shift out the middle component (green) to
849                 * the high 16 bits, and process all three RGB
850                 * components at the same time.
851                 */
852                s = (s | s << 16) & 0x03e07c1f;
853                d = (d | d << 16) & 0x03e07c1f;
854                d += (s - d) * alpha >> 5;
855                d &= 0x03e07c1f;
856                *dstp++ = (Uint16)(d | d >> 16);
857            }, width);
858            /* *INDENT-ON* */ // clang-format on
859            srcp += srcskip;
860            dstp += dstskip;
861        }
862    }
863}
864
865// fast ARGB8888->RGB565 blending with pixel alpha
866static void BlitARGBto565PixelAlpha(SDL_BlitInfo *info)
867{
868    int width = info->dst_w;
869    int height = info->dst_h;
870    Uint32 *srcp = (Uint32 *)info->src;
871    int srcskip = info->src_skip >> 2;
872    Uint16 *dstp = (Uint16 *)info->dst;
873    int dstskip = info->dst_skip >> 1;
874
875    while (height--) {
876        /* *INDENT-OFF* */ // clang-format off
877        DUFFS_LOOP({
878        Uint32 s = *srcp;
879        unsigned alpha = s >> 27; // downscale alpha to 5 bits
880        /* Here we special-case opaque alpha since the
881           compositioning used (>>8 instead of /255) doesn't handle
882           it correctly. */
883        if (alpha) {
884          if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
885            *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3  & 0x1f));
886          } else {
887            Uint32 d = *dstp;
888            /*
889             * convert source and destination to G0RAB65565
890             * and blend all components at the same time
891             */
892            s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800) + (s >> 3 & 0x1f);
893            d = (d | d << 16) & 0x07e0f81f;
894            d += (s - d) * alpha >> 5;
895            d &= 0x07e0f81f;
896            *dstp = (Uint16)(d | d >> 16);
897          }
898        }
899        srcp++;
900        dstp++;
901        }, width);
902        /* *INDENT-ON* */ // clang-format on
903        srcp += srcskip;
904        dstp += dstskip;
905    }
906}
907
908// fast ARGB8888->RGB555 blending with pixel alpha
909static void BlitARGBto555PixelAlpha(SDL_BlitInfo *info)
910{
911    int width = info->dst_w;
912    int height = info->dst_h;
913    Uint32 *srcp = (Uint32 *)info->src;
914    int srcskip = info->src_skip >> 2;
915    Uint16 *dstp = (Uint16 *)info->dst;
916    int dstskip = info->dst_skip >> 1;
917
918    while (height--) {
919        /* *INDENT-OFF* */ // clang-format off
920        DUFFS_LOOP({
921        unsigned alpha;
922        Uint32 s = *srcp;
923        alpha = s >> 27; // downscale alpha to 5 bits
924        /* Here we special-case opaque alpha since the
925           compositioning used (>>8 instead of /255) doesn't handle
926           it correctly. */
927        if (alpha) {
928          if (alpha == (SDL_ALPHA_OPAQUE >> 3)) {
929            *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3  & 0x1f));
930          } else {
931            Uint32 d = *dstp;
932            /*
933             * convert source and destination to G0RAB55555
934             * and blend all components at the same time
935             */
936            s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00) + (s >> 3 & 0x1f);
937            d = (d | d << 16) & 0x03e07c1f;
938            d += (s - d) * alpha >> 5;
939            d &= 0x03e07c1f;
940            *dstp = (Uint16)(d | d >> 16);
941          }
942        }
943        srcp++;
944        dstp++;
945        }, width);
946        /* *INDENT-ON* */ // clang-format on
947        srcp += srcskip;
948        dstp += dstskip;
949    }
950}
951
952// General (slow) N->N blending with per-surface alpha
953static void BlitNtoNSurfaceAlpha(SDL_BlitInfo *info)
954{
955    int width = info->dst_w;
956    int height = info->dst_h;
957    Uint8 *src = info->src;
958    int srcskip = info->src_skip;
959    Uint8 *dst = info->dst;
960    int dstskip = info->dst_skip;
961    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
962    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
963    int srcbpp = srcfmt->bytes_per_pixel;
964    int dstbpp = dstfmt->bytes_per_pixel;
965    Uint32 Pixel;
966    unsigned sR, sG, sB;
967    unsigned dR, dG, dB, dA;
968    const unsigned sA = info->a;
969
970    if (sA) {
971        while (height--) {
972            /* *INDENT-OFF* */ // clang-format off
973        DUFFS_LOOP(
974        {
975        DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
976        DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
977        ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
978        ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
979        src += srcbpp;
980        dst += dstbpp;
981        },
982        width);
983        /* *INDENT-ON* */ // clang-format on
984            src += srcskip;
985            dst += dstskip;
986        }
987    }
988}
989
990// General (slow) colorkeyed N->N blending with per-surface alpha
991static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
992{
993    int width = info->dst_w;
994    int height = info->dst_h;
995    Uint8 *src = info->src;
996    int srcskip = info->src_skip;
997    Uint8 *dst = info->dst;
998    int dstskip = info->dst_skip;
999    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1000    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1001    Uint32 ckey = info->colorkey;
1002    int srcbpp = srcfmt->bytes_per_pixel;
1003    int dstbpp = dstfmt->bytes_per_pixel;
1004    Uint32 Pixel;
1005    unsigned sR, sG, sB;
1006    unsigned dR, dG, dB, dA;
1007    const unsigned sA = info->a;
1008
1009    while (height--) {
1010        /* *INDENT-OFF* */ // clang-format off
1011        DUFFS_LOOP(
1012        {
1013        RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
1014        if (sA && Pixel != ckey) {
1015            RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
1016            DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1017            ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1018            ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1019        }
1020        src += srcbpp;
1021        dst += dstbpp;
1022        },
1023        width);
1024        /* *INDENT-ON* */ // clang-format on
1025        src += srcskip;
1026        dst += dstskip;
1027    }
1028}
1029
1030// Fast 32-bit RGBA->RGBA blending with pixel alpha
1031static void Blit8888to8888PixelAlpha(SDL_BlitInfo *info)
1032{
1033    int width = info->dst_w;
1034    int height = info->dst_h;
1035    Uint8 *src = info->src;
1036    int srcskip = info->src_skip;
1037    Uint8 *dst = info->dst;
1038    int dstskip = info->dst_skip;
1039    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1040
1041    while (height--) {
1042        int i = 0;
1043
1044        for (; i < width; ++i) {
1045            Uint32 src32 = *(Uint32 *)src;
1046            Uint32 dst32 = *(Uint32 *)dst;
1047            ALPHA_BLEND_8888(src32, dst32, srcfmt);
1048            *(Uint32 *)dst = dst32;
1049            src += 4;
1050            dst += 4;
1051        }
1052
1053        src += srcskip;
1054        dst += dstskip;
1055    }
1056}
1057
1058// Fast 32-bit RGBA->RGB(A) blending with pixel alpha and src swizzling
1059static void Blit8888to8888PixelAlphaSwizzle(SDL_BlitInfo *info)
1060{
1061    int width = info->dst_w;
1062    int height = info->dst_h;
1063    Uint8 *src = info->src;
1064    int srcskip = info->src_skip;
1065    Uint8 *dst = info->dst;
1066    int dstskip = info->dst_skip;
1067    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1068    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1069    bool fill_alpha = !dstfmt->Amask;
1070    Uint32 dstAmask, dstAshift;
1071
1072    SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
1073
1074    while (height--) {
1075        int i = 0;
1076
1077        for (; i < width; ++i) {
1078            Uint32 src32 = *(Uint32 *)src;
1079            Uint32 dst32 = *(Uint32 *)dst;
1080            ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
1081            if (fill_alpha) {
1082                dst32 |= dstAmask;
1083            }
1084            *(Uint32 *)dst = dst32;
1085            src += 4;
1086            dst += 4;
1087        }
1088
1089        src += srcskip;
1090        dst += dstskip;
1091    }
1092}
1093
1094#ifdef SDL_SSE4_1_INTRINSICS
1095
1096static void SDL_TARGETING("sse4.1") Blit8888to8888PixelAlphaSwizzleSSE41(SDL_BlitInfo *info)
1097{
1098    int width = info->dst_w;
1099    int height = info->dst_h;
1100    Uint8 *src = info->src;
1101    int srcskip = info->src_skip;
1102    Uint8 *dst = info->dst;
1103    int dstskip = info->dst_skip;
1104    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1105    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1106    bool fill_alpha = !dstfmt->Amask;
1107    Uint32 dstAmask, dstAshift;
1108
1109    SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
1110
1111    // The byte offsets for the start of each pixel
1112    const __m128i mask_offsets = _mm_set_epi8(
1113        12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
1114
1115    const __m128i convert_mask = _mm_add_epi32(
1116        _mm_set1_epi32(
1117            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
1118            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
1119            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
1120        mask_offsets);
1121
1122    const __m128i alpha_splat_mask = _mm_add_epi8(_mm_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
1123    const __m128i alpha_fill_mask = _mm_set1_epi32((int)dstAmask);
1124
1125    while (height--) {
1126        int i = 0;
1127
1128        for (; i + 4 <= width; i += 4) {
1129            // Load 4 src pixels
1130            __m128i src128 = _mm_loadu_si128((__m128i *)src);
1131
1132            // Load 4 dst pixels
1133            __m128i dst128 = _mm_loadu_si128((__m128i *)dst);
1134
1135            // Extract the alpha from each pixel and splat it into all the channels
1136            __m128i srcA = _mm_shuffle_epi8(src128, alpha_splat_mask);
1137
1138            // Convert to dst format
1139            src128 = _mm_shuffle_epi8(src128, convert_mask);
1140
1141            // Set the alpha channels of src to 255
1142            src128 = _mm_or_si128(src128, alpha_fill_mask);
1143
1144            // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
1145            __m128i srca_lo = _mm_unpacklo_epi8(srcA, srcA);
1146            __m128i srca_hi = _mm_unpackhi_epi8(srcA, srcA);
1147
1148            // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
1149            srca_lo = _mm_xor_si128(srca_lo, _mm_set1_epi16(0xff00));
1150            srca_hi = _mm_xor_si128(srca_hi, _mm_set1_epi16(0xff00));
1151
1152            // maddubs expects second argument to be signed, so subtract 128
1153            src128 = _mm_sub_epi8(src128, _mm_set1_epi8((Uint8)128));
1154            dst128 = _mm_sub_epi8(dst128, _mm_set1_epi8((Uint8)128));
1155
1156            // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255
1157            __m128i dst_lo = _mm_maddubs_epi16(srca_lo, _mm_unpacklo_epi8(src128, dst128));
1158            __m128i dst_hi = _mm_maddubs_epi16(srca_hi, _mm_unpackhi_epi8(src128, dst128));
1159
1160            // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result)
1161            dst_lo = _mm_add_epi16(dst_lo, _mm_set1_epi16(1 + 128 * 255));
1162            dst_hi = _mm_add_epi16(dst_hi, _mm_set1_epi16(1 + 128 * 255));
1163
1164            // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16
1165            dst_lo = _mm_mulhi_epu16(dst_lo, _mm_set1_epi16(257));
1166            dst_hi = _mm_mulhi_epu16(dst_hi, _mm_set1_epi16(257));
1167
1168            // Blend the pixels together and save the result
1169            dst128 = _mm_packus_epi16(dst_lo, dst_hi);
1170            if (fill_alpha) {
1171                dst128 = _mm_or_si128(dst128, alpha_fill_mask);
1172            }
1173            _mm_storeu_si128((__m128i *)dst, dst128);
1174
1175            src += 16;
1176            dst += 16;
1177        }
1178
1179        for (; i < width; ++i) {
1180            Uint32 src32 = *(Uint32 *)src;
1181            Uint32 dst32 = *(Uint32 *)dst;
1182            ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
1183            if (fill_alpha) {
1184                dst32 |= dstAmask;
1185            }
1186            *(Uint32 *)dst = dst32;
1187            src += 4;
1188            dst += 4;
1189        }
1190
1191        src += srcskip;
1192        dst += dstskip;
1193    }
1194}
1195
1196#endif
1197
1198#ifdef SDL_AVX2_INTRINSICS
1199
1200static void SDL_TARGETING("avx2") Blit8888to8888PixelAlphaSwizzleAVX2(SDL_BlitInfo *info)
1201{
1202    int width = info->dst_w;
1203    int height = info->dst_h;
1204    Uint8 *src = info->src;
1205    int srcskip = info->src_skip;
1206    Uint8 *dst = info->dst;
1207    int dstskip = info->dst_skip;
1208    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1209    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1210    bool fill_alpha = !dstfmt->Amask;
1211    Uint32 dstAmask, dstAshift;
1212
1213    SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
1214
1215    // The byte offsets for the start of each pixel
1216    const __m256i mask_offsets = _mm256_set_epi8(
1217        28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
1218
1219    const __m256i convert_mask = _mm256_add_epi32(
1220        _mm256_set1_epi32(
1221            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
1222            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
1223            ((srcfmt->Bshift >> 3) << dstfmt->Bshift)),
1224        mask_offsets);
1225
1226    const __m256i alpha_splat_mask = _mm256_add_epi8(_mm256_set1_epi8(srcfmt->Ashift >> 3), mask_offsets);
1227    const __m256i alpha_fill_mask = _mm256_set1_epi32((int)dstAmask);
1228
1229    while (height--) {
1230        int i = 0;
1231
1232        for (; i + 8 <= width; i += 8) {
1233            // Load 8 src pixels
1234            __m256i src256 = _mm256_loadu_si256((__m256i *)src);
1235
1236            // Load 8 dst pixels
1237            __m256i dst256 = _mm256_loadu_si256((__m256i *)dst);
1238
1239            // Extract the alpha from each pixel and splat it into all the channels
1240            __m256i srcA = _mm256_shuffle_epi8(src256, alpha_splat_mask);
1241
1242            // Convert to dst format
1243            src256 = _mm256_shuffle_epi8(src256, convert_mask);
1244
1245            // Set the alpha channels of src to 255
1246            src256 = _mm256_or_si256(src256, alpha_fill_mask);
1247
1248            // Duplicate each 8-bit alpha value into both bytes of 16-bit lanes
1249            __m256i alpha_lo = _mm256_unpacklo_epi8(srcA, srcA);
1250            __m256i alpha_hi = _mm256_unpackhi_epi8(srcA, srcA);
1251
1252            // Calculate 255-srcA in every second 8-bit lane (255-srcA = srcA^0xff)
1253            alpha_lo = _mm256_xor_si256(alpha_lo, _mm256_set1_epi16(0xff00));
1254            alpha_hi = _mm256_xor_si256(alpha_hi, _mm256_set1_epi16(0xff00));
1255
1256            // maddubs expects second argument to be signed, so subtract 128
1257            src256 = _mm256_sub_epi8(src256, _mm256_set1_epi8((Uint8)128));
1258            dst256 = _mm256_sub_epi8(dst256, _mm256_set1_epi8((Uint8)128));
1259
1260            // dst = srcA*(src-128) + (255-srcA)*(dst-128) = srcA*src + (255-srcA)*dst - 128*255
1261            __m256i dst_lo = _mm256_maddubs_epi16(alpha_lo, _mm256_unpacklo_epi8(src256, dst256));
1262            __m256i dst_hi = _mm256_maddubs_epi16(alpha_hi, _mm256_unpackhi_epi8(src256, dst256));
1263
1264            // dst += 0x1U (use 0x80 to round instead of floor) + 128*255 (to fix maddubs result)
1265            dst_lo = _mm256_add_epi16(dst_lo, _mm256_set1_epi16(1 + 128 * 255));
1266            dst_hi = _mm256_add_epi16(dst_hi, _mm256_set1_epi16(1 + 128 * 255));
1267
1268            // dst = (dst + (dst >> 8)) >> 8 = (dst * 257) >> 16
1269            dst_lo = _mm256_mulhi_epu16(dst_lo, _mm256_set1_epi16(257));
1270            dst_hi = _mm256_mulhi_epu16(dst_hi, _mm256_set1_epi16(257));
1271
1272            // Blend the pixels together and save the result
1273            dst256 = _mm256_packus_epi16(dst_lo, dst_hi);
1274            if (fill_alpha) {
1275                dst256 = _mm256_or_si256(dst256, alpha_fill_mask);
1276            }
1277            _mm256_storeu_si256((__m256i *)dst, dst256);
1278
1279            src += 32;
1280            dst += 32;
1281        }
1282
1283        for (; i < width; ++i) {
1284            Uint32 src32 = *(Uint32 *)src;
1285            Uint32 dst32 = *(Uint32 *)dst;
1286            ALPHA_BLEND_SWIZZLE_8888(src32, dst32, srcfmt, dstfmt);
1287            if (fill_alpha) {
1288                dst32 |= dstAmask;
1289            }
1290            *(Uint32 *)dst = dst32;
1291            src += 4;
1292            dst += 4;
1293        }
1294
1295        src += srcskip;
1296        dst += dstskip;
1297    }
1298}
1299
1300#endif
1301
1302#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
1303
1304static void Blit8888to8888PixelAlphaSwizzleNEON(SDL_BlitInfo *info)
1305{
1306    int width = info->dst_w;
1307    int height = info->dst_h;
1308    Uint8 *src = info->src;
1309    int srcskip = info->src_skip;
1310    Uint8 *dst = info->dst;
1311    int dstskip = info->dst_skip;
1312    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1313    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1314    bool fill_alpha = !dstfmt->Amask;
1315    Uint32 dstAmask, dstAshift;
1316
1317    SDL_Get8888AlphaMaskAndShift(dstfmt, &dstAmask, &dstAshift);
1318
1319    // The byte offsets for the start of each pixel
1320    const uint8x16_t mask_offsets = vreinterpretq_u8_u64(vcombine_u64(
1321        vcreate_u64(0x0404040400000000), vcreate_u64(0x0c0c0c0c08080808)));
1322
1323    const uint8x16_t convert_mask = vreinterpretq_u8_u32(vaddq_u32(
1324        vreinterpretq_u32_u8(mask_offsets),
1325        vdupq_n_u32(
1326            ((srcfmt->Rshift >> 3) << dstfmt->Rshift) |
1327            ((srcfmt->Gshift >> 3) << dstfmt->Gshift) |
1328            ((srcfmt->Bshift >> 3) << dstfmt->Bshift))));
1329
1330    const uint8x16_t alpha_splat_mask = vaddq_u8(vdupq_n_u8(srcfmt->Ashift >> 3), mask_offsets);
1331    const uint8x16_t alpha_fill_mask = vreinterpretq_u8_u32(vdupq_n_u32(dstAmask));
1332
1333    while (height--) {
1334        int i = 0;
1335
1336        for (; i + 4 <= width; i += 4) {
1337            // Load 4 src pixels
1338            uint8x16_t src128 = vld1q_u8(src);
1339
1340            // Load 4 dst pixels
1341            uint8x16_t dst128 = vld1q_u8(dst);
1342
1343            // Extract the alpha from each pixel and splat it into all the channels
1344            uint8x16_t srcA = vqtbl1q_u8(src128, alpha_splat_mask);
1345
1346            // Convert to dst format
1347            src128 = vqtbl1q_u8(src128, convert_mask);
1348
1349            // Set the alpha channels of src to 255
1350            src128 = vorrq_u8(src128, alpha_fill_mask);
1351
1352            // 255 - srcA = ~srcA
1353            uint8x16_t srcInvA = vmvnq_u8(srcA);
1354
1355            // Result initialized with 1, this is for truncated divide later
1356            uint16x8_t res_lo = vdupq_n_u16(1);
1357            uint16x8_t res_hi = vdupq_n_u16(1);
1358
1359            // res = alpha * src + (255 - alpha) * dst
1360            res_lo = vmlal_u8(res_lo, vget_low_u8(srcA),    vget_low_u8(src128));
1361            res_lo = vmlal_u8(res_lo, vget_low_u8(srcInvA), vget_low_u8(dst128));
1362            res_hi = vmlal_high_u8(res_hi, srcA,    src128);
1363            res_hi = vmlal_high_u8(res_hi, srcInvA, dst128);
1364
1365            // Now result has +1 already added for truncated division
1366            // dst = (res + (res >> 8)) >> 8
1367            uint8x8_t temp;
1368            temp   = vaddhn_u16(res_lo, vshrq_n_u16(res_lo, 8));
1369            dst128 = vaddhn_high_u16(temp, res_hi, vshrq_n_u16(res_hi, 8));
1370
1371            // For rounded division remove the constant 1 and change first two vmlal_u8 to vmull_u8
1372            // Then replace two previous lines with following code:
1373            // temp   = vraddhn_u16(res_lo, vrshrq_n_u16(res_lo, 8));
1374            // dst128 = vraddhn_high_u16(temp, res_hi, vrshrq_n_u16(res_hi, 8));
1375
1376            if (fill_alpha) {
1377                dst128 = vorrq_u8(dst128, alpha_fill_mask);
1378            }
1379
1380            // Save the result
1381            vst1q_u8(dst, dst128);
1382
1383            src += 16;
1384            dst += 16;
1385        }
1386
1387        // Process 1 pixel per iteration, max 3 iterations, same calculations as above
1388        for (; i < width; ++i) {
1389            // Top 32-bits will be not used in src32 & dst32
1390            uint8x8_t src32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32 *)src));
1391            uint8x8_t dst32 = vreinterpret_u8_u32(vld1_dup_u32((Uint32 *)dst));
1392
1393            uint8x8_t srcA = vtbl1_u8(src32, vget_low_u8(alpha_splat_mask));
1394            src32 = vtbl1_u8(src32, vget_low_u8(convert_mask));
1395            src32 = vorr_u8(src32, vget_low_u8(alpha_fill_mask));
1396            uint8x8_t srcInvA = vmvn_u8(srcA);
1397
1398            uint16x8_t res = vdupq_n_u16(1);
1399            res = vmlal_u8(res, srcA,    src32);
1400            res = vmlal_u8(res, srcInvA, dst32);
1401
1402            dst32 = vaddhn_u16(res, vshrq_n_u16(res, 8));
1403
1404            if (fill_alpha) {
1405                dst32 = vorr_u8(dst32, vget_low_u8(alpha_fill_mask));
1406            }
1407
1408            // Save the result, only low 32-bits
1409            vst1_lane_u32((Uint32 *)dst, vreinterpret_u32_u8(dst32), 0);
1410
1411            src += 4;
1412            dst += 4;
1413        }
1414
1415        src += srcskip;
1416        dst += dstskip;
1417    }
1418}
1419
1420#endif
1421
1422// General (slow) N->N blending with pixel alpha
1423static void BlitNtoNPixelAlpha(SDL_BlitInfo *info)
1424{
1425    int width = info->dst_w;
1426    int height = info->dst_h;
1427    Uint8 *src = info->src;
1428    int srcskip = info->src_skip;
1429    Uint8 *dst = info->dst;
1430    int dstskip = info->dst_skip;
1431    const SDL_PixelFormatDetails *srcfmt = info->src_fmt;
1432    const SDL_PixelFormatDetails *dstfmt = info->dst_fmt;
1433    int srcbpp;
1434    int dstbpp;
1435    Uint32 Pixel;
1436    unsigned sR, sG, sB, sA;
1437    unsigned dR, dG, dB, dA;
1438
1439    // Set up some basic variables
1440    srcbpp = srcfmt->bytes_per_pixel;
1441    dstbpp = dstfmt->bytes_per_pixel;
1442
1443    while (height--) {
1444        DUFFS_LOOP(
1445        {
1446        DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1447        if (sA) {
1448            DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1449            ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1450            ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1451        }
1452        src += srcbpp;
1453        dst += dstbpp;
1454        },
1455        width);
1456        /* *INDENT-ON* */ // clang-format on
1457        src += srcskip;
1458        dst += dstskip;
1459    }
1460}
1461
1462SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface)
1463{
1464    const SDL_PixelFormatDetails *sf = surface->fmt;
1465    const SDL_PixelFormatDetails *df = surface->map.info.dst_fmt;
1466
1467    switch (surface->map.info.flags & ~SDL_COPY_RLE_MASK) {
1468    case SDL_COPY_BLEND:
1469        // Per-pixel alpha blits
1470        switch (df->bytes_per_pixel) {
1471        case 1:
1472            if (surface->map.info.dst_pal) {
1473                return BlitNto1PixelAlpha;
1474            } else {
1475                // RGB332 has no palette !
1476                return BlitNtoNPixelAlpha;
1477            }
1478
1479        case 2:
1480            if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1481                if (df->Gmask == 0x7e0) {
1482                    return BlitARGBto565PixelAlpha;
1483                } else if (df->Gmask == 0x3e0 && !df->Amask) {
1484                    return BlitARGBto555PixelAlpha;
1485                }
1486            }
1487            return BlitNtoNPixelAlpha;
1488
1489        case 4:
1490            if (SDL_PIXELLAYOUT(sf->format) == SDL_PACKEDLAYOUT_8888 && sf->Amask &&
1491                SDL_PIXELLAYOUT(df->format) == SDL_PACKEDLAYOUT_8888) {
1492#ifdef SDL_AVX2_INTRINSICS
1493                if (SDL_HasAVX2()) {
1494                    return Blit8888to8888PixelAlphaSwizzleAVX2;
1495                }
1496#endif
1497#ifdef SDL_SSE4_1_INTRINSICS
1498                if (SDL_HasSSE41()) {
1499                    return Blit8888to8888PixelAlphaSwizzleSSE41;
1500                }
1501#endif
1502#ifdef SDL_LSX_INTRINSICS
1503                if (SDL_HasLSX()) {
1504                    return Blit8888to8888PixelAlphaSwizzleLSX;
1505                }
1506#endif
1507#if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64))
1508                // To prevent "unused function" compiler warnings/errors
1509                (void)Blit8888to8888PixelAlpha;
1510                (void)Blit8888to8888PixelAlphaSwizzle;
1511                return Blit8888to8888PixelAlphaSwizzleNEON;
1512#else
1513                if (sf->format == df->format) {
1514                    return Blit8888to8888PixelAlpha;
1515                } else {
1516                    return Blit8888to8888PixelAlphaSwizzle;
1517                }
1518#endif
1519            }
1520            return BlitNtoNPixelAlpha;
1521
1522        case 3:
1523        default:
1524            break;
1525        }
1526        return BlitNtoNPixelAlpha;
1527
1528    case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1529        if (sf->Amask == 0) {
1530            // Per-surface alpha blits
1531            switch (df->bytes_per_pixel) {
1532            case 1:
1533                if (surface->map.info.dst_pal) {
1534                    return BlitNto1SurfaceAlpha;
1535                } else {
1536                    // RGB332 has no palette !
1537                    return BlitNtoNSurfaceAlpha;
1538                }
1539
1540            case 2:
1541                if (surface->map.identity) {
1542                    if (df->Gmask == 0x7e0) {
1543#ifdef SDL_MMX_INTRINSICS
1544                        if (SDL_HasMMX()) {
1545                            return Blit565to565SurfaceAlphaMMX;
1546                        } else
1547#endif
1548                        {
1549                            return Blit565to565SurfaceAlpha;
1550                        }
1551                    } else if (df->Gmask == 0x3e0) {
1552#ifdef SDL_MMX_INTRINSICS
1553                        if (SDL_HasMMX()) {
1554                            return Blit555to555SurfaceAlphaMMX;
1555                        } else
1556#endif
1557                        {
1558                            return Blit555to555SurfaceAlpha;
1559                        }
1560                    }
1561                }
1562                return BlitNtoNSurfaceAlpha;
1563
1564            case 4:
1565                if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->bytes_per_pixel == 4) {
1566#ifdef SDL_SSE2_INTRINSICS
1567                    if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasSSE2()) {
1568                        return Blit888to888SurfaceAlphaSSE2;
1569                    }
1570#endif
1571                    if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1572                        return BlitRGBtoRGBSurfaceAlpha;
1573                    }
1574                }
1575                return BlitNtoNSurfaceAlpha;
1576
1577            case 3:
1578            default:
1579                return BlitNtoNSurfaceAlpha;
1580            }
1581        }
1582        break;
1583
1584    case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1585        if (sf->Amask == 0) {
1586            if (df->bytes_per_pixel == 1) {
1587
1588                if (surface->map.info.dst_pal) {
1589                    return BlitNto1SurfaceAlphaKey;
1590                } else {
1591                    // RGB332 has no palette !
1592                    return BlitNtoNSurfaceAlphaKey;
1593                }
1594            } else {
1595                return BlitNtoNSurfaceAlphaKey;
1596            }
1597        }
1598        break;
1599    }
1600
1601    return NULL;
1602}
1603
1604#endif // SDL_HAVE_BLIT_A
1605
1606
[FILE END]
(C) 2025 0x4248
(C) 2025 4248 Media and 4248 Systems, All part of 0x4248
See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.