Atlas - SDL_stretch.c

Atlas - SDL_stretch.c
Home / ext / SDL / src / video
Lines: 3 | Size: 36730 bytes
[Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)] 
[FILE BEGIN]
1/*
2  Simple DirectMedia Layer
3  Copyright (C) 1997-2025 Sam Lantinga <[email protected]>
4
5  This software is provided 'as-is', without any express or implied
6  warranty.  In no event will the authors be held liable for any damages
7  arising from the use of this software.
8
9  Permission is granted to anyone to use this software for any purpose,
10  including commercial applications, and to alter it and redistribute it
11  freely, subject to the following restrictions:
12
13  1. The origin of this software must not be misrepresented; you must not
14     claim that you wrote the original software. If you use this software
15     in a product, an acknowledgment in the product documentation would be
16     appreciated but is not required.
17  2. Altered source versions must be plainly marked as such, and must not be
18     misrepresented as being the original software.
19  3. This notice may not be removed or altered from any source distribution.
20*/
21#include "SDL_internal.h"
22
23#include "SDL_surface_c.h"
24
25static bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
26static bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect);
27
28bool SDL_StretchSurface(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect, SDL_ScaleMode scaleMode)
29{
30    bool result;
31    int src_locked;
32    int dst_locked;
33    SDL_Rect full_src;
34    SDL_Rect full_dst;
35
36    CHECK_PARAM(!src) {
37        return SDL_InvalidParamError("src");
38    }
39    CHECK_PARAM(!dst) {
40        return SDL_InvalidParamError("dst");
41    }
42
43    if (src->format != dst->format) {
44        // Slow!
45        SDL_Surface *src_tmp = SDL_ConvertSurfaceAndColorspace(src, dst->format, dst->palette, dst->colorspace, dst->props);
46        if (!src_tmp) {
47            return false;
48        }
49        result = SDL_StretchSurface(src_tmp, srcrect, dst, dstrect, scaleMode);
50        SDL_DestroySurface(src_tmp);
51        return result;
52    }
53
54    if (SDL_ISPIXELFORMAT_FOURCC(src->format)) {
55        // Slow!
56        if (!dstrect) {
57            full_dst.x = 0;
58            full_dst.y = 0;
59            full_dst.w = dst->w;
60            full_dst.h = dst->h;
61            dstrect = &full_dst;
62        }
63
64        SDL_Surface *src_tmp = SDL_ConvertSurface(src, SDL_PIXELFORMAT_XRGB8888);
65        SDL_Surface *dst_tmp = SDL_CreateSurface(dstrect->w, dstrect->h, SDL_PIXELFORMAT_XRGB8888);
66        if (src_tmp && dst_tmp) {
67            result = SDL_StretchSurface(src_tmp, srcrect, dst_tmp, NULL, scaleMode);
68            if (result) {
69                result = SDL_ConvertPixelsAndColorspace(dstrect->w, dstrect->h,
70                            dst_tmp->format, SDL_COLORSPACE_SRGB, 0,
71                            dst_tmp->pixels, dst_tmp->pitch,
72                            dst->format, dst->colorspace, SDL_GetSurfaceProperties(dst),
73                            (Uint8 *)dst->pixels + dstrect->y * dst->pitch + dstrect->x * SDL_BYTESPERPIXEL(dst->format), dst->pitch);
74            }
75        } else {
76            result = false;
77        }
78        SDL_DestroySurface(src_tmp);
79        SDL_DestroySurface(dst_tmp);
80        return result;
81    }
82
83    switch (scaleMode) {
84    case SDL_SCALEMODE_NEAREST:
85        break;
86    case SDL_SCALEMODE_LINEAR:
87        break;
88    case SDL_SCALEMODE_PIXELART:
89        scaleMode = SDL_SCALEMODE_NEAREST;
90        break;
91    default:
92        return SDL_InvalidParamError("scaleMode");
93    }
94
95    if (scaleMode == SDL_SCALEMODE_LINEAR) {
96        if (SDL_BYTESPERPIXEL(src->format) != 4 || src->format == SDL_PIXELFORMAT_ARGB2101010) {
97            return SDL_SetError("Wrong format");
98        }
99    }
100
101    // Verify the blit rectangles
102    if (srcrect) {
103        if ((srcrect->x < 0) || (srcrect->y < 0) ||
104            ((srcrect->x + srcrect->w) > src->w) ||
105            ((srcrect->y + srcrect->h) > src->h)) {
106            return SDL_SetError("Invalid source blit rectangle");
107        }
108    } else {
109        full_src.x = 0;
110        full_src.y = 0;
111        full_src.w = src->w;
112        full_src.h = src->h;
113        srcrect = &full_src;
114    }
115    if (dstrect) {
116        if ((dstrect->x < 0) || (dstrect->y < 0) ||
117            ((dstrect->x + dstrect->w) > dst->w) ||
118            ((dstrect->y + dstrect->h) > dst->h)) {
119            return SDL_SetError("Invalid destination blit rectangle");
120        }
121    } else {
122        full_dst.x = 0;
123        full_dst.y = 0;
124        full_dst.w = dst->w;
125        full_dst.h = dst->h;
126        dstrect = &full_dst;
127    }
128
129    if (dstrect->w <= 0 || dstrect->h <= 0) {
130        return true;
131    }
132
133    if (srcrect->w > SDL_MAX_UINT16 || srcrect->h > SDL_MAX_UINT16 ||
134        dstrect->w > SDL_MAX_UINT16 || dstrect->h > SDL_MAX_UINT16) {
135        return SDL_SetError("Size too large for scaling");
136    }
137
138    // Lock the destination if it's in hardware
139    dst_locked = 0;
140    if (SDL_MUSTLOCK(dst)) {
141        if (!SDL_LockSurface(dst)) {
142            return SDL_SetError("Unable to lock destination surface");
143        }
144        dst_locked = 1;
145    }
146    // Lock the source if it's in hardware
147    src_locked = 0;
148    if (SDL_MUSTLOCK(src)) {
149        if (!SDL_LockSurface(src)) {
150            if (dst_locked) {
151                SDL_UnlockSurface(dst);
152            }
153            return SDL_SetError("Unable to lock source surface");
154        }
155        src_locked = 1;
156    }
157
158    if (scaleMode == SDL_SCALEMODE_NEAREST) {
159        result = SDL_StretchSurfaceUncheckedNearest(src, srcrect, dst, dstrect);
160    } else {
161        result = SDL_StretchSurfaceUncheckedLinear(src, srcrect, dst, dstrect);
162    }
163
164    // We need to unlock the surfaces if they're locked
165    if (dst_locked) {
166        SDL_UnlockSurface(dst);
167    }
168    if (src_locked) {
169        SDL_UnlockSurface(src);
170    }
171
172    return result;
173}
174
175/* bilinear interpolation precision must be < 8
176   Because with SSE: add-multiply: _mm_madd_epi16 works with signed int
177   so pixels 0xb1...... are negatives and false the result
178   same in NEON probably */
179#define PRECISION 7
180
181#define FIXED_POINT(i) ((Uint32)(i) << 16)
182#define SRC_INDEX(fp)  ((Uint32)(fp) >> 16)
183#define INTEGER(fp)    ((Uint32)(fp) >> PRECISION)
184#define FRAC(fp)       ((Uint32)((fp) >> (16 - PRECISION)) & ((1 << PRECISION) - 1))
185#define FRAC_ZERO      0
186#define FRAC_ONE       (1 << PRECISION)
187#define FP_ONE         FIXED_POINT(1)
188
189#define BILINEAR___START                                                              \
190    int i;                                                                            \
191    Sint64 fp_sum_h;                                                                  \
192    int fp_step_h, left_pad_h, right_pad_h;                                           \
193    Sint64 fp_sum_w;                                                                  \
194    int fp_step_w, left_pad_w, right_pad_w;                                           \
195    Sint64 fp_sum_w_init;                                                             \
196    int left_pad_w_init, right_pad_w_init, dst_gap, middle_init;                      \
197    get_scaler_datas(src_h, dst_h, &fp_sum_h, &fp_step_h, &left_pad_h, &right_pad_h); \
198    get_scaler_datas(src_w, dst_w, &fp_sum_w, &fp_step_w, &left_pad_w, &right_pad_w); \
199    fp_sum_w_init = fp_sum_w + left_pad_w * fp_step_w;                                \
200    left_pad_w_init = left_pad_w;                                                     \
201    right_pad_w_init = right_pad_w;                                                   \
202    dst_gap = dst_pitch - 4 * dst_w;                                                  \
203    middle_init = dst_w - left_pad_w - right_pad_w;
204
205#define BILINEAR___HEIGHT                                              \
206    int index_h, frac_h0, frac_h1, middle;                             \
207    const Uint32 *src_h0, *src_h1;                                     \
208    int no_padding;                                                    \
209    Uint64 incr_h0, incr_h1;                                           \
210                                                                       \
211    no_padding = !(i < left_pad_h || i > dst_h - 1 - right_pad_h);     \
212    index_h = SRC_INDEX(fp_sum_h);                                     \
213    frac_h0 = FRAC(fp_sum_h);                                          \
214                                                                       \
215    index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \
216    frac_h0 = no_padding ? frac_h0 : 0;                                \
217    incr_h1 = no_padding ? src_pitch : 0;                              \
218    incr_h0 = (Uint64)index_h * src_pitch;                             \
219                                                                       \
220    src_h0 = (const Uint32 *)((const Uint8 *)src + incr_h0);           \
221    src_h1 = (const Uint32 *)((const Uint8 *)src_h0 + incr_h1);        \
222                                                                       \
223    fp_sum_h += fp_step_h;                                             \
224                                                                       \
225    frac_h1 = FRAC_ONE - frac_h0;                                      \
226    fp_sum_w = fp_sum_w_init;                                          \
227    right_pad_w = right_pad_w_init;                                    \
228    left_pad_w = left_pad_w_init;                                      \
229    middle = middle_init;
230
231#ifdef __clang__
232// Remove inlining of this function
233// Compiler crash with clang 9.0.8 / android-ndk-r21d
234// Compiler crash with clang 11.0.3 / Xcode
235// OK with clang 11.0.5 / android-ndk-22
236// OK with clang 12.0.0 / Xcode
237__attribute__((noinline))
238#endif
239static void get_scaler_datas(int src_nb, int dst_nb, Sint64 *fp_start, int *fp_step, int *left_pad, int *right_pad)
240{
241
242    int step = FIXED_POINT(src_nb) / (dst_nb); // source step in fixed point
243    int x0 = FP_ONE / 2;                       // dst first pixel center at 0.5 in fixed point
244    Sint64 fp_sum;
245    int i;
246#if 0
247    // scale to source coordinates
248    x0 *= src_nb;
249    x0 /= dst_nb; // x0 == step / 2
250#else
251    // Use this code for perfect match with pixman
252    Sint64 tmp[2];
253    tmp[0] = (Sint64)step * (x0 >> 16);
254    tmp[1] = (Sint64)step * (x0 & 0xFFFF);
255    x0 = (int)(tmp[0] + ((tmp[1] + 0x8000) >> 16)); // x0 == (step + 1) / 2
256#endif
257    // -= 0.5, get back the pixel origin, in source coordinates
258    x0 -= FP_ONE / 2;
259
260    *fp_start = x0;
261    *fp_step = step;
262    *left_pad = 0;
263    *right_pad = 0;
264
265    fp_sum = x0;
266    for (i = 0; i < dst_nb; i++) {
267        if (fp_sum < 0) {
268            *left_pad += 1;
269        } else {
270            int index = SRC_INDEX(fp_sum);
271            if (index > src_nb - 2) {
272                *right_pad += 1;
273            }
274        }
275        fp_sum += step;
276    }
277    //    SDL_Log("%d -> %d  x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, *fp_start, *fp_step, *left_pad, *right_pad);
278}
279
280typedef struct color_t
281{
282    Uint8 a;
283    Uint8 b;
284    Uint8 c;
285    Uint8 d;
286} color_t;
287
288#if 0
289static void printf_64(const char *str, void *var)
290{
291    uint8_t *val = (uint8_t *)var;
292    printf(" *   %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n",
293           str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
294}
295#endif
296
297/* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
298
299static SDL_INLINE void INTERPOL(const Uint32 *src_x0, const Uint32 *src_x1, int frac0, int frac1, Uint32 *dst)
300{
301    const color_t *c0 = (const color_t *)src_x0;
302    const color_t *c1 = (const color_t *)src_x1;
303    color_t *cx = (color_t *)dst;
304#if 0
305    cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a));
306    cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b));
307    cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c));
308    cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d));
309#else
310    cx->a = (Uint8)INTEGER(frac1 * c0->a + frac0 * c1->a);
311    cx->b = (Uint8)INTEGER(frac1 * c0->b + frac0 * c1->b);
312    cx->c = (Uint8)INTEGER(frac1 * c0->c + frac0 * c1->c);
313    cx->d = (Uint8)INTEGER(frac1 * c0->d + frac0 * c1->d);
314#endif
315}
316
317static SDL_INLINE void INTERPOL_BILINEAR(const Uint32 *s0, const Uint32 *s1, int frac_w0, int frac_h0, int frac_h1, Uint32 *dst)
318{
319    Uint32 tmp[2];
320    unsigned int frac_w1 = FRAC_ONE - frac_w0;
321
322    // Vertical first, store to 'tmp'
323    INTERPOL(s0, s1, frac_h0, frac_h1, tmp);
324    INTERPOL(s0 + 1, s1 + 1, frac_h0, frac_h1, tmp + 1);
325
326    // Horizontal, store to 'dst'
327    INTERPOL(tmp, tmp + 1, frac_w0, frac_w1, dst);
328}
329
330static bool scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
331{
332    BILINEAR___START
333
334    for (i = 0; i < dst_h; i++) {
335
336        BILINEAR___HEIGHT
337
338        while (left_pad_w--) {
339            INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst);
340            dst += 1;
341        }
342
343        while (middle--) {
344            const Uint32 *s_00_01;
345            const Uint32 *s_10_11;
346            int index_w = 4 * SRC_INDEX(fp_sum_w);
347            int frac_w = FRAC(fp_sum_w);
348            fp_sum_w += fp_step_w;
349
350            /*
351                        x00 ... x0_ ..... x01
352                        .       .         .
353                        .       x         .
354                        .       .         .
355                        .       .         .
356                        x10 ... x1_ ..... x11
357            */
358            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
359            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
360
361            INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst);
362
363            dst += 1;
364        }
365
366        while (right_pad_w--) {
367            int index_w = 4 * (src_w - 2);
368            const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
369            const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
370            INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst);
371            dst += 1;
372        }
373        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
374    }
375    return true;
376}
377
378#ifdef SDL_NEON_INTRINSICS
379#define CAST_uint8x8_t       (uint8x8_t)
380#define CAST_uint32x2_t      (uint32x2_t)
381#endif
382
383#if defined(_MSC_VER)
384#ifdef SDL_NEON_INTRINSICS
385#undef CAST_uint8x8_t
386#undef CAST_uint32x2_t
387#define CAST_uint8x8_t
388#define CAST_uint32x2_t
389#endif
390#endif
391
392#ifdef SDL_SSE2_INTRINSICS
393
394#if 0
395static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var)
396{
397    uint16_t *val = (uint16_t *)&var;
398    printf(" *   %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n",
399           str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]);
400}
401#endif
402
403static SDL_INLINE int hasSSE2(void)
404{
405    static int val = -1;
406    if (val != -1) {
407        return val;
408    }
409    val = SDL_HasSSE2();
410    return val;
411}
412
413static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero)
414{
415    __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
416    __m128i v_frac_w0, k0, l0, d0, e0;
417
418    int f, f2;
419    f = frac_w;
420    f2 = FRAC_ONE - frac_w;
421    v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
422
423    x_00_01 = _mm_loadl_epi64((const __m128i *)s0); // Load x00 and x01
424    x_10_11 = _mm_loadl_epi64((const __m128i *)s1);
425
426    /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
427
428    // Interpolation vertical
429    k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
430    l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
431    k0 = _mm_add_epi16(k0, l0);
432
433    // For perfect match, clear the factionnal part eventually.
434    /*
435    k0 = _mm_srli_epi16(k0, PRECISION);
436    k0 = _mm_slli_epi16(k0, PRECISION);
437    */
438
439    // Interpolation horizontal
440    l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
441    k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
442
443    // Store 1 pixel
444    d0 = _mm_srli_epi32(k0, PRECISION * 2);
445    e0 = _mm_packs_epi32(d0, d0);
446    e0 = _mm_packus_epi16(e0, e0);
447    *dst = _mm_cvtsi128_si32(e0);
448}
449
450static bool SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
451{
452    BILINEAR___START
453
454    for (i = 0; i < dst_h; i++) {
455        int nb_block2;
456        __m128i v_frac_h0;
457        __m128i v_frac_h1;
458        __m128i zero;
459
460        BILINEAR___HEIGHT
461
462        nb_block2 = middle / 2;
463
464        v_frac_h0 = _mm_set_epi16((short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0);
465        v_frac_h1 = _mm_set_epi16((short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1);
466        zero = _mm_setzero_si128();
467
468        while (left_pad_w--) {
469            INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero);
470            dst += 1;
471        }
472
473        while (nb_block2--) {
474            int index_w_0, frac_w_0;
475            int index_w_1, frac_w_1;
476
477            const Uint32 *s_00_01, *s_02_03, *s_10_11, *s_12_13;
478
479            __m128i x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
480            __m128i v_frac_w0, k0, l0, d0, e0;
481            __m128i v_frac_w1, k1, l1, d1, e1;
482
483            int f, f2;
484            index_w_0 = 4 * SRC_INDEX(fp_sum_w);
485            frac_w_0 = FRAC(fp_sum_w);
486            fp_sum_w += fp_step_w;
487            index_w_1 = 4 * SRC_INDEX(fp_sum_w);
488            frac_w_1 = FRAC(fp_sum_w);
489            fp_sum_w += fp_step_w;
490            /*
491                        x00............ x01   x02...........x03
492                        .      .         .     .       .     .
493                        j0     f0        j1    j2      f1    j3
494                        .      .         .     .       .     .
495                        .      .         .     .       .     .
496                        .      .         .     .       .     .
497                        x10............ x11   x12...........x13
498             */
499            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
500            s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
501            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
502            s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
503
504            f = frac_w_0;
505            f2 = FRAC_ONE - frac_w_0;
506            v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
507
508            f = frac_w_1;
509            f2 = FRAC_ONE - frac_w_1;
510            v_frac_w1 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2);
511
512            x_00_01 = _mm_loadl_epi64((const __m128i *)s_00_01); // Load x00 and x01
513            x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03);
514            x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11);
515            x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13);
516
517            // Interpolation vertical
518            k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1);
519            l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0);
520            k0 = _mm_add_epi16(k0, l0);
521            k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1);
522            l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0);
523            k1 = _mm_add_epi16(k1, l1);
524
525            // Interpolation horizontal
526            l0 = _mm_unpacklo_epi64(/* unused */ l0, k0);
527            k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0);
528            l1 = _mm_unpacklo_epi64(/* unused */ l1, k1);
529            k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1);
530
531            // Store 1 pixel
532            d0 = _mm_srli_epi32(k0, PRECISION * 2);
533            e0 = _mm_packs_epi32(d0, d0);
534            e0 = _mm_packus_epi16(e0, e0);
535            *dst++ = _mm_cvtsi128_si32(e0);
536
537            // Store 1 pixel
538            d1 = _mm_srli_epi32(k1, PRECISION * 2);
539            e1 = _mm_packs_epi32(d1, d1);
540            e1 = _mm_packus_epi16(e1, e1);
541            *dst++ = _mm_cvtsi128_si32(e1);
542        }
543
544        // Last point
545        if (middle & 0x1) {
546            const Uint32 *s_00_01;
547            const Uint32 *s_10_11;
548            int index_w = 4 * SRC_INDEX(fp_sum_w);
549            int frac_w = FRAC(fp_sum_w);
550            fp_sum_w += fp_step_w;
551            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
552            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
553            INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero);
554            dst += 1;
555        }
556
557        while (right_pad_w--) {
558            int index_w = 4 * (src_w - 2);
559            const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
560            const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
561            INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero);
562            dst += 1;
563        }
564        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
565    }
566    return true;
567}
568#endif
569
570#ifdef SDL_NEON_INTRINSICS
571
572static SDL_INLINE int hasNEON(void)
573{
574    static int val = -1;
575    if (val != -1) {
576        return val;
577    }
578    val = SDL_HasNEON();
579    return val;
580}
581
582static SDL_INLINE void INTERPOL_BILINEAR_NEON(const Uint32 *s0, const Uint32 *s1, int frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst)
583{
584    uint8x8_t x_00_01, x_10_11; /* Pixels in 4*uint8 in row */
585    uint16x8_t k0;
586    uint32x4_t l0;
587    uint16x8_t d0;
588    uint8x8_t e0;
589
590    x_00_01 = CAST_uint8x8_t vld1_u32(s0); // Load 2 pixels
591    x_10_11 = CAST_uint8x8_t vld1_u32(s1);
592
593    /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
594    k0 = vmull_u8(x_00_01, v_frac_h1);     /* k0 := x0 * (1 - frac)    */
595    k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac          */
596
597    // k0 now contains 2 interpolated pixels { j0, j1 }
598    l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
599    l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w);
600    l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w);
601
602    // Shift and narrow
603    d0 = vcombine_u16(
604        /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
605        /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION));
606
607    // Narrow again
608    e0 = vmovn_u16(d0);
609
610    // Store 1 pixel
611    *dst = vget_lane_u32(CAST_uint32x2_t e0, 0);
612}
613
614static bool scale_mat_NEON(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
615{
616    BILINEAR___START
617
618    for (i = 0; i < dst_h; i++) {
619        int nb_block4;
620        uint8x8_t v_frac_h0, v_frac_h1;
621
622        BILINEAR___HEIGHT
623
624        nb_block4 = middle / 4;
625
626        v_frac_h0 = vmov_n_u8(frac_h0);
627        v_frac_h1 = vmov_n_u8(frac_h1);
628
629        while (left_pad_w--) {
630            INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst);
631            dst += 1;
632        }
633
634        while (nb_block4--) {
635            int index_w_0, frac_w_0;
636            int index_w_1, frac_w_1;
637            int index_w_2, frac_w_2;
638            int index_w_3, frac_w_3;
639
640            const Uint32 *s_00_01, *s_02_03, *s_04_05, *s_06_07;
641            const Uint32 *s_10_11, *s_12_13, *s_14_15, *s_16_17;
642
643            uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
644            uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17;
645
646            uint16x8_t k0, k1, k2, k3;
647            uint32x4_t l0, l1, l2, l3;
648            uint16x8_t d0, d1;
649            uint8x8_t e0, e1;
650            uint32x4_t f0;
651
652            index_w_0 = 4 * SRC_INDEX(fp_sum_w);
653            frac_w_0 = FRAC(fp_sum_w);
654            fp_sum_w += fp_step_w;
655            index_w_1 = 4 * SRC_INDEX(fp_sum_w);
656            frac_w_1 = FRAC(fp_sum_w);
657            fp_sum_w += fp_step_w;
658            index_w_2 = 4 * SRC_INDEX(fp_sum_w);
659            frac_w_2 = FRAC(fp_sum_w);
660            fp_sum_w += fp_step_w;
661            index_w_3 = 4 * SRC_INDEX(fp_sum_w);
662            frac_w_3 = FRAC(fp_sum_w);
663            fp_sum_w += fp_step_w;
664
665            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
666            s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
667            s_04_05 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_2);
668            s_06_07 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_3);
669            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
670            s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
671            s_14_15 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_2);
672            s_16_17 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_3);
673
674            // Interpolation vertical
675            x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
676            x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
677            x_04_05 = CAST_uint8x8_t vld1_u32(s_04_05);
678            x_06_07 = CAST_uint8x8_t vld1_u32(s_06_07);
679            x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
680            x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
681            x_14_15 = CAST_uint8x8_t vld1_u32(s_14_15);
682            x_16_17 = CAST_uint8x8_t vld1_u32(s_16_17);
683
684            /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
685            k0 = vmull_u8(x_00_01, v_frac_h1);     /* k0 := x0 * (1 - frac)    */
686            k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac          */
687
688            k1 = vmull_u8(x_02_03, v_frac_h1);
689            k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
690
691            k2 = vmull_u8(x_04_05, v_frac_h1);
692            k2 = vmlal_u8(k2, x_14_15, v_frac_h0);
693
694            k3 = vmull_u8(x_06_07, v_frac_h1);
695            k3 = vmlal_u8(k3, x_16_17, v_frac_h0);
696
697            // k0 now contains 2 interpolated pixels { j0, j1 }
698            // k1 now contains 2 interpolated pixels { j2, j3 }
699            // k2 now contains 2 interpolated pixels { j4, j5 }
700            // k3 now contains 2 interpolated pixels { j6, j7 }
701
702            l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
703            l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
704            l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
705
706            l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
707            l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
708            l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
709
710            l2 = vshll_n_u16(vget_low_u16(k2), PRECISION);
711            l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2);
712            l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2);
713
714            l3 = vshll_n_u16(vget_low_u16(k3), PRECISION);
715            l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3);
716            l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3);
717
718            // shift and narrow
719            d0 = vcombine_u16(
720                /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
721                /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION));
722            // narrow again
723            e0 = vmovn_u16(d0);
724
725            // Shift and narrow
726            d1 = vcombine_u16(
727                /* uint16x4_t */ vshrn_n_u32(l2, 2 * PRECISION),
728                /* uint16x4_t */ vshrn_n_u32(l3, 2 * PRECISION));
729            // Narrow again
730            e1 = vmovn_u16(d1);
731
732            f0 = vcombine_u32(CAST_uint32x2_t e0, CAST_uint32x2_t e1);
733            // Store 4 pixels
734            vst1q_u32(dst, f0);
735
736            dst += 4;
737        }
738
739        if (middle & 0x2) {
740            int index_w_0, frac_w_0;
741            int index_w_1, frac_w_1;
742            const Uint32 *s_00_01, *s_02_03;
743            const Uint32 *s_10_11, *s_12_13;
744            uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */
745            uint16x8_t k0, k1;
746            uint32x4_t l0, l1;
747            uint16x8_t d0;
748            uint8x8_t e0;
749
750            index_w_0 = 4 * SRC_INDEX(fp_sum_w);
751            frac_w_0 = FRAC(fp_sum_w);
752            fp_sum_w += fp_step_w;
753            index_w_1 = 4 * SRC_INDEX(fp_sum_w);
754            frac_w_1 = FRAC(fp_sum_w);
755            fp_sum_w += fp_step_w;
756            /*
757                        x00............ x01   x02...........x03
758                        .      .         .     .       .     .
759                        j0   dest0       j1    j2    dest1   j3
760                        .      .         .     .       .     .
761                        .      .         .     .       .     .
762                        .      .         .     .       .     .
763                        x10............ x11   x12...........x13
764            */
765            s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0);
766            s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1);
767            s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0);
768            s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1);
769
770            // Interpolation vertical
771            x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels
772            x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03);
773            x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11);
774            x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13);
775
776            /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */
777            k0 = vmull_u8(x_00_01, v_frac_h1);     /* k0 := x0 * (1 - frac)    */
778            k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac          */
779
780            k1 = vmull_u8(x_02_03, v_frac_h1);
781            k1 = vmlal_u8(k1, x_12_13, v_frac_h0);
782
783            // k0 now contains 2 interpolated pixels { j0, j1 }
784            // k1 now contains 2 interpolated pixels { j2, j3 }
785
786            l0 = vshll_n_u16(vget_low_u16(k0), PRECISION);
787            l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0);
788            l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0);
789
790            l1 = vshll_n_u16(vget_low_u16(k1), PRECISION);
791            l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1);
792            l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1);
793
794            // Shift and narrow
795
796            d0 = vcombine_u16(
797                /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION),
798                /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION));
799
800            // Narrow again
801            e0 = vmovn_u16(d0);
802
803            // Store 2 pixels
804            vst1_u32(dst, CAST_uint32x2_t e0);
805            dst += 2;
806        }
807
808        // Last point
809        if (middle & 0x1) {
810            int index_w = 4 * SRC_INDEX(fp_sum_w);
811            int frac_w = FRAC(fp_sum_w);
812            const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
813            const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
814            INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst);
815            dst += 1;
816        }
817
818        while (right_pad_w--) {
819            int index_w = 4 * (src_w - 2);
820            const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w);
821            const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w);
822            INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst);
823            dst += 1;
824        }
825
826        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
827    }
828    return true;
829}
830#endif
831
832bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect)
833{
834    bool result = false;
835    int src_w = srcrect->w;
836    int src_h = srcrect->h;
837    int dst_w = dstrect->w;
838    int dst_h = dstrect->h;
839    int src_pitch = s->pitch;
840    int dst_pitch = d->pitch;
841    Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * 4 + srcrect->y * src_pitch);
842    Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * 4 + dstrect->y * dst_pitch);
843
844#ifdef SDL_NEON_INTRINSICS
845    if (!result && hasNEON()) {
846        result = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
847    }
848#endif
849
850#ifdef SDL_SSE2_INTRINSICS
851    if (!result && hasSSE2()) {
852        result = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
853    }
854#endif
855
856    if (!result) {
857        result = scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
858    }
859
860    return result;
861}
862
863#define SDL_SCALE_NEAREST__START          \
864    int i;                                \
865    Uint64 posy, incy;                    \
866    Uint64 posx, incx;                    \
867    Uint64 srcy, srcx;                    \
868    int dst_gap, n;                       \
869    const Uint32 *src_h0;                 \
870    incy = ((Uint64)src_h << 16) / dst_h; \
871    incx = ((Uint64)src_w << 16) / dst_w; \
872    dst_gap = dst_pitch - bpp * dst_w;    \
873    posy = incy / 2;
874
875#define SDL_SCALE_NEAREST__HEIGHT                                         \
876    srcy = (posy >> 16);                                                  \
877    src_h0 = (const Uint32 *)((const Uint8 *)src_ptr + srcy * src_pitch); \
878    posy += incy;                                                         \
879    posx = incx / 2;                                                      \
880    n = dst_w;
881
882static bool scale_mat_nearest_1(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
883{
884    Uint32 bpp = 1;
885    SDL_SCALE_NEAREST__START
886    for (i = 0; i < dst_h; i++) {
887        SDL_SCALE_NEAREST__HEIGHT
888        while (n--) {
889            const Uint8 *src;
890            srcx = bpp * (posx >> 16);
891            posx += incx;
892            src = (const Uint8 *)src_h0 + srcx;
893            *(Uint8 *)dst = *src;
894            dst = (Uint32 *)((Uint8 *)dst + bpp);
895        }
896        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
897    }
898    return true;
899}
900
901static bool scale_mat_nearest_2(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
902{
903    Uint32 bpp = 2;
904    SDL_SCALE_NEAREST__START
905    for (i = 0; i < dst_h; i++) {
906        SDL_SCALE_NEAREST__HEIGHT
907        while (n--) {
908            const Uint16 *src;
909            srcx = bpp * (posx >> 16);
910            posx += incx;
911            src = (const Uint16 *)((const Uint8 *)src_h0 + srcx);
912            *(Uint16 *)dst = *src;
913            dst = (Uint32 *)((Uint8 *)dst + bpp);
914        }
915        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
916    }
917    return true;
918}
919
920static bool scale_mat_nearest_3(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
921{
922    Uint32 bpp = 3;
923    SDL_SCALE_NEAREST__START
924    for (i = 0; i < dst_h; i++) {
925        SDL_SCALE_NEAREST__HEIGHT
926        while (n--) {
927            const Uint8 *src;
928            srcx = bpp * (posx >> 16);
929            posx += incx;
930            src = (const Uint8 *)src_h0 + srcx;
931            ((Uint8 *)dst)[0] = src[0];
932            ((Uint8 *)dst)[1] = src[1];
933            ((Uint8 *)dst)[2] = src[2];
934            dst = (Uint32 *)((Uint8 *)dst + bpp);
935        }
936        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
937    }
938    return true;
939}
940
941static bool scale_mat_nearest_4(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch)
942{
943    Uint32 bpp = 4;
944    SDL_SCALE_NEAREST__START
945    for (i = 0; i < dst_h; i++) {
946        SDL_SCALE_NEAREST__HEIGHT
947        while (n--) {
948            const Uint32 *src;
949            srcx = bpp * (posx >> 16);
950            posx += incx;
951            src = (const Uint32 *)((const Uint8 *)src_h0 + srcx);
952            *dst = *src;
953            dst = (Uint32 *)((Uint8 *)dst + bpp);
954        }
955        dst = (Uint32 *)((Uint8 *)dst + dst_gap);
956    }
957    return true;
958}
959
960bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect)
961{
962    int src_w = srcrect->w;
963    int src_h = srcrect->h;
964    int dst_w = dstrect->w;
965    int dst_h = dstrect->h;
966    int src_pitch = s->pitch;
967    int dst_pitch = d->pitch;
968    int bpp = SDL_BYTESPERPIXEL(d->format);
969
970    Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * bpp + srcrect->y * src_pitch);
971    Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * bpp + dstrect->y * dst_pitch);
972
973    if (bpp == 4) {
974        return scale_mat_nearest_4(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
975    } else if (bpp == 3) {
976        return scale_mat_nearest_3(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
977    } else if (bpp == 2) {
978        return scale_mat_nearest_2(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
979    } else {
980        return scale_mat_nearest_1(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch);
981    }
982}
983
[FILE END]
(C) 2025 0x4248
(C) 2025 4248 Media and 4248 Systems, All part of 0x4248
See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.