Atlas - SDL_stretch.c
Home / ext / SDL / src / video Lines: 3 | Size: 36730 bytes [Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)][FILE BEGIN]1/* 2 Simple DirectMedia Layer 3 Copyright (C) 1997-2025 Sam Lantinga <[email protected]> 4 5 This software is provided 'as-is', without any express or implied 6 warranty. In no event will the authors be held liable for any damages 7 arising from the use of this software. 8 9 Permission is granted to anyone to use this software for any purpose, 10 including commercial applications, and to alter it and redistribute it 11 freely, subject to the following restrictions: 12 13 1. The origin of this software must not be misrepresented; you must not 14 claim that you wrote the original software. If you use this software 15 in a product, an acknowledgment in the product documentation would be 16 appreciated but is not required. 17 2. Altered source versions must be plainly marked as such, and must not be 18 misrepresented as being the original software. 19 3. This notice may not be removed or altered from any source distribution. 20*/ 21#include "SDL_internal.h" 22 23#include "SDL_surface_c.h" 24 25static bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect); 26static bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect); 27 28bool SDL_StretchSurface(SDL_Surface *src, const SDL_Rect *srcrect, SDL_Surface *dst, const SDL_Rect *dstrect, SDL_ScaleMode scaleMode) 29{ 30 bool result; 31 int src_locked; 32 int dst_locked; 33 SDL_Rect full_src; 34 SDL_Rect full_dst; 35 36 CHECK_PARAM(!src) { 37 return SDL_InvalidParamError("src"); 38 } 39 CHECK_PARAM(!dst) { 40 return SDL_InvalidParamError("dst"); 41 } 42 43 if (src->format != dst->format) { 44 // Slow! 45 SDL_Surface *src_tmp = SDL_ConvertSurfaceAndColorspace(src, dst->format, dst->palette, dst->colorspace, dst->props); 46 if (!src_tmp) { 47 return false; 48 } 49 result = SDL_StretchSurface(src_tmp, srcrect, dst, dstrect, scaleMode); 50 SDL_DestroySurface(src_tmp); 51 return result; 52 } 53 54 if (SDL_ISPIXELFORMAT_FOURCC(src->format)) { 55 // Slow! 56 if (!dstrect) { 57 full_dst.x = 0; 58 full_dst.y = 0; 59 full_dst.w = dst->w; 60 full_dst.h = dst->h; 61 dstrect = &full_dst; 62 } 63 64 SDL_Surface *src_tmp = SDL_ConvertSurface(src, SDL_PIXELFORMAT_XRGB8888); 65 SDL_Surface *dst_tmp = SDL_CreateSurface(dstrect->w, dstrect->h, SDL_PIXELFORMAT_XRGB8888); 66 if (src_tmp && dst_tmp) { 67 result = SDL_StretchSurface(src_tmp, srcrect, dst_tmp, NULL, scaleMode); 68 if (result) { 69 result = SDL_ConvertPixelsAndColorspace(dstrect->w, dstrect->h, 70 dst_tmp->format, SDL_COLORSPACE_SRGB, 0, 71 dst_tmp->pixels, dst_tmp->pitch, 72 dst->format, dst->colorspace, SDL_GetSurfaceProperties(dst), 73 (Uint8 *)dst->pixels + dstrect->y * dst->pitch + dstrect->x * SDL_BYTESPERPIXEL(dst->format), dst->pitch); 74 } 75 } else { 76 result = false; 77 } 78 SDL_DestroySurface(src_tmp); 79 SDL_DestroySurface(dst_tmp); 80 return result; 81 } 82 83 switch (scaleMode) { 84 case SDL_SCALEMODE_NEAREST: 85 break; 86 case SDL_SCALEMODE_LINEAR: 87 break; 88 case SDL_SCALEMODE_PIXELART: 89 scaleMode = SDL_SCALEMODE_NEAREST; 90 break; 91 default: 92 return SDL_InvalidParamError("scaleMode"); 93 } 94 95 if (scaleMode == SDL_SCALEMODE_LINEAR) { 96 if (SDL_BYTESPERPIXEL(src->format) != 4 || src->format == SDL_PIXELFORMAT_ARGB2101010) { 97 return SDL_SetError("Wrong format"); 98 } 99 } 100 101 // Verify the blit rectangles 102 if (srcrect) { 103 if ((srcrect->x < 0) || (srcrect->y < 0) || 104 ((srcrect->x + srcrect->w) > src->w) || 105 ((srcrect->y + srcrect->h) > src->h)) { 106 return SDL_SetError("Invalid source blit rectangle"); 107 } 108 } else { 109 full_src.x = 0; 110 full_src.y = 0; 111 full_src.w = src->w; 112 full_src.h = src->h; 113 srcrect = &full_src; 114 } 115 if (dstrect) { 116 if ((dstrect->x < 0) || (dstrect->y < 0) || 117 ((dstrect->x + dstrect->w) > dst->w) || 118 ((dstrect->y + dstrect->h) > dst->h)) { 119 return SDL_SetError("Invalid destination blit rectangle"); 120 } 121 } else { 122 full_dst.x = 0; 123 full_dst.y = 0; 124 full_dst.w = dst->w; 125 full_dst.h = dst->h; 126 dstrect = &full_dst; 127 } 128 129 if (dstrect->w <= 0 || dstrect->h <= 0) { 130 return true; 131 } 132 133 if (srcrect->w > SDL_MAX_UINT16 || srcrect->h > SDL_MAX_UINT16 || 134 dstrect->w > SDL_MAX_UINT16 || dstrect->h > SDL_MAX_UINT16) { 135 return SDL_SetError("Size too large for scaling"); 136 } 137 138 // Lock the destination if it's in hardware 139 dst_locked = 0; 140 if (SDL_MUSTLOCK(dst)) { 141 if (!SDL_LockSurface(dst)) { 142 return SDL_SetError("Unable to lock destination surface"); 143 } 144 dst_locked = 1; 145 } 146 // Lock the source if it's in hardware 147 src_locked = 0; 148 if (SDL_MUSTLOCK(src)) { 149 if (!SDL_LockSurface(src)) { 150 if (dst_locked) { 151 SDL_UnlockSurface(dst); 152 } 153 return SDL_SetError("Unable to lock source surface"); 154 } 155 src_locked = 1; 156 } 157 158 if (scaleMode == SDL_SCALEMODE_NEAREST) { 159 result = SDL_StretchSurfaceUncheckedNearest(src, srcrect, dst, dstrect); 160 } else { 161 result = SDL_StretchSurfaceUncheckedLinear(src, srcrect, dst, dstrect); 162 } 163 164 // We need to unlock the surfaces if they're locked 165 if (dst_locked) { 166 SDL_UnlockSurface(dst); 167 } 168 if (src_locked) { 169 SDL_UnlockSurface(src); 170 } 171 172 return result; 173} 174 175/* bilinear interpolation precision must be < 8 176 Because with SSE: add-multiply: _mm_madd_epi16 works with signed int 177 so pixels 0xb1...... are negatives and false the result 178 same in NEON probably */ 179#define PRECISION 7 180 181#define FIXED_POINT(i) ((Uint32)(i) << 16) 182#define SRC_INDEX(fp) ((Uint32)(fp) >> 16) 183#define INTEGER(fp) ((Uint32)(fp) >> PRECISION) 184#define FRAC(fp) ((Uint32)((fp) >> (16 - PRECISION)) & ((1 << PRECISION) - 1)) 185#define FRAC_ZERO 0 186#define FRAC_ONE (1 << PRECISION) 187#define FP_ONE FIXED_POINT(1) 188 189#define BILINEAR___START \ 190 int i; \ 191 Sint64 fp_sum_h; \ 192 int fp_step_h, left_pad_h, right_pad_h; \ 193 Sint64 fp_sum_w; \ 194 int fp_step_w, left_pad_w, right_pad_w; \ 195 Sint64 fp_sum_w_init; \ 196 int left_pad_w_init, right_pad_w_init, dst_gap, middle_init; \ 197 get_scaler_datas(src_h, dst_h, &fp_sum_h, &fp_step_h, &left_pad_h, &right_pad_h); \ 198 get_scaler_datas(src_w, dst_w, &fp_sum_w, &fp_step_w, &left_pad_w, &right_pad_w); \ 199 fp_sum_w_init = fp_sum_w + left_pad_w * fp_step_w; \ 200 left_pad_w_init = left_pad_w; \ 201 right_pad_w_init = right_pad_w; \ 202 dst_gap = dst_pitch - 4 * dst_w; \ 203 middle_init = dst_w - left_pad_w - right_pad_w; 204 205#define BILINEAR___HEIGHT \ 206 int index_h, frac_h0, frac_h1, middle; \ 207 const Uint32 *src_h0, *src_h1; \ 208 int no_padding; \ 209 Uint64 incr_h0, incr_h1; \ 210 \ 211 no_padding = !(i < left_pad_h || i > dst_h - 1 - right_pad_h); \ 212 index_h = SRC_INDEX(fp_sum_h); \ 213 frac_h0 = FRAC(fp_sum_h); \ 214 \ 215 index_h = no_padding ? index_h : (i < left_pad_h ? 0 : src_h - 1); \ 216 frac_h0 = no_padding ? frac_h0 : 0; \ 217 incr_h1 = no_padding ? src_pitch : 0; \ 218 incr_h0 = (Uint64)index_h * src_pitch; \ 219 \ 220 src_h0 = (const Uint32 *)((const Uint8 *)src + incr_h0); \ 221 src_h1 = (const Uint32 *)((const Uint8 *)src_h0 + incr_h1); \ 222 \ 223 fp_sum_h += fp_step_h; \ 224 \ 225 frac_h1 = FRAC_ONE - frac_h0; \ 226 fp_sum_w = fp_sum_w_init; \ 227 right_pad_w = right_pad_w_init; \ 228 left_pad_w = left_pad_w_init; \ 229 middle = middle_init; 230 231#ifdef __clang__ 232// Remove inlining of this function 233// Compiler crash with clang 9.0.8 / android-ndk-r21d 234// Compiler crash with clang 11.0.3 / Xcode 235// OK with clang 11.0.5 / android-ndk-22 236// OK with clang 12.0.0 / Xcode 237__attribute__((noinline)) 238#endif 239static void get_scaler_datas(int src_nb, int dst_nb, Sint64 *fp_start, int *fp_step, int *left_pad, int *right_pad) 240{ 241 242 int step = FIXED_POINT(src_nb) / (dst_nb); // source step in fixed point 243 int x0 = FP_ONE / 2; // dst first pixel center at 0.5 in fixed point 244 Sint64 fp_sum; 245 int i; 246#if 0 247 // scale to source coordinates 248 x0 *= src_nb; 249 x0 /= dst_nb; // x0 == step / 2 250#else 251 // Use this code for perfect match with pixman 252 Sint64 tmp[2]; 253 tmp[0] = (Sint64)step * (x0 >> 16); 254 tmp[1] = (Sint64)step * (x0 & 0xFFFF); 255 x0 = (int)(tmp[0] + ((tmp[1] + 0x8000) >> 16)); // x0 == (step + 1) / 2 256#endif 257 // -= 0.5, get back the pixel origin, in source coordinates 258 x0 -= FP_ONE / 2; 259 260 *fp_start = x0; 261 *fp_step = step; 262 *left_pad = 0; 263 *right_pad = 0; 264 265 fp_sum = x0; 266 for (i = 0; i < dst_nb; i++) { 267 if (fp_sum < 0) { 268 *left_pad += 1; 269 } else { 270 int index = SRC_INDEX(fp_sum); 271 if (index > src_nb - 2) { 272 *right_pad += 1; 273 } 274 } 275 fp_sum += step; 276 } 277 // SDL_Log("%d -> %d x0=%d step=%d left_pad=%d right_pad=%d", src_nb, dst_nb, *fp_start, *fp_step, *left_pad, *right_pad); 278} 279 280typedef struct color_t 281{ 282 Uint8 a; 283 Uint8 b; 284 Uint8 c; 285 Uint8 d; 286} color_t; 287 288#if 0 289static void printf_64(const char *str, void *var) 290{ 291 uint8_t *val = (uint8_t *)var; 292 printf(" * %s: %02x %02x %02x %02x _ %02x %02x %02x %02x\n", 293 str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); 294} 295#endif 296 297/* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ 298 299static SDL_INLINE void INTERPOL(const Uint32 *src_x0, const Uint32 *src_x1, int frac0, int frac1, Uint32 *dst) 300{ 301 const color_t *c0 = (const color_t *)src_x0; 302 const color_t *c1 = (const color_t *)src_x1; 303 color_t *cx = (color_t *)dst; 304#if 0 305 cx->a = c0->a + INTEGER(frac0 * (c1->a - c0->a)); 306 cx->b = c0->b + INTEGER(frac0 * (c1->b - c0->b)); 307 cx->c = c0->c + INTEGER(frac0 * (c1->c - c0->c)); 308 cx->d = c0->d + INTEGER(frac0 * (c1->d - c0->d)); 309#else 310 cx->a = (Uint8)INTEGER(frac1 * c0->a + frac0 * c1->a); 311 cx->b = (Uint8)INTEGER(frac1 * c0->b + frac0 * c1->b); 312 cx->c = (Uint8)INTEGER(frac1 * c0->c + frac0 * c1->c); 313 cx->d = (Uint8)INTEGER(frac1 * c0->d + frac0 * c1->d); 314#endif 315} 316 317static SDL_INLINE void INTERPOL_BILINEAR(const Uint32 *s0, const Uint32 *s1, int frac_w0, int frac_h0, int frac_h1, Uint32 *dst) 318{ 319 Uint32 tmp[2]; 320 unsigned int frac_w1 = FRAC_ONE - frac_w0; 321 322 // Vertical first, store to 'tmp' 323 INTERPOL(s0, s1, frac_h0, frac_h1, tmp); 324 INTERPOL(s0 + 1, s1 + 1, frac_h0, frac_h1, tmp + 1); 325 326 // Horizontal, store to 'dst' 327 INTERPOL(tmp, tmp + 1, frac_w0, frac_w1, dst); 328} 329 330static bool scale_mat(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) 331{ 332 BILINEAR___START 333 334 for (i = 0; i < dst_h; i++) { 335 336 BILINEAR___HEIGHT 337 338 while (left_pad_w--) { 339 INTERPOL_BILINEAR(src_h0, src_h1, FRAC_ZERO, frac_h0, frac_h1, dst); 340 dst += 1; 341 } 342 343 while (middle--) { 344 const Uint32 *s_00_01; 345 const Uint32 *s_10_11; 346 int index_w = 4 * SRC_INDEX(fp_sum_w); 347 int frac_w = FRAC(fp_sum_w); 348 fp_sum_w += fp_step_w; 349 350 /* 351 x00 ... x0_ ..... x01 352 . . . 353 . x . 354 . . . 355 . . . 356 x10 ... x1_ ..... x11 357 */ 358 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); 359 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); 360 361 INTERPOL_BILINEAR(s_00_01, s_10_11, frac_w, frac_h0, frac_h1, dst); 362 363 dst += 1; 364 } 365 366 while (right_pad_w--) { 367 int index_w = 4 * (src_w - 2); 368 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); 369 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); 370 INTERPOL_BILINEAR(s_00_01, s_10_11, FRAC_ONE, frac_h0, frac_h1, dst); 371 dst += 1; 372 } 373 dst = (Uint32 *)((Uint8 *)dst + dst_gap); 374 } 375 return true; 376} 377 378#ifdef SDL_NEON_INTRINSICS 379#define CAST_uint8x8_t (uint8x8_t) 380#define CAST_uint32x2_t (uint32x2_t) 381#endif 382 383#if defined(_MSC_VER) 384#ifdef SDL_NEON_INTRINSICS 385#undef CAST_uint8x8_t 386#undef CAST_uint32x2_t 387#define CAST_uint8x8_t 388#define CAST_uint32x2_t 389#endif 390#endif 391 392#ifdef SDL_SSE2_INTRINSICS 393 394#if 0 395static void SDL_TARGETING("sse2") printf_128(const char *str, __m128i var) 396{ 397 uint16_t *val = (uint16_t *)&var; 398 printf(" * %s: %04x %04x %04x %04x _ %04x %04x %04x %04x\n", 399 str, val[0], val[1], val[2], val[3], val[4], val[5], val[6], val[7]); 400} 401#endif 402 403static SDL_INLINE int hasSSE2(void) 404{ 405 static int val = -1; 406 if (val != -1) { 407 return val; 408 } 409 val = SDL_HasSSE2(); 410 return val; 411} 412 413static SDL_INLINE void SDL_TARGETING("sse2") INTERPOL_BILINEAR_SSE(const Uint32 *s0, const Uint32 *s1, int frac_w, __m128i v_frac_h0, __m128i v_frac_h1, Uint32 *dst, __m128i zero) 414{ 415 __m128i x_00_01, x_10_11; /* Pixels in 4*uint8 in row */ 416 __m128i v_frac_w0, k0, l0, d0, e0; 417 418 int f, f2; 419 f = frac_w; 420 f2 = FRAC_ONE - frac_w; 421 v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2); 422 423 x_00_01 = _mm_loadl_epi64((const __m128i *)s0); // Load x00 and x01 424 x_10_11 = _mm_loadl_epi64((const __m128i *)s1); 425 426 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ 427 428 // Interpolation vertical 429 k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1); 430 l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0); 431 k0 = _mm_add_epi16(k0, l0); 432 433 // For perfect match, clear the factionnal part eventually. 434 /* 435 k0 = _mm_srli_epi16(k0, PRECISION); 436 k0 = _mm_slli_epi16(k0, PRECISION); 437 */ 438 439 // Interpolation horizontal 440 l0 = _mm_unpacklo_epi64(/* unused */ l0, k0); 441 k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0); 442 443 // Store 1 pixel 444 d0 = _mm_srli_epi32(k0, PRECISION * 2); 445 e0 = _mm_packs_epi32(d0, d0); 446 e0 = _mm_packus_epi16(e0, e0); 447 *dst = _mm_cvtsi128_si32(e0); 448} 449 450static bool SDL_TARGETING("sse2") scale_mat_SSE(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) 451{ 452 BILINEAR___START 453 454 for (i = 0; i < dst_h; i++) { 455 int nb_block2; 456 __m128i v_frac_h0; 457 __m128i v_frac_h1; 458 __m128i zero; 459 460 BILINEAR___HEIGHT 461 462 nb_block2 = middle / 2; 463 464 v_frac_h0 = _mm_set_epi16((short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0, (short)frac_h0); 465 v_frac_h1 = _mm_set_epi16((short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1, (short)frac_h1); 466 zero = _mm_setzero_si128(); 467 468 while (left_pad_w--) { 469 INTERPOL_BILINEAR_SSE(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst, zero); 470 dst += 1; 471 } 472 473 while (nb_block2--) { 474 int index_w_0, frac_w_0; 475 int index_w_1, frac_w_1; 476 477 const Uint32 *s_00_01, *s_02_03, *s_10_11, *s_12_13; 478 479 __m128i x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */ 480 __m128i v_frac_w0, k0, l0, d0, e0; 481 __m128i v_frac_w1, k1, l1, d1, e1; 482 483 int f, f2; 484 index_w_0 = 4 * SRC_INDEX(fp_sum_w); 485 frac_w_0 = FRAC(fp_sum_w); 486 fp_sum_w += fp_step_w; 487 index_w_1 = 4 * SRC_INDEX(fp_sum_w); 488 frac_w_1 = FRAC(fp_sum_w); 489 fp_sum_w += fp_step_w; 490 /* 491 x00............ x01 x02...........x03 492 . . . . . . 493 j0 f0 j1 j2 f1 j3 494 . . . . . . 495 . . . . . . 496 . . . . . . 497 x10............ x11 x12...........x13 498 */ 499 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0); 500 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1); 501 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0); 502 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1); 503 504 f = frac_w_0; 505 f2 = FRAC_ONE - frac_w_0; 506 v_frac_w0 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2); 507 508 f = frac_w_1; 509 f2 = FRAC_ONE - frac_w_1; 510 v_frac_w1 = _mm_set_epi16((short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2, (short)f, (short)f2); 511 512 x_00_01 = _mm_loadl_epi64((const __m128i *)s_00_01); // Load x00 and x01 513 x_02_03 = _mm_loadl_epi64((const __m128i *)s_02_03); 514 x_10_11 = _mm_loadl_epi64((const __m128i *)s_10_11); 515 x_12_13 = _mm_loadl_epi64((const __m128i *)s_12_13); 516 517 // Interpolation vertical 518 k0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_00_01, zero), v_frac_h1); 519 l0 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_10_11, zero), v_frac_h0); 520 k0 = _mm_add_epi16(k0, l0); 521 k1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_02_03, zero), v_frac_h1); 522 l1 = _mm_mullo_epi16(_mm_unpacklo_epi8(x_12_13, zero), v_frac_h0); 523 k1 = _mm_add_epi16(k1, l1); 524 525 // Interpolation horizontal 526 l0 = _mm_unpacklo_epi64(/* unused */ l0, k0); 527 k0 = _mm_madd_epi16(_mm_unpackhi_epi16(l0, k0), v_frac_w0); 528 l1 = _mm_unpacklo_epi64(/* unused */ l1, k1); 529 k1 = _mm_madd_epi16(_mm_unpackhi_epi16(l1, k1), v_frac_w1); 530 531 // Store 1 pixel 532 d0 = _mm_srli_epi32(k0, PRECISION * 2); 533 e0 = _mm_packs_epi32(d0, d0); 534 e0 = _mm_packus_epi16(e0, e0); 535 *dst++ = _mm_cvtsi128_si32(e0); 536 537 // Store 1 pixel 538 d1 = _mm_srli_epi32(k1, PRECISION * 2); 539 e1 = _mm_packs_epi32(d1, d1); 540 e1 = _mm_packus_epi16(e1, e1); 541 *dst++ = _mm_cvtsi128_si32(e1); 542 } 543 544 // Last point 545 if (middle & 0x1) { 546 const Uint32 *s_00_01; 547 const Uint32 *s_10_11; 548 int index_w = 4 * SRC_INDEX(fp_sum_w); 549 int frac_w = FRAC(fp_sum_w); 550 fp_sum_w += fp_step_w; 551 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); 552 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); 553 INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst, zero); 554 dst += 1; 555 } 556 557 while (right_pad_w--) { 558 int index_w = 4 * (src_w - 2); 559 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); 560 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); 561 INTERPOL_BILINEAR_SSE(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst, zero); 562 dst += 1; 563 } 564 dst = (Uint32 *)((Uint8 *)dst + dst_gap); 565 } 566 return true; 567} 568#endif 569 570#ifdef SDL_NEON_INTRINSICS 571 572static SDL_INLINE int hasNEON(void) 573{ 574 static int val = -1; 575 if (val != -1) { 576 return val; 577 } 578 val = SDL_HasNEON(); 579 return val; 580} 581 582static SDL_INLINE void INTERPOL_BILINEAR_NEON(const Uint32 *s0, const Uint32 *s1, int frac_w, uint8x8_t v_frac_h0, uint8x8_t v_frac_h1, Uint32 *dst) 583{ 584 uint8x8_t x_00_01, x_10_11; /* Pixels in 4*uint8 in row */ 585 uint16x8_t k0; 586 uint32x4_t l0; 587 uint16x8_t d0; 588 uint8x8_t e0; 589 590 x_00_01 = CAST_uint8x8_t vld1_u32(s0); // Load 2 pixels 591 x_10_11 = CAST_uint8x8_t vld1_u32(s1); 592 593 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ 594 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */ 595 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */ 596 597 // k0 now contains 2 interpolated pixels { j0, j1 } 598 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION); 599 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w); 600 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w); 601 602 // Shift and narrow 603 d0 = vcombine_u16( 604 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION), 605 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION)); 606 607 // Narrow again 608 e0 = vmovn_u16(d0); 609 610 // Store 1 pixel 611 *dst = vget_lane_u32(CAST_uint32x2_t e0, 0); 612} 613 614static bool scale_mat_NEON(const Uint32 *src, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) 615{ 616 BILINEAR___START 617 618 for (i = 0; i < dst_h; i++) { 619 int nb_block4; 620 uint8x8_t v_frac_h0, v_frac_h1; 621 622 BILINEAR___HEIGHT 623 624 nb_block4 = middle / 4; 625 626 v_frac_h0 = vmov_n_u8(frac_h0); 627 v_frac_h1 = vmov_n_u8(frac_h1); 628 629 while (left_pad_w--) { 630 INTERPOL_BILINEAR_NEON(src_h0, src_h1, FRAC_ZERO, v_frac_h0, v_frac_h1, dst); 631 dst += 1; 632 } 633 634 while (nb_block4--) { 635 int index_w_0, frac_w_0; 636 int index_w_1, frac_w_1; 637 int index_w_2, frac_w_2; 638 int index_w_3, frac_w_3; 639 640 const Uint32 *s_00_01, *s_02_03, *s_04_05, *s_06_07; 641 const Uint32 *s_10_11, *s_12_13, *s_14_15, *s_16_17; 642 643 uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */ 644 uint8x8_t x_04_05, x_14_15, x_06_07, x_16_17; 645 646 uint16x8_t k0, k1, k2, k3; 647 uint32x4_t l0, l1, l2, l3; 648 uint16x8_t d0, d1; 649 uint8x8_t e0, e1; 650 uint32x4_t f0; 651 652 index_w_0 = 4 * SRC_INDEX(fp_sum_w); 653 frac_w_0 = FRAC(fp_sum_w); 654 fp_sum_w += fp_step_w; 655 index_w_1 = 4 * SRC_INDEX(fp_sum_w); 656 frac_w_1 = FRAC(fp_sum_w); 657 fp_sum_w += fp_step_w; 658 index_w_2 = 4 * SRC_INDEX(fp_sum_w); 659 frac_w_2 = FRAC(fp_sum_w); 660 fp_sum_w += fp_step_w; 661 index_w_3 = 4 * SRC_INDEX(fp_sum_w); 662 frac_w_3 = FRAC(fp_sum_w); 663 fp_sum_w += fp_step_w; 664 665 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0); 666 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1); 667 s_04_05 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_2); 668 s_06_07 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_3); 669 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0); 670 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1); 671 s_14_15 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_2); 672 s_16_17 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_3); 673 674 // Interpolation vertical 675 x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels 676 x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03); 677 x_04_05 = CAST_uint8x8_t vld1_u32(s_04_05); 678 x_06_07 = CAST_uint8x8_t vld1_u32(s_06_07); 679 x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11); 680 x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13); 681 x_14_15 = CAST_uint8x8_t vld1_u32(s_14_15); 682 x_16_17 = CAST_uint8x8_t vld1_u32(s_16_17); 683 684 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ 685 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */ 686 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */ 687 688 k1 = vmull_u8(x_02_03, v_frac_h1); 689 k1 = vmlal_u8(k1, x_12_13, v_frac_h0); 690 691 k2 = vmull_u8(x_04_05, v_frac_h1); 692 k2 = vmlal_u8(k2, x_14_15, v_frac_h0); 693 694 k3 = vmull_u8(x_06_07, v_frac_h1); 695 k3 = vmlal_u8(k3, x_16_17, v_frac_h0); 696 697 // k0 now contains 2 interpolated pixels { j0, j1 } 698 // k1 now contains 2 interpolated pixels { j2, j3 } 699 // k2 now contains 2 interpolated pixels { j4, j5 } 700 // k3 now contains 2 interpolated pixels { j6, j7 } 701 702 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION); 703 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0); 704 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0); 705 706 l1 = vshll_n_u16(vget_low_u16(k1), PRECISION); 707 l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1); 708 l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1); 709 710 l2 = vshll_n_u16(vget_low_u16(k2), PRECISION); 711 l2 = vmlsl_n_u16(l2, vget_low_u16(k2), frac_w_2); 712 l2 = vmlal_n_u16(l2, vget_high_u16(k2), frac_w_2); 713 714 l3 = vshll_n_u16(vget_low_u16(k3), PRECISION); 715 l3 = vmlsl_n_u16(l3, vget_low_u16(k3), frac_w_3); 716 l3 = vmlal_n_u16(l3, vget_high_u16(k3), frac_w_3); 717 718 // shift and narrow 719 d0 = vcombine_u16( 720 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION), 721 /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION)); 722 // narrow again 723 e0 = vmovn_u16(d0); 724 725 // Shift and narrow 726 d1 = vcombine_u16( 727 /* uint16x4_t */ vshrn_n_u32(l2, 2 * PRECISION), 728 /* uint16x4_t */ vshrn_n_u32(l3, 2 * PRECISION)); 729 // Narrow again 730 e1 = vmovn_u16(d1); 731 732 f0 = vcombine_u32(CAST_uint32x2_t e0, CAST_uint32x2_t e1); 733 // Store 4 pixels 734 vst1q_u32(dst, f0); 735 736 dst += 4; 737 } 738 739 if (middle & 0x2) { 740 int index_w_0, frac_w_0; 741 int index_w_1, frac_w_1; 742 const Uint32 *s_00_01, *s_02_03; 743 const Uint32 *s_10_11, *s_12_13; 744 uint8x8_t x_00_01, x_10_11, x_02_03, x_12_13; /* Pixels in 4*uint8 in row */ 745 uint16x8_t k0, k1; 746 uint32x4_t l0, l1; 747 uint16x8_t d0; 748 uint8x8_t e0; 749 750 index_w_0 = 4 * SRC_INDEX(fp_sum_w); 751 frac_w_0 = FRAC(fp_sum_w); 752 fp_sum_w += fp_step_w; 753 index_w_1 = 4 * SRC_INDEX(fp_sum_w); 754 frac_w_1 = FRAC(fp_sum_w); 755 fp_sum_w += fp_step_w; 756 /* 757 x00............ x01 x02...........x03 758 . . . . . . 759 j0 dest0 j1 j2 dest1 j3 760 . . . . . . 761 . . . . . . 762 . . . . . . 763 x10............ x11 x12...........x13 764 */ 765 s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_0); 766 s_02_03 = (const Uint32 *)((const Uint8 *)src_h0 + index_w_1); 767 s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_0); 768 s_12_13 = (const Uint32 *)((const Uint8 *)src_h1 + index_w_1); 769 770 // Interpolation vertical 771 x_00_01 = CAST_uint8x8_t vld1_u32(s_00_01); // Load 2 pixels 772 x_02_03 = CAST_uint8x8_t vld1_u32(s_02_03); 773 x_10_11 = CAST_uint8x8_t vld1_u32(s_10_11); 774 x_12_13 = CAST_uint8x8_t vld1_u32(s_12_13); 775 776 /* Interpolated == x0 + frac * (x1 - x0) == x0 * (1 - frac) + x1 * frac */ 777 k0 = vmull_u8(x_00_01, v_frac_h1); /* k0 := x0 * (1 - frac) */ 778 k0 = vmlal_u8(k0, x_10_11, v_frac_h0); /* k0 += x1 * frac */ 779 780 k1 = vmull_u8(x_02_03, v_frac_h1); 781 k1 = vmlal_u8(k1, x_12_13, v_frac_h0); 782 783 // k0 now contains 2 interpolated pixels { j0, j1 } 784 // k1 now contains 2 interpolated pixels { j2, j3 } 785 786 l0 = vshll_n_u16(vget_low_u16(k0), PRECISION); 787 l0 = vmlsl_n_u16(l0, vget_low_u16(k0), frac_w_0); 788 l0 = vmlal_n_u16(l0, vget_high_u16(k0), frac_w_0); 789 790 l1 = vshll_n_u16(vget_low_u16(k1), PRECISION); 791 l1 = vmlsl_n_u16(l1, vget_low_u16(k1), frac_w_1); 792 l1 = vmlal_n_u16(l1, vget_high_u16(k1), frac_w_1); 793 794 // Shift and narrow 795 796 d0 = vcombine_u16( 797 /* uint16x4_t */ vshrn_n_u32(l0, 2 * PRECISION), 798 /* uint16x4_t */ vshrn_n_u32(l1, 2 * PRECISION)); 799 800 // Narrow again 801 e0 = vmovn_u16(d0); 802 803 // Store 2 pixels 804 vst1_u32(dst, CAST_uint32x2_t e0); 805 dst += 2; 806 } 807 808 // Last point 809 if (middle & 0x1) { 810 int index_w = 4 * SRC_INDEX(fp_sum_w); 811 int frac_w = FRAC(fp_sum_w); 812 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); 813 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); 814 INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, frac_w, v_frac_h0, v_frac_h1, dst); 815 dst += 1; 816 } 817 818 while (right_pad_w--) { 819 int index_w = 4 * (src_w - 2); 820 const Uint32 *s_00_01 = (const Uint32 *)((const Uint8 *)src_h0 + index_w); 821 const Uint32 *s_10_11 = (const Uint32 *)((const Uint8 *)src_h1 + index_w); 822 INTERPOL_BILINEAR_NEON(s_00_01, s_10_11, FRAC_ONE, v_frac_h0, v_frac_h1, dst); 823 dst += 1; 824 } 825 826 dst = (Uint32 *)((Uint8 *)dst + dst_gap); 827 } 828 return true; 829} 830#endif 831 832bool SDL_StretchSurfaceUncheckedLinear(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect) 833{ 834 bool result = false; 835 int src_w = srcrect->w; 836 int src_h = srcrect->h; 837 int dst_w = dstrect->w; 838 int dst_h = dstrect->h; 839 int src_pitch = s->pitch; 840 int dst_pitch = d->pitch; 841 Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * 4 + srcrect->y * src_pitch); 842 Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * 4 + dstrect->y * dst_pitch); 843 844#ifdef SDL_NEON_INTRINSICS 845 if (!result && hasNEON()) { 846 result = scale_mat_NEON(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); 847 } 848#endif 849 850#ifdef SDL_SSE2_INTRINSICS 851 if (!result && hasSSE2()) { 852 result = scale_mat_SSE(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); 853 } 854#endif 855 856 if (!result) { 857 result = scale_mat(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); 858 } 859 860 return result; 861} 862 863#define SDL_SCALE_NEAREST__START \ 864 int i; \ 865 Uint64 posy, incy; \ 866 Uint64 posx, incx; \ 867 Uint64 srcy, srcx; \ 868 int dst_gap, n; \ 869 const Uint32 *src_h0; \ 870 incy = ((Uint64)src_h << 16) / dst_h; \ 871 incx = ((Uint64)src_w << 16) / dst_w; \ 872 dst_gap = dst_pitch - bpp * dst_w; \ 873 posy = incy / 2; 874 875#define SDL_SCALE_NEAREST__HEIGHT \ 876 srcy = (posy >> 16); \ 877 src_h0 = (const Uint32 *)((const Uint8 *)src_ptr + srcy * src_pitch); \ 878 posy += incy; \ 879 posx = incx / 2; \ 880 n = dst_w; 881 882static bool scale_mat_nearest_1(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) 883{ 884 Uint32 bpp = 1; 885 SDL_SCALE_NEAREST__START 886 for (i = 0; i < dst_h; i++) { 887 SDL_SCALE_NEAREST__HEIGHT 888 while (n--) { 889 const Uint8 *src; 890 srcx = bpp * (posx >> 16); 891 posx += incx; 892 src = (const Uint8 *)src_h0 + srcx; 893 *(Uint8 *)dst = *src; 894 dst = (Uint32 *)((Uint8 *)dst + bpp); 895 } 896 dst = (Uint32 *)((Uint8 *)dst + dst_gap); 897 } 898 return true; 899} 900 901static bool scale_mat_nearest_2(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) 902{ 903 Uint32 bpp = 2; 904 SDL_SCALE_NEAREST__START 905 for (i = 0; i < dst_h; i++) { 906 SDL_SCALE_NEAREST__HEIGHT 907 while (n--) { 908 const Uint16 *src; 909 srcx = bpp * (posx >> 16); 910 posx += incx; 911 src = (const Uint16 *)((const Uint8 *)src_h0 + srcx); 912 *(Uint16 *)dst = *src; 913 dst = (Uint32 *)((Uint8 *)dst + bpp); 914 } 915 dst = (Uint32 *)((Uint8 *)dst + dst_gap); 916 } 917 return true; 918} 919 920static bool scale_mat_nearest_3(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) 921{ 922 Uint32 bpp = 3; 923 SDL_SCALE_NEAREST__START 924 for (i = 0; i < dst_h; i++) { 925 SDL_SCALE_NEAREST__HEIGHT 926 while (n--) { 927 const Uint8 *src; 928 srcx = bpp * (posx >> 16); 929 posx += incx; 930 src = (const Uint8 *)src_h0 + srcx; 931 ((Uint8 *)dst)[0] = src[0]; 932 ((Uint8 *)dst)[1] = src[1]; 933 ((Uint8 *)dst)[2] = src[2]; 934 dst = (Uint32 *)((Uint8 *)dst + bpp); 935 } 936 dst = (Uint32 *)((Uint8 *)dst + dst_gap); 937 } 938 return true; 939} 940 941static bool scale_mat_nearest_4(const Uint32 *src_ptr, int src_w, int src_h, int src_pitch, Uint32 *dst, int dst_w, int dst_h, int dst_pitch) 942{ 943 Uint32 bpp = 4; 944 SDL_SCALE_NEAREST__START 945 for (i = 0; i < dst_h; i++) { 946 SDL_SCALE_NEAREST__HEIGHT 947 while (n--) { 948 const Uint32 *src; 949 srcx = bpp * (posx >> 16); 950 posx += incx; 951 src = (const Uint32 *)((const Uint8 *)src_h0 + srcx); 952 *dst = *src; 953 dst = (Uint32 *)((Uint8 *)dst + bpp); 954 } 955 dst = (Uint32 *)((Uint8 *)dst + dst_gap); 956 } 957 return true; 958} 959 960bool SDL_StretchSurfaceUncheckedNearest(SDL_Surface *s, const SDL_Rect *srcrect, SDL_Surface *d, const SDL_Rect *dstrect) 961{ 962 int src_w = srcrect->w; 963 int src_h = srcrect->h; 964 int dst_w = dstrect->w; 965 int dst_h = dstrect->h; 966 int src_pitch = s->pitch; 967 int dst_pitch = d->pitch; 968 int bpp = SDL_BYTESPERPIXEL(d->format); 969 970 Uint32 *src = (Uint32 *)((Uint8 *)s->pixels + srcrect->x * bpp + srcrect->y * src_pitch); 971 Uint32 *dst = (Uint32 *)((Uint8 *)d->pixels + dstrect->x * bpp + dstrect->y * dst_pitch); 972 973 if (bpp == 4) { 974 return scale_mat_nearest_4(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); 975 } else if (bpp == 3) { 976 return scale_mat_nearest_3(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); 977 } else if (bpp == 2) { 978 return scale_mat_nearest_2(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); 979 } else { 980 return scale_mat_nearest_1(src, src_w, src_h, src_pitch, dst, dst_w, dst_h, dst_pitch); 981 } 982} 983[FILE END](C) 2025 0x4248 (C) 2025 4248 Media and 4248 Systems, All part of 0x4248 See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.