Atlas - SDL_audiotypecvt.c

Home / ext / SDL2 / src / audio Lines: 1 | Size: 54798 bytes [Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)]
[FILE BEGIN]
1/* 2 Simple DirectMedia Layer 3 Copyright (C) 1997-2018 Sam Lantinga <[email protected]> 4 5 This software is provided 'as-is', without any express or implied 6 warranty. In no event will the authors be held liable for any damages 7 arising from the use of this software. 8 9 Permission is granted to anyone to use this software for any purpose, 10 including commercial applications, and to alter it and redistribute it 11 freely, subject to the following restrictions: 12 13 1. The origin of this software must not be misrepresented; you must not 14 claim that you wrote the original software. If you use this software 15 in a product, an acknowledgment in the product documentation would be 16 appreciated but is not required. 17 2. Altered source versions must be plainly marked as such, and must not be 18 misrepresented as being the original software. 19 3. This notice may not be removed or altered from any source distribution. 20*/ 21 22#include "../SDL_internal.h" 23#include "SDL_audio.h" 24#include "SDL_audio_c.h" 25#include "SDL_cpuinfo.h" 26#include "SDL_assert.h" 27 28#ifdef __ARM_NEON__ 29#define HAVE_NEON_INTRINSICS 1 30#endif 31 32#ifdef __SSE2__ 33#define HAVE_SSE2_INTRINSICS 1 34#endif 35 36#if defined(__x86_64__) && HAVE_SSE2_INTRINSICS 37#define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* x86_64 guarantees SSE2. */ 38#elif __MACOSX__ && HAVE_SSE2_INTRINSICS 39#define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* Mac OS X/Intel guarantees SSE2. */ 40#elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS 41#define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* ARMv8+ promise NEON. */ 42#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS 43#define NEED_SCALAR_CONVERTER_FALLBACKS 0 /* All Apple ARMv7 chips promise NEON support. */ 44#endif 45 46/* Set to zero if platform is guaranteed to use a SIMD codepath here. */ 47#ifndef NEED_SCALAR_CONVERTER_FALLBACKS 48#define NEED_SCALAR_CONVERTER_FALLBACKS 1 49#endif 50 51/* Function pointers set to a CPU-specific implementation. */ 52SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL; 53SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL; 54SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL; 55SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL; 56SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL; 57SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL; 58SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL; 59SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL; 60SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL; 61SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL; 62 63 64#define DIVBY128 0.0078125f 65#define DIVBY32768 0.000030517578125f 66#define DIVBY8388607 0.00000011920930376163766f 67 68 69#if NEED_SCALAR_CONVERTER_FALLBACKS 70static void SDLCALL 71SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 72{ 73 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; 74 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 75 int i; 76 77 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32"); 78 79 for (i = cvt->len_cvt; i; --i, --src, --dst) { 80 *dst = ((float) *src) * DIVBY128; 81 } 82 83 cvt->len_cvt *= 4; 84 if (cvt->filters[++cvt->filter_index]) { 85 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 86 } 87} 88 89static void SDLCALL 90SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 91{ 92 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; 93 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 94 int i; 95 96 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32"); 97 98 for (i = cvt->len_cvt; i; --i, --src, --dst) { 99 *dst = (((float) *src) * DIVBY128) - 1.0f; 100 } 101 102 cvt->len_cvt *= 4; 103 if (cvt->filters[++cvt->filter_index]) { 104 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 105 } 106} 107 108static void SDLCALL 109SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 110{ 111 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; 112 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 113 int i; 114 115 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32"); 116 117 for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) { 118 *dst = ((float) *src) * DIVBY32768; 119 } 120 121 cvt->len_cvt *= 2; 122 if (cvt->filters[++cvt->filter_index]) { 123 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 124 } 125} 126 127static void SDLCALL 128SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 129{ 130 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; 131 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 132 int i; 133 134 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32"); 135 136 for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) { 137 *dst = (((float) *src) * DIVBY32768) - 1.0f; 138 } 139 140 cvt->len_cvt *= 2; 141 if (cvt->filters[++cvt->filter_index]) { 142 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 143 } 144} 145 146static void SDLCALL 147SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 148{ 149 const Sint32 *src = (const Sint32 *) cvt->buf; 150 float *dst = (float *) cvt->buf; 151 int i; 152 153 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32"); 154 155 for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) { 156 *dst = ((float) (*src>>8)) * DIVBY8388607; 157 } 158 159 if (cvt->filters[++cvt->filter_index]) { 160 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 161 } 162} 163 164static void SDLCALL 165SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 166{ 167 const float *src = (const float *) cvt->buf; 168 Sint8 *dst = (Sint8 *) cvt->buf; 169 int i; 170 171 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8"); 172 173 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 174 const float sample = *src; 175 if (sample >= 1.0f) { 176 *dst = 127; 177 } else if (sample <= -1.0f) { 178 *dst = -128; 179 } else { 180 *dst = (Sint8)(sample * 127.0f); 181 } 182 } 183 184 cvt->len_cvt /= 4; 185 if (cvt->filters[++cvt->filter_index]) { 186 cvt->filters[cvt->filter_index](cvt, AUDIO_S8); 187 } 188} 189 190static void SDLCALL 191SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 192{ 193 const float *src = (const float *) cvt->buf; 194 Uint8 *dst = (Uint8 *) cvt->buf; 195 int i; 196 197 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8"); 198 199 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 200 const float sample = *src; 201 if (sample >= 1.0f) { 202 *dst = 255; 203 } else if (sample <= -1.0f) { 204 *dst = 0; 205 } else { 206 *dst = (Uint8)((sample + 1.0f) * 127.0f); 207 } 208 } 209 210 cvt->len_cvt /= 4; 211 if (cvt->filters[++cvt->filter_index]) { 212 cvt->filters[cvt->filter_index](cvt, AUDIO_U8); 213 } 214} 215 216static void SDLCALL 217SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 218{ 219 const float *src = (const float *) cvt->buf; 220 Sint16 *dst = (Sint16 *) cvt->buf; 221 int i; 222 223 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16"); 224 225 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 226 const float sample = *src; 227 if (sample >= 1.0f) { 228 *dst = 32767; 229 } else if (sample <= -1.0f) { 230 *dst = -32768; 231 } else { 232 *dst = (Sint16)(sample * 32767.0f); 233 } 234 } 235 236 cvt->len_cvt /= 2; 237 if (cvt->filters[++cvt->filter_index]) { 238 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); 239 } 240} 241 242static void SDLCALL 243SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 244{ 245 const float *src = (const float *) cvt->buf; 246 Uint16 *dst = (Uint16 *) cvt->buf; 247 int i; 248 249 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16"); 250 251 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 252 const float sample = *src; 253 if (sample >= 1.0f) { 254 *dst = 65535; 255 } else if (sample <= -1.0f) { 256 *dst = 0; 257 } else { 258 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 259 } 260 } 261 262 cvt->len_cvt /= 2; 263 if (cvt->filters[++cvt->filter_index]) { 264 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); 265 } 266} 267 268static void SDLCALL 269SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format) 270{ 271 const float *src = (const float *) cvt->buf; 272 Sint32 *dst = (Sint32 *) cvt->buf; 273 int i; 274 275 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32"); 276 277 for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) { 278 const float sample = *src; 279 if (sample >= 1.0f) { 280 *dst = 2147483647; 281 } else if (sample <= -1.0f) { 282 *dst = (Sint32) -2147483648LL; 283 } else { 284 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 285 } 286 } 287 288 if (cvt->filters[++cvt->filter_index]) { 289 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); 290 } 291} 292#endif 293 294 295#if HAVE_SSE2_INTRINSICS 296static void SDLCALL 297SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 298{ 299 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; 300 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 301 int i; 302 303 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)"); 304 305 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 306 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { 307 *dst = ((float) *src) * DIVBY128; 308 } 309 310 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */ 311 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 312 313 /* Make sure src is aligned too. */ 314 if ((((size_t) src) & 15) == 0) { 315 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 316 const __m128i *mmsrc = (const __m128i *) src; 317 const __m128i zero = _mm_setzero_si128(); 318 const __m128 divby128 = _mm_set1_ps(DIVBY128); 319 while (i >= 16) { /* 16 * 8-bit */ 320 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 sint8 into an XMM register. */ 321 /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */ 322 const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8); 323 /* right-shift-sign-extend gets us sint16 with the other set of values. */ 324 const __m128i shorts2 = _mm_srai_epi16(bytes, 8); 325 /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */ 326 const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128); 327 const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128); 328 const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128); 329 const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128); 330 /* Interleave back into correct order, store. */ 331 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2)); 332 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2)); 333 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4)); 334 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4)); 335 i -= 16; mmsrc--; dst -= 16; 336 } 337 338 src = (const Sint8 *) mmsrc; 339 } 340 341 src += 15; dst += 15; /* adjust for any scalar finishing. */ 342 343 /* Finish off any leftovers with scalar operations. */ 344 while (i) { 345 *dst = ((float) *src) * DIVBY128; 346 i--; src--; dst--; 347 } 348 349 cvt->len_cvt *= 4; 350 if (cvt->filters[++cvt->filter_index]) { 351 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 352 } 353} 354 355static void SDLCALL 356SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 357{ 358 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; 359 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 360 int i; 361 362 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)"); 363 364 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 365 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { 366 *dst = (((float) *src) * DIVBY128) - 1.0f; 367 } 368 369 src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */ 370 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 371 372 /* Make sure src is aligned too. */ 373 if ((((size_t) src) & 15) == 0) { 374 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 375 const __m128i *mmsrc = (const __m128i *) src; 376 const __m128i zero = _mm_setzero_si128(); 377 const __m128 divby128 = _mm_set1_ps(DIVBY128); 378 const __m128 minus1 = _mm_set1_ps(-1.0f); 379 while (i >= 16) { /* 16 * 8-bit */ 380 const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */ 381 /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */ 382 const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8); 383 /* right-shift-zero-extend gets us uint16 with the other set of values. */ 384 const __m128i shorts2 = _mm_srli_epi16(bytes, 8); 385 /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */ 386 /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */ 387 const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1); 388 const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1); 389 const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1); 390 const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1); 391 /* Interleave back into correct order, store. */ 392 _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2)); 393 _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2)); 394 _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4)); 395 _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4)); 396 i -= 16; mmsrc--; dst -= 16; 397 } 398 399 src = (const Uint8 *) mmsrc; 400 } 401 402 src += 15; dst += 15; /* adjust for any scalar finishing. */ 403 404 /* Finish off any leftovers with scalar operations. */ 405 while (i) { 406 *dst = (((float) *src) * DIVBY128) - 1.0f; 407 i--; src--; dst--; 408 } 409 410 cvt->len_cvt *= 4; 411 if (cvt->filters[++cvt->filter_index]) { 412 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 413 } 414} 415 416static void SDLCALL 417SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 418{ 419 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; 420 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 421 int i; 422 423 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)"); 424 425 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 426 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { 427 *dst = ((float) *src) * DIVBY32768; 428 } 429 430 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */ 431 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 432 433 /* Make sure src is aligned too. */ 434 if ((((size_t) src) & 15) == 0) { 435 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 436 const __m128 divby32768 = _mm_set1_ps(DIVBY32768); 437 while (i >= 8) { /* 8 * 16-bit */ 438 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */ 439 /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */ 440 const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16); 441 /* right-shift-sign-extend gets us sint32 with the other set of values. */ 442 const __m128i b = _mm_srai_epi32(ints, 16); 443 /* Interleave these back into the right order, convert to float, multiply, store. */ 444 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768)); 445 _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768)); 446 i -= 8; src -= 8; dst -= 8; 447 } 448 } 449 450 src += 7; dst += 7; /* adjust for any scalar finishing. */ 451 452 /* Finish off any leftovers with scalar operations. */ 453 while (i) { 454 *dst = ((float) *src) * DIVBY32768; 455 i--; src--; dst--; 456 } 457 458 cvt->len_cvt *= 2; 459 if (cvt->filters[++cvt->filter_index]) { 460 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 461 } 462} 463 464static void SDLCALL 465SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 466{ 467 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; 468 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 469 int i; 470 471 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)"); 472 473 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 474 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { 475 *dst = (((float) *src) * DIVBY32768) - 1.0f; 476 } 477 478 src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */ 479 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 480 481 /* Make sure src is aligned too. */ 482 if ((((size_t) src) & 15) == 0) { 483 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 484 const __m128 divby32768 = _mm_set1_ps(DIVBY32768); 485 const __m128 minus1 = _mm_set1_ps(1.0f); 486 while (i >= 8) { /* 8 * 16-bit */ 487 const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */ 488 /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */ 489 const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16); 490 /* right-shift-sign-extend gets us sint32 with the other set of values. */ 491 const __m128i b = _mm_srli_epi32(ints, 16); 492 /* Interleave these back into the right order, convert to float, multiply, store. */ 493 _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1)); 494 _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1)); 495 i -= 8; src -= 8; dst -= 8; 496 } 497 } 498 499 src += 7; dst += 7; /* adjust for any scalar finishing. */ 500 501 /* Finish off any leftovers with scalar operations. */ 502 while (i) { 503 *dst = (((float) *src) * DIVBY32768) - 1.0f; 504 i--; src--; dst--; 505 } 506 507 cvt->len_cvt *= 2; 508 if (cvt->filters[++cvt->filter_index]) { 509 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 510 } 511} 512 513static void SDLCALL 514SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 515{ 516 const Sint32 *src = (const Sint32 *) cvt->buf; 517 float *dst = (float *) cvt->buf; 518 int i; 519 520 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)"); 521 522 /* Get dst aligned to 16 bytes */ 523 for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 524 *dst = ((float) (*src>>8)) * DIVBY8388607; 525 } 526 527 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 528 SDL_assert(!i || ((((size_t) src) & 15) == 0)); 529 530 { 531 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 532 const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607); 533 const __m128i *mmsrc = (const __m128i *) src; 534 while (i >= 4) { /* 4 * sint32 */ 535 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */ 536 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607)); 537 i -= 4; mmsrc++; dst += 4; 538 } 539 src = (const Sint32 *) mmsrc; 540 } 541 542 /* Finish off any leftovers with scalar operations. */ 543 while (i) { 544 *dst = ((float) (*src>>8)) * DIVBY8388607; 545 i--; src++; dst++; 546 } 547 548 if (cvt->filters[++cvt->filter_index]) { 549 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 550 } 551} 552 553static void SDLCALL 554SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 555{ 556 const float *src = (const float *) cvt->buf; 557 Sint8 *dst = (Sint8 *) cvt->buf; 558 int i; 559 560 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)"); 561 562 /* Get dst aligned to 16 bytes */ 563 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 564 const float sample = *src; 565 if (sample >= 1.0f) { 566 *dst = 127; 567 } else if (sample <= -1.0f) { 568 *dst = -128; 569 } else { 570 *dst = (Sint8)(sample * 127.0f); 571 } 572 } 573 574 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 575 576 /* Make sure src is aligned too. */ 577 if ((((size_t) src) & 15) == 0) { 578 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 579 const __m128 one = _mm_set1_ps(1.0f); 580 const __m128 negone = _mm_set1_ps(-1.0f); 581 const __m128 mulby127 = _mm_set1_ps(127.0f); 582 __m128i *mmdst = (__m128i *) dst; 583 while (i >= 16) { /* 16 * float32 */ 584 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 585 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 586 const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 587 const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 588 _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */ 589 i -= 16; src += 16; mmdst++; 590 } 591 dst = (Sint8 *) mmdst; 592 } 593 594 /* Finish off any leftovers with scalar operations. */ 595 while (i) { 596 const float sample = *src; 597 if (sample >= 1.0f) { 598 *dst = 127; 599 } else if (sample <= -1.0f) { 600 *dst = -128; 601 } else { 602 *dst = (Sint8)(sample * 127.0f); 603 } 604 i--; src++; dst++; 605 } 606 607 cvt->len_cvt /= 4; 608 if (cvt->filters[++cvt->filter_index]) { 609 cvt->filters[cvt->filter_index](cvt, AUDIO_S8); 610 } 611} 612 613static void SDLCALL 614SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 615{ 616 const float *src = (const float *) cvt->buf; 617 Uint8 *dst = (Uint8 *) cvt->buf; 618 int i; 619 620 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)"); 621 622 /* Get dst aligned to 16 bytes */ 623 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 624 const float sample = *src; 625 if (sample >= 1.0f) { 626 *dst = 255; 627 } else if (sample <= -1.0f) { 628 *dst = 0; 629 } else { 630 *dst = (Uint8)((sample + 1.0f) * 127.0f); 631 } 632 } 633 634 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 635 636 /* Make sure src is aligned too. */ 637 if ((((size_t) src) & 15) == 0) { 638 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 639 const __m128 one = _mm_set1_ps(1.0f); 640 const __m128 negone = _mm_set1_ps(-1.0f); 641 const __m128 mulby127 = _mm_set1_ps(127.0f); 642 __m128i *mmdst = (__m128i *) dst; 643 while (i >= 16) { /* 16 * float32 */ 644 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 645 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 646 const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 647 const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 648 _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4))); /* pack down, store out. */ 649 i -= 16; src += 16; mmdst++; 650 } 651 dst = (Uint8 *) mmdst; 652 } 653 654 /* Finish off any leftovers with scalar operations. */ 655 while (i) { 656 const float sample = *src; 657 if (sample >= 1.0f) { 658 *dst = 255; 659 } else if (sample <= -1.0f) { 660 *dst = 0; 661 } else { 662 *dst = (Uint8)((sample + 1.0f) * 127.0f); 663 } 664 i--; src++; dst++; 665 } 666 667 cvt->len_cvt /= 4; 668 if (cvt->filters[++cvt->filter_index]) { 669 cvt->filters[cvt->filter_index](cvt, AUDIO_U8); 670 } 671} 672 673static void SDLCALL 674SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 675{ 676 const float *src = (const float *) cvt->buf; 677 Sint16 *dst = (Sint16 *) cvt->buf; 678 int i; 679 680 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)"); 681 682 /* Get dst aligned to 16 bytes */ 683 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 684 const float sample = *src; 685 if (sample >= 1.0f) { 686 *dst = 32767; 687 } else if (sample <= -1.0f) { 688 *dst = -32768; 689 } else { 690 *dst = (Sint16)(sample * 32767.0f); 691 } 692 } 693 694 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 695 696 /* Make sure src is aligned too. */ 697 if ((((size_t) src) & 15) == 0) { 698 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 699 const __m128 one = _mm_set1_ps(1.0f); 700 const __m128 negone = _mm_set1_ps(-1.0f); 701 const __m128 mulby32767 = _mm_set1_ps(32767.0f); 702 __m128i *mmdst = (__m128i *) dst; 703 while (i >= 8) { /* 8 * float32 */ 704 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 705 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 706 _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2)); /* pack to sint16, store out. */ 707 i -= 8; src += 8; mmdst++; 708 } 709 dst = (Sint16 *) mmdst; 710 } 711 712 /* Finish off any leftovers with scalar operations. */ 713 while (i) { 714 const float sample = *src; 715 if (sample >= 1.0f) { 716 *dst = 32767; 717 } else if (sample <= -1.0f) { 718 *dst = -32768; 719 } else { 720 *dst = (Sint16)(sample * 32767.0f); 721 } 722 i--; src++; dst++; 723 } 724 725 cvt->len_cvt /= 2; 726 if (cvt->filters[++cvt->filter_index]) { 727 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); 728 } 729} 730 731static void SDLCALL 732SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 733{ 734 const float *src = (const float *) cvt->buf; 735 Uint16 *dst = (Uint16 *) cvt->buf; 736 int i; 737 738 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)"); 739 740 /* Get dst aligned to 16 bytes */ 741 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 742 const float sample = *src; 743 if (sample >= 1.0f) { 744 *dst = 65535; 745 } else if (sample <= -1.0f) { 746 *dst = 0; 747 } else { 748 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 749 } 750 } 751 752 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 753 754 /* Make sure src is aligned too. */ 755 if ((((size_t) src) & 15) == 0) { 756 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 757 /* This calculates differently than the scalar path because SSE2 can't 758 pack int32 data down to unsigned int16. _mm_packs_epi32 does signed 759 saturation, so that would corrupt our data. _mm_packus_epi32 exists, 760 but not before SSE 4.1. So we convert from float to sint16, packing 761 that down with legit signed saturation, and then xor the top bit 762 against 1. This results in the correct unsigned 16-bit value, even 763 though it looks like dark magic. */ 764 const __m128 mulby32767 = _mm_set1_ps(32767.0f); 765 const __m128i topbit = _mm_set1_epi16(-32768); 766 const __m128 one = _mm_set1_ps(1.0f); 767 const __m128 negone = _mm_set1_ps(-1.0f); 768 __m128i *mmdst = (__m128i *) dst; 769 while (i >= 8) { /* 8 * float32 */ 770 const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 771 const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 772 _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit)); /* pack to sint16, xor top bit, store out. */ 773 i -= 8; src += 8; mmdst++; 774 } 775 dst = (Uint16 *) mmdst; 776 } 777 778 /* Finish off any leftovers with scalar operations. */ 779 while (i) { 780 const float sample = *src; 781 if (sample >= 1.0f) { 782 *dst = 65535; 783 } else if (sample <= -1.0f) { 784 *dst = 0; 785 } else { 786 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 787 } 788 i--; src++; dst++; 789 } 790 791 cvt->len_cvt /= 2; 792 if (cvt->filters[++cvt->filter_index]) { 793 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); 794 } 795} 796 797static void SDLCALL 798SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format) 799{ 800 const float *src = (const float *) cvt->buf; 801 Sint32 *dst = (Sint32 *) cvt->buf; 802 int i; 803 804 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)"); 805 806 /* Get dst aligned to 16 bytes */ 807 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 808 const float sample = *src; 809 if (sample >= 1.0f) { 810 *dst = 2147483647; 811 } else if (sample <= -1.0f) { 812 *dst = (Sint32) -2147483648LL; 813 } else { 814 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 815 } 816 } 817 818 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 819 SDL_assert(!i || ((((size_t) src) & 15) == 0)); 820 821 { 822 /* Aligned! Do SSE blocks as long as we have 16 bytes available. */ 823 const __m128 one = _mm_set1_ps(1.0f); 824 const __m128 negone = _mm_set1_ps(-1.0f); 825 const __m128 mulby8388607 = _mm_set1_ps(8388607.0f); 826 __m128i *mmdst = (__m128i *) dst; 827 while (i >= 4) { /* 4 * float32 */ 828 _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8)); /* load 4 floats, clamp, convert to sint32 */ 829 i -= 4; src += 4; mmdst++; 830 } 831 dst = (Sint32 *) mmdst; 832 } 833 834 /* Finish off any leftovers with scalar operations. */ 835 while (i) { 836 const float sample = *src; 837 if (sample >= 1.0f) { 838 *dst = 2147483647; 839 } else if (sample <= -1.0f) { 840 *dst = (Sint32) -2147483648LL; 841 } else { 842 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 843 } 844 i--; src++; dst++; 845 } 846 847 if (cvt->filters[++cvt->filter_index]) { 848 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); 849 } 850} 851#endif 852 853 854#if HAVE_NEON_INTRINSICS 855static void SDLCALL 856SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 857{ 858 const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1; 859 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 860 int i; 861 862 LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)"); 863 864 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 865 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { 866 *dst = ((float) *src) * DIVBY128; 867 } 868 869 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */ 870 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 871 872 /* Make sure src is aligned too. */ 873 if ((((size_t) src) & 15) == 0) { 874 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 875 const int8_t *mmsrc = (const int8_t *) src; 876 const float32x4_t divby128 = vdupq_n_f32(DIVBY128); 877 while (i >= 16) { /* 16 * 8-bit */ 878 const int8x16_t bytes = vld1q_s8(mmsrc); /* get 16 sint8 into a NEON register. */ 879 const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes)); /* convert top 8 bytes to 8 int16 */ 880 const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes)); /* convert bottom 8 bytes to 8 int16 */ 881 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */ 882 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128)); 883 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128)); 884 vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128)); 885 vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128)); 886 i -= 16; mmsrc -= 16; dst -= 16; 887 } 888 889 src = (const Sint8 *) mmsrc; 890 } 891 892 src += 15; dst += 15; /* adjust for any scalar finishing. */ 893 894 /* Finish off any leftovers with scalar operations. */ 895 while (i) { 896 *dst = ((float) *src) * DIVBY128; 897 i--; src--; dst--; 898 } 899 900 cvt->len_cvt *= 4; 901 if (cvt->filters[++cvt->filter_index]) { 902 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 903 } 904} 905 906static void SDLCALL 907SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 908{ 909 const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1; 910 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1; 911 int i; 912 913 LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)"); 914 915 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 916 for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { 917 *dst = (((float) *src) * DIVBY128) - 1.0f; 918 } 919 920 src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */ 921 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 922 923 /* Make sure src is aligned too. */ 924 if ((((size_t) src) & 15) == 0) { 925 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 926 const uint8_t *mmsrc = (const uint8_t *) src; 927 const float32x4_t divby128 = vdupq_n_f32(DIVBY128); 928 const float32x4_t one = vdupq_n_f32(1.0f); 929 while (i >= 16) { /* 16 * 8-bit */ 930 const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */ 931 const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */ 932 const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */ 933 /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */ 934 vst1q_f32(dst, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128, one)); 935 vst1q_f32(dst+4, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128, one)); 936 vst1q_f32(dst+8, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128, one)); 937 vst1q_f32(dst+12, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128, one)); 938 i -= 16; mmsrc -= 16; dst -= 16; 939 } 940 941 src = (const Uint8 *) mmsrc; 942 } 943 944 src += 15; dst += 15; /* adjust for any scalar finishing. */ 945 946 /* Finish off any leftovers with scalar operations. */ 947 while (i) { 948 *dst = (((float) *src) * DIVBY128) - 1.0f; 949 i--; src--; dst--; 950 } 951 952 cvt->len_cvt *= 4; 953 if (cvt->filters[++cvt->filter_index]) { 954 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 955 } 956} 957 958static void SDLCALL 959SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 960{ 961 const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1; 962 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 963 int i; 964 965 LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)"); 966 967 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 968 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { 969 *dst = ((float) *src) * DIVBY32768; 970 } 971 972 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */ 973 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 974 975 /* Make sure src is aligned too. */ 976 if ((((size_t) src) & 15) == 0) { 977 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 978 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768); 979 while (i >= 8) { /* 8 * 16-bit */ 980 const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */ 981 /* split int16 to two int32, then convert to float, then multiply to normalize, store. */ 982 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768)); 983 vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768)); 984 i -= 8; src -= 8; dst -= 8; 985 } 986 } 987 988 src += 7; dst += 7; /* adjust for any scalar finishing. */ 989 990 /* Finish off any leftovers with scalar operations. */ 991 while (i) { 992 *dst = ((float) *src) * DIVBY32768; 993 i--; src--; dst--; 994 } 995 996 cvt->len_cvt *= 2; 997 if (cvt->filters[++cvt->filter_index]) { 998 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 999 } 1000} 1001 1002static void SDLCALL 1003SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1004{ 1005 const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1; 1006 float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1; 1007 int i; 1008 1009 LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)"); 1010 1011 /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ 1012 for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { 1013 *dst = (((float) *src) * DIVBY32768) - 1.0f; 1014 } 1015 1016 src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */ 1017 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1018 1019 /* Make sure src is aligned too. */ 1020 if ((((size_t) src) & 15) == 0) { 1021 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1022 const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768); 1023 const float32x4_t one = vdupq_n_f32(1.0f); 1024 while (i >= 8) { /* 8 * 16-bit */ 1025 const uint16x8_t uints = vld1q_u16((uint16_t const *) src); /* get 8 uint16 into a NEON register. */ 1026 /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */ 1027 vst1q_f32(dst, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768)); 1028 vst1q_f32(dst+4, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768)); 1029 i -= 8; src -= 8; dst -= 8; 1030 } 1031 } 1032 1033 src += 7; dst += 7; /* adjust for any scalar finishing. */ 1034 1035 /* Finish off any leftovers with scalar operations. */ 1036 while (i) { 1037 *dst = (((float) *src) * DIVBY32768) - 1.0f; 1038 i--; src--; dst--; 1039 } 1040 1041 cvt->len_cvt *= 2; 1042 if (cvt->filters[++cvt->filter_index]) { 1043 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 1044 } 1045} 1046 1047static void SDLCALL 1048SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1049{ 1050 const Sint32 *src = (const Sint32 *) cvt->buf; 1051 float *dst = (float *) cvt->buf; 1052 int i; 1053 1054 LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)"); 1055 1056 /* Get dst aligned to 16 bytes */ 1057 for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1058 *dst = ((float) (*src>>8)) * DIVBY8388607; 1059 } 1060 1061 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1062 SDL_assert(!i || ((((size_t) src) & 15) == 0)); 1063 1064 { 1065 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1066 const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607); 1067 const int32_t *mmsrc = (const int32_t *) src; 1068 while (i >= 4) { /* 4 * sint32 */ 1069 /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */ 1070 vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607)); 1071 i -= 4; mmsrc += 4; dst += 4; 1072 } 1073 src = (const Sint32 *) mmsrc; 1074 } 1075 1076 /* Finish off any leftovers with scalar operations. */ 1077 while (i) { 1078 *dst = ((float) (*src>>8)) * DIVBY8388607; 1079 i--; src++; dst++; 1080 } 1081 1082 if (cvt->filters[++cvt->filter_index]) { 1083 cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS); 1084 } 1085} 1086 1087static void SDLCALL 1088SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1089{ 1090 const float *src = (const float *) cvt->buf; 1091 Sint8 *dst = (Sint8 *) cvt->buf; 1092 int i; 1093 1094 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)"); 1095 1096 /* Get dst aligned to 16 bytes */ 1097 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1098 const float sample = *src; 1099 if (sample >= 1.0f) { 1100 *dst = 127; 1101 } else if (sample <= -1.0f) { 1102 *dst = -128; 1103 } else { 1104 *dst = (Sint8)(sample * 127.0f); 1105 } 1106 } 1107 1108 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1109 1110 /* Make sure src is aligned too. */ 1111 if ((((size_t) src) & 15) == 0) { 1112 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1113 const float32x4_t one = vdupq_n_f32(1.0f); 1114 const float32x4_t negone = vdupq_n_f32(-1.0f); 1115 const float32x4_t mulby127 = vdupq_n_f32(127.0f); 1116 int8_t *mmdst = (int8_t *) dst; 1117 while (i >= 16) { /* 16 * float32 */ 1118 const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 1119 const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 1120 const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 1121 const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127)); /* load 4 floats, clamp, convert to sint32 */ 1122 const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */ 1123 const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */ 1124 vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi)); /* combine to int8x16_t, store out */ 1125 i -= 16; src += 16; mmdst += 16; 1126 } 1127 dst = (Sint8 *) mmdst; 1128 } 1129 1130 /* Finish off any leftovers with scalar operations. */ 1131 while (i) { 1132 const float sample = *src; 1133 if (sample >= 1.0f) { 1134 *dst = 127; 1135 } else if (sample <= -1.0f) { 1136 *dst = -128; 1137 } else { 1138 *dst = (Sint8)(sample * 127.0f); 1139 } 1140 i--; src++; dst++; 1141 } 1142 1143 cvt->len_cvt /= 4; 1144 if (cvt->filters[++cvt->filter_index]) { 1145 cvt->filters[cvt->filter_index](cvt, AUDIO_S8); 1146 } 1147} 1148 1149static void SDLCALL 1150SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1151{ 1152 const float *src = (const float *) cvt->buf; 1153 Uint8 *dst = (Uint8 *) cvt->buf; 1154 int i; 1155 1156 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)"); 1157 1158 /* Get dst aligned to 16 bytes */ 1159 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1160 const float sample = *src; 1161 if (sample >= 1.0f) { 1162 *dst = 255; 1163 } else if (sample <= -1.0f) { 1164 *dst = 0; 1165 } else { 1166 *dst = (Uint8)((sample + 1.0f) * 127.0f); 1167 } 1168 } 1169 1170 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1171 1172 /* Make sure src is aligned too. */ 1173 if ((((size_t) src) & 15) == 0) { 1174 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1175 const float32x4_t one = vdupq_n_f32(1.0f); 1176 const float32x4_t negone = vdupq_n_f32(-1.0f); 1177 const float32x4_t mulby127 = vdupq_n_f32(127.0f); 1178 uint8_t *mmdst = (uint8_t *) dst; 1179 while (i >= 16) { /* 16 * float32 */ 1180 const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ 1181 const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ 1182 const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ 1183 const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127)); /* load 4 floats, clamp, convert to uint32 */ 1184 const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */ 1185 const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */ 1186 vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi)); /* combine to uint8x16_t, store out */ 1187 i -= 16; src += 16; mmdst += 16; 1188 } 1189 1190 dst = (Uint8 *) mmdst; 1191 } 1192 1193 /* Finish off any leftovers with scalar operations. */ 1194 while (i) { 1195 const float sample = *src; 1196 if (sample >= 1.0f) { 1197 *dst = 255; 1198 } else if (sample <= -1.0f) { 1199 *dst = 0; 1200 } else { 1201 *dst = (Uint8)((sample + 1.0f) * 127.0f); 1202 } 1203 i--; src++; dst++; 1204 } 1205 1206 cvt->len_cvt /= 4; 1207 if (cvt->filters[++cvt->filter_index]) { 1208 cvt->filters[cvt->filter_index](cvt, AUDIO_U8); 1209 } 1210} 1211 1212static void SDLCALL 1213SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1214{ 1215 const float *src = (const float *) cvt->buf; 1216 Sint16 *dst = (Sint16 *) cvt->buf; 1217 int i; 1218 1219 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)"); 1220 1221 /* Get dst aligned to 16 bytes */ 1222 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1223 const float sample = *src; 1224 if (sample >= 1.0f) { 1225 *dst = 32767; 1226 } else if (sample <= -1.0f) { 1227 *dst = -32768; 1228 } else { 1229 *dst = (Sint16)(sample * 32767.0f); 1230 } 1231 } 1232 1233 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1234 1235 /* Make sure src is aligned too. */ 1236 if ((((size_t) src) & 15) == 0) { 1237 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1238 const float32x4_t one = vdupq_n_f32(1.0f); 1239 const float32x4_t negone = vdupq_n_f32(-1.0f); 1240 const float32x4_t mulby32767 = vdupq_n_f32(32767.0f); 1241 int16_t *mmdst = (int16_t *) dst; 1242 while (i >= 8) { /* 8 * float32 */ 1243 const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 1244 const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767)); /* load 4 floats, clamp, convert to sint32 */ 1245 vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, store out. */ 1246 i -= 8; src += 8; mmdst += 8; 1247 } 1248 dst = (Sint16 *) mmdst; 1249 } 1250 1251 /* Finish off any leftovers with scalar operations. */ 1252 while (i) { 1253 const float sample = *src; 1254 if (sample >= 1.0f) { 1255 *dst = 32767; 1256 } else if (sample <= -1.0f) { 1257 *dst = -32768; 1258 } else { 1259 *dst = (Sint16)(sample * 32767.0f); 1260 } 1261 i--; src++; dst++; 1262 } 1263 1264 cvt->len_cvt /= 2; 1265 if (cvt->filters[++cvt->filter_index]) { 1266 cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS); 1267 } 1268} 1269 1270static void SDLCALL 1271SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1272{ 1273 const float *src = (const float *) cvt->buf; 1274 Uint16 *dst = (Uint16 *) cvt->buf; 1275 int i; 1276 1277 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)"); 1278 1279 /* Get dst aligned to 16 bytes */ 1280 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1281 const float sample = *src; 1282 if (sample >= 1.0f) { 1283 *dst = 65535; 1284 } else if (sample <= -1.0f) { 1285 *dst = 0; 1286 } else { 1287 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 1288 } 1289 } 1290 1291 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1292 1293 /* Make sure src is aligned too. */ 1294 if ((((size_t) src) & 15) == 0) { 1295 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1296 const float32x4_t one = vdupq_n_f32(1.0f); 1297 const float32x4_t negone = vdupq_n_f32(-1.0f); 1298 const float32x4_t mulby32767 = vdupq_n_f32(32767.0f); 1299 uint16_t *mmdst = (uint16_t *) dst; 1300 while (i >= 8) { /* 8 * float32 */ 1301 const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */ 1302 const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767)); /* load 4 floats, clamp, convert to uint32 */ 1303 vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, store out. */ 1304 i -= 8; src += 8; mmdst += 8; 1305 } 1306 dst = (Uint16 *) mmdst; 1307 } 1308 1309 /* Finish off any leftovers with scalar operations. */ 1310 while (i) { 1311 const float sample = *src; 1312 if (sample >= 1.0f) { 1313 *dst = 65535; 1314 } else if (sample <= -1.0f) { 1315 *dst = 0; 1316 } else { 1317 *dst = (Uint16)((sample + 1.0f) * 32767.0f); 1318 } 1319 i--; src++; dst++; 1320 } 1321 1322 cvt->len_cvt /= 2; 1323 if (cvt->filters[++cvt->filter_index]) { 1324 cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS); 1325 } 1326} 1327 1328static void SDLCALL 1329SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format) 1330{ 1331 const float *src = (const float *) cvt->buf; 1332 Sint32 *dst = (Sint32 *) cvt->buf; 1333 int i; 1334 1335 LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)"); 1336 1337 /* Get dst aligned to 16 bytes */ 1338 for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) { 1339 const float sample = *src; 1340 if (sample >= 1.0f) { 1341 *dst = 2147483647; 1342 } else if (sample <= -1.0f) { 1343 *dst = -2147483648; 1344 } else { 1345 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 1346 } 1347 } 1348 1349 SDL_assert(!i || ((((size_t) dst) & 15) == 0)); 1350 SDL_assert(!i || ((((size_t) src) & 15) == 0)); 1351 1352 { 1353 /* Aligned! Do NEON blocks as long as we have 16 bytes available. */ 1354 const float32x4_t one = vdupq_n_f32(1.0f); 1355 const float32x4_t negone = vdupq_n_f32(-1.0f); 1356 const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f); 1357 int32_t *mmdst = (int32_t *) dst; 1358 while (i >= 4) { /* 4 * float32 */ 1359 vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8)); 1360 i -= 4; src += 4; mmdst += 4; 1361 } 1362 dst = (Sint32 *) mmdst; 1363 } 1364 1365 /* Finish off any leftovers with scalar operations. */ 1366 while (i) { 1367 const float sample = *src; 1368 if (sample >= 1.0f) { 1369 *dst = 2147483647; 1370 } else if (sample <= -1.0f) { 1371 *dst = -2147483648; 1372 } else { 1373 *dst = ((Sint32)(sample * 8388607.0f)) << 8; 1374 } 1375 i--; src++; dst++; 1376 } 1377 1378 if (cvt->filters[++cvt->filter_index]) { 1379 cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS); 1380 } 1381} 1382#endif 1383 1384 1385 1386void SDL_ChooseAudioConverters(void) 1387{ 1388 static SDL_bool converters_chosen = SDL_FALSE; 1389 1390 if (converters_chosen) { 1391 return; 1392 } 1393 1394#define SET_CONVERTER_FUNCS(fntype) \ 1395 SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \ 1396 SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \ 1397 SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \ 1398 SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \ 1399 SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \ 1400 SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \ 1401 SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \ 1402 SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \ 1403 SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \ 1404 SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \ 1405 converters_chosen = SDL_TRUE 1406 1407#if HAVE_SSE2_INTRINSICS 1408 if (SDL_HasSSE2()) { 1409 SET_CONVERTER_FUNCS(SSE2); 1410 return; 1411 } 1412#endif 1413 1414#if HAVE_NEON_INTRINSICS 1415 if (SDL_HasNEON()) { 1416 SET_CONVERTER_FUNCS(NEON); 1417 return; 1418 } 1419#endif 1420 1421#if NEED_SCALAR_CONVERTER_FALLBACKS 1422 SET_CONVERTER_FUNCS(Scalar); 1423#endif 1424 1425#undef SET_CONVERTER_FUNCS 1426 1427 SDL_assert(converters_chosen == SDL_TRUE); 1428} 1429 1430/* vi: set ts=4 sw=4 expandtab: */ 1431
[FILE END]
(C) 2025 0x4248 (C) 2025 4248 Media and 4248 Systems, All part of 0x4248 See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.