Atlas - SDL_audiotypecvt.c

Atlas - SDL_audiotypecvt.c
Home / ext / SDL2 / src / audio
Lines: 1 | Size: 54798 bytes
[Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)] 
[FILE BEGIN]
1/*
2  Simple DirectMedia Layer
3  Copyright (C) 1997-2018 Sam Lantinga <[email protected]>
4
5  This software is provided 'as-is', without any express or implied
6  warranty.  In no event will the authors be held liable for any damages
7  arising from the use of this software.
8
9  Permission is granted to anyone to use this software for any purpose,
10  including commercial applications, and to alter it and redistribute it
11  freely, subject to the following restrictions:
12
13  1. The origin of this software must not be misrepresented; you must not
14     claim that you wrote the original software. If you use this software
15     in a product, an acknowledgment in the product documentation would be
16     appreciated but is not required.
17  2. Altered source versions must be plainly marked as such, and must not be
18     misrepresented as being the original software.
19  3. This notice may not be removed or altered from any source distribution.
20*/
21
22#include "../SDL_internal.h"
23#include "SDL_audio.h"
24#include "SDL_audio_c.h"
25#include "SDL_cpuinfo.h"
26#include "SDL_assert.h"
27
28#ifdef __ARM_NEON__
29#define HAVE_NEON_INTRINSICS 1
30#endif
31
32#ifdef __SSE2__
33#define HAVE_SSE2_INTRINSICS 1
34#endif
35
36#if defined(__x86_64__) && HAVE_SSE2_INTRINSICS
37#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* x86_64 guarantees SSE2. */
38#elif __MACOSX__ && HAVE_SSE2_INTRINSICS
39#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* Mac OS X/Intel guarantees SSE2. */
40#elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8) && HAVE_NEON_INTRINSICS
41#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* ARMv8+ promise NEON. */
42#elif defined(__APPLE__) && defined(__ARM_ARCH) && (__ARM_ARCH >= 7) && HAVE_NEON_INTRINSICS
43#define NEED_SCALAR_CONVERTER_FALLBACKS 0  /* All Apple ARMv7 chips promise NEON support. */
44#endif
45
46/* Set to zero if platform is guaranteed to use a SIMD codepath here. */
47#ifndef NEED_SCALAR_CONVERTER_FALLBACKS
48#define NEED_SCALAR_CONVERTER_FALLBACKS 1
49#endif
50
51/* Function pointers set to a CPU-specific implementation. */
52SDL_AudioFilter SDL_Convert_S8_to_F32 = NULL;
53SDL_AudioFilter SDL_Convert_U8_to_F32 = NULL;
54SDL_AudioFilter SDL_Convert_S16_to_F32 = NULL;
55SDL_AudioFilter SDL_Convert_U16_to_F32 = NULL;
56SDL_AudioFilter SDL_Convert_S32_to_F32 = NULL;
57SDL_AudioFilter SDL_Convert_F32_to_S8 = NULL;
58SDL_AudioFilter SDL_Convert_F32_to_U8 = NULL;
59SDL_AudioFilter SDL_Convert_F32_to_S16 = NULL;
60SDL_AudioFilter SDL_Convert_F32_to_U16 = NULL;
61SDL_AudioFilter SDL_Convert_F32_to_S32 = NULL;
62
63
64#define DIVBY128 0.0078125f
65#define DIVBY32768 0.000030517578125f
66#define DIVBY8388607 0.00000011920930376163766f
67
68
69#if NEED_SCALAR_CONVERTER_FALLBACKS
70static void SDLCALL
71SDL_Convert_S8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
72{
73    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
74    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
75    int i;
76
77    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32");
78
79    for (i = cvt->len_cvt; i; --i, --src, --dst) {
80        *dst = ((float) *src) * DIVBY128;
81    }
82
83    cvt->len_cvt *= 4;
84    if (cvt->filters[++cvt->filter_index]) {
85        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
86    }
87}
88
89static void SDLCALL
90SDL_Convert_U8_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
91{
92    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
93    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
94    int i;
95
96    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32");
97
98    for (i = cvt->len_cvt; i; --i, --src, --dst) {
99        *dst = (((float) *src) * DIVBY128) - 1.0f;
100    }
101
102    cvt->len_cvt *= 4;
103    if (cvt->filters[++cvt->filter_index]) {
104        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
105    }
106}
107
108static void SDLCALL
109SDL_Convert_S16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
110{
111    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
112    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
113    int i;
114
115    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32");
116
117    for (i = cvt->len_cvt / sizeof (Sint16); i; --i, --src, --dst) {
118        *dst = ((float) *src) * DIVBY32768;
119    }
120
121    cvt->len_cvt *= 2;
122    if (cvt->filters[++cvt->filter_index]) {
123        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
124    }
125}
126
127static void SDLCALL
128SDL_Convert_U16_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
129{
130    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
131    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
132    int i;
133
134    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32");
135
136    for (i = cvt->len_cvt / sizeof (Uint16); i; --i, --src, --dst) {
137        *dst = (((float) *src) * DIVBY32768) - 1.0f;
138    }
139
140    cvt->len_cvt *= 2;
141    if (cvt->filters[++cvt->filter_index]) {
142        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
143    }
144}
145
146static void SDLCALL
147SDL_Convert_S32_to_F32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
148{
149    const Sint32 *src = (const Sint32 *) cvt->buf;
150    float *dst = (float *) cvt->buf;
151    int i;
152
153    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32");
154
155    for (i = cvt->len_cvt / sizeof (Sint32); i; --i, ++src, ++dst) {
156        *dst = ((float) (*src>>8)) * DIVBY8388607;
157    }
158
159    if (cvt->filters[++cvt->filter_index]) {
160        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
161    }
162}
163
164static void SDLCALL
165SDL_Convert_F32_to_S8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
166{
167    const float *src = (const float *) cvt->buf;
168    Sint8 *dst = (Sint8 *) cvt->buf;
169    int i;
170
171    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8");
172
173    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
174        const float sample = *src;
175        if (sample >= 1.0f) {
176            *dst = 127;
177        } else if (sample <= -1.0f) {
178            *dst = -128;
179        } else {
180            *dst = (Sint8)(sample * 127.0f);
181        }
182    }
183
184    cvt->len_cvt /= 4;
185    if (cvt->filters[++cvt->filter_index]) {
186        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
187    }
188}
189
190static void SDLCALL
191SDL_Convert_F32_to_U8_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
192{
193    const float *src = (const float *) cvt->buf;
194    Uint8 *dst = (Uint8 *) cvt->buf;
195    int i;
196
197    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8");
198
199    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
200        const float sample = *src;
201        if (sample >= 1.0f) {
202            *dst = 255;
203        } else if (sample <= -1.0f) {
204            *dst = 0;
205        } else {
206            *dst = (Uint8)((sample + 1.0f) * 127.0f);
207        }
208    }
209
210    cvt->len_cvt /= 4;
211    if (cvt->filters[++cvt->filter_index]) {
212        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
213    }
214}
215
216static void SDLCALL
217SDL_Convert_F32_to_S16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
218{
219    const float *src = (const float *) cvt->buf;
220    Sint16 *dst = (Sint16 *) cvt->buf;
221    int i;
222
223    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16");
224
225    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
226        const float sample = *src;
227        if (sample >= 1.0f) {
228            *dst = 32767;
229        } else if (sample <= -1.0f) {
230            *dst = -32768;
231        } else {
232            *dst = (Sint16)(sample * 32767.0f);
233        }
234    }
235
236    cvt->len_cvt /= 2;
237    if (cvt->filters[++cvt->filter_index]) {
238        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
239    }
240}
241
242static void SDLCALL
243SDL_Convert_F32_to_U16_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
244{
245    const float *src = (const float *) cvt->buf;
246    Uint16 *dst = (Uint16 *) cvt->buf;
247    int i;
248
249    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16");
250
251    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
252        const float sample = *src;
253        if (sample >= 1.0f) {
254            *dst = 65535;
255        } else if (sample <= -1.0f) {
256            *dst = 0;
257        } else {
258            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
259        }
260    }
261
262    cvt->len_cvt /= 2;
263    if (cvt->filters[++cvt->filter_index]) {
264        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
265    }
266}
267
268static void SDLCALL
269SDL_Convert_F32_to_S32_Scalar(SDL_AudioCVT *cvt, SDL_AudioFormat format)
270{
271    const float *src = (const float *) cvt->buf;
272    Sint32 *dst = (Sint32 *) cvt->buf;
273    int i;
274
275    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32");
276
277    for (i = cvt->len_cvt / sizeof (float); i; --i, ++src, ++dst) {
278        const float sample = *src;
279        if (sample >= 1.0f) {
280            *dst = 2147483647;
281        } else if (sample <= -1.0f) {
282            *dst = (Sint32) -2147483648LL;
283        } else {
284            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
285        }
286    }
287
288    if (cvt->filters[++cvt->filter_index]) {
289        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
290    }
291}
292#endif
293
294
295#if HAVE_SSE2_INTRINSICS
296static void SDLCALL
297SDL_Convert_S8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
298{
299    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
300    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
301    int i;
302
303    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using SSE2)");
304
305    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
306    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
307        *dst = ((float) *src) * DIVBY128;
308    }
309
310    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
311    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
312
313    /* Make sure src is aligned too. */
314    if ((((size_t) src) & 15) == 0) {
315        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
316        const __m128i *mmsrc = (const __m128i *) src;
317        const __m128i zero = _mm_setzero_si128();
318        const __m128 divby128 = _mm_set1_ps(DIVBY128);
319        while (i >= 16) {   /* 16 * 8-bit */
320            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 sint8 into an XMM register. */
321            /* treat as int16, shift left to clear every other sint16, then back right with sign-extend. Now sint16. */
322            const __m128i shorts1 = _mm_srai_epi16(_mm_slli_epi16(bytes, 8), 8);
323            /* right-shift-sign-extend gets us sint16 with the other set of values. */
324            const __m128i shorts2 = _mm_srai_epi16(bytes, 8);
325            /* unpack against zero to make these int32, shift to make them sign-extend, convert to float, multiply. Whew! */
326            const __m128 floats1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts1, zero), 16), 16)), divby128);
327            const __m128 floats2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpacklo_epi16(shorts2, zero), 16), 16)), divby128);
328            const __m128 floats3 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts1, zero), 16), 16)), divby128);
329            const __m128 floats4 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_slli_epi32(_mm_unpackhi_epi16(shorts2, zero), 16), 16)), divby128);
330            /* Interleave back into correct order, store. */
331            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
332            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
333            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
334            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
335            i -= 16; mmsrc--; dst -= 16;
336        }
337
338        src = (const Sint8 *) mmsrc;
339    }
340
341    src += 15; dst += 15;  /* adjust for any scalar finishing. */
342
343    /* Finish off any leftovers with scalar operations. */
344    while (i) {
345        *dst = ((float) *src) * DIVBY128;
346        i--; src--; dst--;
347    }
348
349    cvt->len_cvt *= 4;
350    if (cvt->filters[++cvt->filter_index]) {
351        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
352    }
353}
354
355static void SDLCALL
356SDL_Convert_U8_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
357{
358    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
359    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
360    int i;
361
362    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using SSE2)");
363
364    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
365    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
366        *dst = (((float) *src) * DIVBY128) - 1.0f;
367    }
368
369    src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */
370    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
371
372    /* Make sure src is aligned too. */
373    if ((((size_t) src) & 15) == 0) {
374        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
375        const __m128i *mmsrc = (const __m128i *) src;
376        const __m128i zero = _mm_setzero_si128();
377        const __m128 divby128 = _mm_set1_ps(DIVBY128);
378        const __m128 minus1 = _mm_set1_ps(-1.0f);
379        while (i >= 16) {   /* 16 * 8-bit */
380            const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */
381            /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
382            const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
383            /* right-shift-zero-extend gets us uint16 with the other set of values. */
384            const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
385            /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
386            /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
387            const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
388            const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
389            const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
390            const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
391            /* Interleave back into correct order, store. */
392            _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
393            _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
394            _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
395            _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
396            i -= 16; mmsrc--; dst -= 16;
397        }
398
399        src = (const Uint8 *) mmsrc;
400    }
401
402    src += 15; dst += 15;  /* adjust for any scalar finishing. */
403
404    /* Finish off any leftovers with scalar operations. */
405    while (i) {
406        *dst = (((float) *src) * DIVBY128) - 1.0f;
407        i--; src--; dst--;
408    }
409
410    cvt->len_cvt *= 4;
411    if (cvt->filters[++cvt->filter_index]) {
412        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
413    }
414}
415
416static void SDLCALL
417SDL_Convert_S16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
418{
419    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
420    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
421    int i;
422
423    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using SSE2)");
424
425    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
426    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
427        *dst = ((float) *src) * DIVBY32768;
428    }
429
430    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
431    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
432
433    /* Make sure src is aligned too. */
434    if ((((size_t) src) & 15) == 0) {
435        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
436        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
437        while (i >= 8) {   /* 8 * 16-bit */
438            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
439            /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
440            const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
441            /* right-shift-sign-extend gets us sint32 with the other set of values. */
442            const __m128i b = _mm_srai_epi32(ints, 16);
443            /* Interleave these back into the right order, convert to float, multiply, store. */
444            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
445            _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
446            i -= 8; src -= 8; dst -= 8;
447        }
448    }
449
450    src += 7; dst += 7;  /* adjust for any scalar finishing. */
451
452    /* Finish off any leftovers with scalar operations. */
453    while (i) {
454        *dst = ((float) *src) * DIVBY32768;
455        i--; src--; dst--;
456    }
457
458    cvt->len_cvt *= 2;
459    if (cvt->filters[++cvt->filter_index]) {
460        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
461    }
462}
463
464static void SDLCALL
465SDL_Convert_U16_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
466{
467    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
468    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
469    int i;
470
471    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using SSE2)");
472
473    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
474    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
475        *dst = (((float) *src) * DIVBY32768) - 1.0f;
476    }
477
478    src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */
479    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
480
481    /* Make sure src is aligned too. */
482    if ((((size_t) src) & 15) == 0) {
483        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
484        const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
485        const __m128 minus1 = _mm_set1_ps(1.0f);
486        while (i >= 8) {   /* 8 * 16-bit */
487            const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */
488            /* treat as int32, shift left to clear every other sint16, then back right with zero-extend. Now sint32. */
489            const __m128i a = _mm_srli_epi32(_mm_slli_epi32(ints, 16), 16);
490            /* right-shift-sign-extend gets us sint32 with the other set of values. */
491            const __m128i b = _mm_srli_epi32(ints, 16);
492            /* Interleave these back into the right order, convert to float, multiply, store. */
493            _mm_store_ps(dst, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768), minus1));
494            _mm_store_ps(dst+4, _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768), minus1));
495            i -= 8; src -= 8; dst -= 8;
496        }
497    }
498
499    src += 7; dst += 7;  /* adjust for any scalar finishing. */
500
501    /* Finish off any leftovers with scalar operations. */
502    while (i) {
503        *dst = (((float) *src) * DIVBY32768) - 1.0f;
504        i--; src--; dst--;
505    }
506
507    cvt->len_cvt *= 2;
508    if (cvt->filters[++cvt->filter_index]) {
509        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
510    }
511}
512
513static void SDLCALL
514SDL_Convert_S32_to_F32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
515{
516    const Sint32 *src = (const Sint32 *) cvt->buf;
517    float *dst = (float *) cvt->buf;
518    int i;
519
520    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using SSE2)");
521
522    /* Get dst aligned to 16 bytes */
523    for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
524        *dst = ((float) (*src>>8)) * DIVBY8388607;
525    }
526
527    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
528    SDL_assert(!i || ((((size_t) src) & 15) == 0));
529
530    {
531        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
532        const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
533        const __m128i *mmsrc = (const __m128i *) src;
534        while (i >= 4) {   /* 4 * sint32 */
535            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
536            _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
537            i -= 4; mmsrc++; dst += 4;
538        }
539        src = (const Sint32 *) mmsrc;
540    }
541
542    /* Finish off any leftovers with scalar operations. */
543    while (i) {
544        *dst = ((float) (*src>>8)) * DIVBY8388607;
545        i--; src++; dst++;
546    }
547
548    if (cvt->filters[++cvt->filter_index]) {
549        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
550    }
551}
552
553static void SDLCALL
554SDL_Convert_F32_to_S8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
555{
556    const float *src = (const float *) cvt->buf;
557    Sint8 *dst = (Sint8 *) cvt->buf;
558    int i;
559
560    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using SSE2)");
561
562    /* Get dst aligned to 16 bytes */
563    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
564        const float sample = *src;
565        if (sample >= 1.0f) {
566            *dst = 127;
567        } else if (sample <= -1.0f) {
568            *dst = -128;
569        } else {
570            *dst = (Sint8)(sample * 127.0f);
571        }
572    }
573
574    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
575
576    /* Make sure src is aligned too. */
577    if ((((size_t) src) & 15) == 0) {
578        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
579        const __m128 one = _mm_set1_ps(1.0f);
580        const __m128 negone = _mm_set1_ps(-1.0f);
581        const __m128 mulby127 = _mm_set1_ps(127.0f);
582        __m128i *mmdst = (__m128i *) dst;
583        while (i >= 16) {   /* 16 * float32 */
584            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
585            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
586            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
587            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
588            _mm_store_si128(mmdst, _mm_packs_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
589            i -= 16; src += 16; mmdst++;
590        }
591        dst = (Sint8 *) mmdst;
592    }
593
594    /* Finish off any leftovers with scalar operations. */
595    while (i) {
596        const float sample = *src;
597        if (sample >= 1.0f) {
598            *dst = 127;
599        } else if (sample <= -1.0f) {
600            *dst = -128;
601        } else {
602            *dst = (Sint8)(sample * 127.0f);
603        }
604        i--; src++; dst++;
605    }
606
607    cvt->len_cvt /= 4;
608    if (cvt->filters[++cvt->filter_index]) {
609        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
610    }
611}
612
613static void SDLCALL
614SDL_Convert_F32_to_U8_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
615{
616    const float *src = (const float *) cvt->buf;
617    Uint8 *dst = (Uint8 *) cvt->buf;
618    int i;
619
620    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using SSE2)");
621
622    /* Get dst aligned to 16 bytes */
623    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
624        const float sample = *src;
625        if (sample >= 1.0f) {
626            *dst = 255;
627        } else if (sample <= -1.0f) {
628            *dst = 0;
629        } else {
630            *dst = (Uint8)((sample + 1.0f) * 127.0f);
631        }
632    }
633
634    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
635
636    /* Make sure src is aligned too. */
637    if ((((size_t) src) & 15) == 0) {
638        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
639        const __m128 one = _mm_set1_ps(1.0f);
640        const __m128 negone = _mm_set1_ps(-1.0f);
641        const __m128 mulby127 = _mm_set1_ps(127.0f);
642        __m128i *mmdst = (__m128i *) dst;
643        while (i >= 16) {   /* 16 * float32 */
644            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
645            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
646            const __m128i ints3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
647            const __m128i ints4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
648            _mm_store_si128(mmdst, _mm_packus_epi16(_mm_packs_epi32(ints1, ints2), _mm_packs_epi32(ints3, ints4)));  /* pack down, store out. */
649            i -= 16; src += 16; mmdst++;
650        }
651        dst = (Uint8 *) mmdst;
652    }
653
654    /* Finish off any leftovers with scalar operations. */
655    while (i) {
656        const float sample = *src;
657        if (sample >= 1.0f) {
658            *dst = 255;
659        } else if (sample <= -1.0f) {
660            *dst = 0;
661        } else {
662            *dst = (Uint8)((sample + 1.0f) * 127.0f);
663        }
664        i--; src++; dst++;
665    }
666
667    cvt->len_cvt /= 4;
668    if (cvt->filters[++cvt->filter_index]) {
669        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
670    }
671}
672
673static void SDLCALL
674SDL_Convert_F32_to_S16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
675{
676    const float *src = (const float *) cvt->buf;
677    Sint16 *dst = (Sint16 *) cvt->buf;
678    int i;
679
680    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using SSE2)");
681
682    /* Get dst aligned to 16 bytes */
683    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
684        const float sample = *src;
685        if (sample >= 1.0f) {
686            *dst = 32767;
687        } else if (sample <= -1.0f) {
688            *dst = -32768;
689        } else {
690            *dst = (Sint16)(sample * 32767.0f);
691        }
692    }
693
694    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
695
696    /* Make sure src is aligned too. */
697    if ((((size_t) src) & 15) == 0) {
698        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
699        const __m128 one = _mm_set1_ps(1.0f);
700        const __m128 negone = _mm_set1_ps(-1.0f);
701        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
702        __m128i *mmdst = (__m128i *) dst;
703        while (i >= 8) {   /* 8 * float32 */
704            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
705            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
706            _mm_store_si128(mmdst, _mm_packs_epi32(ints1, ints2));  /* pack to sint16, store out. */
707            i -= 8; src += 8; mmdst++;
708        }
709        dst = (Sint16 *) mmdst;
710    }
711
712    /* Finish off any leftovers with scalar operations. */
713    while (i) {
714        const float sample = *src;
715        if (sample >= 1.0f) {
716            *dst = 32767;
717        } else if (sample <= -1.0f) {
718            *dst = -32768;
719        } else {
720            *dst = (Sint16)(sample * 32767.0f);
721        }
722        i--; src++; dst++;
723    }
724
725    cvt->len_cvt /= 2;
726    if (cvt->filters[++cvt->filter_index]) {
727        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
728    }
729}
730
731static void SDLCALL
732SDL_Convert_F32_to_U16_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
733{
734    const float *src = (const float *) cvt->buf;
735    Uint16 *dst = (Uint16 *) cvt->buf;
736    int i;
737
738    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using SSE2)");
739
740    /* Get dst aligned to 16 bytes */
741    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
742        const float sample = *src;
743        if (sample >= 1.0f) {
744            *dst = 65535;
745        } else if (sample <= -1.0f) {
746            *dst = 0;
747        } else {
748            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
749        }
750    }
751
752    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
753
754    /* Make sure src is aligned too. */
755    if ((((size_t) src) & 15) == 0) {
756        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
757        /* This calculates differently than the scalar path because SSE2 can't
758           pack int32 data down to unsigned int16. _mm_packs_epi32 does signed
759           saturation, so that would corrupt our data. _mm_packus_epi32 exists,
760           but not before SSE 4.1. So we convert from float to sint16, packing
761           that down with legit signed saturation, and then xor the top bit
762           against 1. This results in the correct unsigned 16-bit value, even
763           though it looks like dark magic. */
764        const __m128 mulby32767 = _mm_set1_ps(32767.0f);
765        const __m128i topbit = _mm_set1_epi16(-32768);
766        const __m128 one = _mm_set1_ps(1.0f);
767        const __m128 negone = _mm_set1_ps(-1.0f);
768        __m128i *mmdst = (__m128i *) dst;
769        while (i >= 8) {   /* 8 * float32 */
770            const __m128i ints1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
771            const __m128i ints2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
772            _mm_store_si128(mmdst, _mm_xor_si128(_mm_packs_epi32(ints1, ints2), topbit));  /* pack to sint16, xor top bit, store out. */
773            i -= 8; src += 8; mmdst++;
774        }
775        dst = (Uint16 *) mmdst;
776    }
777
778    /* Finish off any leftovers with scalar operations. */
779    while (i) {
780        const float sample = *src;
781        if (sample >= 1.0f) {
782            *dst = 65535;
783        } else if (sample <= -1.0f) {
784            *dst = 0;
785        } else {
786            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
787        }
788        i--; src++; dst++;
789    }
790
791    cvt->len_cvt /= 2;
792    if (cvt->filters[++cvt->filter_index]) {
793        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
794    }
795}
796
797static void SDLCALL
798SDL_Convert_F32_to_S32_SSE2(SDL_AudioCVT *cvt, SDL_AudioFormat format)
799{
800    const float *src = (const float *) cvt->buf;
801    Sint32 *dst = (Sint32 *) cvt->buf;
802    int i;
803
804    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using SSE2)");
805
806    /* Get dst aligned to 16 bytes */
807    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
808        const float sample = *src;
809        if (sample >= 1.0f) {
810            *dst = 2147483647;
811        } else if (sample <= -1.0f) {
812            *dst = (Sint32) -2147483648LL;
813        } else {
814            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
815        }
816    }
817
818    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
819    SDL_assert(!i || ((((size_t) src) & 15) == 0));
820
821    {
822        /* Aligned! Do SSE blocks as long as we have 16 bytes available. */
823        const __m128 one = _mm_set1_ps(1.0f);
824        const __m128 negone = _mm_set1_ps(-1.0f);
825        const __m128 mulby8388607 = _mm_set1_ps(8388607.0f);
826        __m128i *mmdst = (__m128i *) dst;
827        while (i >= 4) {   /* 4 * float32 */
828            _mm_store_si128(mmdst, _mm_slli_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_min_ps(_mm_max_ps(negone, _mm_load_ps(src)), one), mulby8388607)), 8));  /* load 4 floats, clamp, convert to sint32 */
829            i -= 4; src += 4; mmdst++;
830        }
831        dst = (Sint32 *) mmdst;
832    }
833
834    /* Finish off any leftovers with scalar operations. */
835    while (i) {
836        const float sample = *src;
837        if (sample >= 1.0f) {
838            *dst = 2147483647;
839        } else if (sample <= -1.0f) {
840            *dst = (Sint32) -2147483648LL;
841        } else {
842            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
843        }
844        i--; src++; dst++;
845    }
846
847    if (cvt->filters[++cvt->filter_index]) {
848        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
849    }
850}
851#endif
852
853
854#if HAVE_NEON_INTRINSICS
855static void SDLCALL
856SDL_Convert_S8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
857{
858    const Sint8 *src = ((const Sint8 *) (cvt->buf + cvt->len_cvt)) - 1;
859    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
860    int i;
861
862    LOG_DEBUG_CONVERT("AUDIO_S8", "AUDIO_F32 (using NEON)");
863
864    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
865    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
866        *dst = ((float) *src) * DIVBY128;
867    }
868
869    src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
870    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
871
872    /* Make sure src is aligned too. */
873    if ((((size_t) src) & 15) == 0) {
874        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
875        const int8_t *mmsrc = (const int8_t *) src;
876        const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
877        while (i >= 16) {   /* 16 * 8-bit */
878            const int8x16_t bytes = vld1q_s8(mmsrc);  /* get 16 sint8 into a NEON register. */
879            const int16x8_t int16hi = vmovl_s8(vget_high_s8(bytes));  /* convert top 8 bytes to 8 int16 */
880            const int16x8_t int16lo = vmovl_s8(vget_low_s8(bytes));   /* convert bottom 8 bytes to 8 int16 */
881            /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
882            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16hi))), divby128));
883            vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16hi))), divby128));
884            vst1q_f32(dst+8, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(int16lo))), divby128));
885            vst1q_f32(dst+12, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(int16lo))), divby128));
886            i -= 16; mmsrc -= 16; dst -= 16;
887        }
888
889        src = (const Sint8 *) mmsrc;
890    }
891
892    src += 15; dst += 15;  /* adjust for any scalar finishing. */
893
894    /* Finish off any leftovers with scalar operations. */
895    while (i) {
896        *dst = ((float) *src) * DIVBY128;
897        i--; src--; dst--;
898    }
899
900    cvt->len_cvt *= 4;
901    if (cvt->filters[++cvt->filter_index]) {
902        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
903    }
904}
905
906static void SDLCALL
907SDL_Convert_U8_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
908{
909    const Uint8 *src = ((const Uint8 *) (cvt->buf + cvt->len_cvt)) - 1;
910    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 4)) - 1;
911    int i;
912
913    LOG_DEBUG_CONVERT("AUDIO_U8", "AUDIO_F32 (using NEON)");
914
915    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
916    for (i = cvt->len_cvt; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
917        *dst = (((float) *src) * DIVBY128) - 1.0f;
918    }
919
920    src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */
921    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
922
923    /* Make sure src is aligned too. */
924    if ((((size_t) src) & 15) == 0) {
925        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
926        const uint8_t *mmsrc = (const uint8_t *) src;
927        const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
928        const float32x4_t one = vdupq_n_f32(1.0f);
929        while (i >= 16) {   /* 16 * 8-bit */
930            const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */
931            const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */
932            const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */
933            /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
934            vst1q_f32(dst, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128, one));
935            vst1q_f32(dst+4, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128, one));
936            vst1q_f32(dst+8, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128, one));
937            vst1q_f32(dst+12, vmlsq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128, one));
938            i -= 16; mmsrc -= 16; dst -= 16;
939        }
940
941        src = (const Uint8 *) mmsrc;
942    }
943
944    src += 15; dst += 15;  /* adjust for any scalar finishing. */
945
946    /* Finish off any leftovers with scalar operations. */
947    while (i) {
948        *dst = (((float) *src) * DIVBY128) - 1.0f;
949        i--; src--; dst--;
950    }
951
952    cvt->len_cvt *= 4;
953    if (cvt->filters[++cvt->filter_index]) {
954        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
955    }
956}
957
958static void SDLCALL
959SDL_Convert_S16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
960{
961    const Sint16 *src = ((const Sint16 *) (cvt->buf + cvt->len_cvt)) - 1;
962    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
963    int i;
964
965    LOG_DEBUG_CONVERT("AUDIO_S16", "AUDIO_F32 (using NEON)");
966
967    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
968    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
969        *dst = ((float) *src) * DIVBY32768;
970    }
971
972    src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
973    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
974
975    /* Make sure src is aligned too. */
976    if ((((size_t) src) & 15) == 0) {
977        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
978        const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
979        while (i >= 8) {   /* 8 * 16-bit */
980            const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */
981            /* split int16 to two int32, then convert to float, then multiply to normalize, store. */
982            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
983            vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
984            i -= 8; src -= 8; dst -= 8;
985        }
986    }
987
988    src += 7; dst += 7;  /* adjust for any scalar finishing. */
989
990    /* Finish off any leftovers with scalar operations. */
991    while (i) {
992        *dst = ((float) *src) * DIVBY32768;
993        i--; src--; dst--;
994    }
995
996    cvt->len_cvt *= 2;
997    if (cvt->filters[++cvt->filter_index]) {
998        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
999    }
1000}
1001
1002static void SDLCALL
1003SDL_Convert_U16_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1004{
1005    const Uint16 *src = ((const Uint16 *) (cvt->buf + cvt->len_cvt)) - 1;
1006    float *dst = ((float *) (cvt->buf + cvt->len_cvt * 2)) - 1;
1007    int i;
1008
1009    LOG_DEBUG_CONVERT("AUDIO_U16", "AUDIO_F32 (using NEON)");
1010
1011    /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
1012    for (i = cvt->len_cvt / sizeof (Sint16); i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
1013        *dst = (((float) *src) * DIVBY32768) - 1.0f;
1014    }
1015
1016    src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */
1017    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1018
1019    /* Make sure src is aligned too. */
1020    if ((((size_t) src) & 15) == 0) {
1021        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1022        const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
1023        const float32x4_t one = vdupq_n_f32(1.0f);
1024        while (i >= 8) {   /* 8 * 16-bit */
1025            const uint16x8_t uints = vld1q_u16((uint16_t const *) src);  /* get 8 uint16 into a NEON register. */
1026            /* split uint16 to two int32, then convert to float, then multiply to normalize, subtract for sign, store. */
1027            vst1q_f32(dst, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uints))), divby32768));
1028            vst1q_f32(dst+4, vmlsq_f32(one, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uints))), divby32768));
1029            i -= 8; src -= 8; dst -= 8;
1030        }
1031    }
1032
1033    src += 7; dst += 7;  /* adjust for any scalar finishing. */
1034
1035    /* Finish off any leftovers with scalar operations. */
1036    while (i) {
1037        *dst = (((float) *src) * DIVBY32768) - 1.0f;
1038        i--; src--; dst--;
1039    }
1040
1041    cvt->len_cvt *= 2;
1042    if (cvt->filters[++cvt->filter_index]) {
1043        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
1044    }
1045}
1046
1047static void SDLCALL
1048SDL_Convert_S32_to_F32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1049{
1050    const Sint32 *src = (const Sint32 *) cvt->buf;
1051    float *dst = (float *) cvt->buf;
1052    int i;
1053
1054    LOG_DEBUG_CONVERT("AUDIO_S32", "AUDIO_F32 (using NEON)");
1055
1056    /* Get dst aligned to 16 bytes */
1057    for (i = cvt->len_cvt / sizeof (Sint32); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1058        *dst = ((float) (*src>>8)) * DIVBY8388607;
1059    }
1060
1061    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1062    SDL_assert(!i || ((((size_t) src) & 15) == 0));
1063
1064    {
1065        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1066        const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
1067        const int32_t *mmsrc = (const int32_t *) src;
1068        while (i >= 4) {   /* 4 * sint32 */
1069            /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
1070            vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
1071            i -= 4; mmsrc += 4; dst += 4;
1072        }
1073        src = (const Sint32 *) mmsrc;
1074    }
1075
1076    /* Finish off any leftovers with scalar operations. */
1077    while (i) {
1078        *dst = ((float) (*src>>8)) * DIVBY8388607;
1079        i--; src++; dst++;
1080    }
1081
1082    if (cvt->filters[++cvt->filter_index]) {
1083        cvt->filters[cvt->filter_index](cvt, AUDIO_F32SYS);
1084    }
1085}
1086
1087static void SDLCALL
1088SDL_Convert_F32_to_S8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1089{
1090    const float *src = (const float *) cvt->buf;
1091    Sint8 *dst = (Sint8 *) cvt->buf;
1092    int i;
1093
1094    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S8 (using NEON)");
1095
1096    /* Get dst aligned to 16 bytes */
1097    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1098        const float sample = *src;
1099        if (sample >= 1.0f) {
1100            *dst = 127;
1101        } else if (sample <= -1.0f) {
1102            *dst = -128;
1103        } else {
1104            *dst = (Sint8)(sample * 127.0f);
1105        }
1106    }
1107
1108    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1109
1110    /* Make sure src is aligned too. */
1111    if ((((size_t) src) & 15) == 0) {
1112        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1113        const float32x4_t one = vdupq_n_f32(1.0f);
1114        const float32x4_t negone = vdupq_n_f32(-1.0f);
1115        const float32x4_t mulby127 = vdupq_n_f32(127.0f);
1116        int8_t *mmdst = (int8_t *) dst;
1117        while (i >= 16) {   /* 16 * float32 */
1118            const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
1119            const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
1120            const int32x4_t ints3 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
1121            const int32x4_t ints4 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), mulby127));  /* load 4 floats, clamp, convert to sint32 */
1122            const int8x8_t i8lo = vmovn_s16(vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2))); /* narrow to sint16, combine, narrow to sint8 */
1123            const int8x8_t i8hi = vmovn_s16(vcombine_s16(vmovn_s32(ints3), vmovn_s32(ints4))); /* narrow to sint16, combine, narrow to sint8 */
1124            vst1q_s8(mmdst, vcombine_s8(i8lo, i8hi));  /* combine to int8x16_t, store out */
1125            i -= 16; src += 16; mmdst += 16;
1126        }
1127        dst = (Sint8 *) mmdst;
1128    }
1129
1130    /* Finish off any leftovers with scalar operations. */
1131    while (i) {
1132        const float sample = *src;
1133        if (sample >= 1.0f) {
1134            *dst = 127;
1135        } else if (sample <= -1.0f) {
1136            *dst = -128;
1137        } else {
1138            *dst = (Sint8)(sample * 127.0f);
1139        }
1140        i--; src++; dst++;
1141    }
1142
1143    cvt->len_cvt /= 4;
1144    if (cvt->filters[++cvt->filter_index]) {
1145        cvt->filters[cvt->filter_index](cvt, AUDIO_S8);
1146    }
1147}
1148
1149static void SDLCALL
1150SDL_Convert_F32_to_U8_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1151{
1152    const float *src = (const float *) cvt->buf;
1153    Uint8 *dst = (Uint8 *) cvt->buf;
1154    int i;
1155
1156    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U8 (using NEON)");
1157
1158    /* Get dst aligned to 16 bytes */
1159    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1160        const float sample = *src;
1161        if (sample >= 1.0f) {
1162            *dst = 255;
1163        } else if (sample <= -1.0f) {
1164            *dst = 0;
1165        } else {
1166            *dst = (Uint8)((sample + 1.0f) * 127.0f);
1167        }
1168    }
1169
1170    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1171
1172    /* Make sure src is aligned too. */
1173    if ((((size_t) src) & 15) == 0) {
1174        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1175        const float32x4_t one = vdupq_n_f32(1.0f);
1176        const float32x4_t negone = vdupq_n_f32(-1.0f);
1177        const float32x4_t mulby127 = vdupq_n_f32(127.0f);
1178        uint8_t *mmdst = (uint8_t *) dst;
1179        while (i >= 16) {   /* 16 * float32 */
1180            const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
1181            const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
1182            const uint32x4_t uints3 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+8)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
1183            const uint32x4_t uints4 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+12)), one), one), mulby127));  /* load 4 floats, clamp, convert to uint32 */
1184            const uint8x8_t ui8lo = vmovn_u16(vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2))); /* narrow to uint16, combine, narrow to uint8 */
1185            const uint8x8_t ui8hi = vmovn_u16(vcombine_u16(vmovn_u32(uints3), vmovn_u32(uints4))); /* narrow to uint16, combine, narrow to uint8 */
1186            vst1q_u8(mmdst, vcombine_u8(ui8lo, ui8hi));  /* combine to uint8x16_t, store out */
1187            i -= 16; src += 16; mmdst += 16;
1188        }
1189
1190        dst = (Uint8 *) mmdst;
1191    }
1192
1193    /* Finish off any leftovers with scalar operations. */
1194    while (i) {
1195        const float sample = *src;
1196        if (sample >= 1.0f) {
1197            *dst = 255;
1198        } else if (sample <= -1.0f) {
1199            *dst = 0;
1200        } else {
1201            *dst = (Uint8)((sample + 1.0f) * 127.0f);
1202        }
1203        i--; src++; dst++;
1204    }
1205
1206    cvt->len_cvt /= 4;
1207    if (cvt->filters[++cvt->filter_index]) {
1208        cvt->filters[cvt->filter_index](cvt, AUDIO_U8);
1209    }
1210}
1211
1212static void SDLCALL
1213SDL_Convert_F32_to_S16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1214{
1215    const float *src = (const float *) cvt->buf;
1216    Sint16 *dst = (Sint16 *) cvt->buf;
1217    int i;
1218
1219    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S16 (using NEON)");
1220
1221    /* Get dst aligned to 16 bytes */
1222    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1223        const float sample = *src;
1224        if (sample >= 1.0f) {
1225            *dst = 32767;
1226        } else if (sample <= -1.0f) {
1227            *dst = -32768;
1228        } else {
1229            *dst = (Sint16)(sample * 32767.0f);
1230        }
1231    }
1232
1233    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1234
1235    /* Make sure src is aligned too. */
1236    if ((((size_t) src) & 15) == 0) {
1237        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1238        const float32x4_t one = vdupq_n_f32(1.0f);
1239        const float32x4_t negone = vdupq_n_f32(-1.0f);
1240        const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
1241        int16_t *mmdst = (int16_t *) dst;
1242        while (i >= 8) {   /* 8 * float32 */
1243            const int32x4_t ints1 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
1244            const int32x4_t ints2 = vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), mulby32767));  /* load 4 floats, clamp, convert to sint32 */
1245            vst1q_s16(mmdst, vcombine_s16(vmovn_s32(ints1), vmovn_s32(ints2)));  /* narrow to sint16, combine, store out. */
1246            i -= 8; src += 8; mmdst += 8;
1247        }
1248        dst = (Sint16 *) mmdst;
1249    }
1250
1251    /* Finish off any leftovers with scalar operations. */
1252    while (i) {
1253        const float sample = *src;
1254        if (sample >= 1.0f) {
1255            *dst = 32767;
1256        } else if (sample <= -1.0f) {
1257            *dst = -32768;
1258        } else {
1259            *dst = (Sint16)(sample * 32767.0f);
1260        }
1261        i--; src++; dst++;
1262    }
1263
1264    cvt->len_cvt /= 2;
1265    if (cvt->filters[++cvt->filter_index]) {
1266        cvt->filters[cvt->filter_index](cvt, AUDIO_S16SYS);
1267    }
1268}
1269
1270static void SDLCALL
1271SDL_Convert_F32_to_U16_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1272{
1273    const float *src = (const float *) cvt->buf;
1274    Uint16 *dst = (Uint16 *) cvt->buf;
1275    int i;
1276
1277    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_U16 (using NEON)");
1278
1279    /* Get dst aligned to 16 bytes */
1280    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1281        const float sample = *src;
1282        if (sample >= 1.0f) {
1283            *dst = 65535;
1284        } else if (sample <= -1.0f) {
1285            *dst = 0;
1286        } else {
1287            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
1288        }
1289    }
1290
1291    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1292
1293    /* Make sure src is aligned too. */
1294    if ((((size_t) src) & 15) == 0) {
1295        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1296        const float32x4_t one = vdupq_n_f32(1.0f);
1297        const float32x4_t negone = vdupq_n_f32(-1.0f);
1298        const float32x4_t mulby32767 = vdupq_n_f32(32767.0f);
1299        uint16_t *mmdst = (uint16_t *) dst;
1300        while (i >= 8) {   /* 8 * float32 */
1301            const uint32x4_t uints1 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
1302            const uint32x4_t uints2 = vcvtq_u32_f32(vmulq_f32(vaddq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src+4)), one), one), mulby32767));  /* load 4 floats, clamp, convert to uint32 */
1303            vst1q_u16(mmdst, vcombine_u16(vmovn_u32(uints1), vmovn_u32(uints2)));  /* narrow to uint16, combine, store out. */
1304            i -= 8; src += 8; mmdst += 8;
1305        }
1306        dst = (Uint16 *) mmdst;
1307    }
1308
1309    /* Finish off any leftovers with scalar operations. */
1310    while (i) {
1311        const float sample = *src;
1312        if (sample >= 1.0f) {
1313            *dst = 65535;
1314        } else if (sample <= -1.0f) {
1315            *dst = 0;
1316        } else {
1317            *dst = (Uint16)((sample + 1.0f) * 32767.0f);
1318        }
1319        i--; src++; dst++;
1320    }
1321
1322    cvt->len_cvt /= 2;
1323    if (cvt->filters[++cvt->filter_index]) {
1324        cvt->filters[cvt->filter_index](cvt, AUDIO_U16SYS);
1325    }
1326}
1327
1328static void SDLCALL
1329SDL_Convert_F32_to_S32_NEON(SDL_AudioCVT *cvt, SDL_AudioFormat format)
1330{
1331    const float *src = (const float *) cvt->buf;
1332    Sint32 *dst = (Sint32 *) cvt->buf;
1333    int i;
1334
1335    LOG_DEBUG_CONVERT("AUDIO_F32", "AUDIO_S32 (using NEON)");
1336
1337    /* Get dst aligned to 16 bytes */
1338    for (i = cvt->len_cvt / sizeof (float); i && (((size_t) dst) & 15); --i, ++src, ++dst) {
1339        const float sample = *src;
1340        if (sample >= 1.0f) {
1341            *dst = 2147483647;
1342        } else if (sample <= -1.0f) {
1343            *dst = -2147483648;
1344        } else {
1345            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
1346        }
1347    }
1348
1349    SDL_assert(!i || ((((size_t) dst) & 15) == 0));
1350    SDL_assert(!i || ((((size_t) src) & 15) == 0));
1351
1352    {
1353        /* Aligned! Do NEON blocks as long as we have 16 bytes available. */
1354        const float32x4_t one = vdupq_n_f32(1.0f);
1355        const float32x4_t negone = vdupq_n_f32(-1.0f);
1356        const float32x4_t mulby8388607 = vdupq_n_f32(8388607.0f);
1357        int32_t *mmdst = (int32_t *) dst;
1358        while (i >= 4) {   /* 4 * float32 */
1359            vst1q_s32(mmdst, vshlq_n_s32(vcvtq_s32_f32(vmulq_f32(vminq_f32(vmaxq_f32(negone, vld1q_f32(src)), one), mulby8388607)), 8));
1360            i -= 4; src += 4; mmdst += 4;
1361        }
1362        dst = (Sint32 *) mmdst;
1363    }
1364
1365    /* Finish off any leftovers with scalar operations. */
1366    while (i) {
1367        const float sample = *src;
1368        if (sample >= 1.0f) {
1369            *dst = 2147483647;
1370        } else if (sample <= -1.0f) {
1371            *dst = -2147483648;
1372        } else {
1373            *dst = ((Sint32)(sample * 8388607.0f)) << 8;
1374        }
1375        i--; src++; dst++;
1376    }
1377
1378    if (cvt->filters[++cvt->filter_index]) {
1379        cvt->filters[cvt->filter_index](cvt, AUDIO_S32SYS);
1380    }
1381}
1382#endif
1383
1384
1385
1386void SDL_ChooseAudioConverters(void)
1387{
1388    static SDL_bool converters_chosen = SDL_FALSE;
1389
1390    if (converters_chosen) {
1391        return;
1392    }
1393
1394#define SET_CONVERTER_FUNCS(fntype) \
1395        SDL_Convert_S8_to_F32 = SDL_Convert_S8_to_F32_##fntype; \
1396        SDL_Convert_U8_to_F32 = SDL_Convert_U8_to_F32_##fntype; \
1397        SDL_Convert_S16_to_F32 = SDL_Convert_S16_to_F32_##fntype; \
1398        SDL_Convert_U16_to_F32 = SDL_Convert_U16_to_F32_##fntype; \
1399        SDL_Convert_S32_to_F32 = SDL_Convert_S32_to_F32_##fntype; \
1400        SDL_Convert_F32_to_S8 = SDL_Convert_F32_to_S8_##fntype; \
1401        SDL_Convert_F32_to_U8 = SDL_Convert_F32_to_U8_##fntype; \
1402        SDL_Convert_F32_to_S16 = SDL_Convert_F32_to_S16_##fntype; \
1403        SDL_Convert_F32_to_U16 = SDL_Convert_F32_to_U16_##fntype; \
1404        SDL_Convert_F32_to_S32 = SDL_Convert_F32_to_S32_##fntype; \
1405        converters_chosen = SDL_TRUE
1406
1407#if HAVE_SSE2_INTRINSICS
1408    if (SDL_HasSSE2()) {
1409        SET_CONVERTER_FUNCS(SSE2);
1410        return;
1411    }
1412#endif
1413
1414#if HAVE_NEON_INTRINSICS
1415    if (SDL_HasNEON()) {
1416        SET_CONVERTER_FUNCS(NEON);
1417        return;
1418    }
1419#endif
1420
1421#if NEED_SCALAR_CONVERTER_FALLBACKS
1422    SET_CONVERTER_FUNCS(Scalar);
1423#endif
1424
1425#undef SET_CONVERTER_FUNCS
1426
1427    SDL_assert(converters_chosen == SDL_TRUE);
1428}
1429
1430/* vi: set ts=4 sw=4 expandtab: */
1431
[FILE END]
(C) 2025 0x4248
(C) 2025 4248 Media and 4248 Systems, All part of 0x4248
See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.