alysbrooks-public/games/isometric-park-fna Files · FNA/lib/FAudio/src/FAudio_internal_simd.c

alysbrooks-public » games » isometric-park-fna
Location: alysbrooks-public/games/isometric-park-fna/FNA/lib/FAudio/src/FAudio_internal_simd.c

Commit Description:
Various UI improvements.
Commit Description:
Various UI improvements.
References:
r291:6a7d5e8510b1 default
File last commit:
r0:e33f209de7e5 default
Show/Diff file:
Action:
            
                    FNA/lib/FAudio/src/FAudio_internal_simd.c
                
            1626 lines
             | 46.6 KiB
             | text/x-c 
             | CLexer

            History
        
           Show Full History
         |
          Annotation
         | Raw
         |
              
              Download
              
      /* FAudio - XAudio Reimplementation for FNA

       *

       * Copyright (c) 2011-2020 Ethan Lee, Luigi Auriemma, and the MonoGame Team

       *

       * This software is provided 'as-is', without any express or implied warranty.

       * In no event will the authors be held liable for any damages arising from

       * the use of this software.

       *

       * Permission is granted to anyone to use this software for any purpose,

       * including commercial applications, and to alter it and redistribute it

       * freely, subject to the following restrictions:

       *

       * 1. The origin of this software must not be misrepresented; you must not

       * claim that you wrote the original software. If you use this software in a

       * product, an acknowledgment in the product documentation would be

       * appreciated but is not required.

       *

       * 2. Altered source versions must be plainly marked as such, and must not be

       * misrepresented as being the original software.

       *

       * 3. This notice may not be removed or altered from any source distribution.

       *

       * Ethan "flibitijibibo" Lee <flibitijibibo@flibitijibibo.com>

       *

       */

      #include "FAudio_internal.h"

      /* SECTION 0: SSE/NEON Detection */

      /* The SSE/NEON detection comes from MojoAL:

       * https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c

       */

      #if defined(__x86_64__) || defined(_M_X64)

      	/* Some platforms fail to define this... */

      	#ifndef __SSE2__

      	#define __SSE2__ 1

      	#endif

      	/* x86_64 guarantees SSE2. */

      	#define NEED_SCALAR_CONVERTER_FALLBACKS 0

      #elif defined(__aarch64__) || defined(_M_ARM64)

      	/* Some platforms fail to define this... */

      	#ifndef __ARM_NEON__

      	#define __ARM_NEON__ 1

      	#endif

      	/* AArch64 guarantees NEON. */

      	#define NEED_SCALAR_CONVERTER_FALLBACKS 0

      #elif __MACOSX__

      	/* Some build systems may need to specify this. Also, macOS ARM? Sigh */

      	#ifndef __SSE2__

      	#error macOS does not have SSE2? Bad compiler? They actually moved to ARM?!

      	#endif

      	/* Mac OS X/Intel guarantees SSE2. */

      	#define NEED_SCALAR_CONVERTER_FALLBACKS 0

      #else

      	/* Need plain C implementations to support all other hardware */

      	#define NEED_SCALAR_CONVERTER_FALLBACKS 1

      #endif

      /* Our NEON paths require AArch64, don't check __ARM_NEON__ here */

      #if defined(__aarch64__) || defined(_M_ARM64)

      #include <arm_neon.h>

      #define HAVE_NEON_INTRINSICS 1

      #endif

      #ifdef __SSE2__

      #include <emmintrin.h>

      #define HAVE_SSE2_INTRINSICS 1

      #endif

      /* SECTION 1: Type Converters */

      /* The SSE/NEON converters are based on SDL_audiotypecvt:

       * https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c

       */

      #define DIVBY128 0.0078125f

      #define DIVBY32768 0.000030517578125f

      #define DIVBY8388607 0.00000011920930376163766f

      #if NEED_SCALAR_CONVERTER_FALLBACKS

      void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(

      	const uint8_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

      	uint32_t i;

      	for (i = 0; i < len; i += 1)

      	{

      		*dst++ = (*src++ * DIVBY128) - 1.0f;

      	}

      }

      void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(

      	const int16_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

      	uint32_t i;

      	for (i = 0; i < len; i += 1)

      	{

      		*dst++ = *src++ * DIVBY32768;

      	}

      }

      void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(

      	const int32_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

      	uint32_t i;

      	for (i = 0; i < len; i += 1)

      	{

      		*dst++ = (*src++ >> 8) * DIVBY8388607;

      	}

      }

      #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */

      #if HAVE_SSE2_INTRINSICS

      void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(

      	const uint8_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          src += len - 1;

          dst += len - 1;

          /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */

          for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {

              *dst = (((float) *src) * DIVBY128) - 1.0f;

          }

          src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do SSE blocks as long as we have 16 bytes available. */

              const __m128i *mmsrc = (const __m128i *) src;

              const __m128i zero = _mm_setzero_si128();

              const __m128 divby128 = _mm_set1_ps(DIVBY128);

              const __m128 minus1 = _mm_set1_ps(-1.0f);

              while (i >= 16) {   /* 16 * 8-bit */

                  const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */

                  /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */

                  const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);

                  /* right-shift-zero-extend gets us uint16 with the other set of values. */

                  const __m128i shorts2 = _mm_srli_epi16(bytes, 8);

                  /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */

                  /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */

                  const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);

                  const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);

                  const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);

                  const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);

                  /* Interleave back into correct order, store. */

                  _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));

                  _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));

                  _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));

                  _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));

                  i -= 16; mmsrc--; dst -= 16;

              }

              src = (const uint8_t *) mmsrc;

          }

          src += 15; dst += 15;  /* adjust for any scalar finishing. */

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = (((float) *src) * DIVBY128) - 1.0f;

              i--; src--; dst--;

          }

      }

      void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(

      	const int16_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          src += len - 1;

          dst += len - 1;

          /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */

          for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {

              *dst = ((float) *src) * DIVBY32768;

          }

          src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do SSE blocks as long as we have 16 bytes available. */

              const __m128 divby32768 = _mm_set1_ps(DIVBY32768);

              while (i >= 8) {   /* 8 * 16-bit */

                  const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */

                  /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */

                  const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);

                  /* right-shift-sign-extend gets us sint32 with the other set of values. */

                  const __m128i b = _mm_srai_epi32(ints, 16);

                  /* Interleave these back into the right order, convert to float, multiply, store. */

                  _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));

                  _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));

                  i -= 8; src -= 8; dst -= 8;

              }

          }

          src += 7; dst += 7;  /* adjust for any scalar finishing. */

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = ((float) *src) * DIVBY32768;

              i--; src--; dst--;

          }

      }

      void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(

      	const int32_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          /* Get dst aligned to 16 bytes */

          for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {

              *dst = ((float) (*src>>8)) * DIVBY8388607;

          }

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do SSE blocks as long as we have 16 bytes available. */

              const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);

              const __m128i *mmsrc = (const __m128i *) src;

              while (i >= 4) {   /* 4 * sint32 */

                  /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */

                  _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));

                  i -= 4; mmsrc++; dst += 4;

              }

              src = (const int32_t *) mmsrc;

          }

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = ((float) (*src>>8)) * DIVBY8388607;

              i--; src++; dst++;

          }

      }

      #endif /* HAVE_SSE2_INTRINSICS */

      #if HAVE_NEON_INTRINSICS

      void FAudio_INTERNAL_Convert_U8_To_F32_NEON(

      	const uint8_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          src += len - 1;

          dst += len - 1;

          /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */

          for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {

              *dst = (((float) *src) * DIVBY128) - 1.0f;

          }

          src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do NEON blocks as long as we have 16 bytes available. */

              const uint8_t *mmsrc = (const uint8_t *) src;

              const float32x4_t divby128 = vdupq_n_f32(DIVBY128);

              const float32x4_t negone = vdupq_n_f32(-1.0f);

              while (i >= 16) {   /* 16 * 8-bit */

                  const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */

                  const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */

                  const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */

                  /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */

                  vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));

                  vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));

                  vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));

                  vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));

                  i -= 16; mmsrc -= 16; dst -= 16;

              }

              src = (const uint8_t *) mmsrc;

          }

          src += 15; dst += 15;  /* adjust for any scalar finishing. */

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = (((float) *src) * DIVBY128) - 1.0f;

              i--; src--; dst--;

          }

      }

      void FAudio_INTERNAL_Convert_S16_To_F32_NEON(

      	const int16_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          src += len - 1;

          dst += len - 1;

          /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */

          for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {

              *dst = ((float) *src) * DIVBY32768;

          }

          src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do NEON blocks as long as we have 16 bytes available. */

              const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);

              while (i >= 8) {   /* 8 * 16-bit */

                  const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */

                  /* split int16 to two int32, then convert to float, then multiply to normalize, store. */

                  vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));

                  vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));

                  i -= 8; src -= 8; dst -= 8;

              }

          }

          src += 7; dst += 7;  /* adjust for any scalar finishing. */

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = ((float) *src) * DIVBY32768;

              i--; src--; dst--;

          }

      }

      void FAudio_INTERNAL_Convert_S32_To_F32_NEON(

      	const int32_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          /* Get dst aligned to 16 bytes */

          for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {

              *dst = ((float) (*src>>8)) * DIVBY8388607;

          }

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do NEON blocks as long as we have 16 bytes available. */

              const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);

              const int32_t *mmsrc = (const int32_t *) src;

              while (i >= 4) {   /* 4 * sint32 */

                  /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */

                  vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));

                  i -= 4; mmsrc += 4; dst += 4;

              }

              src = (const int32_t *) mmsrc;

          }

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = ((float) (*src>>8)) * DIVBY8388607;

              i--; src++; dst++;

          }

      }

      #endif /* HAVE_NEON_INTRINSICS */

      /* SECTION 2: Linear Resamplers */

      void FAudio_INTERNAL_ResampleGeneric(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t channels

      ) {

      	uint32_t i, j;

      	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < toResample; i += 1)

      	{

      		for (j = 0; j < channels; j += 1)

      		{

      			/* lerp, then convert to float value */

      			*resampleCache++ = (float) (

      				dCache[j] +

      				(dCache[j + channels] - dCache[j]) *

      				FIXED_TO_DOUBLE(cur)

      			);

      		}

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur >> FIXED_PRECISION) * channels;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur &= FIXED_FRACTION_MASK;

      	}

      }

      #if NEED_SCALAR_CONVERTER_FALLBACKS

      void FAudio_INTERNAL_ResampleMono_Scalar(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i;

      	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < toResample; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_DOUBLE(cur)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur &= FIXED_FRACTION_MASK;

      	}

      }

      void FAudio_INTERNAL_ResampleStereo_Scalar(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i;

      	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < toResample; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_DOUBLE(cur)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_DOUBLE(cur)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur &= FIXED_FRACTION_MASK;

      	}

      }

      #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */

      /* The SSE2 versions of the resamplers come from @8thMage! */

      #if HAVE_SSE2_INTRINSICS

      void FAudio_INTERNAL_ResampleMono_SSE2(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i, header, tail;

      	uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;

      	float *dCache_1, *dCache_2, *dCache_3;

      	uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;

      	__m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3,

      		current, next, sub, cur_fixed, mul, res;

      	__m128i cur_frac, adder_frac, adder_frac_loop;

      	/* This is the header, the Dest needs to be aligned to 16B */

      	header = (16 - ((size_t) resampleCache) % 16) / 4;

      	if (header == 4)

      	{

      		header = 0;

      	}

      	for (i = 0; i < header; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      	toResample -= header;

      	/* initialising the varius cur

      	 * cur_frac is the fractional part of cur with 4 samples. as the

      	 * fractional part is 32 bit unsigned value, it can be just added

      	 * and the modulu operation for keeping the fractional part will be implicit.

      	 * the 0.5 is for converting signed values to float (no unsigned convert),

      	 * the 0.5 is added later.

      	 */

      	cur_frac = _mm_set1_epi32(

      		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)

      	);

      	adder_frac = _mm_setr_epi32(

      		0,

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),

      		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),

      		(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)

      	);

      	cur_frac = _mm_add_epi32(cur_frac, adder_frac);

      	/* The various cur_scalar is for the different samples

      	 * (1, 2, 3 compared to original cur_scalar = 0)

      	 */

      	cur_scalar_1 = cur_scalar + resampleStep;

      	cur_scalar_2 = cur_scalar + resampleStep * 2;

      	cur_scalar_3 = cur_scalar + resampleStep * 3;

      	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);

      	dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);

      	dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);

      	cur_scalar &= FIXED_FRACTION_MASK;

      	cur_scalar_1 &= FIXED_FRACTION_MASK;

      	cur_scalar_2 &= FIXED_FRACTION_MASK;

      	cur_scalar_3 &= FIXED_FRACTION_MASK;

      	/* FIXME: These should be _mm_undefined_ps! */

      	current_next_0_1 = _mm_setzero_ps();

      	current_next_2_3 = _mm_setzero_ps();

      	/* Constants */

      	one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);

      	half = _mm_set1_ps(0.5f);

      	adder_frac_loop = _mm_set1_epi32(

      		(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)

      	);

      	tail = toResample % 4;

      	for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)

      	{

      		/* current next holds 2 pairs of the sample and the sample + 1

      		 * after that need to seperate them.

      		 */

      		current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache);

      		current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1);

      		current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2);

      		current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3);

      		/* Unpack them to have seperate current and next in 2 vectors. */

      		current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */

      		next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */

      		sub = _mm_sub_ps(next, current);

      		/* Convert the fractional part to float and then mul to get the fractions out.

      		 * then add back the 0.5 we subtracted before.

      		 */

      		cur_fixed = _mm_add_ps(

      			_mm_mul_ps(

      				_mm_cvtepi32_ps(cur_frac),

      				one_over_fixed_one

      			),

      			half

      		);

      		mul = _mm_mul_ps(sub, cur_fixed);

      		res = _mm_add_ps(current, mul);

      		/* Store back */

      		_mm_store_ps(resampleCache, res);

      		/* Update dCaches for next iteration */

      		cur_scalar += resampleStep * 4;

      		cur_scalar_1 += resampleStep * 4;

      		cur_scalar_2 += resampleStep * 4;

      		cur_scalar_3 += resampleStep * 4;

      		dCache = dCache + (cur_scalar >> FIXED_PRECISION);

      		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);

      		dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);

      		dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);

      		cur_scalar &= FIXED_FRACTION_MASK;

      		cur_scalar_1 &= FIXED_FRACTION_MASK;

      		cur_scalar_2 &= FIXED_FRACTION_MASK;

      		cur_scalar_3 &= FIXED_FRACTION_MASK;

      		cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);

      	}

      	*resampleOffset += resampleStep * (toResample - tail);

      	/* This is the tail. */

      	for (i = 0; i < tail; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      }

      void FAudio_INTERNAL_ResampleStereo_SSE2(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i, header, tail;

      	uint64_t cur_scalar, cur_scalar_1;

      	float *dCache_1;

      	__m128 one_over_fixed_one, half, current_next_1, current_next_2,

      		current, next, sub, cur_fixed, mul, res;

      	__m128i cur_frac, adder_frac, adder_frac_loop;

      	/* This is the header, the Dest needs to be aligned to 16B */

      	header = (16 - ((size_t) resampleCache) % 16) / 8;

      	if (header == 2)

      	{

      		header = 0;

      	}

      	cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < header; i += 2)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      	toResample -= header;

      	/* initialising the varius cur.

      	 * cur_frac holds the fractional part of cur.

      	 * to avoid duplication please see the mono part for a thorough

      	 * explanation.

      	 */

      	cur_frac = _mm_set1_epi32(

      		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)

      	);

      	adder_frac = _mm_setr_epi32(

      		0,

      		0,

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK)

      	);

      	cur_frac = _mm_add_epi32(cur_frac, adder_frac);

      	/* dCache_1 is the pointer for dcache in the next resample pos. */

      	cur_scalar_1 = cur_scalar + resampleStep;

      	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;

      	cur_scalar_1 &= FIXED_FRACTION_MASK;

      	one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);

      	half = _mm_set1_ps(0.5f);

      	adder_frac_loop = _mm_set1_epi32(

      		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)

      	);

      	tail = toResample % 2;

      	for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)

      	{

      		/* Current_next_1 and current_next_2 each holds 4 src

      		 * sample points for getting 4 dest resample point at the end.

      		 * current_next_1 holds:

      		 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)

      		 * for the first resample position, while current_next_2 holds

      		 * the same for the 2nd resample position

      		 */

      		current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */

      		current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */

      		/* Unpack them to get the current and the next in seperate vectors. */

      		current = _mm_castpd_ps(

      			_mm_unpacklo_pd(

      				_mm_castps_pd(current_next_1),

      				_mm_castps_pd(current_next_2)

      			)

      		);

      		next = _mm_castpd_ps(

      			_mm_unpackhi_pd(

      				_mm_castps_pd(current_next_1),

      				_mm_castps_pd(current_next_2)

      			)

      		);

      		sub = _mm_sub_ps(next, current);

      		/* Adding the 0.5 back.

      		 * See mono explanation for more elaborate explanation.

      		 */

      		cur_fixed = _mm_add_ps(

      			_mm_mul_ps(

      				_mm_cvtepi32_ps(cur_frac),

      				one_over_fixed_one

      			),

      			half

      		);

      		mul = _mm_mul_ps(sub, cur_fixed);

      		res = _mm_add_ps(current, mul);

      		/* Store the results */

      		_mm_store_ps(resampleCache, res);

      		/* Update dCaches for next iteration */

      		cur_scalar += resampleStep * 2;

      		cur_scalar_1 += resampleStep * 2;

      		dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;

      		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;

      		cur_scalar &= FIXED_FRACTION_MASK;

      		cur_scalar_1 &= FIXED_FRACTION_MASK;

      		cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);

      	}

      	*resampleOffset += resampleStep * (toResample - tail);

      	/* This is the tail. */

      	for (i = 0; i < tail; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      }

      #endif /* HAVE_SSE2_INTRINSICS */

      #if HAVE_NEON_INTRINSICS

      void FAudio_INTERNAL_ResampleMono_NEON(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i, header, tail;

      	uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;

      	float *dCache_1, *dCache_2, *dCache_3;

      	uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;

      	float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3,

      		current, next, sub, cur_fixed, mul, res;

      	int32x4_t cur_frac, adder_frac, adder_frac_loop;

      	/* This is the header, the Dest needs to be aligned to 16B */

      	header = (16 - ((size_t) resampleCache) % 16) / 4;

      	if (header == 4)

      	{

      		header = 0;

      	}

      	for (i = 0; i < header; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      	toResample -= header;

      	/* initialising the varius cur

      	 * cur_frac is the fractional part of cur with 4 samples. as the

      	 * fractional part is 32 bit unsigned value, it can be just added

      	 * and the modulu operation for keeping the fractional part will be implicit.

      	 * the 0.5 is for converting signed values to float (no unsigned convert),

      	 * the 0.5 is added later.

      	 */

      	cur_frac = vdupq_n_s32(

      		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)

      	);

      	int32_t __attribute__((aligned(16))) data[4] =

      	{

      		0,

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),

      		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),

      		(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)

      	};

      	adder_frac = vld1q_s32(data);

      	cur_frac = vaddq_s32(cur_frac, adder_frac);

      	/* The various cur_scalar is for the different samples

      	 * (1, 2, 3 compared to original cur_scalar = 0)

      	 */

      	cur_scalar_1 = cur_scalar + resampleStep;

      	cur_scalar_2 = cur_scalar + resampleStep * 2;

      	cur_scalar_3 = cur_scalar + resampleStep * 3;

      	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);

      	dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);

      	dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);

      	cur_scalar &= FIXED_FRACTION_MASK;

      	cur_scalar_1 &= FIXED_FRACTION_MASK;

      	cur_scalar_2 &= FIXED_FRACTION_MASK;

      	cur_scalar_3 &= FIXED_FRACTION_MASK;

      	/* Constants */

      	one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);

      	half = vdupq_n_f32(0.5f);

      	adder_frac_loop = vdupq_n_s32(

      		(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)

      	);

      	tail = toResample % 4;

      	for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)

      	{

      		/* current next holds 2 pairs of the sample and the sample + 1

      		 * after that need to separate them.

      		 */

      		current_next_0_1 = vcombine_f32(

      			vld1_f32(dCache),

      			vld1_f32(dCache_1)

      		);

      		current_next_2_3 = vcombine_f32(

      			vld1_f32(dCache_2),

      			vld1_f32(dCache_3)

      		);

      		/* Unpack them to have seperate current and next in 2 vectors. */

      		current = vuzp1q_f32(current_next_0_1, current_next_2_3);

      		next = vuzp2q_f32(current_next_0_1, current_next_2_3);

      		sub = vsubq_f32(next, current);

      		/* Convert the fractional part to float and then mul to get the fractions out.

      		 * then add back the 0.5 we subtracted before.

      		 */

      		cur_fixed = vaddq_f32(

      			vmulq_f32(

      				vcvtq_f32_s32(cur_frac),

      				one_over_fixed_one

      			),

      			half

      		);

      		mul = vmulq_f32(sub, cur_fixed);

      		res = vaddq_f32(current, mul);

      		/* Store back */

      		vst1q_f32(resampleCache, res);

      		/* Update dCaches for next iteration */

      		cur_scalar += resampleStep * 4;

      		cur_scalar_1 += resampleStep * 4;

      		cur_scalar_2 += resampleStep * 4;

      		cur_scalar_3 += resampleStep * 4;

      		dCache = dCache + (cur_scalar >> FIXED_PRECISION);

      		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);

      		dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);

      		dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);

      		cur_scalar &= FIXED_FRACTION_MASK;

      		cur_scalar_1 &= FIXED_FRACTION_MASK;

      		cur_scalar_2 &= FIXED_FRACTION_MASK;

      		cur_scalar_3 &= FIXED_FRACTION_MASK;

      		cur_frac = vaddq_s32(cur_frac, adder_frac_loop);

      	}

      	*resampleOffset += resampleStep * (toResample - tail);

      	/* This is the tail. */

      	for (i = 0; i < tail; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      }

      void FAudio_INTERNAL_ResampleStereo_NEON(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t channels

      ) {

      	uint32_t i, header, tail;

      	uint64_t cur_scalar, cur_scalar_1;

      	float *dCache_1;

      	float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res;

      	int32x4_t cur_frac, adder_frac, adder_frac_loop;

      	/* This is the header, the Dest needs to be aligned to 16B */

      	header = (16 - ((size_t) resampleCache) % 16) / 8;

      	if (header == 2)

      	{

      		header = 0;

      	}

      	cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < header; i += 2)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      	toResample -= header;

      	/* initialising the varius cur.

      	 * cur_frac holds the fractional part of cur.

      	 * to avoid duplication please see the mono part for a thorough

      	 * explanation.

      	 */

      	cur_frac = vdupq_n_s32(

      		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)

      	);

      	int32_t __attribute__((aligned(16))) data[4] =

      	{

      		0,

      		0,

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK)

      	};

      	adder_frac = vld1q_s32(data);

      	cur_frac = vaddq_s32(cur_frac, adder_frac);

      	/* dCache_1 is the pointer for dcache in the next resample pos. */

      	cur_scalar_1 = cur_scalar + resampleStep;

      	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;

      	cur_scalar_1 &= FIXED_FRACTION_MASK;

      	one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);

      	half = vdupq_n_f32(0.5f);

      	adder_frac_loop = vdupq_n_s32(

      		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)

      	);

      	tail = toResample % 2;

      	for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)

      	{

      		/* Current_next_1 and current_next_2 each holds 4 src

      		 * sample points for getting 4 dest resample point at the end.

      		 * current_next_1 holds:

      		 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)

      		 * for the first resample position, while current_next_2 holds

      		 * the same for the 2nd resample position

      		 */

      		current = vcombine_f32(

      			vld1_f32(dCache), /* A1B1 */

      			vld1_f32(dCache_1) /* A3B3 */

      		);

      		next = vcombine_f32(

      			vld1_f32(dCache + 2), /* A2B2 */

      			vld1_f32(dCache_1 + 2) /* A4B4 */

      		);

      		sub = vsubq_f32(next, current);

      		/* Adding the 0.5 back.

      		 * See mono explanation for more elaborate explanation.

      		 */

      		cur_fixed = vaddq_f32(

      			vmulq_f32(

      				vcvtq_f32_s32(cur_frac),

      				one_over_fixed_one

      			),

      			half

      		);

      		mul = vmulq_f32(sub, cur_fixed);

      		res = vaddq_f32(current, mul);

      		/* Store the results */

      		vst1q_f32(resampleCache, res);

      		/* Update dCaches for next iteration */

      		cur_scalar += resampleStep * 2;

      		cur_scalar_1 += resampleStep * 2;

      		dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;

      		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;

      		cur_scalar &= FIXED_FRACTION_MASK;

      		cur_scalar_1 &= FIXED_FRACTION_MASK;

      		cur_frac = vaddq_s32(cur_frac, adder_frac_loop);

      	}

      	*resampleOffset += resampleStep * (toResample - tail);

      	/* This is the tail. */

      	for (i = 0; i < tail; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      }

      #endif /* HAVE_NEON_INTRINSICS */

      /* SECTION 3: Amplifiers */

      #if NEED_SCALAR_CONVERTER_FALLBACKS

      void FAudio_INTERNAL_Amplify_Scalar(

      	float* output,

      	uint32_t totalSamples,

      	float volume

      ) {

      	uint32_t i;

      	for (i = 0; i < totalSamples; i += 1)

      	{

      		output[i] *= volume;

      	}

      }

      #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */

      /* The SSE2 version of the amplifier comes from @8thMage! */

      #if HAVE_SSE2_INTRINSICS

      void FAudio_INTERNAL_Amplify_SSE2(

      	float* output,

      	uint32_t totalSamples,

      	float volume

      ) {

      	uint32_t i;

      	uint32_t header = (16 - (((size_t) output) % 16)) / 4;

      	uint32_t tail = (totalSamples - header) % 4;

      	__m128 volumeVec, outVec;

      	if (header == 4)

      	{

      		header = 0;

      	}

      	if (tail == 4)

      	{

      		tail = 0;

      	}

      	for (i = 0; i < header; i += 1)

      	{

      		output[i] *= volume;

      	}

      	volumeVec = _mm_set1_ps(volume);

      	for (i = header; i < totalSamples - tail; i += 4)

      	{

      		outVec = _mm_load_ps(output + i);

      		outVec = _mm_mul_ps(outVec, volumeVec);

      		_mm_store_ps(output + i, outVec);

      	}

      	for (i = totalSamples - tail; i < totalSamples; i += 1)

      	{

      		output[i] *= volume;

      	}

      }

      #endif /* HAVE_SSE2_INTRINSICS */

      #if HAVE_NEON_INTRINSICS

      void FAudio_INTERNAL_Amplify_NEON(

      	float* output,

      	uint32_t totalSamples,

      	float volume

      ) {

      	uint32_t i;

      	uint32_t header = (16 - (((size_t) output) % 16)) / 4;

      	uint32_t tail = (totalSamples - header) % 4;

      	float32x4_t volumeVec, outVec;

      	if (header == 4)

      	{

      		header = 0;

      	}

      	if (tail == 4)

      	{

      		tail = 0;

      	}

      	for (i = 0; i < header; i += 1)

      	{

      		output[i] *= volume;

      	}

      	volumeVec = vdupq_n_f32(volume);

      	for (i = header; i < totalSamples - tail; i += 4)

      	{

      		outVec = vld1q_f32(output + i);

      		outVec = vmulq_f32(outVec, volumeVec);

      		vst1q_f32(output + i, outVec);

      	}

      	for (i = totalSamples - tail; i < totalSamples; i += 1)

      	{

      		output[i] *= volume;

      	}

      }

      #endif /* HAVE_NEON_INTRINSICS */

      /* SECTION 4: Mixer Functions */

      void FAudio_INTERNAL_Mix_Generic_Scalar(

      	uint32_t toMix,

      	uint32_t srcChans,

      	uint32_t dstChans,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i, co, ci;

      	for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)

      	for (co = 0; co < dstChans; co += 1)

      	{

      		for (ci = 0; ci < srcChans; ci += 1)

      		{

      			dst[co] += (

      				src[ci] *

      				channelVolume[ci] *

      				baseVolume *

      				coefficients[co * srcChans + ci]

      			);

      		}

      	}

      }

      void FAudio_INTERNAL_Mix_1in_1out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolume = baseVolume * channelVolume[0] * coefficients[0];

      	for (i = 0; i < toMix; i += 1, src += 1, dst += 1)

      	{

      		/* Base source data, combined with the coefficients */

      		dst[0] += src[0] * totalVolume;

      	}

      }

      void FAudio_INTERNAL_Mix_1in_2out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolume = baseVolume * channelVolume[0];

      	for (i = 0; i < toMix; i += 1, src += 1, dst += 2)

      	{

      		/* Base source data... */

      		const float sample = src[0] * totalVolume;

      		/* ... combined with the coefficients. */

      		dst[0] += sample * coefficients[0];

      		dst[1] += sample * coefficients[1];

      	}

      }

      void FAudio_INTERNAL_Mix_1in_6out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolume = baseVolume * channelVolume[0];

      	for (i = 0; i < toMix; i += 1, src += 1, dst += 6)

      	{

      		/* Base source data... */

      		const float sample = src[0] * totalVolume;

      		/* ... combined with the coefficients. */

      		dst[0] += sample * coefficients[0];

      		dst[1] += sample * coefficients[1];

      		dst[2] += sample * coefficients[2];

      		dst[3] += sample * coefficients[3];

      		dst[4] += sample * coefficients[4];

      		dst[5] += sample * coefficients[5];

      	}

      }

      void FAudio_INTERNAL_Mix_1in_8out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolume = baseVolume * channelVolume[0];

      	for (i = 0; i < toMix; i += 1, src += 1, dst += 8)

      	{

      		/* Base source data... */

      		const float sample = src[0] * totalVolume;

      		/* ... combined with the coefficients. */

      		dst[0] += sample * coefficients[0];

      		dst[1] += sample * coefficients[1];

      		dst[2] += sample * coefficients[2];

      		dst[3] += sample * coefficients[3];

      		dst[4] += sample * coefficients[4];

      		dst[5] += sample * coefficients[5];

      		dst[6] += sample * coefficients[6];

      		dst[7] += sample * coefficients[7];

      	}

      }

      void FAudio_INTERNAL_Mix_2in_1out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolumeL = baseVolume * channelVolume[0] * coefficients[0];

      	float totalVolumeR = baseVolume * channelVolume[1] * coefficients[1];

      	for (i = 0; i < toMix; i += 1, src += 2, dst += 1)

      	{

      		/* Base source data, combined with the coefficients */

      		dst[0] += (

      			(src[0] * totalVolumeL) +

      			(src[1] * totalVolumeR)

      		);

      	}

      }

      void FAudio_INTERNAL_Mix_2in_2out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolumeL = baseVolume * channelVolume[0];

      	float totalVolumeR = baseVolume * channelVolume[1];

      	for (i = 0; i < toMix; i += 1, src += 2, dst += 2)

      	{

      		/* Base source data... */

      		const float left = src[0] * totalVolumeL;

      		const float right = src[1] * totalVolumeR;

      		/* ... combined with the coefficients. */

      		dst[0] += (

      			(left * coefficients[0]) +

      			(right * coefficients[1])

      		);

      		dst[1] += (

      			(left * coefficients[2]) +

      			(right * coefficients[3])

      		);

      	}

      }

      void FAudio_INTERNAL_Mix_2in_6out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolumeL = baseVolume * channelVolume[0];

      	float totalVolumeR = baseVolume * channelVolume[1];

      	for (i = 0; i < toMix; i += 1, src += 2, dst += 6)

      	{

      		/* Base source data... */

      		const float left = src[0] * totalVolumeL;

      		const float right = src[1] * totalVolumeR;

      		/* ... combined with the coefficients. */

      		dst[0] += (

      			(left * coefficients[0]) +

      			(right * coefficients[1])

      		);

      		dst[1] += (

      			(left * coefficients[2]) +

      			(right * coefficients[3])

      		);

      		dst[2] += (

      			(left * coefficients[4]) +

      			(right * coefficients[5])

      		);

      		dst[3] += (

      			(left * coefficients[6]) +

      			(right * coefficients[7])

      		);

      		dst[4] += (

      			(left * coefficients[8]) +

      			(right * coefficients[9])

      		);

      		dst[5] += (

      			(left * coefficients[10]) +

      			(right * coefficients[11])

      		);

      	}

      }

      void FAudio_INTERNAL_Mix_2in_8out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolumeL = baseVolume * channelVolume[0];

      	float totalVolumeR = baseVolume * channelVolume[1];

      	for (i = 0; i < toMix; i += 1, src += 2, dst += 8)

      	{

      		/* Base source data... */

      		const float left = src[0] * totalVolumeL;

      		const float right = src[1] * totalVolumeR;

      		/* ... combined with the coefficients. */

      		dst[0] += (

      			(left * coefficients[0]) +

      			(right * coefficients[1])

      		);

      		dst[1] += (

      			(left * coefficients[2]) +

      			(right * coefficients[3])

      		);

      		dst[2] += (

      			(left * coefficients[4]) +

      			(right * coefficients[5])

      		);

      		dst[3] += (

      			(left * coefficients[6]) +

      			(right * coefficients[7])

      		);

      		dst[4] += (

      			(left * coefficients[8]) +

      			(right * coefficients[9])

      		);

      		dst[5] += (

      			(left * coefficients[10]) +

      			(right * coefficients[11])

      		);

      		dst[6] += (

      			(left * coefficients[12]) +

      			(right * coefficients[13])

      		);

      		dst[7] += (

      			(left * coefficients[14]) +

      			(right * coefficients[15])

      		);

      	}

      }

      /* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */

      void (*FAudio_INTERNAL_Convert_U8_To_F32)(

      	const uint8_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      );

      void (*FAudio_INTERNAL_Convert_S16_To_F32)(

      	const int16_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      );

      void (*FAudio_INTERNAL_Convert_S32_To_F32)(

      	const int32_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      );

      FAudioResampleCallback FAudio_INTERNAL_ResampleMono;

      FAudioResampleCallback FAudio_INTERNAL_ResampleStereo;

      void (*FAudio_INTERNAL_Amplify)(

      	float *output,

      	uint32_t totalSamples,

      	float volume

      );

      void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON)

      {

      #if HAVE_SSE2_INTRINSICS

      	if (hasSSE2)

      	{

      		FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2;

      		FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2;

      		FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2;

      		FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2;

      		FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2;

      		FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2;

      		return;

      	}

      #endif

      #if HAVE_NEON_INTRINSICS

      	if (hasNEON)

      	{

      		FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON;

      		FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON;

      		FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON;

      		FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON;

      		FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON;

      		FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON;

      		return;

      	}

      #endif

      #if NEED_SCALAR_CONVERTER_FALLBACKS

      	FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar;

      	FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar;

      	FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar;

      	FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar;

      	FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar;

      	FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar;

      #else

      	FAudio_assert(0 && "Need converter functions!");

      #endif

      }

      /* vim: set noexpandtab shiftwidth=8 tabstop=8: */
	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
n r	New repository page
n g	New gist page
	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository permissions settings
Sign in to your account

Last Author