alysbrooks-public/games/isometric-park-fna Files · FNA/lib/FAudio/src/FAudio_internal_simd.c

alysbrooks-public » games » isometric-park-fna
Location: alysbrooks-public/games/isometric-park-fna/FNA/lib/FAudio/src/FAudio_internal_simd.c - annotation

Commit Description:
Add timers for Simulation and various engines...
Commit Description:
Add timers for Simulation and various engines Starting to add additional timers for different stages of the process of updating in order to get more insight into what is slowing it down. The update takes 9ms, which is much longer than it used to. Engine-specific timers are coming later.
References:
r588:3b7b6298ad9c m6-tiles-and-trees
File last commit:
r0:e33f209de7e5 default
Show/Diff file:
Action:
            
                    FNA/lib/FAudio/src/FAudio_internal_simd.c
                
            1626 lines
             | 46.6 KiB
             | text/x-c 
             | CLexer

            History
        
           Show Full History
         |
          Source
         | Raw
         |
              
              Download
              
     alys
  
Early working version (including all dependencies, lol).

              r0
            
      /* FAudio - XAudio Reimplementation for FNA

       *

       * Copyright (c) 2011-2020 Ethan Lee, Luigi Auriemma, and the MonoGame Team

       *

       * This software is provided 'as-is', without any express or implied warranty.

       * In no event will the authors be held liable for any damages arising from

       * the use of this software.

       *

       * Permission is granted to anyone to use this software for any purpose,

       * including commercial applications, and to alter it and redistribute it

       * freely, subject to the following restrictions:

       *

       * 1. The origin of this software must not be misrepresented; you must not

       * claim that you wrote the original software. If you use this software in a

       * product, an acknowledgment in the product documentation would be

       * appreciated but is not required.

       *

       * 2. Altered source versions must be plainly marked as such, and must not be

       * misrepresented as being the original software.

       *

       * 3. This notice may not be removed or altered from any source distribution.

       *

       * Ethan "flibitijibibo" Lee <flibitijibibo@flibitijibibo.com>

       *

       */

      #include "FAudio_internal.h"

      /* SECTION 0: SSE/NEON Detection */

      /* The SSE/NEON detection comes from MojoAL:

       * https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c

       */

      #if defined(__x86_64__) || defined(_M_X64)

      	/* Some platforms fail to define this... */

      	#ifndef __SSE2__

      	#define __SSE2__ 1

      	#endif

      	/* x86_64 guarantees SSE2. */

      	#define NEED_SCALAR_CONVERTER_FALLBACKS 0

      #elif defined(__aarch64__) || defined(_M_ARM64)

      	/* Some platforms fail to define this... */

      	#ifndef __ARM_NEON__

      	#define __ARM_NEON__ 1

      	#endif

      	/* AArch64 guarantees NEON. */

      	#define NEED_SCALAR_CONVERTER_FALLBACKS 0

      #elif __MACOSX__

      	/* Some build systems may need to specify this. Also, macOS ARM? Sigh */

      	#ifndef __SSE2__

      	#error macOS does not have SSE2? Bad compiler? They actually moved to ARM?!

      	#endif

      	/* Mac OS X/Intel guarantees SSE2. */

      	#define NEED_SCALAR_CONVERTER_FALLBACKS 0

      #else

      	/* Need plain C implementations to support all other hardware */

      	#define NEED_SCALAR_CONVERTER_FALLBACKS 1

      #endif

      /* Our NEON paths require AArch64, don't check __ARM_NEON__ here */

      #if defined(__aarch64__) || defined(_M_ARM64)

      #include <arm_neon.h>

      #define HAVE_NEON_INTRINSICS 1

      #endif

      #ifdef __SSE2__

      #include <emmintrin.h>

      #define HAVE_SSE2_INTRINSICS 1

      #endif

      /* SECTION 1: Type Converters */

      /* The SSE/NEON converters are based on SDL_audiotypecvt:

       * https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c

       */

      #define DIVBY128 0.0078125f

      #define DIVBY32768 0.000030517578125f

      #define DIVBY8388607 0.00000011920930376163766f

      #if NEED_SCALAR_CONVERTER_FALLBACKS

      void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(

      	const uint8_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

      	uint32_t i;

      	for (i = 0; i < len; i += 1)

      	{

      		*dst++ = (*src++ * DIVBY128) - 1.0f;

      	}

      }

      void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(

      	const int16_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

      	uint32_t i;

      	for (i = 0; i < len; i += 1)

      	{

      		*dst++ = *src++ * DIVBY32768;

      	}

      }

      void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(

      	const int32_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

      	uint32_t i;

      	for (i = 0; i < len; i += 1)

      	{

      		*dst++ = (*src++ >> 8) * DIVBY8388607;

      	}

      }

      #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */

      #if HAVE_SSE2_INTRINSICS

      void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(

      	const uint8_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          src += len - 1;

          dst += len - 1;

          /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */

          for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {

              *dst = (((float) *src) * DIVBY128) - 1.0f;

          }

          src -= 15; dst -= 15;  /* adjust to read SSE blocks from the start. */

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do SSE blocks as long as we have 16 bytes available. */

              const __m128i *mmsrc = (const __m128i *) src;

              const __m128i zero = _mm_setzero_si128();

              const __m128 divby128 = _mm_set1_ps(DIVBY128);

              const __m128 minus1 = _mm_set1_ps(-1.0f);

              while (i >= 16) {   /* 16 * 8-bit */

                  const __m128i bytes = _mm_load_si128(mmsrc);  /* get 16 uint8 into an XMM register. */

                  /* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */

                  const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);

                  /* right-shift-zero-extend gets us uint16 with the other set of values. */

                  const __m128i shorts2 = _mm_srli_epi16(bytes, 8);

                  /* unpack against zero to make these int32, convert to float, multiply, add. Whew! */

                  /* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */

                  const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);

                  const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);

                  const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);

                  const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);

                  /* Interleave back into correct order, store. */

                  _mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));

                  _mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));

                  _mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));

                  _mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));

                  i -= 16; mmsrc--; dst -= 16;

              }

              src = (const uint8_t *) mmsrc;

          }

          src += 15; dst += 15;  /* adjust for any scalar finishing. */

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = (((float) *src) * DIVBY128) - 1.0f;

              i--; src--; dst--;

          }

      }

      void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(

      	const int16_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          src += len - 1;

          dst += len - 1;

          /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */

          for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {

              *dst = ((float) *src) * DIVBY32768;

          }

          src -= 7; dst -= 7;  /* adjust to read SSE blocks from the start. */

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do SSE blocks as long as we have 16 bytes available. */

              const __m128 divby32768 = _mm_set1_ps(DIVBY32768);

              while (i >= 8) {   /* 8 * 16-bit */

                  const __m128i ints = _mm_load_si128((__m128i const *) src);  /* get 8 sint16 into an XMM register. */

                  /* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */

                  const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);

                  /* right-shift-sign-extend gets us sint32 with the other set of values. */

                  const __m128i b = _mm_srai_epi32(ints, 16);

                  /* Interleave these back into the right order, convert to float, multiply, store. */

                  _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));

                  _mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));

                  i -= 8; src -= 8; dst -= 8;

              }

          }

          src += 7; dst += 7;  /* adjust for any scalar finishing. */

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = ((float) *src) * DIVBY32768;

              i--; src--; dst--;

          }

      }

      void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(

      	const int32_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          /* Get dst aligned to 16 bytes */

          for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {

              *dst = ((float) (*src>>8)) * DIVBY8388607;

          }

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do SSE blocks as long as we have 16 bytes available. */

              const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);

              const __m128i *mmsrc = (const __m128i *) src;

              while (i >= 4) {   /* 4 * sint32 */

                  /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */

                  _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));

                  i -= 4; mmsrc++; dst += 4;

              }

              src = (const int32_t *) mmsrc;

          }

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = ((float) (*src>>8)) * DIVBY8388607;

              i--; src++; dst++;

          }

      }

      #endif /* HAVE_SSE2_INTRINSICS */

      #if HAVE_NEON_INTRINSICS

      void FAudio_INTERNAL_Convert_U8_To_F32_NEON(

      	const uint8_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          src += len - 1;

          dst += len - 1;

          /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */

          for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {

              *dst = (((float) *src) * DIVBY128) - 1.0f;

          }

          src -= 15; dst -= 15;  /* adjust to read NEON blocks from the start. */

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do NEON blocks as long as we have 16 bytes available. */

              const uint8_t *mmsrc = (const uint8_t *) src;

              const float32x4_t divby128 = vdupq_n_f32(DIVBY128);

              const float32x4_t negone = vdupq_n_f32(-1.0f);

              while (i >= 16) {   /* 16 * 8-bit */

                  const uint8x16_t bytes = vld1q_u8(mmsrc);  /* get 16 uint8 into a NEON register. */

                  const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes));  /* convert top 8 bytes to 8 uint16 */

                  const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes));   /* convert bottom 8 bytes to 8 uint16 */

                  /* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */

                  vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));

                  vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));

                  vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));

                  vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));

                  i -= 16; mmsrc -= 16; dst -= 16;

              }

              src = (const uint8_t *) mmsrc;

          }

          src += 15; dst += 15;  /* adjust for any scalar finishing. */

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = (((float) *src) * DIVBY128) - 1.0f;

              i--; src--; dst--;

          }

      }

      void FAudio_INTERNAL_Convert_S16_To_F32_NEON(

      	const int16_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          src += len - 1;

          dst += len - 1;

          /* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */

          for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {

              *dst = ((float) *src) * DIVBY32768;

          }

          src -= 7; dst -= 7;  /* adjust to read NEON blocks from the start. */

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do NEON blocks as long as we have 16 bytes available. */

              const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);

              while (i >= 8) {   /* 8 * 16-bit */

                  const int16x8_t ints = vld1q_s16((int16_t const *) src);  /* get 8 sint16 into a NEON register. */

                  /* split int16 to two int32, then convert to float, then multiply to normalize, store. */

                  vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));

                  vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));

                  i -= 8; src -= 8; dst -= 8;

              }

          }

          src += 7; dst += 7;  /* adjust for any scalar finishing. */

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = ((float) *src) * DIVBY32768;

              i--; src--; dst--;

          }

      }

      void FAudio_INTERNAL_Convert_S32_To_F32_NEON(

      	const int32_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      ) {

          int i;

          /* Get dst aligned to 16 bytes */

          for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {

              *dst = ((float) (*src>>8)) * DIVBY8388607;

          }

          FAudio_assert(!i || ((((size_t) dst) & 15) == 0));

          /* Make sure src is aligned too. */

          if ((((size_t) src) & 15) == 0) {

              /* Aligned! Do NEON blocks as long as we have 16 bytes available. */

              const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);

              const int32_t *mmsrc = (const int32_t *) src;

              while (i >= 4) {   /* 4 * sint32 */

                  /* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */

                  vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));

                  i -= 4; mmsrc += 4; dst += 4;

              }

              src = (const int32_t *) mmsrc;

          }

          /* Finish off any leftovers with scalar operations. */

          while (i) {

              *dst = ((float) (*src>>8)) * DIVBY8388607;

              i--; src++; dst++;

          }

      }

      #endif /* HAVE_NEON_INTRINSICS */

      /* SECTION 2: Linear Resamplers */

      void FAudio_INTERNAL_ResampleGeneric(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t channels

      ) {

      	uint32_t i, j;

      	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < toResample; i += 1)

      	{

      		for (j = 0; j < channels; j += 1)

      		{

      			/* lerp, then convert to float value */

      			*resampleCache++ = (float) (

      				dCache[j] +

      				(dCache[j + channels] - dCache[j]) *

      				FIXED_TO_DOUBLE(cur)

      			);

      		}

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur >> FIXED_PRECISION) * channels;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur &= FIXED_FRACTION_MASK;

      	}

      }

      #if NEED_SCALAR_CONVERTER_FALLBACKS

      void FAudio_INTERNAL_ResampleMono_Scalar(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i;

      	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < toResample; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_DOUBLE(cur)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur &= FIXED_FRACTION_MASK;

      	}

      }

      void FAudio_INTERNAL_ResampleStereo_Scalar(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i;

      	uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < toResample; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_DOUBLE(cur)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_DOUBLE(cur)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur &= FIXED_FRACTION_MASK;

      	}

      }

      #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */

      /* The SSE2 versions of the resamplers come from @8thMage! */

      #if HAVE_SSE2_INTRINSICS

      void FAudio_INTERNAL_ResampleMono_SSE2(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i, header, tail;

      	uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;

      	float *dCache_1, *dCache_2, *dCache_3;

      	uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;

      	__m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3,

      		current, next, sub, cur_fixed, mul, res;

      	__m128i cur_frac, adder_frac, adder_frac_loop;

      	/* This is the header, the Dest needs to be aligned to 16B */

      	header = (16 - ((size_t) resampleCache) % 16) / 4;

      	if (header == 4)

      	{

      		header = 0;

      	}

      	for (i = 0; i < header; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      	toResample -= header;

      	/* initialising the varius cur

      	 * cur_frac is the fractional part of cur with 4 samples. as the

      	 * fractional part is 32 bit unsigned value, it can be just added

      	 * and the modulu operation for keeping the fractional part will be implicit.

      	 * the 0.5 is for converting signed values to float (no unsigned convert),

      	 * the 0.5 is added later.

      	 */

      	cur_frac = _mm_set1_epi32(

      		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)

      	);

      	adder_frac = _mm_setr_epi32(

      		0,

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),

      		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),

      		(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)

      	);

      	cur_frac = _mm_add_epi32(cur_frac, adder_frac);

      	/* The various cur_scalar is for the different samples

      	 * (1, 2, 3 compared to original cur_scalar = 0)

      	 */

      	cur_scalar_1 = cur_scalar + resampleStep;

      	cur_scalar_2 = cur_scalar + resampleStep * 2;

      	cur_scalar_3 = cur_scalar + resampleStep * 3;

      	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);

      	dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);

      	dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);

      	cur_scalar &= FIXED_FRACTION_MASK;

      	cur_scalar_1 &= FIXED_FRACTION_MASK;

      	cur_scalar_2 &= FIXED_FRACTION_MASK;

      	cur_scalar_3 &= FIXED_FRACTION_MASK;

      	/* FIXME: These should be _mm_undefined_ps! */

      	current_next_0_1 = _mm_setzero_ps();

      	current_next_2_3 = _mm_setzero_ps();

      	/* Constants */

      	one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);

      	half = _mm_set1_ps(0.5f);

      	adder_frac_loop = _mm_set1_epi32(

      		(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)

      	);

      	tail = toResample % 4;

      	for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)

      	{

      		/* current next holds 2 pairs of the sample and the sample + 1

      		 * after that need to seperate them.

      		 */

      		current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache);

      		current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1);

      		current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2);

      		current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3);

      		/* Unpack them to have seperate current and next in 2 vectors. */

      		current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */

      		next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */

      		sub = _mm_sub_ps(next, current);

      		/* Convert the fractional part to float and then mul to get the fractions out.

      		 * then add back the 0.5 we subtracted before.

      		 */

      		cur_fixed = _mm_add_ps(

      			_mm_mul_ps(

      				_mm_cvtepi32_ps(cur_frac),

      				one_over_fixed_one

      			),

      			half

      		);

      		mul = _mm_mul_ps(sub, cur_fixed);

      		res = _mm_add_ps(current, mul);

      		/* Store back */

      		_mm_store_ps(resampleCache, res);

      		/* Update dCaches for next iteration */

      		cur_scalar += resampleStep * 4;

      		cur_scalar_1 += resampleStep * 4;

      		cur_scalar_2 += resampleStep * 4;

      		cur_scalar_3 += resampleStep * 4;

      		dCache = dCache + (cur_scalar >> FIXED_PRECISION);

      		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);

      		dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);

      		dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);

      		cur_scalar &= FIXED_FRACTION_MASK;

      		cur_scalar_1 &= FIXED_FRACTION_MASK;

      		cur_scalar_2 &= FIXED_FRACTION_MASK;

      		cur_scalar_3 &= FIXED_FRACTION_MASK;

      		cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);

      	}

      	*resampleOffset += resampleStep * (toResample - tail);

      	/* This is the tail. */

      	for (i = 0; i < tail; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      }

      void FAudio_INTERNAL_ResampleStereo_SSE2(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i, header, tail;

      	uint64_t cur_scalar, cur_scalar_1;

      	float *dCache_1;

      	__m128 one_over_fixed_one, half, current_next_1, current_next_2,

      		current, next, sub, cur_fixed, mul, res;

      	__m128i cur_frac, adder_frac, adder_frac_loop;

      	/* This is the header, the Dest needs to be aligned to 16B */

      	header = (16 - ((size_t) resampleCache) % 16) / 8;

      	if (header == 2)

      	{

      		header = 0;

      	}

      	cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < header; i += 2)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      	toResample -= header;

      	/* initialising the varius cur.

      	 * cur_frac holds the fractional part of cur.

      	 * to avoid duplication please see the mono part for a thorough

      	 * explanation.

      	 */

      	cur_frac = _mm_set1_epi32(

      		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)

      	);

      	adder_frac = _mm_setr_epi32(

      		0,

      		0,

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK)

      	);

      	cur_frac = _mm_add_epi32(cur_frac, adder_frac);

      	/* dCache_1 is the pointer for dcache in the next resample pos. */

      	cur_scalar_1 = cur_scalar + resampleStep;

      	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;

      	cur_scalar_1 &= FIXED_FRACTION_MASK;

      	one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);

      	half = _mm_set1_ps(0.5f);

      	adder_frac_loop = _mm_set1_epi32(

      		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)

      	);

      	tail = toResample % 2;

      	for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)

      	{

      		/* Current_next_1 and current_next_2 each holds 4 src

      		 * sample points for getting 4 dest resample point at the end.

      		 * current_next_1 holds:

      		 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)

      		 * for the first resample position, while current_next_2 holds

      		 * the same for the 2nd resample position

      		 */

      		current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */

      		current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */

      		/* Unpack them to get the current and the next in seperate vectors. */

      		current = _mm_castpd_ps(

      			_mm_unpacklo_pd(

      				_mm_castps_pd(current_next_1),

      				_mm_castps_pd(current_next_2)

      			)

      		);

      		next = _mm_castpd_ps(

      			_mm_unpackhi_pd(

      				_mm_castps_pd(current_next_1),

      				_mm_castps_pd(current_next_2)

      			)

      		);

      		sub = _mm_sub_ps(next, current);

      		/* Adding the 0.5 back.

      		 * See mono explanation for more elaborate explanation.

      		 */

      		cur_fixed = _mm_add_ps(

      			_mm_mul_ps(

      				_mm_cvtepi32_ps(cur_frac),

      				one_over_fixed_one

      			),

      			half

      		);

      		mul = _mm_mul_ps(sub, cur_fixed);

      		res = _mm_add_ps(current, mul);

      		/* Store the results */

      		_mm_store_ps(resampleCache, res);

      		/* Update dCaches for next iteration */

      		cur_scalar += resampleStep * 2;

      		cur_scalar_1 += resampleStep * 2;

      		dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;

      		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;

      		cur_scalar &= FIXED_FRACTION_MASK;

      		cur_scalar_1 &= FIXED_FRACTION_MASK;

      		cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);

      	}

      	*resampleOffset += resampleStep * (toResample - tail);

      	/* This is the tail. */

      	for (i = 0; i < tail; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      }

      #endif /* HAVE_SSE2_INTRINSICS */

      #if HAVE_NEON_INTRINSICS

      void FAudio_INTERNAL_ResampleMono_NEON(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t UNUSED

      ) {

      	uint32_t i, header, tail;

      	uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;

      	float *dCache_1, *dCache_2, *dCache_3;

      	uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;

      	float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3,

      		current, next, sub, cur_fixed, mul, res;

      	int32x4_t cur_frac, adder_frac, adder_frac_loop;

      	/* This is the header, the Dest needs to be aligned to 16B */

      	header = (16 - ((size_t) resampleCache) % 16) / 4;

      	if (header == 4)

      	{

      		header = 0;

      	}

      	for (i = 0; i < header; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      	toResample -= header;

      	/* initialising the varius cur

      	 * cur_frac is the fractional part of cur with 4 samples. as the

      	 * fractional part is 32 bit unsigned value, it can be just added

      	 * and the modulu operation for keeping the fractional part will be implicit.

      	 * the 0.5 is for converting signed values to float (no unsigned convert),

      	 * the 0.5 is added later.

      	 */

      	cur_frac = vdupq_n_s32(

      		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)

      	);

      	int32_t __attribute__((aligned(16))) data[4] =

      	{

      		0,

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),

      		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),

      		(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)

      	};

      	adder_frac = vld1q_s32(data);

      	cur_frac = vaddq_s32(cur_frac, adder_frac);

      	/* The various cur_scalar is for the different samples

      	 * (1, 2, 3 compared to original cur_scalar = 0)

      	 */

      	cur_scalar_1 = cur_scalar + resampleStep;

      	cur_scalar_2 = cur_scalar + resampleStep * 2;

      	cur_scalar_3 = cur_scalar + resampleStep * 3;

      	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);

      	dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);

      	dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);

      	cur_scalar &= FIXED_FRACTION_MASK;

      	cur_scalar_1 &= FIXED_FRACTION_MASK;

      	cur_scalar_2 &= FIXED_FRACTION_MASK;

      	cur_scalar_3 &= FIXED_FRACTION_MASK;

      	/* Constants */

      	one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);

      	half = vdupq_n_f32(0.5f);

      	adder_frac_loop = vdupq_n_s32(

      		(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)

      	);

      	tail = toResample % 4;

      	for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)

      	{

      		/* current next holds 2 pairs of the sample and the sample + 1

      		 * after that need to separate them.

      		 */

      		current_next_0_1 = vcombine_f32(

      			vld1_f32(dCache),

      			vld1_f32(dCache_1)

      		);

      		current_next_2_3 = vcombine_f32(

      			vld1_f32(dCache_2),

      			vld1_f32(dCache_3)

      		);

      		/* Unpack them to have seperate current and next in 2 vectors. */

      		current = vuzp1q_f32(current_next_0_1, current_next_2_3);

      		next = vuzp2q_f32(current_next_0_1, current_next_2_3);

      		sub = vsubq_f32(next, current);

      		/* Convert the fractional part to float and then mul to get the fractions out.

      		 * then add back the 0.5 we subtracted before.

      		 */

      		cur_fixed = vaddq_f32(

      			vmulq_f32(

      				vcvtq_f32_s32(cur_frac),

      				one_over_fixed_one

      			),

      			half

      		);

      		mul = vmulq_f32(sub, cur_fixed);

      		res = vaddq_f32(current, mul);

      		/* Store back */

      		vst1q_f32(resampleCache, res);

      		/* Update dCaches for next iteration */

      		cur_scalar += resampleStep * 4;

      		cur_scalar_1 += resampleStep * 4;

      		cur_scalar_2 += resampleStep * 4;

      		cur_scalar_3 += resampleStep * 4;

      		dCache = dCache + (cur_scalar >> FIXED_PRECISION);

      		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);

      		dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);

      		dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);

      		cur_scalar &= FIXED_FRACTION_MASK;

      		cur_scalar_1 &= FIXED_FRACTION_MASK;

      		cur_scalar_2 &= FIXED_FRACTION_MASK;

      		cur_scalar_3 &= FIXED_FRACTION_MASK;

      		cur_frac = vaddq_s32(cur_frac, adder_frac_loop);

      	}

      	*resampleOffset += resampleStep * (toResample - tail);

      	/* This is the tail. */

      	for (i = 0; i < tail; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[1] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION);

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      }

      void FAudio_INTERNAL_ResampleStereo_NEON(

      	float *restrict dCache,

      	float *restrict resampleCache,

      	uint64_t *resampleOffset,

      	uint64_t resampleStep,

      	uint64_t toResample,

      	uint8_t channels

      ) {

      	uint32_t i, header, tail;

      	uint64_t cur_scalar, cur_scalar_1;

      	float *dCache_1;

      	float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res;

      	int32x4_t cur_frac, adder_frac, adder_frac_loop;

      	/* This is the header, the Dest needs to be aligned to 16B */

      	header = (16 - ((size_t) resampleCache) % 16) / 8;

      	if (header == 2)

      	{

      		header = 0;

      	}

      	cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;

      	for (i = 0; i < header; i += 2)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      	toResample -= header;

      	/* initialising the varius cur.

      	 * cur_frac holds the fractional part of cur.

      	 * to avoid duplication please see the mono part for a thorough

      	 * explanation.

      	 */

      	cur_frac = vdupq_n_s32(

      		(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)

      	);

      	int32_t __attribute__((aligned(16))) data[4] =

      	{

      		0,

      		0,

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK),

      		(uint32_t) (resampleStep & FIXED_FRACTION_MASK)

      	};

      	adder_frac = vld1q_s32(data);

      	cur_frac = vaddq_s32(cur_frac, adder_frac);

      	/* dCache_1 is the pointer for dcache in the next resample pos. */

      	cur_scalar_1 = cur_scalar + resampleStep;

      	dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;

      	cur_scalar_1 &= FIXED_FRACTION_MASK;

      	one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);

      	half = vdupq_n_f32(0.5f);

      	adder_frac_loop = vdupq_n_s32(

      		(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)

      	);

      	tail = toResample % 2;

      	for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)

      	{

      		/* Current_next_1 and current_next_2 each holds 4 src

      		 * sample points for getting 4 dest resample point at the end.

      		 * current_next_1 holds:

      		 * (current_ch_1, current_ch_2, next_ch_1, next_ch_2)

      		 * for the first resample position, while current_next_2 holds

      		 * the same for the 2nd resample position

      		 */

      		current = vcombine_f32(

      			vld1_f32(dCache), /* A1B1 */

      			vld1_f32(dCache_1) /* A3B3 */

      		);

      		next = vcombine_f32(

      			vld1_f32(dCache + 2), /* A2B2 */

      			vld1_f32(dCache_1 + 2) /* A4B4 */

      		);

      		sub = vsubq_f32(next, current);

      		/* Adding the 0.5 back.

      		 * See mono explanation for more elaborate explanation.

      		 */

      		cur_fixed = vaddq_f32(

      			vmulq_f32(

      				vcvtq_f32_s32(cur_frac),

      				one_over_fixed_one

      			),

      			half

      		);

      		mul = vmulq_f32(sub, cur_fixed);

      		res = vaddq_f32(current, mul);

      		/* Store the results */

      		vst1q_f32(resampleCache, res);

      		/* Update dCaches for next iteration */

      		cur_scalar += resampleStep * 2;

      		cur_scalar_1 += resampleStep * 2;

      		dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;

      		dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;

      		cur_scalar &= FIXED_FRACTION_MASK;

      		cur_scalar_1 &= FIXED_FRACTION_MASK;

      		cur_frac = vaddq_s32(cur_frac, adder_frac_loop);

      	}

      	*resampleOffset += resampleStep * (toResample - tail);

      	/* This is the tail. */

      	for (i = 0; i < tail; i += 1)

      	{

      		/* lerp, then convert to float value */

      		*resampleCache++ = (float) (

      			dCache[0] +

      			(dCache[2] - dCache[0]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		*resampleCache++ = (float) (

      			dCache[1] +

      			(dCache[3] - dCache[1]) *

      			FIXED_TO_FLOAT(cur_scalar)

      		);

      		/* Increment fraction offset by the stepping value */

      		*resampleOffset += resampleStep;

      		cur_scalar += resampleStep;

      		/* Only increment the sample offset by integer values.

      		 * Sometimes this will be 0 until cur accumulates

      		 * enough steps, especially for "slow" rates.

      		 */

      		dCache += (cur_scalar >> FIXED_PRECISION) * 2;

      		/* Now that any integer has been added, drop it.

      		 * The offset pointer will preserve the total.

      		 */

      		cur_scalar &= FIXED_FRACTION_MASK;

      	}

      }

      #endif /* HAVE_NEON_INTRINSICS */

      /* SECTION 3: Amplifiers */

      #if NEED_SCALAR_CONVERTER_FALLBACKS

      void FAudio_INTERNAL_Amplify_Scalar(

      	float* output,

      	uint32_t totalSamples,

      	float volume

      ) {

      	uint32_t i;

      	for (i = 0; i < totalSamples; i += 1)

      	{

      		output[i] *= volume;

      	}

      }

      #endif /* NEED_SCALAR_CONVERTER_FALLBACKS */

      /* The SSE2 version of the amplifier comes from @8thMage! */

      #if HAVE_SSE2_INTRINSICS

      void FAudio_INTERNAL_Amplify_SSE2(

      	float* output,

      	uint32_t totalSamples,

      	float volume

      ) {

      	uint32_t i;

      	uint32_t header = (16 - (((size_t) output) % 16)) / 4;

      	uint32_t tail = (totalSamples - header) % 4;

      	__m128 volumeVec, outVec;

      	if (header == 4)

      	{

      		header = 0;

      	}

      	if (tail == 4)

      	{

      		tail = 0;

      	}

      	for (i = 0; i < header; i += 1)

      	{

      		output[i] *= volume;

      	}

      	volumeVec = _mm_set1_ps(volume);

      	for (i = header; i < totalSamples - tail; i += 4)

      	{

      		outVec = _mm_load_ps(output + i);

      		outVec = _mm_mul_ps(outVec, volumeVec);

      		_mm_store_ps(output + i, outVec);

      	}

      	for (i = totalSamples - tail; i < totalSamples; i += 1)

      	{

      		output[i] *= volume;

      	}

      }

      #endif /* HAVE_SSE2_INTRINSICS */

      #if HAVE_NEON_INTRINSICS

      void FAudio_INTERNAL_Amplify_NEON(

      	float* output,

      	uint32_t totalSamples,

      	float volume

      ) {

      	uint32_t i;

      	uint32_t header = (16 - (((size_t) output) % 16)) / 4;

      	uint32_t tail = (totalSamples - header) % 4;

      	float32x4_t volumeVec, outVec;

      	if (header == 4)

      	{

      		header = 0;

      	}

      	if (tail == 4)

      	{

      		tail = 0;

      	}

      	for (i = 0; i < header; i += 1)

      	{

      		output[i] *= volume;

      	}

      	volumeVec = vdupq_n_f32(volume);

      	for (i = header; i < totalSamples - tail; i += 4)

      	{

      		outVec = vld1q_f32(output + i);

      		outVec = vmulq_f32(outVec, volumeVec);

      		vst1q_f32(output + i, outVec);

      	}

      	for (i = totalSamples - tail; i < totalSamples; i += 1)

      	{

      		output[i] *= volume;

      	}

      }

      #endif /* HAVE_NEON_INTRINSICS */

      /* SECTION 4: Mixer Functions */

      void FAudio_INTERNAL_Mix_Generic_Scalar(

      	uint32_t toMix,

      	uint32_t srcChans,

      	uint32_t dstChans,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i, co, ci;

      	for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)

      	for (co = 0; co < dstChans; co += 1)

      	{

      		for (ci = 0; ci < srcChans; ci += 1)

      		{

      			dst[co] += (

      				src[ci] *

      				channelVolume[ci] *

      				baseVolume *

      				coefficients[co * srcChans + ci]

      			);

      		}

      	}

      }

      void FAudio_INTERNAL_Mix_1in_1out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolume = baseVolume * channelVolume[0] * coefficients[0];

      	for (i = 0; i < toMix; i += 1, src += 1, dst += 1)

      	{

      		/* Base source data, combined with the coefficients */

      		dst[0] += src[0] * totalVolume;

      	}

      }

      void FAudio_INTERNAL_Mix_1in_2out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolume = baseVolume * channelVolume[0];

      	for (i = 0; i < toMix; i += 1, src += 1, dst += 2)

      	{

      		/* Base source data... */

      		const float sample = src[0] * totalVolume;

      		/* ... combined with the coefficients. */

      		dst[0] += sample * coefficients[0];

      		dst[1] += sample * coefficients[1];

      	}

      }

      void FAudio_INTERNAL_Mix_1in_6out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolume = baseVolume * channelVolume[0];

      	for (i = 0; i < toMix; i += 1, src += 1, dst += 6)

      	{

      		/* Base source data... */

      		const float sample = src[0] * totalVolume;

      		/* ... combined with the coefficients. */

      		dst[0] += sample * coefficients[0];

      		dst[1] += sample * coefficients[1];

      		dst[2] += sample * coefficients[2];

      		dst[3] += sample * coefficients[3];

      		dst[4] += sample * coefficients[4];

      		dst[5] += sample * coefficients[5];

      	}

      }

      void FAudio_INTERNAL_Mix_1in_8out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolume = baseVolume * channelVolume[0];

      	for (i = 0; i < toMix; i += 1, src += 1, dst += 8)

      	{

      		/* Base source data... */

      		const float sample = src[0] * totalVolume;

      		/* ... combined with the coefficients. */

      		dst[0] += sample * coefficients[0];

      		dst[1] += sample * coefficients[1];

      		dst[2] += sample * coefficients[2];

      		dst[3] += sample * coefficients[3];

      		dst[4] += sample * coefficients[4];

      		dst[5] += sample * coefficients[5];

      		dst[6] += sample * coefficients[6];

      		dst[7] += sample * coefficients[7];

      	}

      }

      void FAudio_INTERNAL_Mix_2in_1out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolumeL = baseVolume * channelVolume[0] * coefficients[0];

      	float totalVolumeR = baseVolume * channelVolume[1] * coefficients[1];

      	for (i = 0; i < toMix; i += 1, src += 2, dst += 1)

      	{

      		/* Base source data, combined with the coefficients */

      		dst[0] += (

      			(src[0] * totalVolumeL) +

      			(src[1] * totalVolumeR)

      		);

      	}

      }

      void FAudio_INTERNAL_Mix_2in_2out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolumeL = baseVolume * channelVolume[0];

      	float totalVolumeR = baseVolume * channelVolume[1];

      	for (i = 0; i < toMix; i += 1, src += 2, dst += 2)

      	{

      		/* Base source data... */

      		const float left = src[0] * totalVolumeL;

      		const float right = src[1] * totalVolumeR;

      		/* ... combined with the coefficients. */

      		dst[0] += (

      			(left * coefficients[0]) +

      			(right * coefficients[1])

      		);

      		dst[1] += (

      			(left * coefficients[2]) +

      			(right * coefficients[3])

      		);

      	}

      }

      void FAudio_INTERNAL_Mix_2in_6out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolumeL = baseVolume * channelVolume[0];

      	float totalVolumeR = baseVolume * channelVolume[1];

      	for (i = 0; i < toMix; i += 1, src += 2, dst += 6)

      	{

      		/* Base source data... */

      		const float left = src[0] * totalVolumeL;

      		const float right = src[1] * totalVolumeR;

      		/* ... combined with the coefficients. */

      		dst[0] += (

      			(left * coefficients[0]) +

      			(right * coefficients[1])

      		);

      		dst[1] += (

      			(left * coefficients[2]) +

      			(right * coefficients[3])

      		);

      		dst[2] += (

      			(left * coefficients[4]) +

      			(right * coefficients[5])

      		);

      		dst[3] += (

      			(left * coefficients[6]) +

      			(right * coefficients[7])

      		);

      		dst[4] += (

      			(left * coefficients[8]) +

      			(right * coefficients[9])

      		);

      		dst[5] += (

      			(left * coefficients[10]) +

      			(right * coefficients[11])

      		);

      	}

      }

      void FAudio_INTERNAL_Mix_2in_8out_Scalar(

      	uint32_t toMix,

      	uint32_t UNUSED1,

      	uint32_t UNUSED2,

      	float baseVolume,

      	float *restrict src,

      	float *restrict dst,

      	float *restrict channelVolume,

      	float *restrict coefficients

      ) {

      	uint32_t i;

      	float totalVolumeL = baseVolume * channelVolume[0];

      	float totalVolumeR = baseVolume * channelVolume[1];

      	for (i = 0; i < toMix; i += 1, src += 2, dst += 8)

      	{

      		/* Base source data... */

      		const float left = src[0] * totalVolumeL;

      		const float right = src[1] * totalVolumeR;

      		/* ... combined with the coefficients. */

      		dst[0] += (

      			(left * coefficients[0]) +

      			(right * coefficients[1])

      		);

      		dst[1] += (

      			(left * coefficients[2]) +

      			(right * coefficients[3])

      		);

      		dst[2] += (

      			(left * coefficients[4]) +

      			(right * coefficients[5])

      		);

      		dst[3] += (

      			(left * coefficients[6]) +

      			(right * coefficients[7])

      		);

      		dst[4] += (

      			(left * coefficients[8]) +

      			(right * coefficients[9])

      		);

      		dst[5] += (

      			(left * coefficients[10]) +

      			(right * coefficients[11])

      		);

      		dst[6] += (

      			(left * coefficients[12]) +

      			(right * coefficients[13])

      		);

      		dst[7] += (

      			(left * coefficients[14]) +

      			(right * coefficients[15])

      		);

      	}

      }

      /* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */

      void (*FAudio_INTERNAL_Convert_U8_To_F32)(

      	const uint8_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      );

      void (*FAudio_INTERNAL_Convert_S16_To_F32)(

      	const int16_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      );

      void (*FAudio_INTERNAL_Convert_S32_To_F32)(

      	const int32_t *restrict src,

      	float *restrict dst,

      	uint32_t len

      );

      FAudioResampleCallback FAudio_INTERNAL_ResampleMono;

      FAudioResampleCallback FAudio_INTERNAL_ResampleStereo;

      void (*FAudio_INTERNAL_Amplify)(

      	float *output,

      	uint32_t totalSamples,

      	float volume

      );

      void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON)

      {

      #if HAVE_SSE2_INTRINSICS

      	if (hasSSE2)

      	{

      		FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2;

      		FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2;

      		FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2;

      		FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2;

      		FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2;

      		FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2;

      		return;

      	}

      #endif

      #if HAVE_NEON_INTRINSICS

      	if (hasNEON)

      	{

      		FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON;

      		FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON;

      		FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON;

      		FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON;

      		FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON;

      		FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON;

      		return;

      	}

      #endif

      #if NEED_SCALAR_CONVERTER_FALLBACKS

      	FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar;

      	FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar;

      	FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar;

      	FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar;

      	FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar;

      	FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar;

      #else

      	FAudio_assert(0 && "Need converter functions!");

      #endif

      }

      /* vim: set noexpandtab shiftwidth=8 tabstop=8: */
	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
n r	New repository page
n g	New gist page
	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository permissions settings
Sign in to your account

Last Author