Show More
Commit Description:
Tweak some settings...
Commit Description:
Tweak some settings
Make upkeep more expensive, planting/clearing cheaper, and large corporate
contracts take up more land.
References:
File last commit:
Show/Diff file:
Action:
FNA/lib/FAudio/src/FAudio_internal_simd.c
1626 lines | 46.6 KiB | text/x-c | CLexer
1626 lines | 46.6 KiB | text/x-c | CLexer
r0 | /* FAudio - XAudio Reimplementation for FNA | |||
* | ||||
* Copyright (c) 2011-2020 Ethan Lee, Luigi Auriemma, and the MonoGame Team | ||||
* | ||||
* This software is provided 'as-is', without any express or implied warranty. | ||||
* In no event will the authors be held liable for any damages arising from | ||||
* the use of this software. | ||||
* | ||||
* Permission is granted to anyone to use this software for any purpose, | ||||
* including commercial applications, and to alter it and redistribute it | ||||
* freely, subject to the following restrictions: | ||||
* | ||||
* 1. The origin of this software must not be misrepresented; you must not | ||||
* claim that you wrote the original software. If you use this software in a | ||||
* product, an acknowledgment in the product documentation would be | ||||
* appreciated but is not required. | ||||
* | ||||
* 2. Altered source versions must be plainly marked as such, and must not be | ||||
* misrepresented as being the original software. | ||||
* | ||||
* 3. This notice may not be removed or altered from any source distribution. | ||||
* | ||||
* Ethan "flibitijibibo" Lee <flibitijibibo@flibitijibibo.com> | ||||
* | ||||
*/ | ||||
#include "FAudio_internal.h" | ||||
/* SECTION 0: SSE/NEON Detection */ | ||||
/* The SSE/NEON detection comes from MojoAL: | ||||
* https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c | ||||
*/ | ||||
#if defined(__x86_64__) || defined(_M_X64) | ||||
/* Some platforms fail to define this... */ | ||||
#ifndef __SSE2__ | ||||
#define __SSE2__ 1 | ||||
#endif | ||||
/* x86_64 guarantees SSE2. */ | ||||
#define NEED_SCALAR_CONVERTER_FALLBACKS 0 | ||||
#elif defined(__aarch64__) || defined(_M_ARM64) | ||||
/* Some platforms fail to define this... */ | ||||
#ifndef __ARM_NEON__ | ||||
#define __ARM_NEON__ 1 | ||||
#endif | ||||
/* AArch64 guarantees NEON. */ | ||||
#define NEED_SCALAR_CONVERTER_FALLBACKS 0 | ||||
#elif __MACOSX__ | ||||
/* Some build systems may need to specify this. Also, macOS ARM? Sigh */ | ||||
#ifndef __SSE2__ | ||||
#error macOS does not have SSE2? Bad compiler? They actually moved to ARM?! | ||||
#endif | ||||
/* Mac OS X/Intel guarantees SSE2. */ | ||||
#define NEED_SCALAR_CONVERTER_FALLBACKS 0 | ||||
#else | ||||
/* Need plain C implementations to support all other hardware */ | ||||
#define NEED_SCALAR_CONVERTER_FALLBACKS 1 | ||||
#endif | ||||
/* Our NEON paths require AArch64, don't check __ARM_NEON__ here */ | ||||
#if defined(__aarch64__) || defined(_M_ARM64) | ||||
#include <arm_neon.h> | ||||
#define HAVE_NEON_INTRINSICS 1 | ||||
#endif | ||||
#ifdef __SSE2__ | ||||
#include <emmintrin.h> | ||||
#define HAVE_SSE2_INTRINSICS 1 | ||||
#endif | ||||
/* SECTION 1: Type Converters */ | ||||
/* The SSE/NEON converters are based on SDL_audiotypecvt: | ||||
* https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c | ||||
*/ | ||||
#define DIVBY128 0.0078125f | ||||
#define DIVBY32768 0.000030517578125f | ||||
#define DIVBY8388607 0.00000011920930376163766f | ||||
#if NEED_SCALAR_CONVERTER_FALLBACKS | ||||
void FAudio_INTERNAL_Convert_U8_To_F32_Scalar( | ||||
const uint8_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
uint32_t i; | ||||
for (i = 0; i < len; i += 1) | ||||
{ | ||||
*dst++ = (*src++ * DIVBY128) - 1.0f; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Convert_S16_To_F32_Scalar( | ||||
const int16_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
uint32_t i; | ||||
for (i = 0; i < len; i += 1) | ||||
{ | ||||
*dst++ = *src++ * DIVBY32768; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Convert_S32_To_F32_Scalar( | ||||
const int32_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
uint32_t i; | ||||
for (i = 0; i < len; i += 1) | ||||
{ | ||||
*dst++ = (*src++ >> 8) * DIVBY8388607; | ||||
} | ||||
} | ||||
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */ | ||||
#if HAVE_SSE2_INTRINSICS | ||||
void FAudio_INTERNAL_Convert_U8_To_F32_SSE2( | ||||
const uint8_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
int i; | ||||
src += len - 1; | ||||
dst += len - 1; | ||||
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ | ||||
for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { | ||||
*dst = (((float) *src) * DIVBY128) - 1.0f; | ||||
} | ||||
src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */ | ||||
FAudio_assert(!i || ((((size_t) dst) & 15) == 0)); | ||||
/* Make sure src is aligned too. */ | ||||
if ((((size_t) src) & 15) == 0) { | ||||
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */ | ||||
const __m128i *mmsrc = (const __m128i *) src; | ||||
const __m128i zero = _mm_setzero_si128(); | ||||
const __m128 divby128 = _mm_set1_ps(DIVBY128); | ||||
const __m128 minus1 = _mm_set1_ps(-1.0f); | ||||
while (i >= 16) { /* 16 * 8-bit */ | ||||
const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */ | ||||
/* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */ | ||||
const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8); | ||||
/* right-shift-zero-extend gets us uint16 with the other set of values. */ | ||||
const __m128i shorts2 = _mm_srli_epi16(bytes, 8); | ||||
/* unpack against zero to make these int32, convert to float, multiply, add. Whew! */ | ||||
/* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */ | ||||
const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1); | ||||
const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1); | ||||
const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1); | ||||
const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1); | ||||
/* Interleave back into correct order, store. */ | ||||
_mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2)); | ||||
_mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2)); | ||||
_mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4)); | ||||
_mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4)); | ||||
i -= 16; mmsrc--; dst -= 16; | ||||
} | ||||
src = (const uint8_t *) mmsrc; | ||||
} | ||||
src += 15; dst += 15; /* adjust for any scalar finishing. */ | ||||
/* Finish off any leftovers with scalar operations. */ | ||||
while (i) { | ||||
*dst = (((float) *src) * DIVBY128) - 1.0f; | ||||
i--; src--; dst--; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Convert_S16_To_F32_SSE2( | ||||
const int16_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
int i; | ||||
src += len - 1; | ||||
dst += len - 1; | ||||
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ | ||||
for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { | ||||
*dst = ((float) *src) * DIVBY32768; | ||||
} | ||||
src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */ | ||||
FAudio_assert(!i || ((((size_t) dst) & 15) == 0)); | ||||
/* Make sure src is aligned too. */ | ||||
if ((((size_t) src) & 15) == 0) { | ||||
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */ | ||||
const __m128 divby32768 = _mm_set1_ps(DIVBY32768); | ||||
while (i >= 8) { /* 8 * 16-bit */ | ||||
const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */ | ||||
/* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */ | ||||
const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16); | ||||
/* right-shift-sign-extend gets us sint32 with the other set of values. */ | ||||
const __m128i b = _mm_srai_epi32(ints, 16); | ||||
/* Interleave these back into the right order, convert to float, multiply, store. */ | ||||
_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768)); | ||||
_mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768)); | ||||
i -= 8; src -= 8; dst -= 8; | ||||
} | ||||
} | ||||
src += 7; dst += 7; /* adjust for any scalar finishing. */ | ||||
/* Finish off any leftovers with scalar operations. */ | ||||
while (i) { | ||||
*dst = ((float) *src) * DIVBY32768; | ||||
i--; src--; dst--; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Convert_S32_To_F32_SSE2( | ||||
const int32_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
int i; | ||||
/* Get dst aligned to 16 bytes */ | ||||
for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) { | ||||
*dst = ((float) (*src>>8)) * DIVBY8388607; | ||||
} | ||||
FAudio_assert(!i || ((((size_t) dst) & 15) == 0)); | ||||
/* Make sure src is aligned too. */ | ||||
if ((((size_t) src) & 15) == 0) { | ||||
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */ | ||||
const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607); | ||||
const __m128i *mmsrc = (const __m128i *) src; | ||||
while (i >= 4) { /* 4 * sint32 */ | ||||
/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */ | ||||
_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607)); | ||||
i -= 4; mmsrc++; dst += 4; | ||||
} | ||||
src = (const int32_t *) mmsrc; | ||||
} | ||||
/* Finish off any leftovers with scalar operations. */ | ||||
while (i) { | ||||
*dst = ((float) (*src>>8)) * DIVBY8388607; | ||||
i--; src++; dst++; | ||||
} | ||||
} | ||||
#endif /* HAVE_SSE2_INTRINSICS */ | ||||
#if HAVE_NEON_INTRINSICS | ||||
void FAudio_INTERNAL_Convert_U8_To_F32_NEON( | ||||
const uint8_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
int i; | ||||
src += len - 1; | ||||
dst += len - 1; | ||||
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ | ||||
for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) { | ||||
*dst = (((float) *src) * DIVBY128) - 1.0f; | ||||
} | ||||
src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */ | ||||
FAudio_assert(!i || ((((size_t) dst) & 15) == 0)); | ||||
/* Make sure src is aligned too. */ | ||||
if ((((size_t) src) & 15) == 0) { | ||||
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */ | ||||
const uint8_t *mmsrc = (const uint8_t *) src; | ||||
const float32x4_t divby128 = vdupq_n_f32(DIVBY128); | ||||
const float32x4_t negone = vdupq_n_f32(-1.0f); | ||||
while (i >= 16) { /* 16 * 8-bit */ | ||||
const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */ | ||||
const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */ | ||||
const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */ | ||||
/* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */ | ||||
vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128)); | ||||
vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128)); | ||||
vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128)); | ||||
vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128)); | ||||
i -= 16; mmsrc -= 16; dst -= 16; | ||||
} | ||||
src = (const uint8_t *) mmsrc; | ||||
} | ||||
src += 15; dst += 15; /* adjust for any scalar finishing. */ | ||||
/* Finish off any leftovers with scalar operations. */ | ||||
while (i) { | ||||
*dst = (((float) *src) * DIVBY128) - 1.0f; | ||||
i--; src--; dst--; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Convert_S16_To_F32_NEON( | ||||
const int16_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
int i; | ||||
src += len - 1; | ||||
dst += len - 1; | ||||
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */ | ||||
for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) { | ||||
*dst = ((float) *src) * DIVBY32768; | ||||
} | ||||
src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */ | ||||
FAudio_assert(!i || ((((size_t) dst) & 15) == 0)); | ||||
/* Make sure src is aligned too. */ | ||||
if ((((size_t) src) & 15) == 0) { | ||||
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */ | ||||
const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768); | ||||
while (i >= 8) { /* 8 * 16-bit */ | ||||
const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */ | ||||
/* split int16 to two int32, then convert to float, then multiply to normalize, store. */ | ||||
vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768)); | ||||
vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768)); | ||||
i -= 8; src -= 8; dst -= 8; | ||||
} | ||||
} | ||||
src += 7; dst += 7; /* adjust for any scalar finishing. */ | ||||
/* Finish off any leftovers with scalar operations. */ | ||||
while (i) { | ||||
*dst = ((float) *src) * DIVBY32768; | ||||
i--; src--; dst--; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Convert_S32_To_F32_NEON( | ||||
const int32_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
) { | ||||
int i; | ||||
/* Get dst aligned to 16 bytes */ | ||||
for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) { | ||||
*dst = ((float) (*src>>8)) * DIVBY8388607; | ||||
} | ||||
FAudio_assert(!i || ((((size_t) dst) & 15) == 0)); | ||||
/* Make sure src is aligned too. */ | ||||
if ((((size_t) src) & 15) == 0) { | ||||
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */ | ||||
const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607); | ||||
const int32_t *mmsrc = (const int32_t *) src; | ||||
while (i >= 4) { /* 4 * sint32 */ | ||||
/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */ | ||||
vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607)); | ||||
i -= 4; mmsrc += 4; dst += 4; | ||||
} | ||||
src = (const int32_t *) mmsrc; | ||||
} | ||||
/* Finish off any leftovers with scalar operations. */ | ||||
while (i) { | ||||
*dst = ((float) (*src>>8)) * DIVBY8388607; | ||||
i--; src++; dst++; | ||||
} | ||||
} | ||||
#endif /* HAVE_NEON_INTRINSICS */ | ||||
/* SECTION 2: Linear Resamplers */ | ||||
void FAudio_INTERNAL_ResampleGeneric( | ||||
float *restrict dCache, | ||||
float *restrict resampleCache, | ||||
uint64_t *resampleOffset, | ||||
uint64_t resampleStep, | ||||
uint64_t toResample, | ||||
uint8_t channels | ||||
) { | ||||
uint32_t i, j; | ||||
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK; | ||||
for (i = 0; i < toResample; i += 1) | ||||
{ | ||||
for (j = 0; j < channels; j += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[j] + | ||||
(dCache[j + channels] - dCache[j]) * | ||||
FIXED_TO_DOUBLE(cur) | ||||
); | ||||
} | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur >> FIXED_PRECISION) * channels; | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur &= FIXED_FRACTION_MASK; | ||||
} | ||||
} | ||||
#if NEED_SCALAR_CONVERTER_FALLBACKS | ||||
void FAudio_INTERNAL_ResampleMono_Scalar( | ||||
float *restrict dCache, | ||||
float *restrict resampleCache, | ||||
uint64_t *resampleOffset, | ||||
uint64_t resampleStep, | ||||
uint64_t toResample, | ||||
uint8_t UNUSED | ||||
) { | ||||
uint32_t i; | ||||
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK; | ||||
for (i = 0; i < toResample; i += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[1] - dCache[0]) * | ||||
FIXED_TO_DOUBLE(cur) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur >> FIXED_PRECISION); | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur &= FIXED_FRACTION_MASK; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_ResampleStereo_Scalar( | ||||
float *restrict dCache, | ||||
float *restrict resampleCache, | ||||
uint64_t *resampleOffset, | ||||
uint64_t resampleStep, | ||||
uint64_t toResample, | ||||
uint8_t UNUSED | ||||
) { | ||||
uint32_t i; | ||||
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK; | ||||
for (i = 0; i < toResample; i += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[2] - dCache[0]) * | ||||
FIXED_TO_DOUBLE(cur) | ||||
); | ||||
*resampleCache++ = (float) ( | ||||
dCache[1] + | ||||
(dCache[3] - dCache[1]) * | ||||
FIXED_TO_DOUBLE(cur) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur >> FIXED_PRECISION) * 2; | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur &= FIXED_FRACTION_MASK; | ||||
} | ||||
} | ||||
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */ | ||||
/* The SSE2 versions of the resamplers come from @8thMage! */ | ||||
#if HAVE_SSE2_INTRINSICS | ||||
void FAudio_INTERNAL_ResampleMono_SSE2( | ||||
float *restrict dCache, | ||||
float *restrict resampleCache, | ||||
uint64_t *resampleOffset, | ||||
uint64_t resampleStep, | ||||
uint64_t toResample, | ||||
uint8_t UNUSED | ||||
) { | ||||
uint32_t i, header, tail; | ||||
uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3; | ||||
float *dCache_1, *dCache_2, *dCache_3; | ||||
uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK; | ||||
__m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3, | ||||
current, next, sub, cur_fixed, mul, res; | ||||
__m128i cur_frac, adder_frac, adder_frac_loop; | ||||
/* This is the header, the Dest needs to be aligned to 16B */ | ||||
header = (16 - ((size_t) resampleCache) % 16) / 4; | ||||
if (header == 4) | ||||
{ | ||||
header = 0; | ||||
} | ||||
for (i = 0; i < header; i += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[1] - dCache[0]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur_scalar += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur_scalar >> FIXED_PRECISION); | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
} | ||||
toResample -= header; | ||||
/* initialising the varius cur | ||||
* cur_frac is the fractional part of cur with 4 samples. as the | ||||
* fractional part is 32 bit unsigned value, it can be just added | ||||
* and the modulu operation for keeping the fractional part will be implicit. | ||||
* the 0.5 is for converting signed values to float (no unsigned convert), | ||||
* the 0.5 is added later. | ||||
*/ | ||||
cur_frac = _mm_set1_epi32( | ||||
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5) | ||||
); | ||||
adder_frac = _mm_setr_epi32( | ||||
0, | ||||
(uint32_t) (resampleStep & FIXED_FRACTION_MASK), | ||||
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK), | ||||
(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK) | ||||
); | ||||
cur_frac = _mm_add_epi32(cur_frac, adder_frac); | ||||
/* The various cur_scalar is for the different samples | ||||
* (1, 2, 3 compared to original cur_scalar = 0) | ||||
*/ | ||||
cur_scalar_1 = cur_scalar + resampleStep; | ||||
cur_scalar_2 = cur_scalar + resampleStep * 2; | ||||
cur_scalar_3 = cur_scalar + resampleStep * 3; | ||||
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION); | ||||
dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION); | ||||
dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION); | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
cur_scalar_1 &= FIXED_FRACTION_MASK; | ||||
cur_scalar_2 &= FIXED_FRACTION_MASK; | ||||
cur_scalar_3 &= FIXED_FRACTION_MASK; | ||||
/* FIXME: These should be _mm_undefined_ps! */ | ||||
current_next_0_1 = _mm_setzero_ps(); | ||||
current_next_2_3 = _mm_setzero_ps(); | ||||
/* Constants */ | ||||
one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE); | ||||
half = _mm_set1_ps(0.5f); | ||||
adder_frac_loop = _mm_set1_epi32( | ||||
(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK) | ||||
); | ||||
tail = toResample % 4; | ||||
for (i = 0; i < toResample - tail; i += 4, resampleCache += 4) | ||||
{ | ||||
/* current next holds 2 pairs of the sample and the sample + 1 | ||||
* after that need to seperate them. | ||||
*/ | ||||
current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache); | ||||
current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1); | ||||
current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2); | ||||
current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3); | ||||
/* Unpack them to have seperate current and next in 2 vectors. */ | ||||
current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */ | ||||
next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */ | ||||
sub = _mm_sub_ps(next, current); | ||||
/* Convert the fractional part to float and then mul to get the fractions out. | ||||
* then add back the 0.5 we subtracted before. | ||||
*/ | ||||
cur_fixed = _mm_add_ps( | ||||
_mm_mul_ps( | ||||
_mm_cvtepi32_ps(cur_frac), | ||||
one_over_fixed_one | ||||
), | ||||
half | ||||
); | ||||
mul = _mm_mul_ps(sub, cur_fixed); | ||||
res = _mm_add_ps(current, mul); | ||||
/* Store back */ | ||||
_mm_store_ps(resampleCache, res); | ||||
/* Update dCaches for next iteration */ | ||||
cur_scalar += resampleStep * 4; | ||||
cur_scalar_1 += resampleStep * 4; | ||||
cur_scalar_2 += resampleStep * 4; | ||||
cur_scalar_3 += resampleStep * 4; | ||||
dCache = dCache + (cur_scalar >> FIXED_PRECISION); | ||||
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION); | ||||
dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION); | ||||
dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION); | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
cur_scalar_1 &= FIXED_FRACTION_MASK; | ||||
cur_scalar_2 &= FIXED_FRACTION_MASK; | ||||
cur_scalar_3 &= FIXED_FRACTION_MASK; | ||||
cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop); | ||||
} | ||||
*resampleOffset += resampleStep * (toResample - tail); | ||||
/* This is the tail. */ | ||||
for (i = 0; i < tail; i += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[1] - dCache[0]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur_scalar += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur_scalar >> FIXED_PRECISION); | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_ResampleStereo_SSE2( | ||||
float *restrict dCache, | ||||
float *restrict resampleCache, | ||||
uint64_t *resampleOffset, | ||||
uint64_t resampleStep, | ||||
uint64_t toResample, | ||||
uint8_t UNUSED | ||||
) { | ||||
uint32_t i, header, tail; | ||||
uint64_t cur_scalar, cur_scalar_1; | ||||
float *dCache_1; | ||||
__m128 one_over_fixed_one, half, current_next_1, current_next_2, | ||||
current, next, sub, cur_fixed, mul, res; | ||||
__m128i cur_frac, adder_frac, adder_frac_loop; | ||||
/* This is the header, the Dest needs to be aligned to 16B */ | ||||
header = (16 - ((size_t) resampleCache) % 16) / 8; | ||||
if (header == 2) | ||||
{ | ||||
header = 0; | ||||
} | ||||
cur_scalar = *resampleOffset & FIXED_FRACTION_MASK; | ||||
for (i = 0; i < header; i += 2) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[2] - dCache[0]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
*resampleCache++ = (float) ( | ||||
dCache[1] + | ||||
(dCache[3] - dCache[1]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur_scalar += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur_scalar >> FIXED_PRECISION) * 2; | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
} | ||||
toResample -= header; | ||||
/* initialising the varius cur. | ||||
* cur_frac holds the fractional part of cur. | ||||
* to avoid duplication please see the mono part for a thorough | ||||
* explanation. | ||||
*/ | ||||
cur_frac = _mm_set1_epi32( | ||||
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5) | ||||
); | ||||
adder_frac = _mm_setr_epi32( | ||||
0, | ||||
0, | ||||
(uint32_t) (resampleStep & FIXED_FRACTION_MASK), | ||||
(uint32_t) (resampleStep & FIXED_FRACTION_MASK) | ||||
); | ||||
cur_frac = _mm_add_epi32(cur_frac, adder_frac); | ||||
/* dCache_1 is the pointer for dcache in the next resample pos. */ | ||||
cur_scalar_1 = cur_scalar + resampleStep; | ||||
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2; | ||||
cur_scalar_1 &= FIXED_FRACTION_MASK; | ||||
one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE); | ||||
half = _mm_set1_ps(0.5f); | ||||
adder_frac_loop = _mm_set1_epi32( | ||||
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK) | ||||
); | ||||
tail = toResample % 2; | ||||
for (i = 0; i < toResample - tail; i += 2, resampleCache += 4) | ||||
{ | ||||
/* Current_next_1 and current_next_2 each holds 4 src | ||||
* sample points for getting 4 dest resample point at the end. | ||||
* current_next_1 holds: | ||||
* (current_ch_1, current_ch_2, next_ch_1, next_ch_2) | ||||
* for the first resample position, while current_next_2 holds | ||||
* the same for the 2nd resample position | ||||
*/ | ||||
current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */ | ||||
current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */ | ||||
/* Unpack them to get the current and the next in seperate vectors. */ | ||||
current = _mm_castpd_ps( | ||||
_mm_unpacklo_pd( | ||||
_mm_castps_pd(current_next_1), | ||||
_mm_castps_pd(current_next_2) | ||||
) | ||||
); | ||||
next = _mm_castpd_ps( | ||||
_mm_unpackhi_pd( | ||||
_mm_castps_pd(current_next_1), | ||||
_mm_castps_pd(current_next_2) | ||||
) | ||||
); | ||||
sub = _mm_sub_ps(next, current); | ||||
/* Adding the 0.5 back. | ||||
* See mono explanation for more elaborate explanation. | ||||
*/ | ||||
cur_fixed = _mm_add_ps( | ||||
_mm_mul_ps( | ||||
_mm_cvtepi32_ps(cur_frac), | ||||
one_over_fixed_one | ||||
), | ||||
half | ||||
); | ||||
mul = _mm_mul_ps(sub, cur_fixed); | ||||
res = _mm_add_ps(current, mul); | ||||
/* Store the results */ | ||||
_mm_store_ps(resampleCache, res); | ||||
/* Update dCaches for next iteration */ | ||||
cur_scalar += resampleStep * 2; | ||||
cur_scalar_1 += resampleStep * 2; | ||||
dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2; | ||||
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2; | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
cur_scalar_1 &= FIXED_FRACTION_MASK; | ||||
cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop); | ||||
} | ||||
*resampleOffset += resampleStep * (toResample - tail); | ||||
/* This is the tail. */ | ||||
for (i = 0; i < tail; i += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[2] - dCache[0]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
*resampleCache++ = (float) ( | ||||
dCache[1] + | ||||
(dCache[3] - dCache[1]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur_scalar += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur_scalar >> FIXED_PRECISION) * 2; | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
} | ||||
} | ||||
#endif /* HAVE_SSE2_INTRINSICS */ | ||||
#if HAVE_NEON_INTRINSICS | ||||
void FAudio_INTERNAL_ResampleMono_NEON( | ||||
float *restrict dCache, | ||||
float *restrict resampleCache, | ||||
uint64_t *resampleOffset, | ||||
uint64_t resampleStep, | ||||
uint64_t toResample, | ||||
uint8_t UNUSED | ||||
) { | ||||
uint32_t i, header, tail; | ||||
uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3; | ||||
float *dCache_1, *dCache_2, *dCache_3; | ||||
uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK; | ||||
float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3, | ||||
current, next, sub, cur_fixed, mul, res; | ||||
int32x4_t cur_frac, adder_frac, adder_frac_loop; | ||||
/* This is the header, the Dest needs to be aligned to 16B */ | ||||
header = (16 - ((size_t) resampleCache) % 16) / 4; | ||||
if (header == 4) | ||||
{ | ||||
header = 0; | ||||
} | ||||
for (i = 0; i < header; i += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[1] - dCache[0]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur_scalar += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur_scalar >> FIXED_PRECISION); | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
} | ||||
toResample -= header; | ||||
/* initialising the varius cur | ||||
* cur_frac is the fractional part of cur with 4 samples. as the | ||||
* fractional part is 32 bit unsigned value, it can be just added | ||||
* and the modulu operation for keeping the fractional part will be implicit. | ||||
* the 0.5 is for converting signed values to float (no unsigned convert), | ||||
* the 0.5 is added later. | ||||
*/ | ||||
cur_frac = vdupq_n_s32( | ||||
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5) | ||||
); | ||||
int32_t __attribute__((aligned(16))) data[4] = | ||||
{ | ||||
0, | ||||
(uint32_t) (resampleStep & FIXED_FRACTION_MASK), | ||||
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK), | ||||
(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK) | ||||
}; | ||||
adder_frac = vld1q_s32(data); | ||||
cur_frac = vaddq_s32(cur_frac, adder_frac); | ||||
/* The various cur_scalar is for the different samples | ||||
* (1, 2, 3 compared to original cur_scalar = 0) | ||||
*/ | ||||
cur_scalar_1 = cur_scalar + resampleStep; | ||||
cur_scalar_2 = cur_scalar + resampleStep * 2; | ||||
cur_scalar_3 = cur_scalar + resampleStep * 3; | ||||
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION); | ||||
dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION); | ||||
dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION); | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
cur_scalar_1 &= FIXED_FRACTION_MASK; | ||||
cur_scalar_2 &= FIXED_FRACTION_MASK; | ||||
cur_scalar_3 &= FIXED_FRACTION_MASK; | ||||
/* Constants */ | ||||
one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE); | ||||
half = vdupq_n_f32(0.5f); | ||||
adder_frac_loop = vdupq_n_s32( | ||||
(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK) | ||||
); | ||||
tail = toResample % 4; | ||||
for (i = 0; i < toResample - tail; i += 4, resampleCache += 4) | ||||
{ | ||||
/* current next holds 2 pairs of the sample and the sample + 1 | ||||
* after that need to separate them. | ||||
*/ | ||||
current_next_0_1 = vcombine_f32( | ||||
vld1_f32(dCache), | ||||
vld1_f32(dCache_1) | ||||
); | ||||
current_next_2_3 = vcombine_f32( | ||||
vld1_f32(dCache_2), | ||||
vld1_f32(dCache_3) | ||||
); | ||||
/* Unpack them to have seperate current and next in 2 vectors. */ | ||||
current = vuzp1q_f32(current_next_0_1, current_next_2_3); | ||||
next = vuzp2q_f32(current_next_0_1, current_next_2_3); | ||||
sub = vsubq_f32(next, current); | ||||
/* Convert the fractional part to float and then mul to get the fractions out. | ||||
* then add back the 0.5 we subtracted before. | ||||
*/ | ||||
cur_fixed = vaddq_f32( | ||||
vmulq_f32( | ||||
vcvtq_f32_s32(cur_frac), | ||||
one_over_fixed_one | ||||
), | ||||
half | ||||
); | ||||
mul = vmulq_f32(sub, cur_fixed); | ||||
res = vaddq_f32(current, mul); | ||||
/* Store back */ | ||||
vst1q_f32(resampleCache, res); | ||||
/* Update dCaches for next iteration */ | ||||
cur_scalar += resampleStep * 4; | ||||
cur_scalar_1 += resampleStep * 4; | ||||
cur_scalar_2 += resampleStep * 4; | ||||
cur_scalar_3 += resampleStep * 4; | ||||
dCache = dCache + (cur_scalar >> FIXED_PRECISION); | ||||
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION); | ||||
dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION); | ||||
dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION); | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
cur_scalar_1 &= FIXED_FRACTION_MASK; | ||||
cur_scalar_2 &= FIXED_FRACTION_MASK; | ||||
cur_scalar_3 &= FIXED_FRACTION_MASK; | ||||
cur_frac = vaddq_s32(cur_frac, adder_frac_loop); | ||||
} | ||||
*resampleOffset += resampleStep * (toResample - tail); | ||||
/* This is the tail. */ | ||||
for (i = 0; i < tail; i += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[1] - dCache[0]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur_scalar += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur_scalar >> FIXED_PRECISION); | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_ResampleStereo_NEON( | ||||
float *restrict dCache, | ||||
float *restrict resampleCache, | ||||
uint64_t *resampleOffset, | ||||
uint64_t resampleStep, | ||||
uint64_t toResample, | ||||
uint8_t channels | ||||
) { | ||||
uint32_t i, header, tail; | ||||
uint64_t cur_scalar, cur_scalar_1; | ||||
float *dCache_1; | ||||
float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res; | ||||
int32x4_t cur_frac, adder_frac, adder_frac_loop; | ||||
/* This is the header, the Dest needs to be aligned to 16B */ | ||||
header = (16 - ((size_t) resampleCache) % 16) / 8; | ||||
if (header == 2) | ||||
{ | ||||
header = 0; | ||||
} | ||||
cur_scalar = *resampleOffset & FIXED_FRACTION_MASK; | ||||
for (i = 0; i < header; i += 2) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[2] - dCache[0]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
*resampleCache++ = (float) ( | ||||
dCache[1] + | ||||
(dCache[3] - dCache[1]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur_scalar += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur_scalar >> FIXED_PRECISION) * 2; | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
} | ||||
toResample -= header; | ||||
/* initialising the varius cur. | ||||
* cur_frac holds the fractional part of cur. | ||||
* to avoid duplication please see the mono part for a thorough | ||||
* explanation. | ||||
*/ | ||||
cur_frac = vdupq_n_s32( | ||||
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5) | ||||
); | ||||
int32_t __attribute__((aligned(16))) data[4] = | ||||
{ | ||||
0, | ||||
0, | ||||
(uint32_t) (resampleStep & FIXED_FRACTION_MASK), | ||||
(uint32_t) (resampleStep & FIXED_FRACTION_MASK) | ||||
}; | ||||
adder_frac = vld1q_s32(data); | ||||
cur_frac = vaddq_s32(cur_frac, adder_frac); | ||||
/* dCache_1 is the pointer for dcache in the next resample pos. */ | ||||
cur_scalar_1 = cur_scalar + resampleStep; | ||||
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2; | ||||
cur_scalar_1 &= FIXED_FRACTION_MASK; | ||||
one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE); | ||||
half = vdupq_n_f32(0.5f); | ||||
adder_frac_loop = vdupq_n_s32( | ||||
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK) | ||||
); | ||||
tail = toResample % 2; | ||||
for (i = 0; i < toResample - tail; i += 2, resampleCache += 4) | ||||
{ | ||||
/* Current_next_1 and current_next_2 each holds 4 src | ||||
* sample points for getting 4 dest resample point at the end. | ||||
* current_next_1 holds: | ||||
* (current_ch_1, current_ch_2, next_ch_1, next_ch_2) | ||||
* for the first resample position, while current_next_2 holds | ||||
* the same for the 2nd resample position | ||||
*/ | ||||
current = vcombine_f32( | ||||
vld1_f32(dCache), /* A1B1 */ | ||||
vld1_f32(dCache_1) /* A3B3 */ | ||||
); | ||||
next = vcombine_f32( | ||||
vld1_f32(dCache + 2), /* A2B2 */ | ||||
vld1_f32(dCache_1 + 2) /* A4B4 */ | ||||
); | ||||
sub = vsubq_f32(next, current); | ||||
/* Adding the 0.5 back. | ||||
* See mono explanation for more elaborate explanation. | ||||
*/ | ||||
cur_fixed = vaddq_f32( | ||||
vmulq_f32( | ||||
vcvtq_f32_s32(cur_frac), | ||||
one_over_fixed_one | ||||
), | ||||
half | ||||
); | ||||
mul = vmulq_f32(sub, cur_fixed); | ||||
res = vaddq_f32(current, mul); | ||||
/* Store the results */ | ||||
vst1q_f32(resampleCache, res); | ||||
/* Update dCaches for next iteration */ | ||||
cur_scalar += resampleStep * 2; | ||||
cur_scalar_1 += resampleStep * 2; | ||||
dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2; | ||||
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2; | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
cur_scalar_1 &= FIXED_FRACTION_MASK; | ||||
cur_frac = vaddq_s32(cur_frac, adder_frac_loop); | ||||
} | ||||
*resampleOffset += resampleStep * (toResample - tail); | ||||
/* This is the tail. */ | ||||
for (i = 0; i < tail; i += 1) | ||||
{ | ||||
/* lerp, then convert to float value */ | ||||
*resampleCache++ = (float) ( | ||||
dCache[0] + | ||||
(dCache[2] - dCache[0]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
*resampleCache++ = (float) ( | ||||
dCache[1] + | ||||
(dCache[3] - dCache[1]) * | ||||
FIXED_TO_FLOAT(cur_scalar) | ||||
); | ||||
/* Increment fraction offset by the stepping value */ | ||||
*resampleOffset += resampleStep; | ||||
cur_scalar += resampleStep; | ||||
/* Only increment the sample offset by integer values. | ||||
* Sometimes this will be 0 until cur accumulates | ||||
* enough steps, especially for "slow" rates. | ||||
*/ | ||||
dCache += (cur_scalar >> FIXED_PRECISION) * 2; | ||||
/* Now that any integer has been added, drop it. | ||||
* The offset pointer will preserve the total. | ||||
*/ | ||||
cur_scalar &= FIXED_FRACTION_MASK; | ||||
} | ||||
} | ||||
#endif /* HAVE_NEON_INTRINSICS */ | ||||
/* SECTION 3: Amplifiers */ | ||||
#if NEED_SCALAR_CONVERTER_FALLBACKS | ||||
void FAudio_INTERNAL_Amplify_Scalar( | ||||
float* output, | ||||
uint32_t totalSamples, | ||||
float volume | ||||
) { | ||||
uint32_t i; | ||||
for (i = 0; i < totalSamples; i += 1) | ||||
{ | ||||
output[i] *= volume; | ||||
} | ||||
} | ||||
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */ | ||||
/* The SSE2 version of the amplifier comes from @8thMage! */ | ||||
#if HAVE_SSE2_INTRINSICS | ||||
void FAudio_INTERNAL_Amplify_SSE2( | ||||
float* output, | ||||
uint32_t totalSamples, | ||||
float volume | ||||
) { | ||||
uint32_t i; | ||||
uint32_t header = (16 - (((size_t) output) % 16)) / 4; | ||||
uint32_t tail = (totalSamples - header) % 4; | ||||
__m128 volumeVec, outVec; | ||||
if (header == 4) | ||||
{ | ||||
header = 0; | ||||
} | ||||
if (tail == 4) | ||||
{ | ||||
tail = 0; | ||||
} | ||||
for (i = 0; i < header; i += 1) | ||||
{ | ||||
output[i] *= volume; | ||||
} | ||||
volumeVec = _mm_set1_ps(volume); | ||||
for (i = header; i < totalSamples - tail; i += 4) | ||||
{ | ||||
outVec = _mm_load_ps(output + i); | ||||
outVec = _mm_mul_ps(outVec, volumeVec); | ||||
_mm_store_ps(output + i, outVec); | ||||
} | ||||
for (i = totalSamples - tail; i < totalSamples; i += 1) | ||||
{ | ||||
output[i] *= volume; | ||||
} | ||||
} | ||||
#endif /* HAVE_SSE2_INTRINSICS */ | ||||
#if HAVE_NEON_INTRINSICS | ||||
void FAudio_INTERNAL_Amplify_NEON( | ||||
float* output, | ||||
uint32_t totalSamples, | ||||
float volume | ||||
) { | ||||
uint32_t i; | ||||
uint32_t header = (16 - (((size_t) output) % 16)) / 4; | ||||
uint32_t tail = (totalSamples - header) % 4; | ||||
float32x4_t volumeVec, outVec; | ||||
if (header == 4) | ||||
{ | ||||
header = 0; | ||||
} | ||||
if (tail == 4) | ||||
{ | ||||
tail = 0; | ||||
} | ||||
for (i = 0; i < header; i += 1) | ||||
{ | ||||
output[i] *= volume; | ||||
} | ||||
volumeVec = vdupq_n_f32(volume); | ||||
for (i = header; i < totalSamples - tail; i += 4) | ||||
{ | ||||
outVec = vld1q_f32(output + i); | ||||
outVec = vmulq_f32(outVec, volumeVec); | ||||
vst1q_f32(output + i, outVec); | ||||
} | ||||
for (i = totalSamples - tail; i < totalSamples; i += 1) | ||||
{ | ||||
output[i] *= volume; | ||||
} | ||||
} | ||||
#endif /* HAVE_NEON_INTRINSICS */ | ||||
/* SECTION 4: Mixer Functions */ | ||||
void FAudio_INTERNAL_Mix_Generic_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t srcChans, | ||||
uint32_t dstChans, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i, co, ci; | ||||
for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans) | ||||
for (co = 0; co < dstChans; co += 1) | ||||
{ | ||||
for (ci = 0; ci < srcChans; ci += 1) | ||||
{ | ||||
dst[co] += ( | ||||
src[ci] * | ||||
channelVolume[ci] * | ||||
baseVolume * | ||||
coefficients[co * srcChans + ci] | ||||
); | ||||
} | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Mix_1in_1out_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t UNUSED1, | ||||
uint32_t UNUSED2, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i; | ||||
float totalVolume = baseVolume * channelVolume[0] * coefficients[0]; | ||||
for (i = 0; i < toMix; i += 1, src += 1, dst += 1) | ||||
{ | ||||
/* Base source data, combined with the coefficients */ | ||||
dst[0] += src[0] * totalVolume; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Mix_1in_2out_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t UNUSED1, | ||||
uint32_t UNUSED2, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i; | ||||
float totalVolume = baseVolume * channelVolume[0]; | ||||
for (i = 0; i < toMix; i += 1, src += 1, dst += 2) | ||||
{ | ||||
/* Base source data... */ | ||||
const float sample = src[0] * totalVolume; | ||||
/* ... combined with the coefficients. */ | ||||
dst[0] += sample * coefficients[0]; | ||||
dst[1] += sample * coefficients[1]; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Mix_1in_6out_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t UNUSED1, | ||||
uint32_t UNUSED2, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i; | ||||
float totalVolume = baseVolume * channelVolume[0]; | ||||
for (i = 0; i < toMix; i += 1, src += 1, dst += 6) | ||||
{ | ||||
/* Base source data... */ | ||||
const float sample = src[0] * totalVolume; | ||||
/* ... combined with the coefficients. */ | ||||
dst[0] += sample * coefficients[0]; | ||||
dst[1] += sample * coefficients[1]; | ||||
dst[2] += sample * coefficients[2]; | ||||
dst[3] += sample * coefficients[3]; | ||||
dst[4] += sample * coefficients[4]; | ||||
dst[5] += sample * coefficients[5]; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Mix_1in_8out_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t UNUSED1, | ||||
uint32_t UNUSED2, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i; | ||||
float totalVolume = baseVolume * channelVolume[0]; | ||||
for (i = 0; i < toMix; i += 1, src += 1, dst += 8) | ||||
{ | ||||
/* Base source data... */ | ||||
const float sample = src[0] * totalVolume; | ||||
/* ... combined with the coefficients. */ | ||||
dst[0] += sample * coefficients[0]; | ||||
dst[1] += sample * coefficients[1]; | ||||
dst[2] += sample * coefficients[2]; | ||||
dst[3] += sample * coefficients[3]; | ||||
dst[4] += sample * coefficients[4]; | ||||
dst[5] += sample * coefficients[5]; | ||||
dst[6] += sample * coefficients[6]; | ||||
dst[7] += sample * coefficients[7]; | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Mix_2in_1out_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t UNUSED1, | ||||
uint32_t UNUSED2, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i; | ||||
float totalVolumeL = baseVolume * channelVolume[0] * coefficients[0]; | ||||
float totalVolumeR = baseVolume * channelVolume[1] * coefficients[1]; | ||||
for (i = 0; i < toMix; i += 1, src += 2, dst += 1) | ||||
{ | ||||
/* Base source data, combined with the coefficients */ | ||||
dst[0] += ( | ||||
(src[0] * totalVolumeL) + | ||||
(src[1] * totalVolumeR) | ||||
); | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Mix_2in_2out_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t UNUSED1, | ||||
uint32_t UNUSED2, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i; | ||||
float totalVolumeL = baseVolume * channelVolume[0]; | ||||
float totalVolumeR = baseVolume * channelVolume[1]; | ||||
for (i = 0; i < toMix; i += 1, src += 2, dst += 2) | ||||
{ | ||||
/* Base source data... */ | ||||
const float left = src[0] * totalVolumeL; | ||||
const float right = src[1] * totalVolumeR; | ||||
/* ... combined with the coefficients. */ | ||||
dst[0] += ( | ||||
(left * coefficients[0]) + | ||||
(right * coefficients[1]) | ||||
); | ||||
dst[1] += ( | ||||
(left * coefficients[2]) + | ||||
(right * coefficients[3]) | ||||
); | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Mix_2in_6out_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t UNUSED1, | ||||
uint32_t UNUSED2, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i; | ||||
float totalVolumeL = baseVolume * channelVolume[0]; | ||||
float totalVolumeR = baseVolume * channelVolume[1]; | ||||
for (i = 0; i < toMix; i += 1, src += 2, dst += 6) | ||||
{ | ||||
/* Base source data... */ | ||||
const float left = src[0] * totalVolumeL; | ||||
const float right = src[1] * totalVolumeR; | ||||
/* ... combined with the coefficients. */ | ||||
dst[0] += ( | ||||
(left * coefficients[0]) + | ||||
(right * coefficients[1]) | ||||
); | ||||
dst[1] += ( | ||||
(left * coefficients[2]) + | ||||
(right * coefficients[3]) | ||||
); | ||||
dst[2] += ( | ||||
(left * coefficients[4]) + | ||||
(right * coefficients[5]) | ||||
); | ||||
dst[3] += ( | ||||
(left * coefficients[6]) + | ||||
(right * coefficients[7]) | ||||
); | ||||
dst[4] += ( | ||||
(left * coefficients[8]) + | ||||
(right * coefficients[9]) | ||||
); | ||||
dst[5] += ( | ||||
(left * coefficients[10]) + | ||||
(right * coefficients[11]) | ||||
); | ||||
} | ||||
} | ||||
void FAudio_INTERNAL_Mix_2in_8out_Scalar( | ||||
uint32_t toMix, | ||||
uint32_t UNUSED1, | ||||
uint32_t UNUSED2, | ||||
float baseVolume, | ||||
float *restrict src, | ||||
float *restrict dst, | ||||
float *restrict channelVolume, | ||||
float *restrict coefficients | ||||
) { | ||||
uint32_t i; | ||||
float totalVolumeL = baseVolume * channelVolume[0]; | ||||
float totalVolumeR = baseVolume * channelVolume[1]; | ||||
for (i = 0; i < toMix; i += 1, src += 2, dst += 8) | ||||
{ | ||||
/* Base source data... */ | ||||
const float left = src[0] * totalVolumeL; | ||||
const float right = src[1] * totalVolumeR; | ||||
/* ... combined with the coefficients. */ | ||||
dst[0] += ( | ||||
(left * coefficients[0]) + | ||||
(right * coefficients[1]) | ||||
); | ||||
dst[1] += ( | ||||
(left * coefficients[2]) + | ||||
(right * coefficients[3]) | ||||
); | ||||
dst[2] += ( | ||||
(left * coefficients[4]) + | ||||
(right * coefficients[5]) | ||||
); | ||||
dst[3] += ( | ||||
(left * coefficients[6]) + | ||||
(right * coefficients[7]) | ||||
); | ||||
dst[4] += ( | ||||
(left * coefficients[8]) + | ||||
(right * coefficients[9]) | ||||
); | ||||
dst[5] += ( | ||||
(left * coefficients[10]) + | ||||
(right * coefficients[11]) | ||||
); | ||||
dst[6] += ( | ||||
(left * coefficients[12]) + | ||||
(right * coefficients[13]) | ||||
); | ||||
dst[7] += ( | ||||
(left * coefficients[14]) + | ||||
(right * coefficients[15]) | ||||
); | ||||
} | ||||
} | ||||
/* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */ | ||||
void (*FAudio_INTERNAL_Convert_U8_To_F32)( | ||||
const uint8_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
); | ||||
void (*FAudio_INTERNAL_Convert_S16_To_F32)( | ||||
const int16_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
); | ||||
void (*FAudio_INTERNAL_Convert_S32_To_F32)( | ||||
const int32_t *restrict src, | ||||
float *restrict dst, | ||||
uint32_t len | ||||
); | ||||
FAudioResampleCallback FAudio_INTERNAL_ResampleMono; | ||||
FAudioResampleCallback FAudio_INTERNAL_ResampleStereo; | ||||
void (*FAudio_INTERNAL_Amplify)( | ||||
float *output, | ||||
uint32_t totalSamples, | ||||
float volume | ||||
); | ||||
void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON) | ||||
{ | ||||
#if HAVE_SSE2_INTRINSICS | ||||
if (hasSSE2) | ||||
{ | ||||
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2; | ||||
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2; | ||||
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2; | ||||
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2; | ||||
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2; | ||||
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2; | ||||
return; | ||||
} | ||||
#endif | ||||
#if HAVE_NEON_INTRINSICS | ||||
if (hasNEON) | ||||
{ | ||||
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON; | ||||
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON; | ||||
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON; | ||||
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON; | ||||
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON; | ||||
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON; | ||||
return; | ||||
} | ||||
#endif | ||||
#if NEED_SCALAR_CONVERTER_FALLBACKS | ||||
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar; | ||||
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar; | ||||
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar; | ||||
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar; | ||||
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar; | ||||
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar; | ||||
#else | ||||
FAudio_assert(0 && "Need converter functions!"); | ||||
#endif | ||||
} | ||||
/* vim: set noexpandtab shiftwidth=8 tabstop=8: */ | ||||