Show More
Commit Description:
Various UI improvements.
Commit Description:
Various UI improvements.
File last commit:
Show/Diff file:
Action:
FNA/lib/FAudio/src/FAudio_internal_simd.c
1626 lines | 46.6 KiB | text/x-c | CLexer
/* FAudio - XAudio Reimplementation for FNA
*
* Copyright (c) 2011-2020 Ethan Lee, Luigi Auriemma, and the MonoGame Team
*
* This software is provided 'as-is', without any express or implied warranty.
* In no event will the authors be held liable for any damages arising from
* the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software in a
* product, an acknowledgment in the product documentation would be
* appreciated but is not required.
*
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
*
* 3. This notice may not be removed or altered from any source distribution.
*
* Ethan "flibitijibibo" Lee <flibitijibibo@flibitijibibo.com>
*
*/
#include "FAudio_internal.h"
/* SECTION 0: SSE/NEON Detection */
/* The SSE/NEON detection comes from MojoAL:
* https://hg.icculus.org/icculus/mojoAL/file/default/mojoal.c
*/
#if defined(__x86_64__) || defined(_M_X64)
/* Some platforms fail to define this... */
#ifndef __SSE2__
#define __SSE2__ 1
#endif
/* x86_64 guarantees SSE2. */
#define NEED_SCALAR_CONVERTER_FALLBACKS 0
#elif defined(__aarch64__) || defined(_M_ARM64)
/* Some platforms fail to define this... */
#ifndef __ARM_NEON__
#define __ARM_NEON__ 1
#endif
/* AArch64 guarantees NEON. */
#define NEED_SCALAR_CONVERTER_FALLBACKS 0
#elif __MACOSX__
/* Some build systems may need to specify this. Also, macOS ARM? Sigh */
#ifndef __SSE2__
#error macOS does not have SSE2? Bad compiler? They actually moved to ARM?!
#endif
/* Mac OS X/Intel guarantees SSE2. */
#define NEED_SCALAR_CONVERTER_FALLBACKS 0
#else
/* Need plain C implementations to support all other hardware */
#define NEED_SCALAR_CONVERTER_FALLBACKS 1
#endif
/* Our NEON paths require AArch64, don't check __ARM_NEON__ here */
#if defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>
#define HAVE_NEON_INTRINSICS 1
#endif
#ifdef __SSE2__
#include <emmintrin.h>
#define HAVE_SSE2_INTRINSICS 1
#endif
/* SECTION 1: Type Converters */
/* The SSE/NEON converters are based on SDL_audiotypecvt:
* https://hg.libsdl.org/SDL/file/default/src/audio/SDL_audiotypecvt.c
*/
#define DIVBY128 0.0078125f
#define DIVBY32768 0.000030517578125f
#define DIVBY8388607 0.00000011920930376163766f
#if NEED_SCALAR_CONVERTER_FALLBACKS
void FAudio_INTERNAL_Convert_U8_To_F32_Scalar(
const uint8_t *restrict src,
float *restrict dst,
uint32_t len
) {
uint32_t i;
for (i = 0; i < len; i += 1)
{
*dst++ = (*src++ * DIVBY128) - 1.0f;
}
}
void FAudio_INTERNAL_Convert_S16_To_F32_Scalar(
const int16_t *restrict src,
float *restrict dst,
uint32_t len
) {
uint32_t i;
for (i = 0; i < len; i += 1)
{
*dst++ = *src++ * DIVBY32768;
}
}
void FAudio_INTERNAL_Convert_S32_To_F32_Scalar(
const int32_t *restrict src,
float *restrict dst,
uint32_t len
) {
uint32_t i;
for (i = 0; i < len; i += 1)
{
*dst++ = (*src++ >> 8) * DIVBY8388607;
}
}
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
#if HAVE_SSE2_INTRINSICS
void FAudio_INTERNAL_Convert_U8_To_F32_SSE2(
const uint8_t *restrict src,
float *restrict dst,
uint32_t len
) {
int i;
src += len - 1;
dst += len - 1;
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
*dst = (((float) *src) * DIVBY128) - 1.0f;
}
src -= 15; dst -= 15; /* adjust to read SSE blocks from the start. */
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
/* Make sure src is aligned too. */
if ((((size_t) src) & 15) == 0) {
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
const __m128i *mmsrc = (const __m128i *) src;
const __m128i zero = _mm_setzero_si128();
const __m128 divby128 = _mm_set1_ps(DIVBY128);
const __m128 minus1 = _mm_set1_ps(-1.0f);
while (i >= 16) { /* 16 * 8-bit */
const __m128i bytes = _mm_load_si128(mmsrc); /* get 16 uint8 into an XMM register. */
/* treat as int16, shift left to clear every other sint16, then back right with zero-extend. Now uint16. */
const __m128i shorts1 = _mm_srli_epi16(_mm_slli_epi16(bytes, 8), 8);
/* right-shift-zero-extend gets us uint16 with the other set of values. */
const __m128i shorts2 = _mm_srli_epi16(bytes, 8);
/* unpack against zero to make these int32, convert to float, multiply, add. Whew! */
/* Note that AVX2 can do floating point multiply+add in one instruction, fwiw. SSE2 cannot. */
const __m128 floats1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts1, zero)), divby128), minus1);
const __m128 floats2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(shorts2, zero)), divby128), minus1);
const __m128 floats3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts1, zero)), divby128), minus1);
const __m128 floats4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(shorts2, zero)), divby128), minus1);
/* Interleave back into correct order, store. */
_mm_store_ps(dst, _mm_unpacklo_ps(floats1, floats2));
_mm_store_ps(dst+4, _mm_unpackhi_ps(floats1, floats2));
_mm_store_ps(dst+8, _mm_unpacklo_ps(floats3, floats4));
_mm_store_ps(dst+12, _mm_unpackhi_ps(floats3, floats4));
i -= 16; mmsrc--; dst -= 16;
}
src = (const uint8_t *) mmsrc;
}
src += 15; dst += 15; /* adjust for any scalar finishing. */
/* Finish off any leftovers with scalar operations. */
while (i) {
*dst = (((float) *src) * DIVBY128) - 1.0f;
i--; src--; dst--;
}
}
void FAudio_INTERNAL_Convert_S16_To_F32_SSE2(
const int16_t *restrict src,
float *restrict dst,
uint32_t len
) {
int i;
src += len - 1;
dst += len - 1;
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
*dst = ((float) *src) * DIVBY32768;
}
src -= 7; dst -= 7; /* adjust to read SSE blocks from the start. */
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
/* Make sure src is aligned too. */
if ((((size_t) src) & 15) == 0) {
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
const __m128 divby32768 = _mm_set1_ps(DIVBY32768);
while (i >= 8) { /* 8 * 16-bit */
const __m128i ints = _mm_load_si128((__m128i const *) src); /* get 8 sint16 into an XMM register. */
/* treat as int32, shift left to clear every other sint16, then back right with sign-extend. Now sint32. */
const __m128i a = _mm_srai_epi32(_mm_slli_epi32(ints, 16), 16);
/* right-shift-sign-extend gets us sint32 with the other set of values. */
const __m128i b = _mm_srai_epi32(ints, 16);
/* Interleave these back into the right order, convert to float, multiply, store. */
_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi32(a, b)), divby32768));
_mm_store_ps(dst+4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi32(a, b)), divby32768));
i -= 8; src -= 8; dst -= 8;
}
}
src += 7; dst += 7; /* adjust for any scalar finishing. */
/* Finish off any leftovers with scalar operations. */
while (i) {
*dst = ((float) *src) * DIVBY32768;
i--; src--; dst--;
}
}
void FAudio_INTERNAL_Convert_S32_To_F32_SSE2(
const int32_t *restrict src,
float *restrict dst,
uint32_t len
) {
int i;
/* Get dst aligned to 16 bytes */
for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
*dst = ((float) (*src>>8)) * DIVBY8388607;
}
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
/* Make sure src is aligned too. */
if ((((size_t) src) & 15) == 0) {
/* Aligned! Do SSE blocks as long as we have 16 bytes available. */
const __m128 divby8388607 = _mm_set1_ps(DIVBY8388607);
const __m128i *mmsrc = (const __m128i *) src;
while (i >= 4) { /* 4 * sint32 */
/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
_mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_load_si128(mmsrc), 8)), divby8388607));
i -= 4; mmsrc++; dst += 4;
}
src = (const int32_t *) mmsrc;
}
/* Finish off any leftovers with scalar operations. */
while (i) {
*dst = ((float) (*src>>8)) * DIVBY8388607;
i--; src++; dst++;
}
}
#endif /* HAVE_SSE2_INTRINSICS */
#if HAVE_NEON_INTRINSICS
void FAudio_INTERNAL_Convert_U8_To_F32_NEON(
const uint8_t *restrict src,
float *restrict dst,
uint32_t len
) {
int i;
src += len - 1;
dst += len - 1;
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
for (i = len; i && (((size_t) (dst-15)) & 15); --i, --src, --dst) {
*dst = (((float) *src) * DIVBY128) - 1.0f;
}
src -= 15; dst -= 15; /* adjust to read NEON blocks from the start. */
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
/* Make sure src is aligned too. */
if ((((size_t) src) & 15) == 0) {
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */
const uint8_t *mmsrc = (const uint8_t *) src;
const float32x4_t divby128 = vdupq_n_f32(DIVBY128);
const float32x4_t negone = vdupq_n_f32(-1.0f);
while (i >= 16) { /* 16 * 8-bit */
const uint8x16_t bytes = vld1q_u8(mmsrc); /* get 16 uint8 into a NEON register. */
const uint16x8_t uint16hi = vmovl_u8(vget_high_u8(bytes)); /* convert top 8 bytes to 8 uint16 */
const uint16x8_t uint16lo = vmovl_u8(vget_low_u8(bytes)); /* convert bottom 8 bytes to 8 uint16 */
/* split uint16 to two uint32, then convert to float, then multiply to normalize, subtract to adjust for sign, store. */
vst1q_f32(dst, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16hi))), divby128));
vst1q_f32(dst+4, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16hi))), divby128));
vst1q_f32(dst+8, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_low_u16(uint16lo))), divby128));
vst1q_f32(dst+12, vmlaq_f32(negone, vcvtq_f32_u32(vmovl_u16(vget_high_u16(uint16lo))), divby128));
i -= 16; mmsrc -= 16; dst -= 16;
}
src = (const uint8_t *) mmsrc;
}
src += 15; dst += 15; /* adjust for any scalar finishing. */
/* Finish off any leftovers with scalar operations. */
while (i) {
*dst = (((float) *src) * DIVBY128) - 1.0f;
i--; src--; dst--;
}
}
void FAudio_INTERNAL_Convert_S16_To_F32_NEON(
const int16_t *restrict src,
float *restrict dst,
uint32_t len
) {
int i;
src += len - 1;
dst += len - 1;
/* Get dst aligned to 16 bytes (since buffer is growing, we don't have to worry about overreading from src) */
for (i = len; i && (((size_t) (dst-7)) & 15); --i, --src, --dst) {
*dst = ((float) *src) * DIVBY32768;
}
src -= 7; dst -= 7; /* adjust to read NEON blocks from the start. */
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
/* Make sure src is aligned too. */
if ((((size_t) src) & 15) == 0) {
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */
const float32x4_t divby32768 = vdupq_n_f32(DIVBY32768);
while (i >= 8) { /* 8 * 16-bit */
const int16x8_t ints = vld1q_s16((int16_t const *) src); /* get 8 sint16 into a NEON register. */
/* split int16 to two int32, then convert to float, then multiply to normalize, store. */
vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(ints))), divby32768));
vst1q_f32(dst+4, vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(ints))), divby32768));
i -= 8; src -= 8; dst -= 8;
}
}
src += 7; dst += 7; /* adjust for any scalar finishing. */
/* Finish off any leftovers with scalar operations. */
while (i) {
*dst = ((float) *src) * DIVBY32768;
i--; src--; dst--;
}
}
void FAudio_INTERNAL_Convert_S32_To_F32_NEON(
const int32_t *restrict src,
float *restrict dst,
uint32_t len
) {
int i;
/* Get dst aligned to 16 bytes */
for (i = len; i && (((size_t) dst) & 15); --i, ++src, ++dst) {
*dst = ((float) (*src>>8)) * DIVBY8388607;
}
FAudio_assert(!i || ((((size_t) dst) & 15) == 0));
/* Make sure src is aligned too. */
if ((((size_t) src) & 15) == 0) {
/* Aligned! Do NEON blocks as long as we have 16 bytes available. */
const float32x4_t divby8388607 = vdupq_n_f32(DIVBY8388607);
const int32_t *mmsrc = (const int32_t *) src;
while (i >= 4) { /* 4 * sint32 */
/* shift out lowest bits so int fits in a float32. Small precision loss, but much faster. */
vst1q_f32(dst, vmulq_f32(vcvtq_f32_s32(vshrq_n_s32(vld1q_s32(mmsrc), 8)), divby8388607));
i -= 4; mmsrc += 4; dst += 4;
}
src = (const int32_t *) mmsrc;
}
/* Finish off any leftovers with scalar operations. */
while (i) {
*dst = ((float) (*src>>8)) * DIVBY8388607;
i--; src++; dst++;
}
}
#endif /* HAVE_NEON_INTRINSICS */
/* SECTION 2: Linear Resamplers */
void FAudio_INTERNAL_ResampleGeneric(
float *restrict dCache,
float *restrict resampleCache,
uint64_t *resampleOffset,
uint64_t resampleStep,
uint64_t toResample,
uint8_t channels
) {
uint32_t i, j;
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
for (i = 0; i < toResample; i += 1)
{
for (j = 0; j < channels; j += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[j] +
(dCache[j + channels] - dCache[j]) *
FIXED_TO_DOUBLE(cur)
);
}
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur >> FIXED_PRECISION) * channels;
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur &= FIXED_FRACTION_MASK;
}
}
#if NEED_SCALAR_CONVERTER_FALLBACKS
void FAudio_INTERNAL_ResampleMono_Scalar(
float *restrict dCache,
float *restrict resampleCache,
uint64_t *resampleOffset,
uint64_t resampleStep,
uint64_t toResample,
uint8_t UNUSED
) {
uint32_t i;
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
for (i = 0; i < toResample; i += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[1] - dCache[0]) *
FIXED_TO_DOUBLE(cur)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur >> FIXED_PRECISION);
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur &= FIXED_FRACTION_MASK;
}
}
void FAudio_INTERNAL_ResampleStereo_Scalar(
float *restrict dCache,
float *restrict resampleCache,
uint64_t *resampleOffset,
uint64_t resampleStep,
uint64_t toResample,
uint8_t UNUSED
) {
uint32_t i;
uint64_t cur = *resampleOffset & FIXED_FRACTION_MASK;
for (i = 0; i < toResample; i += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[2] - dCache[0]) *
FIXED_TO_DOUBLE(cur)
);
*resampleCache++ = (float) (
dCache[1] +
(dCache[3] - dCache[1]) *
FIXED_TO_DOUBLE(cur)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur >> FIXED_PRECISION) * 2;
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur &= FIXED_FRACTION_MASK;
}
}
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
/* The SSE2 versions of the resamplers come from @8thMage! */
#if HAVE_SSE2_INTRINSICS
void FAudio_INTERNAL_ResampleMono_SSE2(
float *restrict dCache,
float *restrict resampleCache,
uint64_t *resampleOffset,
uint64_t resampleStep,
uint64_t toResample,
uint8_t UNUSED
) {
uint32_t i, header, tail;
uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
float *dCache_1, *dCache_2, *dCache_3;
uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
__m128 one_over_fixed_one, half, current_next_0_1, current_next_2_3,
current, next, sub, cur_fixed, mul, res;
__m128i cur_frac, adder_frac, adder_frac_loop;
/* This is the header, the Dest needs to be aligned to 16B */
header = (16 - ((size_t) resampleCache) % 16) / 4;
if (header == 4)
{
header = 0;
}
for (i = 0; i < header; i += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[1] - dCache[0]) *
FIXED_TO_FLOAT(cur_scalar)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur_scalar += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur_scalar >> FIXED_PRECISION);
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur_scalar &= FIXED_FRACTION_MASK;
}
toResample -= header;
/* initialising the varius cur
* cur_frac is the fractional part of cur with 4 samples. as the
* fractional part is 32 bit unsigned value, it can be just added
* and the modulu operation for keeping the fractional part will be implicit.
* the 0.5 is for converting signed values to float (no unsigned convert),
* the 0.5 is added later.
*/
cur_frac = _mm_set1_epi32(
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
);
adder_frac = _mm_setr_epi32(
0,
(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
);
cur_frac = _mm_add_epi32(cur_frac, adder_frac);
/* The various cur_scalar is for the different samples
* (1, 2, 3 compared to original cur_scalar = 0)
*/
cur_scalar_1 = cur_scalar + resampleStep;
cur_scalar_2 = cur_scalar + resampleStep * 2;
cur_scalar_3 = cur_scalar + resampleStep * 3;
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
cur_scalar &= FIXED_FRACTION_MASK;
cur_scalar_1 &= FIXED_FRACTION_MASK;
cur_scalar_2 &= FIXED_FRACTION_MASK;
cur_scalar_3 &= FIXED_FRACTION_MASK;
/* FIXME: These should be _mm_undefined_ps! */
current_next_0_1 = _mm_setzero_ps();
current_next_2_3 = _mm_setzero_ps();
/* Constants */
one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
half = _mm_set1_ps(0.5f);
adder_frac_loop = _mm_set1_epi32(
(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
);
tail = toResample % 4;
for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
{
/* current next holds 2 pairs of the sample and the sample + 1
* after that need to seperate them.
*/
current_next_0_1 = _mm_loadl_pi(current_next_0_1, (__m64*) dCache);
current_next_0_1 = _mm_loadh_pi(current_next_0_1, (__m64*) dCache_1);
current_next_2_3 = _mm_loadl_pi(current_next_2_3, (__m64*) dCache_2);
current_next_2_3 = _mm_loadh_pi(current_next_2_3, (__m64*) dCache_3);
/* Unpack them to have seperate current and next in 2 vectors. */
current = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0x88); /* 0b1000 */
next = _mm_shuffle_ps(current_next_0_1, current_next_2_3, 0xdd); /* 0b1101 */
sub = _mm_sub_ps(next, current);
/* Convert the fractional part to float and then mul to get the fractions out.
* then add back the 0.5 we subtracted before.
*/
cur_fixed = _mm_add_ps(
_mm_mul_ps(
_mm_cvtepi32_ps(cur_frac),
one_over_fixed_one
),
half
);
mul = _mm_mul_ps(sub, cur_fixed);
res = _mm_add_ps(current, mul);
/* Store back */
_mm_store_ps(resampleCache, res);
/* Update dCaches for next iteration */
cur_scalar += resampleStep * 4;
cur_scalar_1 += resampleStep * 4;
cur_scalar_2 += resampleStep * 4;
cur_scalar_3 += resampleStep * 4;
dCache = dCache + (cur_scalar >> FIXED_PRECISION);
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
cur_scalar &= FIXED_FRACTION_MASK;
cur_scalar_1 &= FIXED_FRACTION_MASK;
cur_scalar_2 &= FIXED_FRACTION_MASK;
cur_scalar_3 &= FIXED_FRACTION_MASK;
cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
}
*resampleOffset += resampleStep * (toResample - tail);
/* This is the tail. */
for (i = 0; i < tail; i += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[1] - dCache[0]) *
FIXED_TO_FLOAT(cur_scalar)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur_scalar += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur_scalar >> FIXED_PRECISION);
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur_scalar &= FIXED_FRACTION_MASK;
}
}
void FAudio_INTERNAL_ResampleStereo_SSE2(
float *restrict dCache,
float *restrict resampleCache,
uint64_t *resampleOffset,
uint64_t resampleStep,
uint64_t toResample,
uint8_t UNUSED
) {
uint32_t i, header, tail;
uint64_t cur_scalar, cur_scalar_1;
float *dCache_1;
__m128 one_over_fixed_one, half, current_next_1, current_next_2,
current, next, sub, cur_fixed, mul, res;
__m128i cur_frac, adder_frac, adder_frac_loop;
/* This is the header, the Dest needs to be aligned to 16B */
header = (16 - ((size_t) resampleCache) % 16) / 8;
if (header == 2)
{
header = 0;
}
cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
for (i = 0; i < header; i += 2)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[2] - dCache[0]) *
FIXED_TO_FLOAT(cur_scalar)
);
*resampleCache++ = (float) (
dCache[1] +
(dCache[3] - dCache[1]) *
FIXED_TO_FLOAT(cur_scalar)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur_scalar += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur_scalar >> FIXED_PRECISION) * 2;
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur_scalar &= FIXED_FRACTION_MASK;
}
toResample -= header;
/* initialising the varius cur.
* cur_frac holds the fractional part of cur.
* to avoid duplication please see the mono part for a thorough
* explanation.
*/
cur_frac = _mm_set1_epi32(
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
);
adder_frac = _mm_setr_epi32(
0,
0,
(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
(uint32_t) (resampleStep & FIXED_FRACTION_MASK)
);
cur_frac = _mm_add_epi32(cur_frac, adder_frac);
/* dCache_1 is the pointer for dcache in the next resample pos. */
cur_scalar_1 = cur_scalar + resampleStep;
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
cur_scalar_1 &= FIXED_FRACTION_MASK;
one_over_fixed_one = _mm_set1_ps(1.0f / FIXED_ONE);
half = _mm_set1_ps(0.5f);
adder_frac_loop = _mm_set1_epi32(
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
);
tail = toResample % 2;
for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
{
/* Current_next_1 and current_next_2 each holds 4 src
* sample points for getting 4 dest resample point at the end.
* current_next_1 holds:
* (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
* for the first resample position, while current_next_2 holds
* the same for the 2nd resample position
*/
current_next_1 = _mm_loadu_ps(dCache); /* A1B1A2B2 */
current_next_2 = _mm_loadu_ps(dCache_1); /* A3B3A4B4 */
/* Unpack them to get the current and the next in seperate vectors. */
current = _mm_castpd_ps(
_mm_unpacklo_pd(
_mm_castps_pd(current_next_1),
_mm_castps_pd(current_next_2)
)
);
next = _mm_castpd_ps(
_mm_unpackhi_pd(
_mm_castps_pd(current_next_1),
_mm_castps_pd(current_next_2)
)
);
sub = _mm_sub_ps(next, current);
/* Adding the 0.5 back.
* See mono explanation for more elaborate explanation.
*/
cur_fixed = _mm_add_ps(
_mm_mul_ps(
_mm_cvtepi32_ps(cur_frac),
one_over_fixed_one
),
half
);
mul = _mm_mul_ps(sub, cur_fixed);
res = _mm_add_ps(current, mul);
/* Store the results */
_mm_store_ps(resampleCache, res);
/* Update dCaches for next iteration */
cur_scalar += resampleStep * 2;
cur_scalar_1 += resampleStep * 2;
dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
cur_scalar &= FIXED_FRACTION_MASK;
cur_scalar_1 &= FIXED_FRACTION_MASK;
cur_frac = _mm_add_epi32(cur_frac, adder_frac_loop);
}
*resampleOffset += resampleStep * (toResample - tail);
/* This is the tail. */
for (i = 0; i < tail; i += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[2] - dCache[0]) *
FIXED_TO_FLOAT(cur_scalar)
);
*resampleCache++ = (float) (
dCache[1] +
(dCache[3] - dCache[1]) *
FIXED_TO_FLOAT(cur_scalar)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur_scalar += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur_scalar >> FIXED_PRECISION) * 2;
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur_scalar &= FIXED_FRACTION_MASK;
}
}
#endif /* HAVE_SSE2_INTRINSICS */
#if HAVE_NEON_INTRINSICS
void FAudio_INTERNAL_ResampleMono_NEON(
float *restrict dCache,
float *restrict resampleCache,
uint64_t *resampleOffset,
uint64_t resampleStep,
uint64_t toResample,
uint8_t UNUSED
) {
uint32_t i, header, tail;
uint64_t cur_scalar_1, cur_scalar_2, cur_scalar_3;
float *dCache_1, *dCache_2, *dCache_3;
uint64_t cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
float32x4_t one_over_fixed_one, half, current_next_0_1, current_next_2_3,
current, next, sub, cur_fixed, mul, res;
int32x4_t cur_frac, adder_frac, adder_frac_loop;
/* This is the header, the Dest needs to be aligned to 16B */
header = (16 - ((size_t) resampleCache) % 16) / 4;
if (header == 4)
{
header = 0;
}
for (i = 0; i < header; i += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[1] - dCache[0]) *
FIXED_TO_FLOAT(cur_scalar)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur_scalar += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur_scalar >> FIXED_PRECISION);
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur_scalar &= FIXED_FRACTION_MASK;
}
toResample -= header;
/* initialising the varius cur
* cur_frac is the fractional part of cur with 4 samples. as the
* fractional part is 32 bit unsigned value, it can be just added
* and the modulu operation for keeping the fractional part will be implicit.
* the 0.5 is for converting signed values to float (no unsigned convert),
* the 0.5 is added later.
*/
cur_frac = vdupq_n_s32(
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
);
int32_t __attribute__((aligned(16))) data[4] =
{
0,
(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK),
(uint32_t) ((resampleStep * 3) & FIXED_FRACTION_MASK)
};
adder_frac = vld1q_s32(data);
cur_frac = vaddq_s32(cur_frac, adder_frac);
/* The various cur_scalar is for the different samples
* (1, 2, 3 compared to original cur_scalar = 0)
*/
cur_scalar_1 = cur_scalar + resampleStep;
cur_scalar_2 = cur_scalar + resampleStep * 2;
cur_scalar_3 = cur_scalar + resampleStep * 3;
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION);
dCache_2 = dCache + (cur_scalar_2 >> FIXED_PRECISION);
dCache_3 = dCache + (cur_scalar_3 >> FIXED_PRECISION);
cur_scalar &= FIXED_FRACTION_MASK;
cur_scalar_1 &= FIXED_FRACTION_MASK;
cur_scalar_2 &= FIXED_FRACTION_MASK;
cur_scalar_3 &= FIXED_FRACTION_MASK;
/* Constants */
one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
half = vdupq_n_f32(0.5f);
adder_frac_loop = vdupq_n_s32(
(uint32_t) ((resampleStep * 4) & FIXED_FRACTION_MASK)
);
tail = toResample % 4;
for (i = 0; i < toResample - tail; i += 4, resampleCache += 4)
{
/* current next holds 2 pairs of the sample and the sample + 1
* after that need to separate them.
*/
current_next_0_1 = vcombine_f32(
vld1_f32(dCache),
vld1_f32(dCache_1)
);
current_next_2_3 = vcombine_f32(
vld1_f32(dCache_2),
vld1_f32(dCache_3)
);
/* Unpack them to have seperate current and next in 2 vectors. */
current = vuzp1q_f32(current_next_0_1, current_next_2_3);
next = vuzp2q_f32(current_next_0_1, current_next_2_3);
sub = vsubq_f32(next, current);
/* Convert the fractional part to float and then mul to get the fractions out.
* then add back the 0.5 we subtracted before.
*/
cur_fixed = vaddq_f32(
vmulq_f32(
vcvtq_f32_s32(cur_frac),
one_over_fixed_one
),
half
);
mul = vmulq_f32(sub, cur_fixed);
res = vaddq_f32(current, mul);
/* Store back */
vst1q_f32(resampleCache, res);
/* Update dCaches for next iteration */
cur_scalar += resampleStep * 4;
cur_scalar_1 += resampleStep * 4;
cur_scalar_2 += resampleStep * 4;
cur_scalar_3 += resampleStep * 4;
dCache = dCache + (cur_scalar >> FIXED_PRECISION);
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION);
dCache_2 = dCache_2 + (cur_scalar_2 >> FIXED_PRECISION);
dCache_3 = dCache_3 + (cur_scalar_3 >> FIXED_PRECISION);
cur_scalar &= FIXED_FRACTION_MASK;
cur_scalar_1 &= FIXED_FRACTION_MASK;
cur_scalar_2 &= FIXED_FRACTION_MASK;
cur_scalar_3 &= FIXED_FRACTION_MASK;
cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
}
*resampleOffset += resampleStep * (toResample - tail);
/* This is the tail. */
for (i = 0; i < tail; i += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[1] - dCache[0]) *
FIXED_TO_FLOAT(cur_scalar)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur_scalar += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur_scalar >> FIXED_PRECISION);
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur_scalar &= FIXED_FRACTION_MASK;
}
}
void FAudio_INTERNAL_ResampleStereo_NEON(
float *restrict dCache,
float *restrict resampleCache,
uint64_t *resampleOffset,
uint64_t resampleStep,
uint64_t toResample,
uint8_t channels
) {
uint32_t i, header, tail;
uint64_t cur_scalar, cur_scalar_1;
float *dCache_1;
float32x4_t one_over_fixed_one, half, current, next, sub, cur_fixed, mul, res;
int32x4_t cur_frac, adder_frac, adder_frac_loop;
/* This is the header, the Dest needs to be aligned to 16B */
header = (16 - ((size_t) resampleCache) % 16) / 8;
if (header == 2)
{
header = 0;
}
cur_scalar = *resampleOffset & FIXED_FRACTION_MASK;
for (i = 0; i < header; i += 2)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[2] - dCache[0]) *
FIXED_TO_FLOAT(cur_scalar)
);
*resampleCache++ = (float) (
dCache[1] +
(dCache[3] - dCache[1]) *
FIXED_TO_FLOAT(cur_scalar)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur_scalar += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur_scalar >> FIXED_PRECISION) * 2;
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur_scalar &= FIXED_FRACTION_MASK;
}
toResample -= header;
/* initialising the varius cur.
* cur_frac holds the fractional part of cur.
* to avoid duplication please see the mono part for a thorough
* explanation.
*/
cur_frac = vdupq_n_s32(
(uint32_t) (cur_scalar & FIXED_FRACTION_MASK) - DOUBLE_TO_FIXED(0.5)
);
int32_t __attribute__((aligned(16))) data[4] =
{
0,
0,
(uint32_t) (resampleStep & FIXED_FRACTION_MASK),
(uint32_t) (resampleStep & FIXED_FRACTION_MASK)
};
adder_frac = vld1q_s32(data);
cur_frac = vaddq_s32(cur_frac, adder_frac);
/* dCache_1 is the pointer for dcache in the next resample pos. */
cur_scalar_1 = cur_scalar + resampleStep;
dCache_1 = dCache + (cur_scalar_1 >> FIXED_PRECISION) * 2;
cur_scalar_1 &= FIXED_FRACTION_MASK;
one_over_fixed_one = vdupq_n_f32(1.0f / FIXED_ONE);
half = vdupq_n_f32(0.5f);
adder_frac_loop = vdupq_n_s32(
(uint32_t) ((resampleStep * 2) & FIXED_FRACTION_MASK)
);
tail = toResample % 2;
for (i = 0; i < toResample - tail; i += 2, resampleCache += 4)
{
/* Current_next_1 and current_next_2 each holds 4 src
* sample points for getting 4 dest resample point at the end.
* current_next_1 holds:
* (current_ch_1, current_ch_2, next_ch_1, next_ch_2)
* for the first resample position, while current_next_2 holds
* the same for the 2nd resample position
*/
current = vcombine_f32(
vld1_f32(dCache), /* A1B1 */
vld1_f32(dCache_1) /* A3B3 */
);
next = vcombine_f32(
vld1_f32(dCache + 2), /* A2B2 */
vld1_f32(dCache_1 + 2) /* A4B4 */
);
sub = vsubq_f32(next, current);
/* Adding the 0.5 back.
* See mono explanation for more elaborate explanation.
*/
cur_fixed = vaddq_f32(
vmulq_f32(
vcvtq_f32_s32(cur_frac),
one_over_fixed_one
),
half
);
mul = vmulq_f32(sub, cur_fixed);
res = vaddq_f32(current, mul);
/* Store the results */
vst1q_f32(resampleCache, res);
/* Update dCaches for next iteration */
cur_scalar += resampleStep * 2;
cur_scalar_1 += resampleStep * 2;
dCache = dCache + (cur_scalar >> FIXED_PRECISION) * 2;
dCache_1 = dCache_1 + (cur_scalar_1 >> FIXED_PRECISION) * 2;
cur_scalar &= FIXED_FRACTION_MASK;
cur_scalar_1 &= FIXED_FRACTION_MASK;
cur_frac = vaddq_s32(cur_frac, adder_frac_loop);
}
*resampleOffset += resampleStep * (toResample - tail);
/* This is the tail. */
for (i = 0; i < tail; i += 1)
{
/* lerp, then convert to float value */
*resampleCache++ = (float) (
dCache[0] +
(dCache[2] - dCache[0]) *
FIXED_TO_FLOAT(cur_scalar)
);
*resampleCache++ = (float) (
dCache[1] +
(dCache[3] - dCache[1]) *
FIXED_TO_FLOAT(cur_scalar)
);
/* Increment fraction offset by the stepping value */
*resampleOffset += resampleStep;
cur_scalar += resampleStep;
/* Only increment the sample offset by integer values.
* Sometimes this will be 0 until cur accumulates
* enough steps, especially for "slow" rates.
*/
dCache += (cur_scalar >> FIXED_PRECISION) * 2;
/* Now that any integer has been added, drop it.
* The offset pointer will preserve the total.
*/
cur_scalar &= FIXED_FRACTION_MASK;
}
}
#endif /* HAVE_NEON_INTRINSICS */
/* SECTION 3: Amplifiers */
#if NEED_SCALAR_CONVERTER_FALLBACKS
void FAudio_INTERNAL_Amplify_Scalar(
float* output,
uint32_t totalSamples,
float volume
) {
uint32_t i;
for (i = 0; i < totalSamples; i += 1)
{
output[i] *= volume;
}
}
#endif /* NEED_SCALAR_CONVERTER_FALLBACKS */
/* The SSE2 version of the amplifier comes from @8thMage! */
#if HAVE_SSE2_INTRINSICS
void FAudio_INTERNAL_Amplify_SSE2(
float* output,
uint32_t totalSamples,
float volume
) {
uint32_t i;
uint32_t header = (16 - (((size_t) output) % 16)) / 4;
uint32_t tail = (totalSamples - header) % 4;
__m128 volumeVec, outVec;
if (header == 4)
{
header = 0;
}
if (tail == 4)
{
tail = 0;
}
for (i = 0; i < header; i += 1)
{
output[i] *= volume;
}
volumeVec = _mm_set1_ps(volume);
for (i = header; i < totalSamples - tail; i += 4)
{
outVec = _mm_load_ps(output + i);
outVec = _mm_mul_ps(outVec, volumeVec);
_mm_store_ps(output + i, outVec);
}
for (i = totalSamples - tail; i < totalSamples; i += 1)
{
output[i] *= volume;
}
}
#endif /* HAVE_SSE2_INTRINSICS */
#if HAVE_NEON_INTRINSICS
void FAudio_INTERNAL_Amplify_NEON(
float* output,
uint32_t totalSamples,
float volume
) {
uint32_t i;
uint32_t header = (16 - (((size_t) output) % 16)) / 4;
uint32_t tail = (totalSamples - header) % 4;
float32x4_t volumeVec, outVec;
if (header == 4)
{
header = 0;
}
if (tail == 4)
{
tail = 0;
}
for (i = 0; i < header; i += 1)
{
output[i] *= volume;
}
volumeVec = vdupq_n_f32(volume);
for (i = header; i < totalSamples - tail; i += 4)
{
outVec = vld1q_f32(output + i);
outVec = vmulq_f32(outVec, volumeVec);
vst1q_f32(output + i, outVec);
}
for (i = totalSamples - tail; i < totalSamples; i += 1)
{
output[i] *= volume;
}
}
#endif /* HAVE_NEON_INTRINSICS */
/* SECTION 4: Mixer Functions */
void FAudio_INTERNAL_Mix_Generic_Scalar(
uint32_t toMix,
uint32_t srcChans,
uint32_t dstChans,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i, co, ci;
for (i = 0; i < toMix; i += 1, src += srcChans, dst += dstChans)
for (co = 0; co < dstChans; co += 1)
{
for (ci = 0; ci < srcChans; ci += 1)
{
dst[co] += (
src[ci] *
channelVolume[ci] *
baseVolume *
coefficients[co * srcChans + ci]
);
}
}
}
void FAudio_INTERNAL_Mix_1in_1out_Scalar(
uint32_t toMix,
uint32_t UNUSED1,
uint32_t UNUSED2,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i;
float totalVolume = baseVolume * channelVolume[0] * coefficients[0];
for (i = 0; i < toMix; i += 1, src += 1, dst += 1)
{
/* Base source data, combined with the coefficients */
dst[0] += src[0] * totalVolume;
}
}
void FAudio_INTERNAL_Mix_1in_2out_Scalar(
uint32_t toMix,
uint32_t UNUSED1,
uint32_t UNUSED2,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i;
float totalVolume = baseVolume * channelVolume[0];
for (i = 0; i < toMix; i += 1, src += 1, dst += 2)
{
/* Base source data... */
const float sample = src[0] * totalVolume;
/* ... combined with the coefficients. */
dst[0] += sample * coefficients[0];
dst[1] += sample * coefficients[1];
}
}
void FAudio_INTERNAL_Mix_1in_6out_Scalar(
uint32_t toMix,
uint32_t UNUSED1,
uint32_t UNUSED2,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i;
float totalVolume = baseVolume * channelVolume[0];
for (i = 0; i < toMix; i += 1, src += 1, dst += 6)
{
/* Base source data... */
const float sample = src[0] * totalVolume;
/* ... combined with the coefficients. */
dst[0] += sample * coefficients[0];
dst[1] += sample * coefficients[1];
dst[2] += sample * coefficients[2];
dst[3] += sample * coefficients[3];
dst[4] += sample * coefficients[4];
dst[5] += sample * coefficients[5];
}
}
void FAudio_INTERNAL_Mix_1in_8out_Scalar(
uint32_t toMix,
uint32_t UNUSED1,
uint32_t UNUSED2,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i;
float totalVolume = baseVolume * channelVolume[0];
for (i = 0; i < toMix; i += 1, src += 1, dst += 8)
{
/* Base source data... */
const float sample = src[0] * totalVolume;
/* ... combined with the coefficients. */
dst[0] += sample * coefficients[0];
dst[1] += sample * coefficients[1];
dst[2] += sample * coefficients[2];
dst[3] += sample * coefficients[3];
dst[4] += sample * coefficients[4];
dst[5] += sample * coefficients[5];
dst[6] += sample * coefficients[6];
dst[7] += sample * coefficients[7];
}
}
void FAudio_INTERNAL_Mix_2in_1out_Scalar(
uint32_t toMix,
uint32_t UNUSED1,
uint32_t UNUSED2,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i;
float totalVolumeL = baseVolume * channelVolume[0] * coefficients[0];
float totalVolumeR = baseVolume * channelVolume[1] * coefficients[1];
for (i = 0; i < toMix; i += 1, src += 2, dst += 1)
{
/* Base source data, combined with the coefficients */
dst[0] += (
(src[0] * totalVolumeL) +
(src[1] * totalVolumeR)
);
}
}
void FAudio_INTERNAL_Mix_2in_2out_Scalar(
uint32_t toMix,
uint32_t UNUSED1,
uint32_t UNUSED2,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i;
float totalVolumeL = baseVolume * channelVolume[0];
float totalVolumeR = baseVolume * channelVolume[1];
for (i = 0; i < toMix; i += 1, src += 2, dst += 2)
{
/* Base source data... */
const float left = src[0] * totalVolumeL;
const float right = src[1] * totalVolumeR;
/* ... combined with the coefficients. */
dst[0] += (
(left * coefficients[0]) +
(right * coefficients[1])
);
dst[1] += (
(left * coefficients[2]) +
(right * coefficients[3])
);
}
}
void FAudio_INTERNAL_Mix_2in_6out_Scalar(
uint32_t toMix,
uint32_t UNUSED1,
uint32_t UNUSED2,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i;
float totalVolumeL = baseVolume * channelVolume[0];
float totalVolumeR = baseVolume * channelVolume[1];
for (i = 0; i < toMix; i += 1, src += 2, dst += 6)
{
/* Base source data... */
const float left = src[0] * totalVolumeL;
const float right = src[1] * totalVolumeR;
/* ... combined with the coefficients. */
dst[0] += (
(left * coefficients[0]) +
(right * coefficients[1])
);
dst[1] += (
(left * coefficients[2]) +
(right * coefficients[3])
);
dst[2] += (
(left * coefficients[4]) +
(right * coefficients[5])
);
dst[3] += (
(left * coefficients[6]) +
(right * coefficients[7])
);
dst[4] += (
(left * coefficients[8]) +
(right * coefficients[9])
);
dst[5] += (
(left * coefficients[10]) +
(right * coefficients[11])
);
}
}
void FAudio_INTERNAL_Mix_2in_8out_Scalar(
uint32_t toMix,
uint32_t UNUSED1,
uint32_t UNUSED2,
float baseVolume,
float *restrict src,
float *restrict dst,
float *restrict channelVolume,
float *restrict coefficients
) {
uint32_t i;
float totalVolumeL = baseVolume * channelVolume[0];
float totalVolumeR = baseVolume * channelVolume[1];
for (i = 0; i < toMix; i += 1, src += 2, dst += 8)
{
/* Base source data... */
const float left = src[0] * totalVolumeL;
const float right = src[1] * totalVolumeR;
/* ... combined with the coefficients. */
dst[0] += (
(left * coefficients[0]) +
(right * coefficients[1])
);
dst[1] += (
(left * coefficients[2]) +
(right * coefficients[3])
);
dst[2] += (
(left * coefficients[4]) +
(right * coefficients[5])
);
dst[3] += (
(left * coefficients[6]) +
(right * coefficients[7])
);
dst[4] += (
(left * coefficients[8]) +
(right * coefficients[9])
);
dst[5] += (
(left * coefficients[10]) +
(right * coefficients[11])
);
dst[6] += (
(left * coefficients[12]) +
(right * coefficients[13])
);
dst[7] += (
(left * coefficients[14]) +
(right * coefficients[15])
);
}
}
/* SECTION 5: InitSIMDFunctions. Assigns based on SSE2/NEON support. */
void (*FAudio_INTERNAL_Convert_U8_To_F32)(
const uint8_t *restrict src,
float *restrict dst,
uint32_t len
);
void (*FAudio_INTERNAL_Convert_S16_To_F32)(
const int16_t *restrict src,
float *restrict dst,
uint32_t len
);
void (*FAudio_INTERNAL_Convert_S32_To_F32)(
const int32_t *restrict src,
float *restrict dst,
uint32_t len
);
FAudioResampleCallback FAudio_INTERNAL_ResampleMono;
FAudioResampleCallback FAudio_INTERNAL_ResampleStereo;
void (*FAudio_INTERNAL_Amplify)(
float *output,
uint32_t totalSamples,
float volume
);
void FAudio_INTERNAL_InitSIMDFunctions(uint8_t hasSSE2, uint8_t hasNEON)
{
#if HAVE_SSE2_INTRINSICS
if (hasSSE2)
{
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_SSE2;
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_SSE2;
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_SSE2;
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_SSE2;
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_SSE2;
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_SSE2;
return;
}
#endif
#if HAVE_NEON_INTRINSICS
if (hasNEON)
{
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_NEON;
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_NEON;
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_NEON;
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_NEON;
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_NEON;
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_NEON;
return;
}
#endif
#if NEED_SCALAR_CONVERTER_FALLBACKS
FAudio_INTERNAL_Convert_U8_To_F32 = FAudio_INTERNAL_Convert_U8_To_F32_Scalar;
FAudio_INTERNAL_Convert_S16_To_F32 = FAudio_INTERNAL_Convert_S16_To_F32_Scalar;
FAudio_INTERNAL_Convert_S32_To_F32 = FAudio_INTERNAL_Convert_S32_To_F32_Scalar;
FAudio_INTERNAL_ResampleMono = FAudio_INTERNAL_ResampleMono_Scalar;
FAudio_INTERNAL_ResampleStereo = FAudio_INTERNAL_ResampleStereo_Scalar;
FAudio_INTERNAL_Amplify = FAudio_INTERNAL_Amplify_Scalar;
#else
FAudio_assert(0 && "Need converter functions!");
#endif
}
/* vim: set noexpandtab shiftwidth=8 tabstop=8: */