From c6debcc62a5ed2935975360f13eb1e31f9c9dbff Mon Sep 17 00:00:00 2001 From: luocai Date: Fri, 6 Sep 2024 11:01:01 +0800 Subject: [PATCH] add neon code. --- Record/EchoRecord.cpp | 37 +- VocieProcess/CMakeLists.txt | 12 +- VocieProcess/common_audio/fir_filter.h | 30 ++ VocieProcess/common_audio/fir_filter_neon.cc | 73 ++++ VocieProcess/common_audio/fir_filter_neon.h | 39 ++ .../resampler/sinc_resampler_neon.cc | 48 +++ .../cross_correlation_neon.c | 88 +++++ .../signal_processing/downsample_fast_neon.c | 224 +++++++++++ .../min_max_operations_neon.c | 333 +++++++++++++++++ .../ooura/fft_size_128/ooura_fft_neon.cc | 351 ++++++++++++++++++ .../fft_size_128/ooura_fft_tables_neon_sse2.h | 98 +++++ 11 files changed, 1312 insertions(+), 21 deletions(-) create mode 100644 VocieProcess/common_audio/fir_filter.h create mode 100644 VocieProcess/common_audio/fir_filter_neon.cc create mode 100644 VocieProcess/common_audio/fir_filter_neon.h create mode 100644 VocieProcess/common_audio/resampler/sinc_resampler_neon.cc create mode 100644 VocieProcess/common_audio/signal_processing/cross_correlation_neon.c create mode 100644 VocieProcess/common_audio/signal_processing/downsample_fast_neon.c create mode 100644 VocieProcess/common_audio/signal_processing/min_max_operations_neon.c create mode 100644 VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc create mode 100644 VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h diff --git a/Record/EchoRecord.cpp b/Record/EchoRecord.cpp index 630ac24..176e9c9 100644 --- a/Record/EchoRecord.cpp +++ b/Record/EchoRecord.cpp @@ -10,19 +10,18 @@ class EchoRecordPrivate { public: - EchoRecordPrivate() { - + void initialize(int sampleRate, int channels, int period) { std::unique_ptr factory = std::make_unique(); - - echoCanceller = factory->Create(16000, 1, 1); - - // nearendBuffer = std::make_unique(16000, 1, 16000, 1, 16000, 1); - // farendBuffer = std::make_unique(16000, 1, 16000, 1, 16000, 1); + echoCanceller = factory->Create(sampleRate, channels, channels); + nearendBuffer = std::make_unique(sampleRate, channels, sampleRate, channels, sampleRate, channels); + farendBuffer = std::make_unique(sampleRate, channels, sampleRate, channels, sampleRate, channels); + linearOutputBuffer = std::make_unique(sampleRate, channels, sampleRate, channels, sampleRate, channels); } std::unique_ptr echoCanceller; - // std::unique_ptr nearendBuffer; - // std::unique_ptr farendBuffer; + std::unique_ptr nearendBuffer; + std::unique_ptr farendBuffer; + std::unique_ptr linearOutputBuffer; }; EchoRecordTask::EchoRecordTask() : m_d{new EchoRecordPrivate()} { @@ -56,7 +55,7 @@ void EchoRecordTask::run() { RkAudio::Format format; format.channels = m_channels; - format.period = 20; + format.period = 10; m_speex = std::make_shared(); m_speex->start(format.sampleRate, m_channels, format.period); @@ -66,6 +65,8 @@ void EchoRecordTask::run() { m_webRtcAecm = std::make_shared(); m_webRtcAecm->start(format.sampleRate, format.channels, format.period); + m_d->initialize(format.sampleRate, m_channels, format.period); + m_output = std::make_shared(); if (!m_output->open(sizeof(uint16_t), format.sampleRate, 2, format.period, m_dsp == Vqe)) { LOG(error) << "audio output open failed."; @@ -87,18 +88,16 @@ void EchoRecordTask::run() { reinterpret_cast(m_outBuffer.data()), frame.frameSize); } else if (m_dsp == Aec3) { webrtc::StreamConfig config(format.sampleRate, format.channels); // 单声道 - webrtc::AudioBuffer nearendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1); - webrtc::AudioBuffer farendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1); - webrtc::AudioBuffer linearOutputBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1); - nearendBuffer.CopyFrom(reinterpret_cast(frame.data), config); + m_d->nearendBuffer->CopyFrom(reinterpret_cast(frame.data), config); - farendBuffer.CopyFrom(reinterpret_cast(m_farendBuffer.data()), config); + m_d->farendBuffer->CopyFrom(reinterpret_cast(m_farendBuffer.data()), config); - m_d->echoCanceller->AnalyzeRender(&farendBuffer); - m_d->echoCanceller->AnalyzeCapture(&nearendBuffer); - m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false); + m_d->echoCanceller->AnalyzeRender(m_d->farendBuffer.get()); + m_d->echoCanceller->AnalyzeCapture(m_d->nearendBuffer.get()); + m_d->echoCanceller->ProcessCapture(m_d->nearendBuffer.get(), false); + // m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false); - linearOutputBuffer.CopyTo(config, reinterpret_cast(m_outBuffer.data())); + m_d->nearendBuffer->CopyTo(config, reinterpret_cast(m_outBuffer.data())); } if (m_channels == 2) { diff --git a/VocieProcess/CMakeLists.txt b/VocieProcess/CMakeLists.txt index 3d9751e..7563aa5 100644 --- a/VocieProcess/CMakeLists.txt +++ b/VocieProcess/CMakeLists.txt @@ -28,25 +28,31 @@ add_library(VocieProcess common_audio/audio_util.cc common_audio/channel_buffer.h common_audio/channel_buffer.cc + common_audio/fir_filter_neon.h common_audio/fir_filter_neon.cc common_audio/ring_buffer.h common_audio/ring_buffer.c common_audio/resampler/push_sinc_resampler.h common_audio/resampler/push_sinc_resampler.cc - common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler.cc + common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler_neon.cc + common_audio/resampler/sinc_resampler.cc common_audio/signal_processing/complex_bit_reverse.c common_audio/signal_processing/complex_fft.c + common_audio/signal_processing/cross_correlation_neon.c common_audio/signal_processing/cross_correlation.c common_audio/signal_processing/division_operations.c common_audio/signal_processing/dot_product_with_scale.h common_audio/signal_processing/dot_product_with_scale.cc common_audio/signal_processing/downsample_fast.c + common_audio/signal_processing/downsample_fast_neon.c common_audio/signal_processing/min_max_operations.c + common_audio/signal_processing/min_max_operations_neon.c common_audio/signal_processing/randomization_functions.c common_audio/signal_processing/real_fft.c common_audio/signal_processing/spl_init.c common_audio/signal_processing/splitting_filter.c common_audio/signal_processing/vector_scaling_operations.c - common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft.cc + common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc + common_audio/third_party/ooura/fft_size_128/ooura_fft.cc common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c rtc_base/checks.h rtc_base/checks.cc @@ -132,6 +138,7 @@ add_library(VocieProcess modules/audio_processing/aec3/transparent_mode.h modules/audio_processing/aec3/transparent_mode.cc modules/audio_processing/aecm/aecm_core.h modules/audio_processing/aecm/aecm_core.cc modules/audio_processing/aecm/aecm_core_c.cc + modules/audio_processing/aecm/aecm_core_neon.cc modules/audio_processing/aecm/echo_control_mobile.h modules/audio_processing/aecm/echo_control_mobile.cc modules/audio_processing/logging/apm_data_dumper.h modules/audio_processing/logging/apm_data_dumper.cc @@ -148,6 +155,7 @@ target_compile_definitions(VocieProcess PRIVATE NOMINMAX # PRIVATE RTC_DISABLE_LOGGING PUBLIC RTC_DISABLE_METRICS + PUBLIC WEBRTC_HAS_NEON PUBLIC WEBRTC_APM_DEBUG_DUMP=0 $<$:WEBRTC_WIN> $<$:WEBRTC_POSIX WEBRTC_LINUX> diff --git a/VocieProcess/common_audio/fir_filter.h b/VocieProcess/common_audio/fir_filter.h new file mode 100644 index 0000000..e0b18ca --- /dev/null +++ b/VocieProcess/common_audio/fir_filter.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef COMMON_AUDIO_FIR_FILTER_H_ +#define COMMON_AUDIO_FIR_FILTER_H_ + +#include + +namespace webrtc { + +// Finite Impulse Response filter using floating-point arithmetic. +class FIRFilter { + public: + virtual ~FIRFilter() {} + + // Filters the `in` data supplied. + // `out` must be previously allocated and it must be at least of `length`. + virtual void Filter(const float* in, size_t length, float* out) = 0; +}; + +} // namespace webrtc + +#endif // COMMON_AUDIO_FIR_FILTER_H_ diff --git a/VocieProcess/common_audio/fir_filter_neon.cc b/VocieProcess/common_audio/fir_filter_neon.cc new file mode 100644 index 0000000..346cb69 --- /dev/null +++ b/VocieProcess/common_audio/fir_filter_neon.cc @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "common_audio/fir_filter_neon.h" + +#include +#include + +#include "rtc_base/checks.h" +#include "rtc_base/memory/aligned_malloc.h" + +namespace webrtc { + +FIRFilterNEON::~FIRFilterNEON() {} + +FIRFilterNEON::FIRFilterNEON(const float* coefficients, + size_t coefficients_length, + size_t max_input_length) + : // Closest higher multiple of four. + coefficients_length_((coefficients_length + 3) & ~0x03), + state_length_(coefficients_length_ - 1), + coefficients_(static_cast( + AlignedMalloc(sizeof(float) * coefficients_length_, 16))), + state_(static_cast( + AlignedMalloc(sizeof(float) * (max_input_length + state_length_), + 16))) { + // Add zeros at the end of the coefficients. + size_t padding = coefficients_length_ - coefficients_length; + memset(coefficients_.get(), 0.f, padding * sizeof(coefficients_[0])); + // The coefficients are reversed to compensate for the order in which the + // input samples are acquired (most recent last). + for (size_t i = 0; i < coefficients_length; ++i) { + coefficients_[i + padding] = coefficients[coefficients_length - i - 1]; + } + memset(state_.get(), 0.f, + (max_input_length + state_length_) * sizeof(state_[0])); +} + +void FIRFilterNEON::Filter(const float* in, size_t length, float* out) { + RTC_DCHECK_GT(length, 0); + + memcpy(&state_[state_length_], in, length * sizeof(*in)); + + // Convolves the input signal `in` with the filter kernel `coefficients_` + // taking into account the previous state. + for (size_t i = 0; i < length; ++i) { + float* in_ptr = &state_[i]; + float* coef_ptr = coefficients_.get(); + + float32x4_t m_sum = vmovq_n_f32(0); + float32x4_t m_in; + + for (size_t j = 0; j < coefficients_length_; j += 4) { + m_in = vld1q_f32(in_ptr + j); + m_sum = vmlaq_f32(m_sum, m_in, vld1q_f32(coef_ptr + j)); + } + + float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum)); + out[i] = vget_lane_f32(vpadd_f32(m_half, m_half), 0); + } + + // Update current state. + memmove(state_.get(), &state_[length], state_length_ * sizeof(state_[0])); +} + +} // namespace webrtc diff --git a/VocieProcess/common_audio/fir_filter_neon.h b/VocieProcess/common_audio/fir_filter_neon.h new file mode 100644 index 0000000..1ffefd8 --- /dev/null +++ b/VocieProcess/common_audio/fir_filter_neon.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef COMMON_AUDIO_FIR_FILTER_NEON_H_ +#define COMMON_AUDIO_FIR_FILTER_NEON_H_ + +#include + +#include "common_audio/fir_filter.h" +#include "rtc_base/memory/aligned_malloc.h" + +namespace webrtc { + +class FIRFilterNEON : public FIRFilter { + public: + FIRFilterNEON(const float* coefficients, + size_t coefficients_length, + size_t max_input_length); + ~FIRFilterNEON() override; + + void Filter(const float* in, size_t length, float* out) override; + + private: + size_t coefficients_length_; + size_t state_length_; + std::unique_ptr coefficients_; + std::unique_ptr state_; +}; + +} // namespace webrtc + +#endif // COMMON_AUDIO_FIR_FILTER_NEON_H_ diff --git a/VocieProcess/common_audio/resampler/sinc_resampler_neon.cc b/VocieProcess/common_audio/resampler/sinc_resampler_neon.cc new file mode 100644 index 0000000..9ee918b --- /dev/null +++ b/VocieProcess/common_audio/resampler/sinc_resampler_neon.cc @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// Modified from the Chromium original: +// src/media/base/sinc_resampler.cc + +#include + +#include "common_audio/resampler/sinc_resampler.h" + +namespace webrtc { + +float SincResampler::Convolve_NEON(const float* input_ptr, + const float* k1, + const float* k2, + double kernel_interpolation_factor) { + float32x4_t m_input; + float32x4_t m_sums1 = vmovq_n_f32(0); + float32x4_t m_sums2 = vmovq_n_f32(0); + + const float* upper = input_ptr + kKernelSize; + for (; input_ptr < upper;) { + m_input = vld1q_f32(input_ptr); + input_ptr += 4; + m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1)); + k1 += 4; + m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2)); + k2 += 4; + } + + // Linearly interpolate the two "convolutions". + m_sums1 = vmlaq_f32( + vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)), + m_sums2, vmovq_n_f32(kernel_interpolation_factor)); + + // Sum components together. + float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1)); + return vget_lane_f32(vpadd_f32(m_half, m_half), 0); +} + +} // namespace webrtc diff --git a/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c b/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c new file mode 100644 index 0000000..d3ecf13 --- /dev/null +++ b/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "common_audio/signal_processing/include/signal_processing_library.h" +#include "rtc_base/system/arch.h" + +#include + +static inline void DotProductWithScaleNeon(int32_t* cross_correlation, + const int16_t* vector1, + const int16_t* vector2, + size_t length, + int scaling) { + size_t i = 0; + size_t len1 = length >> 3; + size_t len2 = length & 7; + int64x2_t sum0 = vdupq_n_s64(0); + int64x2_t sum1 = vdupq_n_s64(0); + + for (i = len1; i > 0; i -= 1) { + int16x8_t seq1_16x8 = vld1q_s16(vector1); + int16x8_t seq2_16x8 = vld1q_s16(vector2); +#if defined(WEBRTC_ARCH_ARM64) + int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8), + vget_low_s16(seq2_16x8)); + int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8); +#else + int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8), + vget_low_s16(seq2_16x8)); + int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8), + vget_high_s16(seq2_16x8)); +#endif + sum0 = vpadalq_s32(sum0, tmp0); + sum1 = vpadalq_s32(sum1, tmp1); + vector1 += 8; + vector2 += 8; + } + + // Calculate the rest of the samples. + int64_t sum_res = 0; + for (i = len2; i > 0; i -= 1) { + sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2); + vector1++; + vector2++; + } + + sum0 = vaddq_s64(sum0, sum1); +#if defined(WEBRTC_ARCH_ARM64) + int64_t sum2 = vaddvq_s64(sum0); + *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling); +#else + int64x1_t shift = vdup_n_s64(-scaling); + int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0)); + sum2 = vadd_s64(sum2, vdup_n_s64(sum_res)); + sum2 = vshl_s64(sum2, shift); + vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0); +#endif +} + +/* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */ +void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation, + const int16_t* seq1, + const int16_t* seq2, + size_t dim_seq, + size_t dim_cross_correlation, + int right_shifts, + int step_seq2) { + int i = 0; + + for (i = 0; i < (int)dim_cross_correlation; i++) { + const int16_t* seq1_ptr = seq1; + const int16_t* seq2_ptr = seq2 + (step_seq2 * i); + + DotProductWithScaleNeon(cross_correlation, + seq1_ptr, + seq2_ptr, + dim_seq, + right_shifts); + cross_correlation++; + } +} diff --git a/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c b/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c new file mode 100644 index 0000000..f1b754b --- /dev/null +++ b/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "common_audio/signal_processing/include/signal_processing_library.h" + +#include "rtc_base/checks.h" + +// NEON intrinsics version of WebRtcSpl_DownsampleFast() +// for ARM 32-bit/64-bit platforms. +int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in, + size_t data_in_length, + int16_t* data_out, + size_t data_out_length, + const int16_t* __restrict coefficients, + size_t coefficients_length, + int factor, + size_t delay) { + // Using signed indexes to be able to compute negative i-j that + // is used to index data_in. + int i = 0; + int j = 0; + int32_t out_s32 = 0; + int endpos = delay + factor * (data_out_length - 1) + 1; + size_t res = data_out_length & 0x7; + int endpos1 = endpos - factor * res; + + // Return error if any of the running conditions doesn't meet. + if (data_out_length == 0 || coefficients_length == 0 + || (int)data_in_length < endpos) { + return -1; + } + + RTC_DCHECK_GE(endpos, 0); + RTC_DCHECK_GE(endpos1, 0); + + // First part, unroll the loop 8 times, with 3 subcases + // (factor == 2, 4, others). + switch (factor) { + case 2: { + for (i = delay; i < endpos1; i += 16) { + // Round value, 0.5 in Q12. + int32x4_t out32x4_0 = vdupq_n_s32(2048); + int32x4_t out32x4_1 = vdupq_n_s32(2048); + +#if defined(WEBRTC_ARCH_ARM64) + // Unroll the loop 2 times. + for (j = 0; j < (int)coefficients_length - 1; j += 2) { + int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]); + int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32); + int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]); + + // Mul and accumulate low 64-bit data. + int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]); + int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0); + + // Mul and accumulate high 64-bit data. + // TODO: vget_high_s16 need extra cost on ARM64. This could be + // replaced by vmlal_high_lane_s16. But for the interface of + // vmlal_high_lane_s16, there is a bug in gcc 4.9. + // This issue need to be tracked in the future. + int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]); + int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0); + } + + for (; j < (int)coefficients_length; j++) { + int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]); + int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]); + + // Mul and accumulate low 64-bit data. + int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0); + + // Mul and accumulate high 64-bit data. + // TODO: vget_high_s16 need extra cost on ARM64. This could be + // replaced by vmlal_high_lane_s16. But for the interface of + // vmlal_high_lane_s16, there is a bug in gcc 4.9. + // This issue need to be tracked in the future. + int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0); + } +#else + // On ARMv7, the loop unrolling 2 times results in performance + // regression. + for (j = 0; j < (int)coefficients_length; j++) { + int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]); + int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]); + + // Mul and accumulate. + int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]); + int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0); + } +#endif + + // Saturate and store the output. + int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12); + int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12); + vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1)); + data_out += 8; + } + break; + } + case 4: { + for (i = delay; i < endpos1; i += 32) { + // Round value, 0.5 in Q12. + int32x4_t out32x4_0 = vdupq_n_s32(2048); + int32x4_t out32x4_1 = vdupq_n_s32(2048); + + // Unroll the loop 4 times. + for (j = 0; j < (int)coefficients_length - 3; j += 4) { + int16x4_t coeff16x4 = vld1_s16(&coefficients[j]); + int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]); + + // Mul and accumulate low 64-bit data. + int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]); + int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]); + int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]); + int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0); + + // Mul and accumulate high 64-bit data. + // TODO: vget_high_s16 need extra cost on ARM64. This could be + // replaced by vmlal_high_lane_s16. But for the interface of + // vmlal_high_lane_s16, there is a bug in gcc 4.9. + // This issue need to be tracked in the future. + int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]); + int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]); + int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]); + int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0); + } + + for (; j < (int)coefficients_length; j++) { + int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]); + int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]); + + // Mul and accumulate low 64-bit data. + int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]); + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0); + + // Mul and accumulate high 64-bit data. + // TODO: vget_high_s16 need extra cost on ARM64. This could be + // replaced by vmlal_high_lane_s16. But for the interface of + // vmlal_high_lane_s16, there is a bug in gcc 4.9. + // This issue need to be tracked in the future. + int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0); + } + + // Saturate and store the output. + int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12); + int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12); + vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1)); + data_out += 8; + } + break; + } + default: { + for (i = delay; i < endpos1; i += factor * 8) { + // Round value, 0.5 in Q12. + int32x4_t out32x4_0 = vdupq_n_s32(2048); + int32x4_t out32x4_1 = vdupq_n_s32(2048); + + for (j = 0; j < (int)coefficients_length; j++) { + int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]); + int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]); + in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1); + in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2); + in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3); + int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]); + in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1); + in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2); + in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3); + + // Mul and accumulate. + out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0); + out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0); + } + + // Saturate and store the output. + int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12); + int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12); + vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1)); + data_out += 8; + } + break; + } + } + + // Second part, do the rest iterations (if any). + for (; i < endpos; i += factor) { + out_s32 = 2048; // Round value, 0.5 in Q12. + + for (j = 0; j < (int)coefficients_length; j++) { + out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32); + } + + // Saturate and store the output. + out_s32 >>= 12; + *data_out++ = WebRtcSpl_SatW32ToW16(out_s32); + } + + return 0; +} diff --git a/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c b/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c new file mode 100644 index 0000000..e5b4b7c --- /dev/null +++ b/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "rtc_base/checks.h" +#include "common_audio/signal_processing/include/signal_processing_library.h" + +// Maximum absolute value of word16 vector. C version for generic platforms. +int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) { + int absolute = 0, maximum = 0; + + RTC_DCHECK_GT(length, 0); + + const int16_t* p_start = vector; + size_t rest = length & 7; + const int16_t* p_end = vector + length - rest; + + int16x8_t v; + uint16x8_t max_qv; + max_qv = vdupq_n_u16(0); + + while (p_start < p_end) { + v = vld1q_s16(p_start); + // Note vabs doesn't change the value of -32768. + v = vabsq_s16(v); + // Use u16 so we don't lose the value -32768. + max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v)); + p_start += 8; + } + +#ifdef WEBRTC_ARCH_ARM64 + maximum = (int)vmaxvq_u16(max_qv); +#else + uint16x4_t max_dv; + max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv)); + max_dv = vpmax_u16(max_dv, max_dv); + max_dv = vpmax_u16(max_dv, max_dv); + + maximum = (int)vget_lane_u16(max_dv, 0); +#endif + + p_end = vector + length; + while (p_start < p_end) { + absolute = abs((int)(*p_start)); + + if (absolute > maximum) { + maximum = absolute; + } + p_start++; + } + + // Guard the case for abs(-32768). + if (maximum > WEBRTC_SPL_WORD16_MAX) { + maximum = WEBRTC_SPL_WORD16_MAX; + } + + return (int16_t)maximum; +} + +// Maximum absolute value of word32 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) { + // Use uint32_t for the local variables, to accommodate the return value + // of abs(0x80000000), which is 0x80000000. + + uint32_t absolute = 0, maximum = 0; + size_t i = 0; + size_t residual = length & 0x7; + + RTC_DCHECK_GT(length, 0); + + const int32_t* p_start = vector; + uint32x4_t max32x4_0 = vdupq_n_u32(0); + uint32x4_t max32x4_1 = vdupq_n_u32(0); + + // First part, unroll the loop 8 times. + for (i = 0; i < length - residual; i += 8) { + int32x4_t in32x4_0 = vld1q_s32(p_start); + p_start += 4; + int32x4_t in32x4_1 = vld1q_s32(p_start); + p_start += 4; + in32x4_0 = vabsq_s32(in32x4_0); + in32x4_1 = vabsq_s32(in32x4_1); + // vabs doesn't change the value of 0x80000000. + // Use u32 so we don't lose the value 0x80000000. + max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0)); + max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1)); + } + + uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1); +#if defined(WEBRTC_ARCH_ARM64) + maximum = vmaxvq_u32(max32x4); +#else + uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4)); + max32x2 = vpmax_u32(max32x2, max32x2); + + maximum = vget_lane_u32(max32x2, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + absolute = abs((int)(*p_start)); + if (absolute > maximum) { + maximum = absolute; + } + p_start++; + } + + // Guard against the case for 0x80000000. + maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX); + + return (int32_t)maximum; +} + +// Maximum value of word16 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) { + int16_t maximum = WEBRTC_SPL_WORD16_MIN; + size_t i = 0; + size_t residual = length & 0x7; + + RTC_DCHECK_GT(length, 0); + + const int16_t* p_start = vector; + int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN); + + // First part, unroll the loop 8 times. + for (i = 0; i < length - residual; i += 8) { + int16x8_t in16x8 = vld1q_s16(p_start); + max16x8 = vmaxq_s16(max16x8, in16x8); + p_start += 8; + } + +#if defined(WEBRTC_ARCH_ARM64) + maximum = vmaxvq_s16(max16x8); +#else + int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8)); + max16x4 = vpmax_s16(max16x4, max16x4); + max16x4 = vpmax_s16(max16x4, max16x4); + + maximum = vget_lane_s16(max16x4, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + if (*p_start > maximum) + maximum = *p_start; + p_start++; + } + return maximum; +} + +// Maximum value of word32 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) { + int32_t maximum = WEBRTC_SPL_WORD32_MIN; + size_t i = 0; + size_t residual = length & 0x7; + + RTC_DCHECK_GT(length, 0); + + const int32_t* p_start = vector; + int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN); + int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN); + + // First part, unroll the loop 8 times. + for (i = 0; i < length - residual; i += 8) { + int32x4_t in32x4_0 = vld1q_s32(p_start); + p_start += 4; + int32x4_t in32x4_1 = vld1q_s32(p_start); + p_start += 4; + max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0); + max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1); + } + + int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1); +#if defined(WEBRTC_ARCH_ARM64) + maximum = vmaxvq_s32(max32x4); +#else + int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4)); + max32x2 = vpmax_s32(max32x2, max32x2); + + maximum = vget_lane_s32(max32x2, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + if (*p_start > maximum) + maximum = *p_start; + p_start++; + } + return maximum; +} + +// Minimum value of word16 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) { + int16_t minimum = WEBRTC_SPL_WORD16_MAX; + size_t i = 0; + size_t residual = length & 0x7; + + RTC_DCHECK_GT(length, 0); + + const int16_t* p_start = vector; + int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX); + + // First part, unroll the loop 8 times. + for (i = 0; i < length - residual; i += 8) { + int16x8_t in16x8 = vld1q_s16(p_start); + min16x8 = vminq_s16(min16x8, in16x8); + p_start += 8; + } + +#if defined(WEBRTC_ARCH_ARM64) + minimum = vminvq_s16(min16x8); +#else + int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8)); + min16x4 = vpmin_s16(min16x4, min16x4); + min16x4 = vpmin_s16(min16x4, min16x4); + + minimum = vget_lane_s16(min16x4, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + if (*p_start < minimum) + minimum = *p_start; + p_start++; + } + return minimum; +} + +// Minimum value of word32 vector. NEON intrinsics version for +// ARM 32-bit/64-bit platforms. +int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) { + int32_t minimum = WEBRTC_SPL_WORD32_MAX; + size_t i = 0; + size_t residual = length & 0x7; + + RTC_DCHECK_GT(length, 0); + + const int32_t* p_start = vector; + int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX); + int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX); + + // First part, unroll the loop 8 times. + for (i = 0; i < length - residual; i += 8) { + int32x4_t in32x4_0 = vld1q_s32(p_start); + p_start += 4; + int32x4_t in32x4_1 = vld1q_s32(p_start); + p_start += 4; + min32x4_0 = vminq_s32(min32x4_0, in32x4_0); + min32x4_1 = vminq_s32(min32x4_1, in32x4_1); + } + + int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1); +#if defined(WEBRTC_ARCH_ARM64) + minimum = vminvq_s32(min32x4); +#else + int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4)); + min32x2 = vpmin_s32(min32x2, min32x2); + + minimum = vget_lane_s32(min32x2, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + if (*p_start < minimum) + minimum = *p_start; + p_start++; + } + return minimum; +} + +// Finds both the minimum and maximum elements in an array of 16-bit integers. +void WebRtcSpl_MinMaxW16Neon(const int16_t* vector, size_t length, + int16_t* min_val, int16_t* max_val) { + int16_t minimum = WEBRTC_SPL_WORD16_MAX; + int16_t maximum = WEBRTC_SPL_WORD16_MIN; + size_t i = 0; + size_t residual = length & 0x7; + + RTC_DCHECK_GT(length, 0); + + const int16_t* p_start = vector; + int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX); + int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN); + + // First part, unroll the loop 8 times. + for (i = 0; i < length - residual; i += 8) { + int16x8_t in16x8 = vld1q_s16(p_start); + min16x8 = vminq_s16(min16x8, in16x8); + max16x8 = vmaxq_s16(max16x8, in16x8); + p_start += 8; + } + +#if defined(WEBRTC_ARCH_ARM64) + minimum = vminvq_s16(min16x8); + maximum = vmaxvq_s16(max16x8); +#else + int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8)); + min16x4 = vpmin_s16(min16x4, min16x4); + min16x4 = vpmin_s16(min16x4, min16x4); + + minimum = vget_lane_s16(min16x4, 0); + + int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8)); + max16x4 = vpmax_s16(max16x4, max16x4); + max16x4 = vpmax_s16(max16x4, max16x4); + + maximum = vget_lane_s16(max16x4, 0); +#endif + + // Second part, do the remaining iterations (if any). + for (i = residual; i > 0; i--) { + if (*p_start < minimum) + minimum = *p_start; + if (*p_start > maximum) + maximum = *p_start; + p_start++; + } + *min_val = minimum; + *max_val = maximum; +} diff --git a/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc new file mode 100644 index 0000000..acab972 --- /dev/null +++ b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* + * The rdft AEC algorithm, neon version of speed-critical functions. + * + * Based on the sse2 version. + */ + +#include + +#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h" +#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h" +#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h" + +namespace webrtc { + +#if defined(WEBRTC_HAS_NEON) +void cft1st_128_neon(float* a) { + const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); + int j, k2; + + for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { + float32x4_t a00v = vld1q_f32(&a[j + 0]); + float32x4_t a04v = vld1q_f32(&a[j + 4]); + float32x4_t a08v = vld1q_f32(&a[j + 8]); + float32x4_t a12v = vld1q_f32(&a[j + 12]); + float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); + float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); + float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v)); + float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v)); + const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]); + const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]); + const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]); + const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]); + const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]); + const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]); + float32x4_t x0v = vaddq_f32(a01v, a23v); + const float32x4_t x1v = vsubq_f32(a01v, a23v); + const float32x4_t x2v = vaddq_f32(a45v, a67v); + const float32x4_t x3v = vsubq_f32(a45v, a67v); + const float32x4_t x3w = vrev64q_f32(x3v); + float32x4_t x0w; + a01v = vaddq_f32(x0v, x2v); + x0v = vsubq_f32(x0v, x2v); + x0w = vrev64q_f32(x0v); + a45v = vmulq_f32(wk2rv, x0v); + a45v = vmlaq_f32(a45v, wk2iv, x0w); + x0v = vmlaq_f32(x1v, x3w, vec_swap_sign); + x0w = vrev64q_f32(x0v); + a23v = vmulq_f32(wk1rv, x0v); + a23v = vmlaq_f32(a23v, wk1iv, x0w); + x0v = vmlsq_f32(x1v, x3w, vec_swap_sign); + x0w = vrev64q_f32(x0v); + a67v = vmulq_f32(wk3rv, x0v); + a67v = vmlaq_f32(a67v, wk3iv, x0w); + a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v)); + a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); + a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); + a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); + vst1q_f32(&a[j + 0], a00v); + vst1q_f32(&a[j + 4], a04v); + vst1q_f32(&a[j + 8], a08v); + vst1q_f32(&a[j + 12], a12v); + } +} + +void cftmdl_128_neon(float* a) { + int j; + const int l = 8; + const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); + float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); + + for (j = 0; j < l; j += 2) { + const float32x2_t a_00 = vld1_f32(&a[j + 0]); + const float32x2_t a_08 = vld1_f32(&a[j + 8]); + const float32x2_t a_32 = vld1_f32(&a[j + 32]); + const float32x2_t a_40 = vld1_f32(&a[j + 40]); + const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); + const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); + const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); + const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); + const float32x2_t a_16 = vld1_f32(&a[j + 16]); + const float32x2_t a_24 = vld1_f32(&a[j + 24]); + const float32x2_t a_48 = vld1_f32(&a[j + 48]); + const float32x2_t a_56 = vld1_f32(&a[j + 56]); + const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); + const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); + const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); + const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); + const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); + const float32x4_t x1_x3_add = + vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); + const float32x4_t x1_x3_sub = + vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); + const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0); + const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0); + const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s); + const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1); + const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1); + const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s); + const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as); + const float32x4_t yy4 = vmulq_f32(wk1rv, yy0); + const float32x4_t xx1_rev = vrev64q_f32(xx1); + const float32x4_t yy4_rev = vrev64q_f32(yy4); + + vst1_f32(&a[j + 0], vget_low_f32(xx0)); + vst1_f32(&a[j + 32], vget_high_f32(xx0)); + vst1_f32(&a[j + 16], vget_low_f32(xx1)); + vst1_f32(&a[j + 48], vget_high_f32(xx1_rev)); + + a[j + 48] = -a[j + 48]; + + vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add)); + vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub)); + vst1_f32(&a[j + 40], vget_low_f32(yy4)); + vst1_f32(&a[j + 56], vget_high_f32(yy4_rev)); + } + + { + const int k = 64; + const int k1 = 2; + const int k2 = 2 * k1; + const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]); + const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]); + const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]); + const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]); + const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]); + wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]); + for (j = k; j < l + k; j += 2) { + const float32x2_t a_00 = vld1_f32(&a[j + 0]); + const float32x2_t a_08 = vld1_f32(&a[j + 8]); + const float32x2_t a_32 = vld1_f32(&a[j + 32]); + const float32x2_t a_40 = vld1_f32(&a[j + 40]); + const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); + const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); + const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); + const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); + const float32x2_t a_16 = vld1_f32(&a[j + 16]); + const float32x2_t a_24 = vld1_f32(&a[j + 24]); + const float32x2_t a_48 = vld1_f32(&a[j + 48]); + const float32x2_t a_56 = vld1_f32(&a[j + 56]); + const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); + const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); + const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); + const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); + const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); + const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); + const float32x4_t x1_x3_add = + vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); + const float32x4_t x1_x3_sub = + vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); + float32x4_t xx4 = vmulq_f32(wk2rv, xx1); + float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add); + float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub); + xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1)); + xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add)); + xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub)); + + vst1_f32(&a[j + 0], vget_low_f32(xx)); + vst1_f32(&a[j + 32], vget_high_f32(xx)); + vst1_f32(&a[j + 16], vget_low_f32(xx4)); + vst1_f32(&a[j + 48], vget_high_f32(xx4)); + vst1_f32(&a[j + 8], vget_low_f32(xx12)); + vst1_f32(&a[j + 40], vget_high_f32(xx12)); + vst1_f32(&a[j + 24], vget_low_f32(xx22)); + vst1_f32(&a[j + 56], vget_high_f32(xx22)); + } + } +} + +__inline static float32x4_t reverse_order_f32x4(float32x4_t in) { + // A B C D -> C D A B + const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in)); + // C D A B -> D C B A + return vrev64q_f32(rev); +} + +void rftfsub_128_neon(float* a) { + const float* c = rdft_w + 32; + int j1, j2; + const float32x4_t mm_half = vdupq_n_f32(0.5f); + + // Vectorized code (four at once). + // Note: commented number are indexes for the first iteration of the loop. + for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { + // Load 'wk'. + const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, + const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, + const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, + const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, + const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, + // Load and shuffle 'a'. + // 2, 4, 6, 8, 3, 5, 7, 9 + float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); + // 120, 122, 124, 126, 121, 123, 125, 127, + const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); + // 126, 124, 122, 120 + const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); + // 127, 125, 123, 121 + const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); + // Calculate 'x'. + const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0); + // 2-126, 4-124, 6-122, 8-120, + const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1); + // 3-127, 5-125, 7-123, 9-121, + // Calculate product into 'y'. + // yr = wkr * xr - wki * xi; + // yi = wkr * xi + wki * xr; + const float32x4_t a_ = vmulq_f32(wkr_, xr_); + const float32x4_t b_ = vmulq_f32(wki_, xi_); + const float32x4_t c_ = vmulq_f32(wkr_, xi_); + const float32x4_t d_ = vmulq_f32(wki_, xr_); + const float32x4_t yr_ = vsubq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120, + const float32x4_t yi_ = vaddq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121, + // Update 'a'. + // a[j2 + 0] -= yr; + // a[j2 + 1] -= yi; + // a[k2 + 0] += yr; + // a[k2 + 1] -= yi; + // 126, 124, 122, 120, + const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_); + // 127, 125, 123, 121, + const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_); + // Shuffle in right order and store. + const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n); + const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n); + // 124, 125, 126, 127, 120, 121, 122, 123 + const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr); + // 2, 4, 6, 8, + a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_); + // 3, 5, 7, 9, + a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_); + // 2, 3, 4, 5, 6, 7, 8, 9, + vst2q_f32(&a[0 + j2], a_j2_p); + + vst1q_f32(&a[122 - j2], a_k2_n.val[1]); + vst1q_f32(&a[126 - j2], a_k2_n.val[0]); + } + + // Scalar code for the remaining items. + for (; j2 < 64; j1 += 1, j2 += 2) { + const int k2 = 128 - j2; + const int k1 = 32 - j1; + const float wkr = 0.5f - c[k1]; + const float wki = c[j1]; + const float xr = a[j2 + 0] - a[k2 + 0]; + const float xi = a[j2 + 1] + a[k2 + 1]; + const float yr = wkr * xr - wki * xi; + const float yi = wkr * xi + wki * xr; + a[j2 + 0] -= yr; + a[j2 + 1] -= yi; + a[k2 + 0] += yr; + a[k2 + 1] -= yi; + } +} + +void rftbsub_128_neon(float* a) { + const float* c = rdft_w + 32; + int j1, j2; + const float32x4_t mm_half = vdupq_n_f32(0.5f); + + a[1] = -a[1]; + // Vectorized code (four at once). + // Note: commented number are indexes for the first iteration of the loop. + for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { + // Load 'wk'. + const float32x4_t c_j1 = vld1q_f32(&c[j1]); // 1, 2, 3, 4, + const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]); // 28, 29, 30, 31, + const float32x4_t wkrt = vsubq_f32(mm_half, c_k1); // 28, 29, 30, 31, + const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28, + const float32x4_t wki_ = c_j1; // 1, 2, 3, 4, + // Load and shuffle 'a'. + // 2, 4, 6, 8, 3, 5, 7, 9 + float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]); + // 120, 122, 124, 126, 121, 123, 125, 127, + const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]); + // 126, 124, 122, 120 + const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]); + // 127, 125, 123, 121 + const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]); + // Calculate 'x'. + const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0); + // 2-126, 4-124, 6-122, 8-120, + const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1); + // 3-127, 5-125, 7-123, 9-121, + // Calculate product into 'y'. + // yr = wkr * xr - wki * xi; + // yi = wkr * xi + wki * xr; + const float32x4_t a_ = vmulq_f32(wkr_, xr_); + const float32x4_t b_ = vmulq_f32(wki_, xi_); + const float32x4_t c_ = vmulq_f32(wkr_, xi_); + const float32x4_t d_ = vmulq_f32(wki_, xr_); + const float32x4_t yr_ = vaddq_f32(a_, b_); // 2-126, 4-124, 6-122, 8-120, + const float32x4_t yi_ = vsubq_f32(c_, d_); // 3-127, 5-125, 7-123, 9-121, + // Update 'a'. + // a[j2 + 0] -= yr; + // a[j2 + 1] -= yi; + // a[k2 + 0] += yr; + // a[k2 + 1] -= yi; + // 126, 124, 122, 120, + const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_); + // 127, 125, 123, 121, + const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1); + // Shuffle in right order and store. + // 2, 3, 4, 5, 6, 7, 8, 9, + const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n); + const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n); + // 124, 125, 126, 127, 120, 121, 122, 123 + const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr); + // 2, 4, 6, 8, + a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_); + // 3, 5, 7, 9, + a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]); + // 2, 3, 4, 5, 6, 7, 8, 9, + vst2q_f32(&a[0 + j2], a_j2_p); + + vst1q_f32(&a[122 - j2], a_k2_n.val[1]); + vst1q_f32(&a[126 - j2], a_k2_n.val[0]); + } + + // Scalar code for the remaining items. + for (; j2 < 64; j1 += 1, j2 += 2) { + const int k2 = 128 - j2; + const int k1 = 32 - j1; + const float wkr = 0.5f - c[k1]; + const float wki = c[j1]; + const float xr = a[j2 + 0] - a[k2 + 0]; + const float xi = a[j2 + 1] + a[k2 + 1]; + const float yr = wkr * xr + wki * xi; + const float yi = wkr * xi - wki * xr; + a[j2 + 0] = a[j2 + 0] - yr; + a[j2 + 1] = yi - a[j2 + 1]; + a[k2 + 0] = yr + a[k2 + 0]; + a[k2 + 1] = yi - a[k2 + 1]; + } + a[65] = -a[65]; +} +#endif + +} // namespace webrtc diff --git a/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h new file mode 100644 index 0000000..a63d187 --- /dev/null +++ b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_ +#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_ + +#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h" +#include "rtc_base/system/arch.h" + +#ifdef _MSC_VER /* visual c++ */ +#define ALIGN16_BEG __declspec(align(16)) +#define ALIGN16_END +#else /* gcc or icc */ +#define ALIGN16_BEG +#define ALIGN16_END __attribute__((aligned(16))) +#endif + +namespace webrtc { + +// These tables used to be computed at run-time. For example, refer to: +// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564 +// to see the initialization code. +#if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON) +// Constants used by SSE2 and NEON but initialized in the C path. +const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f}; + +ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = { + 1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f, + 0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f, + 0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f, + 0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f, + 0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f, + 0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f, + 0.098017141f, 0.098017141f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = { + 1.000000000f, 1.000000000f, -0.000000000f, -0.000000000f, 0.707106769f, + 0.707106769f, -0.707106769f, -0.707106769f, 0.923879564f, 0.923879564f, + -0.382683456f, -0.382683456f, 0.382683456f, 0.382683456f, -0.923879564f, + -0.923879564f, 0.980785251f, 0.980785251f, -0.195090324f, -0.195090324f, + 0.555570245f, 0.555570245f, -0.831469595f, -0.831469595f, 0.831469595f, + 0.831469595f, -0.555570245f, -0.555570245f, 0.195090324f, 0.195090324f, + -0.980785251f, -0.980785251f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = { + 1.000000000f, 1.000000000f, -0.707106769f, -0.707106769f, 0.382683456f, + 0.382683456f, -0.923879564f, -0.923879564f, 0.831469536f, 0.831469536f, + -0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f, + -0.555570245f, 0.956940353f, 0.956940353f, -0.881921172f, -0.881921172f, + 0.098017156f, 0.098017156f, -0.773010492f, -0.773010492f, 0.634393334f, + 0.634393334f, -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f, + -0.290284693f, -0.290284693f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = { + -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.382683456f, + 0.382683456f, -0.923879564f, 0.923879564f, -0.195090324f, 0.195090324f, + -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.980785251f, + 0.980785251f, -0.098017141f, 0.098017141f, -0.773010433f, 0.773010433f, + -0.471396744f, 0.471396744f, -0.956940353f, 0.956940353f, -0.290284663f, + 0.290284663f, -0.881921291f, 0.881921291f, -0.634393334f, 0.634393334f, + -0.995184720f, 0.995184720f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = { + -0.000000000f, 0.000000000f, -1.000000000f, 1.000000000f, -0.707106769f, + 0.707106769f, -0.707106769f, 0.707106769f, -0.382683456f, 0.382683456f, + -0.923879564f, 0.923879564f, -0.923879564f, 0.923879564f, -0.382683456f, + 0.382683456f, -0.195090324f, 0.195090324f, -0.980785251f, 0.980785251f, + -0.831469595f, 0.831469595f, -0.555570245f, 0.555570245f, -0.555570245f, + 0.555570245f, -0.831469595f, 0.831469595f, -0.980785251f, 0.980785251f, + -0.195090324f, 0.195090324f, +}; +ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = { + -0.000000000f, 0.000000000f, -0.707106769f, 0.707106769f, -0.923879564f, + 0.923879564f, 0.382683456f, -0.382683456f, -0.555570245f, 0.555570245f, + -0.195090353f, 0.195090353f, -0.980785251f, 0.980785251f, 0.831469536f, + -0.831469536f, -0.290284693f, 0.290284693f, -0.471396863f, 0.471396863f, + -0.995184720f, 0.995184720f, 0.634393334f, -0.634393334f, -0.773010492f, + 0.773010492f, 0.098017156f, -0.098017156f, -0.881921172f, 0.881921172f, + 0.956940353f, -0.956940353f, +}; +ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = { + 0.707106769f, + 0.707106769f, + 0.707106769f, + -0.707106769f, +}; +#endif + +} // namespace webrtc + +#endif // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_