add neon code.

2024-09-06 11:01:01 +08:00 · 2024-09-06 11:01:01 +08:00 · c6debcc62a
commit c6debcc62a
parent fe7ab957c8
11 changed files with 1312 additions and 21 deletions
--- a/Record/EchoRecord.cpp
+++ b/Record/EchoRecord.cpp
@ -10,19 +10,18 @@

 class EchoRecordPrivate {
 public:
-    EchoRecordPrivate() {
-
+    void initialize(int sampleRate, int channels, int period) {
        std::unique_ptr<webrtc::EchoCanceller3Factory> factory = std::make_unique<webrtc::EchoCanceller3Factory>();
-
-        echoCanceller = factory->Create(16000, 1, 1);
-
-        // nearendBuffer = std::make_unique<webrtc::AudioBuffer>(16000, 1, 16000, 1, 16000, 1);
-        // farendBuffer = std::make_unique<webrtc::AudioBuffer>(16000, 1, 16000, 1, 16000, 1);
+        echoCanceller = factory->Create(sampleRate, channels, channels);
+        nearendBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
+        farendBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
+        linearOutputBuffer = std::make_unique<webrtc::AudioBuffer>(sampleRate, channels, sampleRate, channels, sampleRate, channels);
    }

    std::unique_ptr<webrtc::EchoControl> echoCanceller;
-    // std::unique_ptr<webrtc::AudioBuffer> nearendBuffer;
-    // std::unique_ptr<webrtc::AudioBuffer> farendBuffer;
+    std::unique_ptr<webrtc::AudioBuffer> nearendBuffer;
+    std::unique_ptr<webrtc::AudioBuffer> farendBuffer;
+    std::unique_ptr<webrtc::AudioBuffer> linearOutputBuffer;
 };

 EchoRecordTask::EchoRecordTask() : m_d{new EchoRecordPrivate()} {
@ -56,7 +55,7 @@ void EchoRecordTask::run() {

    RkAudio::Format format;
    format.channels = m_channels;
-    format.period = 20;
+    format.period = 10;

    m_speex = std::make_shared<SpeexDsp>();
    m_speex->start(format.sampleRate, m_channels, format.period);
@ -66,6 +65,8 @@ void EchoRecordTask::run() {
    m_webRtcAecm = std::make_shared<WebRtcAecm>();
    m_webRtcAecm->start(format.sampleRate, format.channels, format.period);

+    m_d->initialize(format.sampleRate, m_channels, format.period);
+
    m_output = std::make_shared<RkAudio::Output>();
    if (!m_output->open(sizeof(uint16_t), format.sampleRate, 2, format.period, m_dsp == Vqe)) {
        LOG(error) << "audio output open failed.";
@ -87,18 +88,16 @@ void EchoRecordTask::run() {
                                           reinterpret_cast<int16_t *>(m_outBuffer.data()), frame.frameSize);
        } else if (m_dsp == Aec3) {
            webrtc::StreamConfig config(format.sampleRate, format.channels); // 单声道
-            webrtc::AudioBuffer nearendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
-            webrtc::AudioBuffer farendBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
-            webrtc::AudioBuffer linearOutputBuffer(format.sampleRate, 1, format.sampleRate, 1, format.sampleRate, 1);
-            nearendBuffer.CopyFrom(reinterpret_cast<const int16_t *>(frame.data), config);
+            m_d->nearendBuffer->CopyFrom(reinterpret_cast<const int16_t *>(frame.data), config);

-            farendBuffer.CopyFrom(reinterpret_cast<const int16_t *>(m_farendBuffer.data()), config);
+            m_d->farendBuffer->CopyFrom(reinterpret_cast<const int16_t *>(m_farendBuffer.data()), config);

-            m_d->echoCanceller->AnalyzeRender(&farendBuffer);
-            m_d->echoCanceller->AnalyzeCapture(&nearendBuffer);
-            m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false);
+            m_d->echoCanceller->AnalyzeRender(m_d->farendBuffer.get());
+            m_d->echoCanceller->AnalyzeCapture(m_d->nearendBuffer.get());
+            m_d->echoCanceller->ProcessCapture(m_d->nearendBuffer.get(), false);
+            // m_d->echoCanceller->ProcessCapture(&nearendBuffer, &linearOutputBuffer, /*level_change=*/false);

-            linearOutputBuffer.CopyTo(config, reinterpret_cast<int16_t *>(m_outBuffer.data()));
+            m_d->nearendBuffer->CopyTo(config, reinterpret_cast<int16_t *>(m_outBuffer.data()));
        }

        if (m_channels == 2) {
--- a/VocieProcess/CMakeLists.txt
+++ b/VocieProcess/CMakeLists.txt
@ -28,25 +28,31 @@ add_library(VocieProcess

    common_audio/audio_util.cc
    common_audio/channel_buffer.h common_audio/channel_buffer.cc
+    common_audio/fir_filter_neon.h common_audio/fir_filter_neon.cc
    common_audio/ring_buffer.h common_audio/ring_buffer.c

    common_audio/resampler/push_sinc_resampler.h common_audio/resampler/push_sinc_resampler.cc
-    common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler.cc
+    common_audio/resampler/sinc_resampler.h common_audio/resampler/sinc_resampler_neon.cc
+    common_audio/resampler/sinc_resampler.cc

    common_audio/signal_processing/complex_bit_reverse.c
    common_audio/signal_processing/complex_fft.c
+    common_audio/signal_processing/cross_correlation_neon.c
    common_audio/signal_processing/cross_correlation.c
    common_audio/signal_processing/division_operations.c
    common_audio/signal_processing/dot_product_with_scale.h common_audio/signal_processing/dot_product_with_scale.cc
    common_audio/signal_processing/downsample_fast.c
+    common_audio/signal_processing/downsample_fast_neon.c
    common_audio/signal_processing/min_max_operations.c
+    common_audio/signal_processing/min_max_operations_neon.c
    common_audio/signal_processing/randomization_functions.c
    common_audio/signal_processing/real_fft.c
    common_audio/signal_processing/spl_init.c
    common_audio/signal_processing/splitting_filter.c
    common_audio/signal_processing/vector_scaling_operations.c

-    common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
+    common_audio/third_party/ooura/fft_size_128/ooura_fft.h common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc 
+    common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
    common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.h common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c

    rtc_base/checks.h rtc_base/checks.cc
@ -132,6 +138,7 @@ add_library(VocieProcess
    modules/audio_processing/aec3/transparent_mode.h modules/audio_processing/aec3/transparent_mode.cc

    modules/audio_processing/aecm/aecm_core.h modules/audio_processing/aecm/aecm_core.cc modules/audio_processing/aecm/aecm_core_c.cc
+    modules/audio_processing/aecm/aecm_core_neon.cc
    modules/audio_processing/aecm/echo_control_mobile.h  modules/audio_processing/aecm/echo_control_mobile.cc

    modules/audio_processing/logging/apm_data_dumper.h modules/audio_processing/logging/apm_data_dumper.cc
@ -148,6 +155,7 @@ target_compile_definitions(VocieProcess
    PRIVATE NOMINMAX # <windows.h>
    PRIVATE RTC_DISABLE_LOGGING
    PUBLIC RTC_DISABLE_METRICS
+    PUBLIC WEBRTC_HAS_NEON
    PUBLIC WEBRTC_APM_DEBUG_DUMP=0
    $<$<PLATFORM_ID:Windows>:WEBRTC_WIN>
    $<$<PLATFORM_ID:Linux>:WEBRTC_POSIX WEBRTC_LINUX>
--- a/VocieProcess/common_audio/fir_filter.h
+++ b/VocieProcess/common_audio/fir_filter.h
@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef COMMON_AUDIO_FIR_FILTER_H_
+#define COMMON_AUDIO_FIR_FILTER_H_
+
+#include <string.h>
+
+namespace webrtc {
+
+// Finite Impulse Response filter using floating-point arithmetic.
+class FIRFilter {
+ public:
+  virtual ~FIRFilter() {}
+
+  // Filters the `in` data supplied.
+  // `out` must be previously allocated and it must be at least of `length`.
+  virtual void Filter(const float* in, size_t length, float* out) = 0;
+};
+
+}  // namespace webrtc
+
+#endif  // COMMON_AUDIO_FIR_FILTER_H_
--- a/VocieProcess/common_audio/fir_filter_neon.cc
+++ b/VocieProcess/common_audio/fir_filter_neon.cc
@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "common_audio/fir_filter_neon.h"
+
+#include <arm_neon.h>
+#include <string.h>
+
+#include "rtc_base/checks.h"
+#include "rtc_base/memory/aligned_malloc.h"
+
+namespace webrtc {
+
+FIRFilterNEON::~FIRFilterNEON() {}
+
+FIRFilterNEON::FIRFilterNEON(const float* coefficients,
+                             size_t coefficients_length,
+                             size_t max_input_length)
+    :  // Closest higher multiple of four.
+      coefficients_length_((coefficients_length + 3) & ~0x03),
+      state_length_(coefficients_length_ - 1),
+      coefficients_(static_cast<float*>(
+          AlignedMalloc(sizeof(float) * coefficients_length_, 16))),
+      state_(static_cast<float*>(
+          AlignedMalloc(sizeof(float) * (max_input_length + state_length_),
+                        16))) {
+  // Add zeros at the end of the coefficients.
+  size_t padding = coefficients_length_ - coefficients_length;
+  memset(coefficients_.get(), 0.f, padding * sizeof(coefficients_[0]));
+  // The coefficients are reversed to compensate for the order in which the
+  // input samples are acquired (most recent last).
+  for (size_t i = 0; i < coefficients_length; ++i) {
+    coefficients_[i + padding] = coefficients[coefficients_length - i - 1];
+  }
+  memset(state_.get(), 0.f,
+         (max_input_length + state_length_) * sizeof(state_[0]));
+}
+
+void FIRFilterNEON::Filter(const float* in, size_t length, float* out) {
+  RTC_DCHECK_GT(length, 0);
+
+  memcpy(&state_[state_length_], in, length * sizeof(*in));
+
+  // Convolves the input signal `in` with the filter kernel `coefficients_`
+  // taking into account the previous state.
+  for (size_t i = 0; i < length; ++i) {
+    float* in_ptr = &state_[i];
+    float* coef_ptr = coefficients_.get();
+
+    float32x4_t m_sum = vmovq_n_f32(0);
+    float32x4_t m_in;
+
+    for (size_t j = 0; j < coefficients_length_; j += 4) {
+      m_in = vld1q_f32(in_ptr + j);
+      m_sum = vmlaq_f32(m_sum, m_in, vld1q_f32(coef_ptr + j));
+    }
+
+    float32x2_t m_half = vadd_f32(vget_high_f32(m_sum), vget_low_f32(m_sum));
+    out[i] = vget_lane_f32(vpadd_f32(m_half, m_half), 0);
+  }
+
+  // Update current state.
+  memmove(state_.get(), &state_[length], state_length_ * sizeof(state_[0]));
+}
+
+}  // namespace webrtc
--- a/VocieProcess/common_audio/fir_filter_neon.h
+++ b/VocieProcess/common_audio/fir_filter_neon.h
@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef COMMON_AUDIO_FIR_FILTER_NEON_H_
+#define COMMON_AUDIO_FIR_FILTER_NEON_H_
+
+#include <memory>
+
+#include "common_audio/fir_filter.h"
+#include "rtc_base/memory/aligned_malloc.h"
+
+namespace webrtc {
+
+class FIRFilterNEON : public FIRFilter {
+ public:
+  FIRFilterNEON(const float* coefficients,
+                size_t coefficients_length,
+                size_t max_input_length);
+  ~FIRFilterNEON() override;
+
+  void Filter(const float* in, size_t length, float* out) override;
+
+ private:
+  size_t coefficients_length_;
+  size_t state_length_;
+  std::unique_ptr<float[], AlignedFreeDeleter> coefficients_;
+  std::unique_ptr<float[], AlignedFreeDeleter> state_;
+};
+
+}  // namespace webrtc
+
+#endif  // COMMON_AUDIO_FIR_FILTER_NEON_H_
--- a/VocieProcess/common_audio/resampler/sinc_resampler_neon.cc
+++ b/VocieProcess/common_audio/resampler/sinc_resampler_neon.cc
@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Modified from the Chromium original:
+// src/media/base/sinc_resampler.cc
+
+#include <arm_neon.h>
+
+#include "common_audio/resampler/sinc_resampler.h"
+
+namespace webrtc {
+
+float SincResampler::Convolve_NEON(const float* input_ptr,
+                                   const float* k1,
+                                   const float* k2,
+                                   double kernel_interpolation_factor) {
+  float32x4_t m_input;
+  float32x4_t m_sums1 = vmovq_n_f32(0);
+  float32x4_t m_sums2 = vmovq_n_f32(0);
+
+  const float* upper = input_ptr + kKernelSize;
+  for (; input_ptr < upper;) {
+    m_input = vld1q_f32(input_ptr);
+    input_ptr += 4;
+    m_sums1 = vmlaq_f32(m_sums1, m_input, vld1q_f32(k1));
+    k1 += 4;
+    m_sums2 = vmlaq_f32(m_sums2, m_input, vld1q_f32(k2));
+    k2 += 4;
+  }
+
+  // Linearly interpolate the two "convolutions".
+  m_sums1 = vmlaq_f32(
+      vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),
+      m_sums2, vmovq_n_f32(kernel_interpolation_factor));
+
+  // Sum components together.
+  float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));
+  return vget_lane_f32(vpadd_f32(m_half, m_half), 0);
+}
+
+}  // namespace webrtc
--- a/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c
+++ b/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c
@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+#include "rtc_base/system/arch.h"
+
+#include <arm_neon.h>
+
+static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
+                                           const int16_t* vector1,
+                                           const int16_t* vector2,
+                                           size_t length,
+                                           int scaling) {
+  size_t i = 0;
+  size_t len1 = length >> 3;
+  size_t len2 = length & 7;
+  int64x2_t sum0 = vdupq_n_s64(0);
+  int64x2_t sum1 = vdupq_n_s64(0);
+
+  for (i = len1; i > 0; i -= 1) {
+    int16x8_t seq1_16x8 = vld1q_s16(vector1);
+    int16x8_t seq2_16x8 = vld1q_s16(vector2);
+#if defined(WEBRTC_ARCH_ARM64)
+    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
+                               vget_low_s16(seq2_16x8));
+    int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
+#else
+    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
+                               vget_low_s16(seq2_16x8));
+    int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
+                               vget_high_s16(seq2_16x8));
+#endif
+    sum0 = vpadalq_s32(sum0, tmp0);
+    sum1 = vpadalq_s32(sum1, tmp1);
+    vector1 += 8;
+    vector2 += 8;
+  }
+
+  // Calculate the rest of the samples.
+  int64_t sum_res = 0;
+  for (i = len2; i > 0; i -= 1) {
+    sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
+    vector1++;
+    vector2++;
+  }
+
+  sum0 = vaddq_s64(sum0, sum1);
+#if defined(WEBRTC_ARCH_ARM64)
+  int64_t sum2 = vaddvq_s64(sum0);
+  *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
+#else
+  int64x1_t shift = vdup_n_s64(-scaling);
+  int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
+  sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
+  sum2 = vshl_s64(sum2, shift);
+  vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
+#endif
+}
+
+/* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
+void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
+                                    const int16_t* seq1,
+                                    const int16_t* seq2,
+                                    size_t dim_seq,
+                                    size_t dim_cross_correlation,
+                                    int right_shifts,
+                                    int step_seq2) {
+  int i = 0;
+
+  for (i = 0; i < (int)dim_cross_correlation; i++) {
+    const int16_t* seq1_ptr = seq1;
+    const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
+
+    DotProductWithScaleNeon(cross_correlation,
+                            seq1_ptr,
+                            seq2_ptr,
+                            dim_seq,
+                            right_shifts);
+    cross_correlation++;
+  }
+}
--- a/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c
+++ b/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c
@ -0,0 +1,224 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+
+#include "rtc_base/checks.h"
+
+// NEON intrinsics version of WebRtcSpl_DownsampleFast()
+// for ARM 32-bit/64-bit platforms.
+int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
+                                 size_t data_in_length,
+                                 int16_t* data_out,
+                                 size_t data_out_length,
+                                 const int16_t* __restrict coefficients,
+                                 size_t coefficients_length,
+                                 int factor,
+                                 size_t delay) {
+  // Using signed indexes to be able to compute negative i-j that
+  // is used to index data_in.
+  int i = 0;
+  int j = 0;
+  int32_t out_s32 = 0;
+  int endpos = delay + factor * (data_out_length - 1) + 1;
+  size_t res = data_out_length & 0x7;
+  int endpos1 = endpos - factor * res;
+
+  // Return error if any of the running conditions doesn't meet.
+  if (data_out_length == 0 || coefficients_length == 0
+                           || (int)data_in_length < endpos) {
+    return -1;
+  }
+
+  RTC_DCHECK_GE(endpos, 0);
+  RTC_DCHECK_GE(endpos1, 0);
+
+  // First part, unroll the loop 8 times, with 3 subcases
+  // (factor == 2, 4, others).
+  switch (factor) {
+    case 2: {
+      for (i = delay; i < endpos1; i += 16) {
+        // Round value, 0.5 in Q12.
+        int32x4_t out32x4_0 = vdupq_n_s32(2048);
+        int32x4_t out32x4_1 = vdupq_n_s32(2048);
+
+#if defined(WEBRTC_ARCH_ARM64)
+        // Unroll the loop 2 times.
+        for (j = 0; j < (int)coefficients_length - 1; j += 2) {
+          int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
+          int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
+          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
+
+          // Mul and accumulate low 64-bit data.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
+          int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
+
+          // Mul and accumulate high 64-bit data.
+          // TODO: vget_high_s16 need extra cost on ARM64. This could be
+          // replaced by vmlal_high_lane_s16. But for the interface of
+          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
+          // This issue need to be tracked in the future.
+          int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
+          int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
+        }
+
+        for (; j < (int)coefficients_length; j++) {
+          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
+          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
+
+          // Mul and accumulate low 64-bit data.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
+
+          // Mul and accumulate high 64-bit data.
+          // TODO: vget_high_s16 need extra cost on ARM64. This could be
+          // replaced by vmlal_high_lane_s16. But for the interface of
+          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
+          // This issue need to be tracked in the future.
+          int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
+        }
+#else
+        // On ARMv7, the loop unrolling 2 times results in performance
+        // regression.
+        for (j = 0; j < (int)coefficients_length; j++) {
+          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
+          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
+
+          // Mul and accumulate.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
+          int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
+        }
+#endif
+
+        // Saturate and store the output.
+        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
+        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
+        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
+        data_out += 8;
+      }
+      break;
+    }
+    case 4: {
+      for (i = delay; i < endpos1; i += 32) {
+        // Round value, 0.5 in Q12.
+        int32x4_t out32x4_0 = vdupq_n_s32(2048);
+        int32x4_t out32x4_1 = vdupq_n_s32(2048);
+
+        // Unroll the loop 4 times.
+        for (j = 0; j < (int)coefficients_length - 3; j += 4) {
+          int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
+          int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
+
+          // Mul and accumulate low 64-bit data.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
+          int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
+          int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
+          int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
+
+          // Mul and accumulate high 64-bit data.
+          // TODO: vget_high_s16 need extra cost on ARM64. This could be
+          // replaced by vmlal_high_lane_s16. But for the interface of
+          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
+          // This issue need to be tracked in the future.
+          int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
+          int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
+          int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
+          int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
+        }
+
+        for (; j < (int)coefficients_length; j++) {
+          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
+          int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
+
+          // Mul and accumulate low 64-bit data.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
+
+          // Mul and accumulate high 64-bit data.
+          // TODO: vget_high_s16 need extra cost on ARM64. This could be
+          // replaced by vmlal_high_lane_s16. But for the interface of
+          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
+          // This issue need to be tracked in the future.
+          int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
+        }
+
+        // Saturate and store the output.
+        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
+        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
+        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
+        data_out += 8;
+      }
+      break;
+    }
+    default: {
+      for (i = delay; i < endpos1; i += factor * 8) {
+        // Round value, 0.5 in Q12.
+        int32x4_t out32x4_0 = vdupq_n_s32(2048);
+        int32x4_t out32x4_1 = vdupq_n_s32(2048);
+
+        for (j = 0; j < (int)coefficients_length; j++) {
+          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
+          int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
+          in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
+          in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
+          in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
+          int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
+          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
+          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
+          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
+
+          // Mul and accumulate.
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
+        }
+
+        // Saturate and store the output.
+        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
+        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
+        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
+        data_out += 8;
+      }
+      break;
+    }
+  }
+
+  // Second part, do the rest iterations (if any).
+  for (; i < endpos; i += factor) {
+    out_s32 = 2048;  // Round value, 0.5 in Q12.
+
+    for (j = 0; j < (int)coefficients_length; j++) {
+      out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
+    }
+
+    // Saturate and store the output.
+    out_s32 >>= 12;
+    *data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
+  }
+
+  return 0;
+}
--- a/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c
+++ b/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c
@ -0,0 +1,333 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <stdlib.h>
+
+#include "rtc_base/checks.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+
+// Maximum absolute value of word16 vector. C version for generic platforms.
+int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
+  int absolute = 0, maximum = 0;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int16_t* p_start = vector;
+  size_t rest = length & 7;
+  const int16_t* p_end = vector + length - rest;
+
+  int16x8_t v;
+  uint16x8_t max_qv;
+  max_qv = vdupq_n_u16(0);
+
+  while (p_start < p_end) {
+    v = vld1q_s16(p_start);
+    // Note vabs doesn't change the value of -32768.
+    v = vabsq_s16(v);
+    // Use u16 so we don't lose the value -32768.
+    max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
+    p_start += 8;
+  }
+
+#ifdef WEBRTC_ARCH_ARM64
+  maximum = (int)vmaxvq_u16(max_qv);
+#else
+  uint16x4_t max_dv;
+  max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
+  max_dv = vpmax_u16(max_dv, max_dv);
+  max_dv = vpmax_u16(max_dv, max_dv);
+
+  maximum = (int)vget_lane_u16(max_dv, 0);
+#endif
+
+  p_end = vector + length;
+  while (p_start < p_end) {
+    absolute = abs((int)(*p_start));
+
+    if (absolute > maximum) {
+      maximum = absolute;
+    }
+    p_start++;
+  }
+
+  // Guard the case for abs(-32768).
+  if (maximum > WEBRTC_SPL_WORD16_MAX) {
+    maximum = WEBRTC_SPL_WORD16_MAX;
+  }
+
+  return (int16_t)maximum;
+}
+
+// Maximum absolute value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
+  // Use uint32_t for the local variables, to accommodate the return value
+  // of abs(0x80000000), which is 0x80000000.
+
+  uint32_t absolute = 0, maximum = 0;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int32_t* p_start = vector;
+  uint32x4_t max32x4_0 = vdupq_n_u32(0);
+  uint32x4_t max32x4_1 = vdupq_n_u32(0);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int32x4_t in32x4_0 = vld1q_s32(p_start);
+    p_start += 4;
+    int32x4_t in32x4_1 = vld1q_s32(p_start);
+    p_start += 4;
+    in32x4_0 = vabsq_s32(in32x4_0);
+    in32x4_1 = vabsq_s32(in32x4_1);
+    // vabs doesn't change the value of 0x80000000.
+    // Use u32 so we don't lose the value 0x80000000.
+    max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
+    max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
+  }
+
+  uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+  maximum = vmaxvq_u32(max32x4);
+#else
+  uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
+  max32x2 = vpmax_u32(max32x2, max32x2);
+
+  maximum = vget_lane_u32(max32x2, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    absolute = abs((int)(*p_start));
+    if (absolute > maximum) {
+      maximum = absolute;
+    }
+    p_start++;
+  }
+
+  // Guard against the case for 0x80000000.
+  maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
+
+  return (int32_t)maximum;
+}
+
+// Maximum value of word16 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
+  int16_t maximum = WEBRTC_SPL_WORD16_MIN;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int16_t* p_start = vector;
+  int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int16x8_t in16x8 = vld1q_s16(p_start);
+    max16x8 = vmaxq_s16(max16x8, in16x8);
+    p_start += 8;
+  }
+
+#if defined(WEBRTC_ARCH_ARM64)
+  maximum = vmaxvq_s16(max16x8);
+#else
+  int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
+  max16x4 = vpmax_s16(max16x4, max16x4);
+  max16x4 = vpmax_s16(max16x4, max16x4);
+
+  maximum = vget_lane_s16(max16x4, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start > maximum)
+      maximum = *p_start;
+    p_start++;
+  }
+  return maximum;
+}
+
+// Maximum value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
+  int32_t maximum = WEBRTC_SPL_WORD32_MIN;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int32_t* p_start = vector;
+  int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
+  int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int32x4_t in32x4_0 = vld1q_s32(p_start);
+    p_start += 4;
+    int32x4_t in32x4_1 = vld1q_s32(p_start);
+    p_start += 4;
+    max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
+    max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
+  }
+
+  int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+  maximum = vmaxvq_s32(max32x4);
+#else
+  int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
+  max32x2 = vpmax_s32(max32x2, max32x2);
+
+  maximum = vget_lane_s32(max32x2, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start > maximum)
+      maximum = *p_start;
+    p_start++;
+  }
+  return maximum;
+}
+
+// Minimum value of word16 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
+  int16_t minimum = WEBRTC_SPL_WORD16_MAX;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int16_t* p_start = vector;
+  int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int16x8_t in16x8 = vld1q_s16(p_start);
+    min16x8 = vminq_s16(min16x8, in16x8);
+    p_start += 8;
+  }
+
+#if defined(WEBRTC_ARCH_ARM64)
+  minimum = vminvq_s16(min16x8);
+#else
+  int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
+  min16x4 = vpmin_s16(min16x4, min16x4);
+  min16x4 = vpmin_s16(min16x4, min16x4);
+
+  minimum = vget_lane_s16(min16x4, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start < minimum)
+      minimum = *p_start;
+    p_start++;
+  }
+  return minimum;
+}
+
+// Minimum value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
+  int32_t minimum = WEBRTC_SPL_WORD32_MAX;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int32_t* p_start = vector;
+  int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
+  int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int32x4_t in32x4_0 = vld1q_s32(p_start);
+    p_start += 4;
+    int32x4_t in32x4_1 = vld1q_s32(p_start);
+    p_start += 4;
+    min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
+    min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
+  }
+
+  int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+  minimum = vminvq_s32(min32x4);
+#else
+  int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
+  min32x2 = vpmin_s32(min32x2, min32x2);
+
+  minimum = vget_lane_s32(min32x2, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start < minimum)
+      minimum = *p_start;
+    p_start++;
+  }
+  return minimum;
+}
+
+// Finds both the minimum and maximum elements in an array of 16-bit integers.
+void WebRtcSpl_MinMaxW16Neon(const int16_t* vector, size_t length,
+                             int16_t* min_val, int16_t* max_val) {
+  int16_t minimum = WEBRTC_SPL_WORD16_MAX;
+  int16_t maximum = WEBRTC_SPL_WORD16_MIN;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int16_t* p_start = vector;
+  int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
+  int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int16x8_t in16x8 = vld1q_s16(p_start);
+    min16x8 = vminq_s16(min16x8, in16x8);
+    max16x8 = vmaxq_s16(max16x8, in16x8);
+    p_start += 8;
+  }
+
+#if defined(WEBRTC_ARCH_ARM64)
+  minimum = vminvq_s16(min16x8);
+  maximum = vmaxvq_s16(max16x8);
+#else
+  int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
+  min16x4 = vpmin_s16(min16x4, min16x4);
+  min16x4 = vpmin_s16(min16x4, min16x4);
+
+  minimum = vget_lane_s16(min16x4, 0);
+
+  int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
+  max16x4 = vpmax_s16(max16x4, max16x4);
+  max16x4 = vpmax_s16(max16x4, max16x4);
+
+  maximum = vget_lane_s16(max16x4, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start < minimum)
+      minimum = *p_start;
+    if (*p_start > maximum)
+      maximum = *p_start;
+    p_start++;
+  }
+  *min_val = minimum;
+  *max_val = maximum;
+}
--- a/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
+++ b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
@ -0,0 +1,351 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * The rdft AEC algorithm, neon version of speed-critical functions.
+ *
+ * Based on the sse2 version.
+ */
+
+#include <arm_neon.h>
+
+#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
+#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h"
+#include "common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h"
+
+namespace webrtc {
+
+#if defined(WEBRTC_HAS_NEON)
+void cft1st_128_neon(float* a) {
+  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
+  int j, k2;
+
+  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
+    float32x4_t a00v = vld1q_f32(&a[j + 0]);
+    float32x4_t a04v = vld1q_f32(&a[j + 4]);
+    float32x4_t a08v = vld1q_f32(&a[j + 8]);
+    float32x4_t a12v = vld1q_f32(&a[j + 12]);
+    float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v));
+    float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v));
+    float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v));
+    float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v));
+    const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]);
+    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]);
+    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]);
+    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]);
+    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]);
+    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]);
+    float32x4_t x0v = vaddq_f32(a01v, a23v);
+    const float32x4_t x1v = vsubq_f32(a01v, a23v);
+    const float32x4_t x2v = vaddq_f32(a45v, a67v);
+    const float32x4_t x3v = vsubq_f32(a45v, a67v);
+    const float32x4_t x3w = vrev64q_f32(x3v);
+    float32x4_t x0w;
+    a01v = vaddq_f32(x0v, x2v);
+    x0v = vsubq_f32(x0v, x2v);
+    x0w = vrev64q_f32(x0v);
+    a45v = vmulq_f32(wk2rv, x0v);
+    a45v = vmlaq_f32(a45v, wk2iv, x0w);
+    x0v = vmlaq_f32(x1v, x3w, vec_swap_sign);
+    x0w = vrev64q_f32(x0v);
+    a23v = vmulq_f32(wk1rv, x0v);
+    a23v = vmlaq_f32(a23v, wk1iv, x0w);
+    x0v = vmlsq_f32(x1v, x3w, vec_swap_sign);
+    x0w = vrev64q_f32(x0v);
+    a67v = vmulq_f32(wk3rv, x0v);
+    a67v = vmlaq_f32(a67v, wk3iv, x0w);
+    a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v));
+    a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v));
+    a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v));
+    a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v));
+    vst1q_f32(&a[j + 0], a00v);
+    vst1q_f32(&a[j + 4], a04v);
+    vst1q_f32(&a[j + 8], a08v);
+    vst1q_f32(&a[j + 12], a12v);
+  }
+}
+
+void cftmdl_128_neon(float* a) {
+  int j;
+  const int l = 8;
+  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
+  float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);
+
+  for (j = 0; j < l; j += 2) {
+    const float32x2_t a_00 = vld1_f32(&a[j + 0]);
+    const float32x2_t a_08 = vld1_f32(&a[j + 8]);
+    const float32x2_t a_32 = vld1_f32(&a[j + 32]);
+    const float32x2_t a_40 = vld1_f32(&a[j + 40]);
+    const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
+    const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
+    const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
+    const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
+    const float32x2_t a_16 = vld1_f32(&a[j + 16]);
+    const float32x2_t a_24 = vld1_f32(&a[j + 24]);
+    const float32x2_t a_48 = vld1_f32(&a[j + 48]);
+    const float32x2_t a_56 = vld1_f32(&a[j + 56]);
+    const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
+    const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
+    const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
+    const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
+    const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+    const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
+    const float32x4_t x1_x3_add =
+        vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+    const float32x4_t x1_x3_sub =
+        vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+    const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
+    const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
+    const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
+    const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
+    const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
+    const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
+    const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
+    const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
+    const float32x4_t xx1_rev = vrev64q_f32(xx1);
+    const float32x4_t yy4_rev = vrev64q_f32(yy4);
+
+    vst1_f32(&a[j + 0], vget_low_f32(xx0));
+    vst1_f32(&a[j + 32], vget_high_f32(xx0));
+    vst1_f32(&a[j + 16], vget_low_f32(xx1));
+    vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));
+
+    a[j + 48] = -a[j + 48];
+
+    vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
+    vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
+    vst1_f32(&a[j + 40], vget_low_f32(yy4));
+    vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
+  }
+
+  {
+    const int k = 64;
+    const int k1 = 2;
+    const int k2 = 2 * k1;
+    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
+    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
+    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
+    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
+    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
+    wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
+    for (j = k; j < l + k; j += 2) {
+      const float32x2_t a_00 = vld1_f32(&a[j + 0]);
+      const float32x2_t a_08 = vld1_f32(&a[j + 8]);
+      const float32x2_t a_32 = vld1_f32(&a[j + 32]);
+      const float32x2_t a_40 = vld1_f32(&a[j + 40]);
+      const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
+      const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
+      const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
+      const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
+      const float32x2_t a_16 = vld1_f32(&a[j + 16]);
+      const float32x2_t a_24 = vld1_f32(&a[j + 24]);
+      const float32x2_t a_48 = vld1_f32(&a[j + 48]);
+      const float32x2_t a_56 = vld1_f32(&a[j + 56]);
+      const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
+      const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
+      const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
+      const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
+      const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
+      const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
+      const float32x4_t x1_x3_add =
+          vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+      const float32x4_t x1_x3_sub =
+          vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
+      float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
+      float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
+      float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
+      xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
+      xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
+      xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));
+
+      vst1_f32(&a[j + 0], vget_low_f32(xx));
+      vst1_f32(&a[j + 32], vget_high_f32(xx));
+      vst1_f32(&a[j + 16], vget_low_f32(xx4));
+      vst1_f32(&a[j + 48], vget_high_f32(xx4));
+      vst1_f32(&a[j + 8], vget_low_f32(xx12));
+      vst1_f32(&a[j + 40], vget_high_f32(xx12));
+      vst1_f32(&a[j + 24], vget_low_f32(xx22));
+      vst1_f32(&a[j + 56], vget_high_f32(xx22));
+    }
+  }
+}
+
+__inline static float32x4_t reverse_order_f32x4(float32x4_t in) {
+  // A B C D -> C D A B
+  const float32x4_t rev = vcombine_f32(vget_high_f32(in), vget_low_f32(in));
+  // C D A B -> D C B A
+  return vrev64q_f32(rev);
+}
+
+void rftfsub_128_neon(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2;
+  const float32x4_t mm_half = vdupq_n_f32(0.5f);
+
+  // Vectorized code (four at once).
+  // Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
+    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
+    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
+    const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
+    const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    //   2,   4,   6,   8,   3,   5,   7,   9
+    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
+    // 120, 122, 124, 126, 121, 123, 125, 127,
+    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
+    // 126, 124, 122, 120
+    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
+    // 127, 125, 123, 121
+    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
+    // Calculate 'x'.
+    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
+    const float32x4_t b_ = vmulq_f32(wki_, xi_);
+    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
+    const float32x4_t d_ = vmulq_f32(wki_, xr_);
+    const float32x4_t yr_ = vsubq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t yi_ = vaddq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                                // Update 'a'.
+                                                //    a[j2 + 0] -= yr;
+                                                //    a[j2 + 1] -= yi;
+                                                //    a[k2 + 0] += yr;
+                                                //    a[k2 + 1] -= yi;
+    // 126, 124, 122, 120,
+    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
+    // 127, 125, 123, 121,
+    const float32x4_t a_k2_p1n = vsubq_f32(a_k2_p1, yi_);
+    // Shuffle in right order and store.
+    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
+    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
+    // 124, 125, 126, 127, 120, 121, 122, 123
+    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
+    //   2,   4,   6,   8,
+    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
+    //   3,   5,   7,   9,
+    a_j2_p.val[1] = vsubq_f32(a_j2_p.val[1], yi_);
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    vst2q_f32(&a[0 + j2], a_j2_p);
+
+    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
+    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
+  }
+
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    const int k2 = 128 - j2;
+    const int k1 = 32 - j1;
+    const float wkr = 0.5f - c[k1];
+    const float wki = c[j1];
+    const float xr = a[j2 + 0] - a[k2 + 0];
+    const float xi = a[j2 + 1] + a[k2 + 1];
+    const float yr = wkr * xr - wki * xi;
+    const float yi = wkr * xi + wki * xr;
+    a[j2 + 0] -= yr;
+    a[j2 + 1] -= yi;
+    a[k2 + 0] += yr;
+    a[k2 + 1] -= yi;
+  }
+}
+
+void rftbsub_128_neon(float* a) {
+  const float* c = rdft_w + 32;
+  int j1, j2;
+  const float32x4_t mm_half = vdupq_n_f32(0.5f);
+
+  a[1] = -a[1];
+  // Vectorized code (four at once).
+  //    Note: commented number are indexes for the first iteration of the loop.
+  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
+    // Load 'wk'.
+    const float32x4_t c_j1 = vld1q_f32(&c[j1]);          //  1,  2,  3,  4,
+    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);     // 28, 29, 30, 31,
+    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);   // 28, 29, 30, 31,
+    const float32x4_t wkr_ = reverse_order_f32x4(wkrt);  // 31, 30, 29, 28,
+    const float32x4_t wki_ = c_j1;                       //  1,  2,  3,  4,
+    // Load and shuffle 'a'.
+    //   2,   4,   6,   8,   3,   5,   7,   9
+    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
+    // 120, 122, 124, 126, 121, 123, 125, 127,
+    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
+    // 126, 124, 122, 120
+    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
+    // 127, 125, 123, 121
+    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
+    // Calculate 'x'.
+    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
+    // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
+    // 3-127, 5-125, 7-123, 9-121,
+    // Calculate product into 'y'.
+    //    yr = wkr * xr - wki * xi;
+    //    yi = wkr * xi + wki * xr;
+    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
+    const float32x4_t b_ = vmulq_f32(wki_, xi_);
+    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
+    const float32x4_t d_ = vmulq_f32(wki_, xr_);
+    const float32x4_t yr_ = vaddq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
+    const float32x4_t yi_ = vsubq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
+                                                // Update 'a'.
+                                                //    a[j2 + 0] -= yr;
+                                                //    a[j2 + 1] -= yi;
+                                                //    a[k2 + 0] += yr;
+                                                //    a[k2 + 1] -= yi;
+    // 126, 124, 122, 120,
+    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
+    // 127, 125, 123, 121,
+    const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
+    // Shuffle in right order and store.
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
+    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
+    // 124, 125, 126, 127, 120, 121, 122, 123
+    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
+    //   2,   4,   6,   8,
+    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
+    //   3,   5,   7,   9,
+    a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
+    //   2,   3,   4,   5,   6,   7,   8,   9,
+    vst2q_f32(&a[0 + j2], a_j2_p);
+
+    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
+    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
+  }
+
+  // Scalar code for the remaining items.
+  for (; j2 < 64; j1 += 1, j2 += 2) {
+    const int k2 = 128 - j2;
+    const int k1 = 32 - j1;
+    const float wkr = 0.5f - c[k1];
+    const float wki = c[j1];
+    const float xr = a[j2 + 0] - a[k2 + 0];
+    const float xi = a[j2 + 1] + a[k2 + 1];
+    const float yr = wkr * xr + wki * xi;
+    const float yi = wkr * xi - wki * xr;
+    a[j2 + 0] = a[j2 + 0] - yr;
+    a[j2 + 1] = yi - a[j2 + 1];
+    a[k2 + 0] = yr + a[k2 + 0];
+    a[k2 + 1] = yi - a[k2 + 1];
+  }
+  a[65] = -a[65];
+}
+#endif
+
+}  // namespace webrtc
--- a/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
+++ b/VocieProcess/common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
+#define MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_
+
+#include "common_audio/third_party/ooura/fft_size_128/ooura_fft.h"
+#include "rtc_base/system/arch.h"
+
+#ifdef _MSC_VER /* visual c++ */
+#define ALIGN16_BEG __declspec(align(16))
+#define ALIGN16_END
+#else /* gcc or icc */
+#define ALIGN16_BEG
+#define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+namespace webrtc {
+
+// These tables used to be computed at run-time. For example, refer to:
+// https://code.google.com/p/webrtc/source/browse/trunk/webrtc/modules/audio_processing/utility/apm_rdft.c?r=6564
+// to see the initialization code.
+#if defined(WEBRTC_ARCH_X86_FAMILY) || defined(WEBRTC_HAS_NEON)
+// Constants used by SSE2 and NEON but initialized in the C path.
+const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
+
+ALIGN16_BEG const float ALIGN16_END rdft_wk1r[32] = {
+    1.000000000f, 1.000000000f, 0.707106769f, 0.707106769f, 0.923879564f,
+    0.923879564f, 0.382683456f, 0.382683456f, 0.980785251f, 0.980785251f,
+    0.555570245f, 0.555570245f, 0.831469595f, 0.831469595f, 0.195090324f,
+    0.195090324f, 0.995184720f, 0.995184720f, 0.634393334f, 0.634393334f,
+    0.881921291f, 0.881921291f, 0.290284663f, 0.290284663f, 0.956940353f,
+    0.956940353f, 0.471396744f, 0.471396744f, 0.773010433f, 0.773010433f,
+    0.098017141f, 0.098017141f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk2r[32] = {
+    1.000000000f,  1.000000000f,  -0.000000000f, -0.000000000f, 0.707106769f,
+    0.707106769f,  -0.707106769f, -0.707106769f, 0.923879564f,  0.923879564f,
+    -0.382683456f, -0.382683456f, 0.382683456f,  0.382683456f,  -0.923879564f,
+    -0.923879564f, 0.980785251f,  0.980785251f,  -0.195090324f, -0.195090324f,
+    0.555570245f,  0.555570245f,  -0.831469595f, -0.831469595f, 0.831469595f,
+    0.831469595f,  -0.555570245f, -0.555570245f, 0.195090324f,  0.195090324f,
+    -0.980785251f, -0.980785251f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk3r[32] = {
+    1.000000000f,  1.000000000f,  -0.707106769f, -0.707106769f, 0.382683456f,
+    0.382683456f,  -0.923879564f, -0.923879564f, 0.831469536f,  0.831469536f,
+    -0.980785251f, -0.980785251f, -0.195090353f, -0.195090353f, -0.555570245f,
+    -0.555570245f, 0.956940353f,  0.956940353f,  -0.881921172f, -0.881921172f,
+    0.098017156f,  0.098017156f,  -0.773010492f, -0.773010492f, 0.634393334f,
+    0.634393334f,  -0.995184720f, -0.995184720f, -0.471396863f, -0.471396863f,
+    -0.290284693f, -0.290284693f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk1i[32] = {
+    -0.000000000f, 0.000000000f,  -0.707106769f, 0.707106769f,  -0.382683456f,
+    0.382683456f,  -0.923879564f, 0.923879564f,  -0.195090324f, 0.195090324f,
+    -0.831469595f, 0.831469595f,  -0.555570245f, 0.555570245f,  -0.980785251f,
+    0.980785251f,  -0.098017141f, 0.098017141f,  -0.773010433f, 0.773010433f,
+    -0.471396744f, 0.471396744f,  -0.956940353f, 0.956940353f,  -0.290284663f,
+    0.290284663f,  -0.881921291f, 0.881921291f,  -0.634393334f, 0.634393334f,
+    -0.995184720f, 0.995184720f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk2i[32] = {
+    -0.000000000f, 0.000000000f,  -1.000000000f, 1.000000000f,  -0.707106769f,
+    0.707106769f,  -0.707106769f, 0.707106769f,  -0.382683456f, 0.382683456f,
+    -0.923879564f, 0.923879564f,  -0.923879564f, 0.923879564f,  -0.382683456f,
+    0.382683456f,  -0.195090324f, 0.195090324f,  -0.980785251f, 0.980785251f,
+    -0.831469595f, 0.831469595f,  -0.555570245f, 0.555570245f,  -0.555570245f,
+    0.555570245f,  -0.831469595f, 0.831469595f,  -0.980785251f, 0.980785251f,
+    -0.195090324f, 0.195090324f,
+};
+ALIGN16_BEG const float ALIGN16_END rdft_wk3i[32] = {
+    -0.000000000f, 0.000000000f,  -0.707106769f, 0.707106769f,  -0.923879564f,
+    0.923879564f,  0.382683456f,  -0.382683456f, -0.555570245f, 0.555570245f,
+    -0.195090353f, 0.195090353f,  -0.980785251f, 0.980785251f,  0.831469536f,
+    -0.831469536f, -0.290284693f, 0.290284693f,  -0.471396863f, 0.471396863f,
+    -0.995184720f, 0.995184720f,  0.634393334f,  -0.634393334f, -0.773010492f,
+    0.773010492f,  0.098017156f,  -0.098017156f, -0.881921172f, 0.881921172f,
+    0.956940353f,  -0.956940353f,
+};
+ALIGN16_BEG const float ALIGN16_END cftmdl_wk1r[4] = {
+    0.707106769f,
+    0.707106769f,
+    0.707106769f,
+    -0.707106769f,
+};
+#endif
+
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_UTILITY_OOURA_FFT_TABLES_NEON_SSE2_H_