add neon code.

2024-09-06 11:01:01 +08:00
parent fe7ab957c8
commit c6debcc62a
11 changed files with 1312 additions and 21 deletions
--- a/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c
+++ b/VocieProcess/common_audio/signal_processing/cross_correlation_neon.c
@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+#include "rtc_base/system/arch.h"
+
+#include <arm_neon.h>
+
+static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
+                                           const int16_t* vector1,
+                                           const int16_t* vector2,
+                                           size_t length,
+                                           int scaling) {
+  size_t i = 0;
+  size_t len1 = length >> 3;
+  size_t len2 = length & 7;
+  int64x2_t sum0 = vdupq_n_s64(0);
+  int64x2_t sum1 = vdupq_n_s64(0);
+
+  for (i = len1; i > 0; i -= 1) {
+    int16x8_t seq1_16x8 = vld1q_s16(vector1);
+    int16x8_t seq2_16x8 = vld1q_s16(vector2);
+#if defined(WEBRTC_ARCH_ARM64)
+    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
+                               vget_low_s16(seq2_16x8));
+    int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
+#else
+    int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
+                               vget_low_s16(seq2_16x8));
+    int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
+                               vget_high_s16(seq2_16x8));
+#endif
+    sum0 = vpadalq_s32(sum0, tmp0);
+    sum1 = vpadalq_s32(sum1, tmp1);
+    vector1 += 8;
+    vector2 += 8;
+  }
+
+  // Calculate the rest of the samples.
+  int64_t sum_res = 0;
+  for (i = len2; i > 0; i -= 1) {
+    sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
+    vector1++;
+    vector2++;
+  }
+
+  sum0 = vaddq_s64(sum0, sum1);
+#if defined(WEBRTC_ARCH_ARM64)
+  int64_t sum2 = vaddvq_s64(sum0);
+  *cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
+#else
+  int64x1_t shift = vdup_n_s64(-scaling);
+  int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
+  sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
+  sum2 = vshl_s64(sum2, shift);
+  vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
+#endif
+}
+
+/* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
+void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
+                                    const int16_t* seq1,
+                                    const int16_t* seq2,
+                                    size_t dim_seq,
+                                    size_t dim_cross_correlation,
+                                    int right_shifts,
+                                    int step_seq2) {
+  int i = 0;
+
+  for (i = 0; i < (int)dim_cross_correlation; i++) {
+    const int16_t* seq1_ptr = seq1;
+    const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
+
+    DotProductWithScaleNeon(cross_correlation,
+                            seq1_ptr,
+                            seq2_ptr,
+                            dim_seq,
+                            right_shifts);
+    cross_correlation++;
+  }
+}
--- a/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c
+++ b/VocieProcess/common_audio/signal_processing/downsample_fast_neon.c
@ -0,0 +1,224 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+
+#include "rtc_base/checks.h"
+
+// NEON intrinsics version of WebRtcSpl_DownsampleFast()
+// for ARM 32-bit/64-bit platforms.
+int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
+                                 size_t data_in_length,
+                                 int16_t* data_out,
+                                 size_t data_out_length,
+                                 const int16_t* __restrict coefficients,
+                                 size_t coefficients_length,
+                                 int factor,
+                                 size_t delay) {
+  // Using signed indexes to be able to compute negative i-j that
+  // is used to index data_in.
+  int i = 0;
+  int j = 0;
+  int32_t out_s32 = 0;
+  int endpos = delay + factor * (data_out_length - 1) + 1;
+  size_t res = data_out_length & 0x7;
+  int endpos1 = endpos - factor * res;
+
+  // Return error if any of the running conditions doesn't meet.
+  if (data_out_length == 0 || coefficients_length == 0
+                           || (int)data_in_length < endpos) {
+    return -1;
+  }
+
+  RTC_DCHECK_GE(endpos, 0);
+  RTC_DCHECK_GE(endpos1, 0);
+
+  // First part, unroll the loop 8 times, with 3 subcases
+  // (factor == 2, 4, others).
+  switch (factor) {
+    case 2: {
+      for (i = delay; i < endpos1; i += 16) {
+        // Round value, 0.5 in Q12.
+        int32x4_t out32x4_0 = vdupq_n_s32(2048);
+        int32x4_t out32x4_1 = vdupq_n_s32(2048);
+
+#if defined(WEBRTC_ARCH_ARM64)
+        // Unroll the loop 2 times.
+        for (j = 0; j < (int)coefficients_length - 1; j += 2) {
+          int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
+          int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
+          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
+
+          // Mul and accumulate low 64-bit data.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
+          int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
+
+          // Mul and accumulate high 64-bit data.
+          // TODO: vget_high_s16 need extra cost on ARM64. This could be
+          // replaced by vmlal_high_lane_s16. But for the interface of
+          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
+          // This issue need to be tracked in the future.
+          int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
+          int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
+        }
+
+        for (; j < (int)coefficients_length; j++) {
+          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
+          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
+
+          // Mul and accumulate low 64-bit data.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
+
+          // Mul and accumulate high 64-bit data.
+          // TODO: vget_high_s16 need extra cost on ARM64. This could be
+          // replaced by vmlal_high_lane_s16. But for the interface of
+          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
+          // This issue need to be tracked in the future.
+          int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
+        }
+#else
+        // On ARMv7, the loop unrolling 2 times results in performance
+        // regression.
+        for (j = 0; j < (int)coefficients_length; j++) {
+          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
+          int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
+
+          // Mul and accumulate.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
+          int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
+        }
+#endif
+
+        // Saturate and store the output.
+        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
+        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
+        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
+        data_out += 8;
+      }
+      break;
+    }
+    case 4: {
+      for (i = delay; i < endpos1; i += 32) {
+        // Round value, 0.5 in Q12.
+        int32x4_t out32x4_0 = vdupq_n_s32(2048);
+        int32x4_t out32x4_1 = vdupq_n_s32(2048);
+
+        // Unroll the loop 4 times.
+        for (j = 0; j < (int)coefficients_length - 3; j += 4) {
+          int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
+          int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
+
+          // Mul and accumulate low 64-bit data.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
+          int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
+          int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
+          int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
+
+          // Mul and accumulate high 64-bit data.
+          // TODO: vget_high_s16 need extra cost on ARM64. This could be
+          // replaced by vmlal_high_lane_s16. But for the interface of
+          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
+          // This issue need to be tracked in the future.
+          int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
+          int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
+          int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
+          int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
+        }
+
+        for (; j < (int)coefficients_length; j++) {
+          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
+          int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
+
+          // Mul and accumulate low 64-bit data.
+          int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
+
+          // Mul and accumulate high 64-bit data.
+          // TODO: vget_high_s16 need extra cost on ARM64. This could be
+          // replaced by vmlal_high_lane_s16. But for the interface of
+          // vmlal_high_lane_s16, there is a bug in gcc 4.9.
+          // This issue need to be tracked in the future.
+          int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
+        }
+
+        // Saturate and store the output.
+        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
+        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
+        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
+        data_out += 8;
+      }
+      break;
+    }
+    default: {
+      for (i = delay; i < endpos1; i += factor * 8) {
+        // Round value, 0.5 in Q12.
+        int32x4_t out32x4_0 = vdupq_n_s32(2048);
+        int32x4_t out32x4_1 = vdupq_n_s32(2048);
+
+        for (j = 0; j < (int)coefficients_length; j++) {
+          int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
+          int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
+          in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
+          in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
+          in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
+          int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
+          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
+          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
+          in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
+
+          // Mul and accumulate.
+          out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
+          out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
+        }
+
+        // Saturate and store the output.
+        int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
+        int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
+        vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
+        data_out += 8;
+      }
+      break;
+    }
+  }
+
+  // Second part, do the rest iterations (if any).
+  for (; i < endpos; i += factor) {
+    out_s32 = 2048;  // Round value, 0.5 in Q12.
+
+    for (j = 0; j < (int)coefficients_length; j++) {
+      out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
+    }
+
+    // Saturate and store the output.
+    out_s32 >>= 12;
+    *data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
+  }
+
+  return 0;
+}
--- a/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c
+++ b/VocieProcess/common_audio/signal_processing/min_max_operations_neon.c
@ -0,0 +1,333 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <stdlib.h>
+
+#include "rtc_base/checks.h"
+#include "common_audio/signal_processing/include/signal_processing_library.h"
+
+// Maximum absolute value of word16 vector. C version for generic platforms.
+int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
+  int absolute = 0, maximum = 0;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int16_t* p_start = vector;
+  size_t rest = length & 7;
+  const int16_t* p_end = vector + length - rest;
+
+  int16x8_t v;
+  uint16x8_t max_qv;
+  max_qv = vdupq_n_u16(0);
+
+  while (p_start < p_end) {
+    v = vld1q_s16(p_start);
+    // Note vabs doesn't change the value of -32768.
+    v = vabsq_s16(v);
+    // Use u16 so we don't lose the value -32768.
+    max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
+    p_start += 8;
+  }
+
+#ifdef WEBRTC_ARCH_ARM64
+  maximum = (int)vmaxvq_u16(max_qv);
+#else
+  uint16x4_t max_dv;
+  max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
+  max_dv = vpmax_u16(max_dv, max_dv);
+  max_dv = vpmax_u16(max_dv, max_dv);
+
+  maximum = (int)vget_lane_u16(max_dv, 0);
+#endif
+
+  p_end = vector + length;
+  while (p_start < p_end) {
+    absolute = abs((int)(*p_start));
+
+    if (absolute > maximum) {
+      maximum = absolute;
+    }
+    p_start++;
+  }
+
+  // Guard the case for abs(-32768).
+  if (maximum > WEBRTC_SPL_WORD16_MAX) {
+    maximum = WEBRTC_SPL_WORD16_MAX;
+  }
+
+  return (int16_t)maximum;
+}
+
+// Maximum absolute value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
+  // Use uint32_t for the local variables, to accommodate the return value
+  // of abs(0x80000000), which is 0x80000000.
+
+  uint32_t absolute = 0, maximum = 0;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int32_t* p_start = vector;
+  uint32x4_t max32x4_0 = vdupq_n_u32(0);
+  uint32x4_t max32x4_1 = vdupq_n_u32(0);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int32x4_t in32x4_0 = vld1q_s32(p_start);
+    p_start += 4;
+    int32x4_t in32x4_1 = vld1q_s32(p_start);
+    p_start += 4;
+    in32x4_0 = vabsq_s32(in32x4_0);
+    in32x4_1 = vabsq_s32(in32x4_1);
+    // vabs doesn't change the value of 0x80000000.
+    // Use u32 so we don't lose the value 0x80000000.
+    max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
+    max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
+  }
+
+  uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+  maximum = vmaxvq_u32(max32x4);
+#else
+  uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
+  max32x2 = vpmax_u32(max32x2, max32x2);
+
+  maximum = vget_lane_u32(max32x2, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    absolute = abs((int)(*p_start));
+    if (absolute > maximum) {
+      maximum = absolute;
+    }
+    p_start++;
+  }
+
+  // Guard against the case for 0x80000000.
+  maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
+
+  return (int32_t)maximum;
+}
+
+// Maximum value of word16 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
+  int16_t maximum = WEBRTC_SPL_WORD16_MIN;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int16_t* p_start = vector;
+  int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int16x8_t in16x8 = vld1q_s16(p_start);
+    max16x8 = vmaxq_s16(max16x8, in16x8);
+    p_start += 8;
+  }
+
+#if defined(WEBRTC_ARCH_ARM64)
+  maximum = vmaxvq_s16(max16x8);
+#else
+  int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
+  max16x4 = vpmax_s16(max16x4, max16x4);
+  max16x4 = vpmax_s16(max16x4, max16x4);
+
+  maximum = vget_lane_s16(max16x4, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start > maximum)
+      maximum = *p_start;
+    p_start++;
+  }
+  return maximum;
+}
+
+// Maximum value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
+  int32_t maximum = WEBRTC_SPL_WORD32_MIN;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int32_t* p_start = vector;
+  int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
+  int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int32x4_t in32x4_0 = vld1q_s32(p_start);
+    p_start += 4;
+    int32x4_t in32x4_1 = vld1q_s32(p_start);
+    p_start += 4;
+    max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
+    max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
+  }
+
+  int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+  maximum = vmaxvq_s32(max32x4);
+#else
+  int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
+  max32x2 = vpmax_s32(max32x2, max32x2);
+
+  maximum = vget_lane_s32(max32x2, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start > maximum)
+      maximum = *p_start;
+    p_start++;
+  }
+  return maximum;
+}
+
+// Minimum value of word16 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
+  int16_t minimum = WEBRTC_SPL_WORD16_MAX;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int16_t* p_start = vector;
+  int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int16x8_t in16x8 = vld1q_s16(p_start);
+    min16x8 = vminq_s16(min16x8, in16x8);
+    p_start += 8;
+  }
+
+#if defined(WEBRTC_ARCH_ARM64)
+  minimum = vminvq_s16(min16x8);
+#else
+  int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
+  min16x4 = vpmin_s16(min16x4, min16x4);
+  min16x4 = vpmin_s16(min16x4, min16x4);
+
+  minimum = vget_lane_s16(min16x4, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start < minimum)
+      minimum = *p_start;
+    p_start++;
+  }
+  return minimum;
+}
+
+// Minimum value of word32 vector. NEON intrinsics version for
+// ARM 32-bit/64-bit platforms.
+int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
+  int32_t minimum = WEBRTC_SPL_WORD32_MAX;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int32_t* p_start = vector;
+  int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
+  int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int32x4_t in32x4_0 = vld1q_s32(p_start);
+    p_start += 4;
+    int32x4_t in32x4_1 = vld1q_s32(p_start);
+    p_start += 4;
+    min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
+    min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
+  }
+
+  int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
+#if defined(WEBRTC_ARCH_ARM64)
+  minimum = vminvq_s32(min32x4);
+#else
+  int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
+  min32x2 = vpmin_s32(min32x2, min32x2);
+
+  minimum = vget_lane_s32(min32x2, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start < minimum)
+      minimum = *p_start;
+    p_start++;
+  }
+  return minimum;
+}
+
+// Finds both the minimum and maximum elements in an array of 16-bit integers.
+void WebRtcSpl_MinMaxW16Neon(const int16_t* vector, size_t length,
+                             int16_t* min_val, int16_t* max_val) {
+  int16_t minimum = WEBRTC_SPL_WORD16_MAX;
+  int16_t maximum = WEBRTC_SPL_WORD16_MIN;
+  size_t i = 0;
+  size_t residual = length & 0x7;
+
+  RTC_DCHECK_GT(length, 0);
+
+  const int16_t* p_start = vector;
+  int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
+  int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
+
+  // First part, unroll the loop 8 times.
+  for (i = 0; i < length - residual; i += 8) {
+    int16x8_t in16x8 = vld1q_s16(p_start);
+    min16x8 = vminq_s16(min16x8, in16x8);
+    max16x8 = vmaxq_s16(max16x8, in16x8);
+    p_start += 8;
+  }
+
+#if defined(WEBRTC_ARCH_ARM64)
+  minimum = vminvq_s16(min16x8);
+  maximum = vmaxvq_s16(max16x8);
+#else
+  int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
+  min16x4 = vpmin_s16(min16x4, min16x4);
+  min16x4 = vpmin_s16(min16x4, min16x4);
+
+  minimum = vget_lane_s16(min16x4, 0);
+
+  int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
+  max16x4 = vpmax_s16(max16x4, max16x4);
+  max16x4 = vpmax_s16(max16x4, max16x4);
+
+  maximum = vget_lane_s16(max16x4, 0);
+#endif
+
+  // Second part, do the remaining iterations (if any).
+  for (i = residual; i > 0; i--) {
+    if (*p_start < minimum)
+      minimum = *p_start;
+    if (*p_start > maximum)
+      maximum = *p_start;
+    p_start++;
+  }
+  *min_val = minimum;
+  *max_val = maximum;
+}