add neon code.
This commit is contained in:
@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
||||
#include "rtc_base/system/arch.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static inline void DotProductWithScaleNeon(int32_t* cross_correlation,
|
||||
const int16_t* vector1,
|
||||
const int16_t* vector2,
|
||||
size_t length,
|
||||
int scaling) {
|
||||
size_t i = 0;
|
||||
size_t len1 = length >> 3;
|
||||
size_t len2 = length & 7;
|
||||
int64x2_t sum0 = vdupq_n_s64(0);
|
||||
int64x2_t sum1 = vdupq_n_s64(0);
|
||||
|
||||
for (i = len1; i > 0; i -= 1) {
|
||||
int16x8_t seq1_16x8 = vld1q_s16(vector1);
|
||||
int16x8_t seq2_16x8 = vld1q_s16(vector2);
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
|
||||
vget_low_s16(seq2_16x8));
|
||||
int32x4_t tmp1 = vmull_high_s16(seq1_16x8, seq2_16x8);
|
||||
#else
|
||||
int32x4_t tmp0 = vmull_s16(vget_low_s16(seq1_16x8),
|
||||
vget_low_s16(seq2_16x8));
|
||||
int32x4_t tmp1 = vmull_s16(vget_high_s16(seq1_16x8),
|
||||
vget_high_s16(seq2_16x8));
|
||||
#endif
|
||||
sum0 = vpadalq_s32(sum0, tmp0);
|
||||
sum1 = vpadalq_s32(sum1, tmp1);
|
||||
vector1 += 8;
|
||||
vector2 += 8;
|
||||
}
|
||||
|
||||
// Calculate the rest of the samples.
|
||||
int64_t sum_res = 0;
|
||||
for (i = len2; i > 0; i -= 1) {
|
||||
sum_res += WEBRTC_SPL_MUL_16_16(*vector1, *vector2);
|
||||
vector1++;
|
||||
vector2++;
|
||||
}
|
||||
|
||||
sum0 = vaddq_s64(sum0, sum1);
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
int64_t sum2 = vaddvq_s64(sum0);
|
||||
*cross_correlation = (int32_t)((sum2 + sum_res) >> scaling);
|
||||
#else
|
||||
int64x1_t shift = vdup_n_s64(-scaling);
|
||||
int64x1_t sum2 = vadd_s64(vget_low_s64(sum0), vget_high_s64(sum0));
|
||||
sum2 = vadd_s64(sum2, vdup_n_s64(sum_res));
|
||||
sum2 = vshl_s64(sum2, shift);
|
||||
vst1_lane_s32(cross_correlation, vreinterpret_s32_s64(sum2), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* NEON version of WebRtcSpl_CrossCorrelation() for ARM32/64 platforms. */
|
||||
void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
|
||||
const int16_t* seq1,
|
||||
const int16_t* seq2,
|
||||
size_t dim_seq,
|
||||
size_t dim_cross_correlation,
|
||||
int right_shifts,
|
||||
int step_seq2) {
|
||||
int i = 0;
|
||||
|
||||
for (i = 0; i < (int)dim_cross_correlation; i++) {
|
||||
const int16_t* seq1_ptr = seq1;
|
||||
const int16_t* seq2_ptr = seq2 + (step_seq2 * i);
|
||||
|
||||
DotProductWithScaleNeon(cross_correlation,
|
||||
seq1_ptr,
|
||||
seq2_ptr,
|
||||
dim_seq,
|
||||
right_shifts);
|
||||
cross_correlation++;
|
||||
}
|
||||
}
|
@ -0,0 +1,224 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
||||
|
||||
#include "rtc_base/checks.h"
|
||||
|
||||
// NEON intrinsics version of WebRtcSpl_DownsampleFast()
|
||||
// for ARM 32-bit/64-bit platforms.
|
||||
int WebRtcSpl_DownsampleFastNeon(const int16_t* data_in,
|
||||
size_t data_in_length,
|
||||
int16_t* data_out,
|
||||
size_t data_out_length,
|
||||
const int16_t* __restrict coefficients,
|
||||
size_t coefficients_length,
|
||||
int factor,
|
||||
size_t delay) {
|
||||
// Using signed indexes to be able to compute negative i-j that
|
||||
// is used to index data_in.
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int32_t out_s32 = 0;
|
||||
int endpos = delay + factor * (data_out_length - 1) + 1;
|
||||
size_t res = data_out_length & 0x7;
|
||||
int endpos1 = endpos - factor * res;
|
||||
|
||||
// Return error if any of the running conditions doesn't meet.
|
||||
if (data_out_length == 0 || coefficients_length == 0
|
||||
|| (int)data_in_length < endpos) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
RTC_DCHECK_GE(endpos, 0);
|
||||
RTC_DCHECK_GE(endpos1, 0);
|
||||
|
||||
// First part, unroll the loop 8 times, with 3 subcases
|
||||
// (factor == 2, 4, others).
|
||||
switch (factor) {
|
||||
case 2: {
|
||||
for (i = delay; i < endpos1; i += 16) {
|
||||
// Round value, 0.5 in Q12.
|
||||
int32x4_t out32x4_0 = vdupq_n_s32(2048);
|
||||
int32x4_t out32x4_1 = vdupq_n_s32(2048);
|
||||
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
// Unroll the loop 2 times.
|
||||
for (j = 0; j < (int)coefficients_length - 1; j += 2) {
|
||||
int32x2_t coeff32 = vld1_dup_s32((int32_t*)&coefficients[j]);
|
||||
int16x4_t coeff16x4 = vreinterpret_s16_s32(coeff32);
|
||||
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j - 1]);
|
||||
|
||||
// Mul and accumulate low 64-bit data.
|
||||
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
|
||||
int16x4_t in16x4_1 = vget_low_s16(in16x8x2.val[1]);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 1);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_1, coeff16x4, 0);
|
||||
|
||||
// Mul and accumulate high 64-bit data.
|
||||
// TODO: vget_high_s16 need extra cost on ARM64. This could be
|
||||
// replaced by vmlal_high_lane_s16. But for the interface of
|
||||
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
|
||||
// This issue need to be tracked in the future.
|
||||
int16x4_t in16x4_2 = vget_high_s16(in16x8x2.val[0]);
|
||||
int16x4_t in16x4_3 = vget_high_s16(in16x8x2.val[1]);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_2, coeff16x4, 1);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 0);
|
||||
}
|
||||
|
||||
for (; j < (int)coefficients_length; j++) {
|
||||
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
|
||||
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
|
||||
|
||||
// Mul and accumulate low 64-bit data.
|
||||
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
|
||||
|
||||
// Mul and accumulate high 64-bit data.
|
||||
// TODO: vget_high_s16 need extra cost on ARM64. This could be
|
||||
// replaced by vmlal_high_lane_s16. But for the interface of
|
||||
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
|
||||
// This issue need to be tracked in the future.
|
||||
int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
|
||||
}
|
||||
#else
|
||||
// On ARMv7, the loop unrolling 2 times results in performance
|
||||
// regression.
|
||||
for (j = 0; j < (int)coefficients_length; j++) {
|
||||
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
|
||||
int16x8x2_t in16x8x2 = vld2q_s16(&data_in[i - j]);
|
||||
|
||||
// Mul and accumulate.
|
||||
int16x4_t in16x4_0 = vget_low_s16(in16x8x2.val[0]);
|
||||
int16x4_t in16x4_1 = vget_high_s16(in16x8x2.val[0]);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Saturate and store the output.
|
||||
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
|
||||
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
|
||||
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
|
||||
data_out += 8;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
for (i = delay; i < endpos1; i += 32) {
|
||||
// Round value, 0.5 in Q12.
|
||||
int32x4_t out32x4_0 = vdupq_n_s32(2048);
|
||||
int32x4_t out32x4_1 = vdupq_n_s32(2048);
|
||||
|
||||
// Unroll the loop 4 times.
|
||||
for (j = 0; j < (int)coefficients_length - 3; j += 4) {
|
||||
int16x4_t coeff16x4 = vld1_s16(&coefficients[j]);
|
||||
int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j - 3]);
|
||||
|
||||
// Mul and accumulate low 64-bit data.
|
||||
int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
|
||||
int16x4_t in16x4_2 = vget_low_s16(in16x8x4.val[1]);
|
||||
int16x4_t in16x4_4 = vget_low_s16(in16x8x4.val[2]);
|
||||
int16x4_t in16x4_6 = vget_low_s16(in16x8x4.val[3]);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 3);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_2, coeff16x4, 2);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_4, coeff16x4, 1);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_6, coeff16x4, 0);
|
||||
|
||||
// Mul and accumulate high 64-bit data.
|
||||
// TODO: vget_high_s16 need extra cost on ARM64. This could be
|
||||
// replaced by vmlal_high_lane_s16. But for the interface of
|
||||
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
|
||||
// This issue need to be tracked in the future.
|
||||
int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
|
||||
int16x4_t in16x4_3 = vget_high_s16(in16x8x4.val[1]);
|
||||
int16x4_t in16x4_5 = vget_high_s16(in16x8x4.val[2]);
|
||||
int16x4_t in16x4_7 = vget_high_s16(in16x8x4.val[3]);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 3);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_3, coeff16x4, 2);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_5, coeff16x4, 1);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_7, coeff16x4, 0);
|
||||
}
|
||||
|
||||
for (; j < (int)coefficients_length; j++) {
|
||||
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
|
||||
int16x8x4_t in16x8x4 = vld4q_s16(&data_in[i - j]);
|
||||
|
||||
// Mul and accumulate low 64-bit data.
|
||||
int16x4_t in16x4_0 = vget_low_s16(in16x8x4.val[0]);
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
|
||||
|
||||
// Mul and accumulate high 64-bit data.
|
||||
// TODO: vget_high_s16 need extra cost on ARM64. This could be
|
||||
// replaced by vmlal_high_lane_s16. But for the interface of
|
||||
// vmlal_high_lane_s16, there is a bug in gcc 4.9.
|
||||
// This issue need to be tracked in the future.
|
||||
int16x4_t in16x4_1 = vget_high_s16(in16x8x4.val[0]);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
|
||||
}
|
||||
|
||||
// Saturate and store the output.
|
||||
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
|
||||
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
|
||||
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
|
||||
data_out += 8;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
for (i = delay; i < endpos1; i += factor * 8) {
|
||||
// Round value, 0.5 in Q12.
|
||||
int32x4_t out32x4_0 = vdupq_n_s32(2048);
|
||||
int32x4_t out32x4_1 = vdupq_n_s32(2048);
|
||||
|
||||
for (j = 0; j < (int)coefficients_length; j++) {
|
||||
int16x4_t coeff16x4 = vld1_dup_s16(&coefficients[j]);
|
||||
int16x4_t in16x4_0 = vld1_dup_s16(&data_in[i - j]);
|
||||
in16x4_0 = vld1_lane_s16(&data_in[i + factor - j], in16x4_0, 1);
|
||||
in16x4_0 = vld1_lane_s16(&data_in[i + factor * 2 - j], in16x4_0, 2);
|
||||
in16x4_0 = vld1_lane_s16(&data_in[i + factor * 3 - j], in16x4_0, 3);
|
||||
int16x4_t in16x4_1 = vld1_dup_s16(&data_in[i + factor * 4 - j]);
|
||||
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 5 - j], in16x4_1, 1);
|
||||
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 6 - j], in16x4_1, 2);
|
||||
in16x4_1 = vld1_lane_s16(&data_in[i + factor * 7 - j], in16x4_1, 3);
|
||||
|
||||
// Mul and accumulate.
|
||||
out32x4_0 = vmlal_lane_s16(out32x4_0, in16x4_0, coeff16x4, 0);
|
||||
out32x4_1 = vmlal_lane_s16(out32x4_1, in16x4_1, coeff16x4, 0);
|
||||
}
|
||||
|
||||
// Saturate and store the output.
|
||||
int16x4_t out16x4_0 = vqshrn_n_s32(out32x4_0, 12);
|
||||
int16x4_t out16x4_1 = vqshrn_n_s32(out32x4_1, 12);
|
||||
vst1q_s16(data_out, vcombine_s16(out16x4_0, out16x4_1));
|
||||
data_out += 8;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Second part, do the rest iterations (if any).
|
||||
for (; i < endpos; i += factor) {
|
||||
out_s32 = 2048; // Round value, 0.5 in Q12.
|
||||
|
||||
for (j = 0; j < (int)coefficients_length; j++) {
|
||||
out_s32 = WebRtc_MulAccumW16(coefficients[j], data_in[i - j], out_s32);
|
||||
}
|
||||
|
||||
// Saturate and store the output.
|
||||
out_s32 >>= 12;
|
||||
*data_out++ = WebRtcSpl_SatW32ToW16(out_s32);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,333 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "rtc_base/checks.h"
|
||||
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
||||
|
||||
// Maximum absolute value of word16 vector. C version for generic platforms.
|
||||
int16_t WebRtcSpl_MaxAbsValueW16Neon(const int16_t* vector, size_t length) {
|
||||
int absolute = 0, maximum = 0;
|
||||
|
||||
RTC_DCHECK_GT(length, 0);
|
||||
|
||||
const int16_t* p_start = vector;
|
||||
size_t rest = length & 7;
|
||||
const int16_t* p_end = vector + length - rest;
|
||||
|
||||
int16x8_t v;
|
||||
uint16x8_t max_qv;
|
||||
max_qv = vdupq_n_u16(0);
|
||||
|
||||
while (p_start < p_end) {
|
||||
v = vld1q_s16(p_start);
|
||||
// Note vabs doesn't change the value of -32768.
|
||||
v = vabsq_s16(v);
|
||||
// Use u16 so we don't lose the value -32768.
|
||||
max_qv = vmaxq_u16(max_qv, vreinterpretq_u16_s16(v));
|
||||
p_start += 8;
|
||||
}
|
||||
|
||||
#ifdef WEBRTC_ARCH_ARM64
|
||||
maximum = (int)vmaxvq_u16(max_qv);
|
||||
#else
|
||||
uint16x4_t max_dv;
|
||||
max_dv = vmax_u16(vget_low_u16(max_qv), vget_high_u16(max_qv));
|
||||
max_dv = vpmax_u16(max_dv, max_dv);
|
||||
max_dv = vpmax_u16(max_dv, max_dv);
|
||||
|
||||
maximum = (int)vget_lane_u16(max_dv, 0);
|
||||
#endif
|
||||
|
||||
p_end = vector + length;
|
||||
while (p_start < p_end) {
|
||||
absolute = abs((int)(*p_start));
|
||||
|
||||
if (absolute > maximum) {
|
||||
maximum = absolute;
|
||||
}
|
||||
p_start++;
|
||||
}
|
||||
|
||||
// Guard the case for abs(-32768).
|
||||
if (maximum > WEBRTC_SPL_WORD16_MAX) {
|
||||
maximum = WEBRTC_SPL_WORD16_MAX;
|
||||
}
|
||||
|
||||
return (int16_t)maximum;
|
||||
}
|
||||
|
||||
// Maximum absolute value of word32 vector. NEON intrinsics version for
|
||||
// ARM 32-bit/64-bit platforms.
|
||||
int32_t WebRtcSpl_MaxAbsValueW32Neon(const int32_t* vector, size_t length) {
|
||||
// Use uint32_t for the local variables, to accommodate the return value
|
||||
// of abs(0x80000000), which is 0x80000000.
|
||||
|
||||
uint32_t absolute = 0, maximum = 0;
|
||||
size_t i = 0;
|
||||
size_t residual = length & 0x7;
|
||||
|
||||
RTC_DCHECK_GT(length, 0);
|
||||
|
||||
const int32_t* p_start = vector;
|
||||
uint32x4_t max32x4_0 = vdupq_n_u32(0);
|
||||
uint32x4_t max32x4_1 = vdupq_n_u32(0);
|
||||
|
||||
// First part, unroll the loop 8 times.
|
||||
for (i = 0; i < length - residual; i += 8) {
|
||||
int32x4_t in32x4_0 = vld1q_s32(p_start);
|
||||
p_start += 4;
|
||||
int32x4_t in32x4_1 = vld1q_s32(p_start);
|
||||
p_start += 4;
|
||||
in32x4_0 = vabsq_s32(in32x4_0);
|
||||
in32x4_1 = vabsq_s32(in32x4_1);
|
||||
// vabs doesn't change the value of 0x80000000.
|
||||
// Use u32 so we don't lose the value 0x80000000.
|
||||
max32x4_0 = vmaxq_u32(max32x4_0, vreinterpretq_u32_s32(in32x4_0));
|
||||
max32x4_1 = vmaxq_u32(max32x4_1, vreinterpretq_u32_s32(in32x4_1));
|
||||
}
|
||||
|
||||
uint32x4_t max32x4 = vmaxq_u32(max32x4_0, max32x4_1);
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
maximum = vmaxvq_u32(max32x4);
|
||||
#else
|
||||
uint32x2_t max32x2 = vmax_u32(vget_low_u32(max32x4), vget_high_u32(max32x4));
|
||||
max32x2 = vpmax_u32(max32x2, max32x2);
|
||||
|
||||
maximum = vget_lane_u32(max32x2, 0);
|
||||
#endif
|
||||
|
||||
// Second part, do the remaining iterations (if any).
|
||||
for (i = residual; i > 0; i--) {
|
||||
absolute = abs((int)(*p_start));
|
||||
if (absolute > maximum) {
|
||||
maximum = absolute;
|
||||
}
|
||||
p_start++;
|
||||
}
|
||||
|
||||
// Guard against the case for 0x80000000.
|
||||
maximum = WEBRTC_SPL_MIN(maximum, WEBRTC_SPL_WORD32_MAX);
|
||||
|
||||
return (int32_t)maximum;
|
||||
}
|
||||
|
||||
// Maximum value of word16 vector. NEON intrinsics version for
|
||||
// ARM 32-bit/64-bit platforms.
|
||||
int16_t WebRtcSpl_MaxValueW16Neon(const int16_t* vector, size_t length) {
|
||||
int16_t maximum = WEBRTC_SPL_WORD16_MIN;
|
||||
size_t i = 0;
|
||||
size_t residual = length & 0x7;
|
||||
|
||||
RTC_DCHECK_GT(length, 0);
|
||||
|
||||
const int16_t* p_start = vector;
|
||||
int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
|
||||
|
||||
// First part, unroll the loop 8 times.
|
||||
for (i = 0; i < length - residual; i += 8) {
|
||||
int16x8_t in16x8 = vld1q_s16(p_start);
|
||||
max16x8 = vmaxq_s16(max16x8, in16x8);
|
||||
p_start += 8;
|
||||
}
|
||||
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
maximum = vmaxvq_s16(max16x8);
|
||||
#else
|
||||
int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
|
||||
max16x4 = vpmax_s16(max16x4, max16x4);
|
||||
max16x4 = vpmax_s16(max16x4, max16x4);
|
||||
|
||||
maximum = vget_lane_s16(max16x4, 0);
|
||||
#endif
|
||||
|
||||
// Second part, do the remaining iterations (if any).
|
||||
for (i = residual; i > 0; i--) {
|
||||
if (*p_start > maximum)
|
||||
maximum = *p_start;
|
||||
p_start++;
|
||||
}
|
||||
return maximum;
|
||||
}
|
||||
|
||||
// Maximum value of word32 vector. NEON intrinsics version for
|
||||
// ARM 32-bit/64-bit platforms.
|
||||
int32_t WebRtcSpl_MaxValueW32Neon(const int32_t* vector, size_t length) {
|
||||
int32_t maximum = WEBRTC_SPL_WORD32_MIN;
|
||||
size_t i = 0;
|
||||
size_t residual = length & 0x7;
|
||||
|
||||
RTC_DCHECK_GT(length, 0);
|
||||
|
||||
const int32_t* p_start = vector;
|
||||
int32x4_t max32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
|
||||
int32x4_t max32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MIN);
|
||||
|
||||
// First part, unroll the loop 8 times.
|
||||
for (i = 0; i < length - residual; i += 8) {
|
||||
int32x4_t in32x4_0 = vld1q_s32(p_start);
|
||||
p_start += 4;
|
||||
int32x4_t in32x4_1 = vld1q_s32(p_start);
|
||||
p_start += 4;
|
||||
max32x4_0 = vmaxq_s32(max32x4_0, in32x4_0);
|
||||
max32x4_1 = vmaxq_s32(max32x4_1, in32x4_1);
|
||||
}
|
||||
|
||||
int32x4_t max32x4 = vmaxq_s32(max32x4_0, max32x4_1);
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
maximum = vmaxvq_s32(max32x4);
|
||||
#else
|
||||
int32x2_t max32x2 = vmax_s32(vget_low_s32(max32x4), vget_high_s32(max32x4));
|
||||
max32x2 = vpmax_s32(max32x2, max32x2);
|
||||
|
||||
maximum = vget_lane_s32(max32x2, 0);
|
||||
#endif
|
||||
|
||||
// Second part, do the remaining iterations (if any).
|
||||
for (i = residual; i > 0; i--) {
|
||||
if (*p_start > maximum)
|
||||
maximum = *p_start;
|
||||
p_start++;
|
||||
}
|
||||
return maximum;
|
||||
}
|
||||
|
||||
// Minimum value of word16 vector. NEON intrinsics version for
|
||||
// ARM 32-bit/64-bit platforms.
|
||||
int16_t WebRtcSpl_MinValueW16Neon(const int16_t* vector, size_t length) {
|
||||
int16_t minimum = WEBRTC_SPL_WORD16_MAX;
|
||||
size_t i = 0;
|
||||
size_t residual = length & 0x7;
|
||||
|
||||
RTC_DCHECK_GT(length, 0);
|
||||
|
||||
const int16_t* p_start = vector;
|
||||
int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
|
||||
|
||||
// First part, unroll the loop 8 times.
|
||||
for (i = 0; i < length - residual; i += 8) {
|
||||
int16x8_t in16x8 = vld1q_s16(p_start);
|
||||
min16x8 = vminq_s16(min16x8, in16x8);
|
||||
p_start += 8;
|
||||
}
|
||||
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
minimum = vminvq_s16(min16x8);
|
||||
#else
|
||||
int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
|
||||
min16x4 = vpmin_s16(min16x4, min16x4);
|
||||
min16x4 = vpmin_s16(min16x4, min16x4);
|
||||
|
||||
minimum = vget_lane_s16(min16x4, 0);
|
||||
#endif
|
||||
|
||||
// Second part, do the remaining iterations (if any).
|
||||
for (i = residual; i > 0; i--) {
|
||||
if (*p_start < minimum)
|
||||
minimum = *p_start;
|
||||
p_start++;
|
||||
}
|
||||
return minimum;
|
||||
}
|
||||
|
||||
// Minimum value of word32 vector. NEON intrinsics version for
|
||||
// ARM 32-bit/64-bit platforms.
|
||||
int32_t WebRtcSpl_MinValueW32Neon(const int32_t* vector, size_t length) {
|
||||
int32_t minimum = WEBRTC_SPL_WORD32_MAX;
|
||||
size_t i = 0;
|
||||
size_t residual = length & 0x7;
|
||||
|
||||
RTC_DCHECK_GT(length, 0);
|
||||
|
||||
const int32_t* p_start = vector;
|
||||
int32x4_t min32x4_0 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
|
||||
int32x4_t min32x4_1 = vdupq_n_s32(WEBRTC_SPL_WORD32_MAX);
|
||||
|
||||
// First part, unroll the loop 8 times.
|
||||
for (i = 0; i < length - residual; i += 8) {
|
||||
int32x4_t in32x4_0 = vld1q_s32(p_start);
|
||||
p_start += 4;
|
||||
int32x4_t in32x4_1 = vld1q_s32(p_start);
|
||||
p_start += 4;
|
||||
min32x4_0 = vminq_s32(min32x4_0, in32x4_0);
|
||||
min32x4_1 = vminq_s32(min32x4_1, in32x4_1);
|
||||
}
|
||||
|
||||
int32x4_t min32x4 = vminq_s32(min32x4_0, min32x4_1);
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
minimum = vminvq_s32(min32x4);
|
||||
#else
|
||||
int32x2_t min32x2 = vmin_s32(vget_low_s32(min32x4), vget_high_s32(min32x4));
|
||||
min32x2 = vpmin_s32(min32x2, min32x2);
|
||||
|
||||
minimum = vget_lane_s32(min32x2, 0);
|
||||
#endif
|
||||
|
||||
// Second part, do the remaining iterations (if any).
|
||||
for (i = residual; i > 0; i--) {
|
||||
if (*p_start < minimum)
|
||||
minimum = *p_start;
|
||||
p_start++;
|
||||
}
|
||||
return minimum;
|
||||
}
|
||||
|
||||
// Finds both the minimum and maximum elements in an array of 16-bit integers.
|
||||
void WebRtcSpl_MinMaxW16Neon(const int16_t* vector, size_t length,
|
||||
int16_t* min_val, int16_t* max_val) {
|
||||
int16_t minimum = WEBRTC_SPL_WORD16_MAX;
|
||||
int16_t maximum = WEBRTC_SPL_WORD16_MIN;
|
||||
size_t i = 0;
|
||||
size_t residual = length & 0x7;
|
||||
|
||||
RTC_DCHECK_GT(length, 0);
|
||||
|
||||
const int16_t* p_start = vector;
|
||||
int16x8_t min16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MAX);
|
||||
int16x8_t max16x8 = vdupq_n_s16(WEBRTC_SPL_WORD16_MIN);
|
||||
|
||||
// First part, unroll the loop 8 times.
|
||||
for (i = 0; i < length - residual; i += 8) {
|
||||
int16x8_t in16x8 = vld1q_s16(p_start);
|
||||
min16x8 = vminq_s16(min16x8, in16x8);
|
||||
max16x8 = vmaxq_s16(max16x8, in16x8);
|
||||
p_start += 8;
|
||||
}
|
||||
|
||||
#if defined(WEBRTC_ARCH_ARM64)
|
||||
minimum = vminvq_s16(min16x8);
|
||||
maximum = vmaxvq_s16(max16x8);
|
||||
#else
|
||||
int16x4_t min16x4 = vmin_s16(vget_low_s16(min16x8), vget_high_s16(min16x8));
|
||||
min16x4 = vpmin_s16(min16x4, min16x4);
|
||||
min16x4 = vpmin_s16(min16x4, min16x4);
|
||||
|
||||
minimum = vget_lane_s16(min16x4, 0);
|
||||
|
||||
int16x4_t max16x4 = vmax_s16(vget_low_s16(max16x8), vget_high_s16(max16x8));
|
||||
max16x4 = vpmax_s16(max16x4, max16x4);
|
||||
max16x4 = vpmax_s16(max16x4, max16x4);
|
||||
|
||||
maximum = vget_lane_s16(max16x4, 0);
|
||||
#endif
|
||||
|
||||
// Second part, do the remaining iterations (if any).
|
||||
for (i = residual; i > 0; i--) {
|
||||
if (*p_start < minimum)
|
||||
minimum = *p_start;
|
||||
if (*p_start > maximum)
|
||||
maximum = *p_start;
|
||||
p_start++;
|
||||
}
|
||||
*min_val = minimum;
|
||||
*max_val = maximum;
|
||||
}
|
Reference in New Issue
Block a user