2024-09-05 09:59:28 +08:00

380 lines
14 KiB

* Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
#include "modules/audio_processing/aec3/residual_echo_estimator.h"
#include <stddef.h>
#include <algorithm>
#include <vector>
#include "api/array_view.h"
#include "modules/audio_processing/aec3/reverb_model.h"
#include "rtc_base/checks.h"
#include "system_wrappers/include/field_trial.h"
namespace webrtc {
namespace {
constexpr float kDefaultTransparentModeGain = 0.01f;
float GetTransparentModeGain() {
return kDefaultTransparentModeGain;
float GetEarlyReflectionsDefaultModeGain(
const EchoCanceller3Config::EpStrength& config) {
if (field_trial::IsEnabled("WebRTC-Aec3UseLowEarlyReflectionsDefaultGain")) {
return 0.1f;
return config.default_gain;
float GetLateReflectionsDefaultModeGain(
const EchoCanceller3Config::EpStrength& config) {
if (field_trial::IsEnabled("WebRTC-Aec3UseLowLateReflectionsDefaultGain")) {
return 0.1f;
return config.default_gain;
bool UseErleOnsetCompensationInDominantNearend(
const EchoCanceller3Config::EpStrength& config) {
return config.erle_onset_compensation_in_dominant_nearend ||
// Computes the indexes that will be used for computing spectral power over
// the blocks surrounding the delay.
void GetRenderIndexesToAnalyze(
const SpectrumBuffer& spectrum_buffer,
const EchoCanceller3Config::EchoModel& echo_model,
int filter_delay_blocks,
int* idx_start,
int* idx_stop) {
size_t window_start;
size_t window_end;
window_start =
std::max(0, filter_delay_blocks -
window_end = filter_delay_blocks +
*idx_start = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_start);
*idx_stop = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_end + 1);
// Estimates the residual echo power based on the echo return loss enhancement
// (ERLE) and the linear power estimate.
void LinearEstimate(
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> erle,
rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) {
RTC_DCHECK_EQ(S2_linear.size(), erle.size());
RTC_DCHECK_EQ(S2_linear.size(), R2.size());
const size_t num_capture_channels = R2.size();
for (size_t ch = 0; ch < num_capture_channels; ++ch) {
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
RTC_DCHECK_LT(0.f, erle[ch][k]);
R2[ch][k] = S2_linear[ch][k] / erle[ch][k];
// Estimates the residual echo power based on the estimate of the echo path
// gain.
void NonLinearEstimate(
float echo_path_gain,
const std::array<float, kFftLengthBy2Plus1>& X2,
rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) {
const size_t num_capture_channels = R2.size();
for (size_t ch = 0; ch < num_capture_channels; ++ch) {
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
R2[ch][k] = X2[k] * echo_path_gain;
// Applies a soft noise gate to the echo generating power.
void ApplyNoiseGate(const EchoCanceller3Config::EchoModel& config,
rtc::ArrayView<float, kFftLengthBy2Plus1> X2) {
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
if (config.noise_gate_power > X2[k]) {
X2[k] = std::max(0.f, X2[k] - config.noise_gate_slope *
(config.noise_gate_power - X2[k]));
// Estimates the echo generating signal power as gated maximal power over a
// time window.
void EchoGeneratingPower(size_t num_render_channels,
const SpectrumBuffer& spectrum_buffer,
const EchoCanceller3Config::EchoModel& echo_model,
int filter_delay_blocks,
rtc::ArrayView<float, kFftLengthBy2Plus1> X2) {
int idx_stop;
int idx_start;
GetRenderIndexesToAnalyze(spectrum_buffer, echo_model, filter_delay_blocks,
&idx_start, &idx_stop);
std::fill(X2.begin(), X2.end(), 0.f);
if (num_render_channels == 1) {
for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) {
for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
X2[j] = std::max(X2[j], spectrum_buffer.buffer[k][/*channel=*/0][j]);
} else {
for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) {
std::array<float, kFftLengthBy2Plus1> render_power;
for (size_t ch = 0; ch < num_render_channels; ++ch) {
const auto& channel_power = spectrum_buffer.buffer[k][ch];
for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
render_power[j] += channel_power[j];
for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
X2[j] = std::max(X2[j], render_power[j]);
} // namespace
ResidualEchoEstimator::ResidualEchoEstimator(const EchoCanceller3Config& config,
size_t num_render_channels)
: config_(config),
UseErleOnsetCompensationInDominantNearend(config_.ep_strength)) {
ResidualEchoEstimator::~ResidualEchoEstimator() = default;
void ResidualEchoEstimator::Estimate(
const AecState& aec_state,
const RenderBuffer& render_buffer,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear,
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> Y2,
bool dominant_nearend,
rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2,
rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2_unbounded) {
RTC_DCHECK_EQ(R2.size(), Y2.size());
RTC_DCHECK_EQ(R2.size(), S2_linear.size());
const size_t num_capture_channels = R2.size();
// Estimate the power of the stationary noise in the render signal.
// Estimate the residual echo power.
if (aec_state.UsableLinearEstimate()) {
// When there is saturated echo, assume the same spectral content as is
// present in the microphone signal.
if (aec_state.SaturatedEcho()) {
for (size_t ch = 0; ch < num_capture_channels; ++ch) {
std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin());
std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin());
} else {
const bool onset_compensated =
erle_onset_compensation_in_dominant_nearend_ || !dominant_nearend;
LinearEstimate(S2_linear, aec_state.Erle(onset_compensated), R2);
LinearEstimate(S2_linear, aec_state.ErleUnbounded(), R2_unbounded);
UpdateReverb(ReverbType::kLinear, aec_state, render_buffer,
} else {
const float echo_path_gain =
GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/true);
// When there is saturated echo, assume the same spectral content as is
// present in the microphone signal.
if (aec_state.SaturatedEcho()) {
for (size_t ch = 0; ch < num_capture_channels; ++ch) {
std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin());
std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin());
} else {
// Estimate the echo generating signal power.
std::array<float, kFftLengthBy2Plus1> X2;
render_buffer.GetSpectrumBuffer(), config_.echo_model,
aec_state.MinDirectPathFilterDelay(), X2);
if (!aec_state.UseStationarityProperties()) {
ApplyNoiseGate(config_.echo_model, X2);
// Subtract the stationary noise power to avoid stationary noise causing
// excessive echo suppression.
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
X2[k] -= config_.echo_model.stationary_gate_slope * X2_noise_floor_[k];
X2[k] = std::max(0.f, X2[k]);
NonLinearEstimate(echo_path_gain, X2, R2);
NonLinearEstimate(echo_path_gain, X2, R2_unbounded);
if (config_.echo_model.model_reverb_in_nonlinear_mode &&
!aec_state.TransparentModeActive()) {
UpdateReverb(ReverbType::kNonLinear, aec_state, render_buffer,
if (aec_state.UseStationarityProperties()) {
// Scale the echo according to echo audibility.
std::array<float, kFftLengthBy2Plus1> residual_scaling;
for (size_t ch = 0; ch < num_capture_channels; ++ch) {
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
R2[ch][k] *= residual_scaling[k];
R2_unbounded[ch][k] *= residual_scaling[k];
void ResidualEchoEstimator::Reset() {
void ResidualEchoEstimator::UpdateRenderNoisePower(
const RenderBuffer& render_buffer) {
std::array<float, kFftLengthBy2Plus1> render_power_data;
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 =
rtc::ArrayView<const float, kFftLengthBy2Plus1> render_power =
if (num_render_channels_ > 1) {
for (size_t ch = 0; ch < num_render_channels_; ++ch) {
const auto& channel_power = X2[ch];
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
render_power_data[k] += channel_power[k];
render_power = render_power_data;
// Estimate the stationary noise power in a minimum statistics manner.
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
// Decrease rapidly.
if (render_power[k] < X2_noise_floor_[k]) {
X2_noise_floor_[k] = render_power[k];
X2_noise_floor_counter_[k] = 0;
} else {
// Increase in a delayed, leaky manner.
if (X2_noise_floor_counter_[k] >=
static_cast<int>(config_.echo_model.noise_floor_hold)) {
X2_noise_floor_[k] = std::max(X2_noise_floor_[k] * 1.1f,
} else {
// Updates the reverb estimation.
void ResidualEchoEstimator::UpdateReverb(ReverbType reverb_type,
const AecState& aec_state,
const RenderBuffer& render_buffer,
bool dominant_nearend) {
// Choose reverb partition based on what type of echo power model is used.
const size_t first_reverb_partition =
reverb_type == ReverbType::kLinear
? aec_state.FilterLengthBlocks() + 1
: aec_state.MinDirectPathFilterDelay() + 1;
// Compute render power for the reverb.
std::array<float, kFftLengthBy2Plus1> render_power_data;
rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 =
rtc::ArrayView<const float, kFftLengthBy2Plus1> render_power =
if (num_render_channels_ > 1) {
for (size_t ch = 0; ch < num_render_channels_; ++ch) {
const auto& channel_power = X2[ch];
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
render_power_data[k] += channel_power[k];
render_power = render_power_data;
// Update the reverb estimate.
float reverb_decay = aec_state.ReverbDecay(/*mild=*/dominant_nearend);
if (reverb_type == ReverbType::kLinear) {
render_power, aec_state.GetReverbFrequencyResponse(), reverb_decay);
} else {
const float echo_path_gain =
GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/false);
echo_reverb_.UpdateReverbNoFreqShaping(render_power, echo_path_gain,
// Adds the estimated power of the reverb to the residual echo power.
void ResidualEchoEstimator::AddReverb(
rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) const {
const size_t num_capture_channels = R2.size();
// Add the reverb power.
rtc::ArrayView<const float, kFftLengthBy2Plus1> reverb_power =
for (size_t ch = 0; ch < num_capture_channels; ++ch) {
for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
R2[ch][k] += reverb_power[k];
// Chooses the echo path gain to use.
float ResidualEchoEstimator::GetEchoPathGain(
const AecState& aec_state,
bool gain_for_early_reflections) const {
float gain_amplitude;
if (aec_state.TransparentModeActive()) {
gain_amplitude = gain_for_early_reflections
? early_reflections_transparent_mode_gain_
: late_reflections_transparent_mode_gain_;
} else {
gain_amplitude = gain_for_early_reflections
? early_reflections_general_gain_
: late_reflections_general_gain_;
return gain_amplitude * gain_amplitude;
} // namespace webrtc