add vad code.

This commit is contained in:
luocai
2024-09-06 18:26:45 +08:00
parent 35bf68338f
commit 2bed1dacf2
93 changed files with 12362 additions and 2 deletions

View File

@ -0,0 +1,219 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "common_audio/audio_converter.h"
#include <cstring>
#include <memory>
#include <utility>
#include <vector>
#include "common_audio/channel_buffer.h"
#include "common_audio/resampler/push_sinc_resampler.h"
#include "rtc_base/checks.h"
#include "rtc_base/numerics/safe_conversions.h"
namespace webrtc {
class CopyConverter : public AudioConverter {
public:
CopyConverter(size_t src_channels,
size_t src_frames,
size_t dst_channels,
size_t dst_frames)
: AudioConverter(src_channels, src_frames, dst_channels, dst_frames) {}
~CopyConverter() override {}
void Convert(const float* const* src,
size_t src_size,
float* const* dst,
size_t dst_capacity) override {
CheckSizes(src_size, dst_capacity);
if (src != dst) {
for (size_t i = 0; i < src_channels(); ++i)
std::memcpy(dst[i], src[i], dst_frames() * sizeof(*dst[i]));
}
}
};
class UpmixConverter : public AudioConverter {
public:
UpmixConverter(size_t src_channels,
size_t src_frames,
size_t dst_channels,
size_t dst_frames)
: AudioConverter(src_channels, src_frames, dst_channels, dst_frames) {}
~UpmixConverter() override {}
void Convert(const float* const* src,
size_t src_size,
float* const* dst,
size_t dst_capacity) override {
CheckSizes(src_size, dst_capacity);
for (size_t i = 0; i < dst_frames(); ++i) {
const float value = src[0][i];
for (size_t j = 0; j < dst_channels(); ++j)
dst[j][i] = value;
}
}
};
class DownmixConverter : public AudioConverter {
public:
DownmixConverter(size_t src_channels,
size_t src_frames,
size_t dst_channels,
size_t dst_frames)
: AudioConverter(src_channels, src_frames, dst_channels, dst_frames) {}
~DownmixConverter() override {}
void Convert(const float* const* src,
size_t src_size,
float* const* dst,
size_t dst_capacity) override {
CheckSizes(src_size, dst_capacity);
float* dst_mono = dst[0];
for (size_t i = 0; i < src_frames(); ++i) {
float sum = 0;
for (size_t j = 0; j < src_channels(); ++j)
sum += src[j][i];
dst_mono[i] = sum / src_channels();
}
}
};
class ResampleConverter : public AudioConverter {
public:
ResampleConverter(size_t src_channels,
size_t src_frames,
size_t dst_channels,
size_t dst_frames)
: AudioConverter(src_channels, src_frames, dst_channels, dst_frames) {
resamplers_.reserve(src_channels);
for (size_t i = 0; i < src_channels; ++i)
resamplers_.push_back(std::unique_ptr<PushSincResampler>(
new PushSincResampler(src_frames, dst_frames)));
}
~ResampleConverter() override {}
void Convert(const float* const* src,
size_t src_size,
float* const* dst,
size_t dst_capacity) override {
CheckSizes(src_size, dst_capacity);
for (size_t i = 0; i < resamplers_.size(); ++i)
resamplers_[i]->Resample(src[i], src_frames(), dst[i], dst_frames());
}
private:
std::vector<std::unique_ptr<PushSincResampler>> resamplers_;
};
// Apply a vector of converters in serial, in the order given. At least two
// converters must be provided.
class CompositionConverter : public AudioConverter {
public:
explicit CompositionConverter(
std::vector<std::unique_ptr<AudioConverter>> converters)
: converters_(std::move(converters)) {
RTC_CHECK_GE(converters_.size(), 2);
// We need an intermediate buffer after every converter.
for (auto it = converters_.begin(); it != converters_.end() - 1; ++it)
buffers_.push_back(
std::unique_ptr<ChannelBuffer<float>>(new ChannelBuffer<float>(
(*it)->dst_frames(), (*it)->dst_channels())));
}
~CompositionConverter() override {}
void Convert(const float* const* src,
size_t src_size,
float* const* dst,
size_t dst_capacity) override {
converters_.front()->Convert(src, src_size, buffers_.front()->channels(),
buffers_.front()->size());
for (size_t i = 2; i < converters_.size(); ++i) {
auto& src_buffer = buffers_[i - 2];
auto& dst_buffer = buffers_[i - 1];
converters_[i]->Convert(src_buffer->channels(), src_buffer->size(),
dst_buffer->channels(), dst_buffer->size());
}
converters_.back()->Convert(buffers_.back()->channels(),
buffers_.back()->size(), dst, dst_capacity);
}
private:
std::vector<std::unique_ptr<AudioConverter>> converters_;
std::vector<std::unique_ptr<ChannelBuffer<float>>> buffers_;
};
std::unique_ptr<AudioConverter> AudioConverter::Create(size_t src_channels,
size_t src_frames,
size_t dst_channels,
size_t dst_frames) {
std::unique_ptr<AudioConverter> sp;
if (src_channels > dst_channels) {
if (src_frames != dst_frames) {
std::vector<std::unique_ptr<AudioConverter>> converters;
converters.push_back(std::unique_ptr<AudioConverter>(new DownmixConverter(
src_channels, src_frames, dst_channels, src_frames)));
converters.push_back(
std::unique_ptr<AudioConverter>(new ResampleConverter(
dst_channels, src_frames, dst_channels, dst_frames)));
sp.reset(new CompositionConverter(std::move(converters)));
} else {
sp.reset(new DownmixConverter(src_channels, src_frames, dst_channels,
dst_frames));
}
} else if (src_channels < dst_channels) {
if (src_frames != dst_frames) {
std::vector<std::unique_ptr<AudioConverter>> converters;
converters.push_back(
std::unique_ptr<AudioConverter>(new ResampleConverter(
src_channels, src_frames, src_channels, dst_frames)));
converters.push_back(std::unique_ptr<AudioConverter>(new UpmixConverter(
src_channels, dst_frames, dst_channels, dst_frames)));
sp.reset(new CompositionConverter(std::move(converters)));
} else {
sp.reset(new UpmixConverter(src_channels, src_frames, dst_channels,
dst_frames));
}
} else if (src_frames != dst_frames) {
sp.reset(new ResampleConverter(src_channels, src_frames, dst_channels,
dst_frames));
} else {
sp.reset(
new CopyConverter(src_channels, src_frames, dst_channels, dst_frames));
}
return sp;
}
// For CompositionConverter.
AudioConverter::AudioConverter()
: src_channels_(0), src_frames_(0), dst_channels_(0), dst_frames_(0) {}
AudioConverter::AudioConverter(size_t src_channels,
size_t src_frames,
size_t dst_channels,
size_t dst_frames)
: src_channels_(src_channels),
src_frames_(src_frames),
dst_channels_(dst_channels),
dst_frames_(dst_frames) {
RTC_CHECK(dst_channels == src_channels || dst_channels == 1 ||
src_channels == 1);
}
void AudioConverter::CheckSizes(size_t src_size, size_t dst_capacity) const {
RTC_CHECK_EQ(src_size, src_channels() * src_frames());
RTC_CHECK_GE(dst_capacity, dst_channels() * dst_frames());
}
} // namespace webrtc

View File

@ -0,0 +1,72 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef COMMON_AUDIO_AUDIO_CONVERTER_H_
#define COMMON_AUDIO_AUDIO_CONVERTER_H_
#include <stddef.h>
#include <memory>
namespace webrtc {
// Format conversion (remixing and resampling) for audio. Only simple remixing
// conversions are supported: downmix to mono (i.e. `dst_channels` == 1) or
// upmix from mono (i.e. |src_channels == 1|).
//
// The source and destination chunks have the same duration in time; specifying
// the number of frames is equivalent to specifying the sample rates.
class AudioConverter {
public:
// Returns a new AudioConverter, which will use the supplied format for its
// lifetime. Caller is responsible for the memory.
static std::unique_ptr<AudioConverter> Create(size_t src_channels,
size_t src_frames,
size_t dst_channels,
size_t dst_frames);
virtual ~AudioConverter() {}
AudioConverter(const AudioConverter&) = delete;
AudioConverter& operator=(const AudioConverter&) = delete;
// Convert `src`, containing `src_size` samples, to `dst`, having a sample
// capacity of `dst_capacity`. Both point to a series of buffers containing
// the samples for each channel. The sizes must correspond to the format
// passed to Create().
virtual void Convert(const float* const* src,
size_t src_size,
float* const* dst,
size_t dst_capacity) = 0;
size_t src_channels() const { return src_channels_; }
size_t src_frames() const { return src_frames_; }
size_t dst_channels() const { return dst_channels_; }
size_t dst_frames() const { return dst_frames_; }
protected:
AudioConverter();
AudioConverter(size_t src_channels,
size_t src_frames,
size_t dst_channels,
size_t dst_frames);
// Helper to RTC_CHECK that inputs are correctly sized.
void CheckSizes(size_t src_size, size_t dst_capacity) const;
private:
const size_t src_channels_;
const size_t src_frames_;
const size_t dst_channels_;
const size_t dst_frames_;
};
} // namespace webrtc
#endif // COMMON_AUDIO_AUDIO_CONVERTER_H_

View File

@ -0,0 +1,58 @@
/*
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef COMMON_AUDIO_RESAMPLER_INCLUDE_PUSH_RESAMPLER_H_
#define COMMON_AUDIO_RESAMPLER_INCLUDE_PUSH_RESAMPLER_H_
#include <memory>
#include <vector>
#include "api/audio/audio_view.h"
namespace webrtc {
class PushSincResampler;
// Wraps PushSincResampler to provide stereo support.
// Note: This implementation assumes 10ms buffer sizes throughout.
template <typename T>
class PushResampler final {
public:
PushResampler();
PushResampler(size_t src_samples_per_channel,
size_t dst_samples_per_channel,
size_t num_channels);
~PushResampler();
// Returns the total number of samples provided in destination (e.g. 32 kHz,
// 2 channel audio gives 640 samples).
int Resample(InterleavedView<const T> src, InterleavedView<T> dst);
// For when a deinterleaved/mono channel already exists and we can skip the
// deinterleaved operation.
int Resample(MonoView<const T> src, MonoView<T> dst);
private:
// Ensures that source and destination buffers for deinterleaving are
// correctly configured prior to resampling that requires deinterleaving.
void EnsureInitialized(size_t src_samples_per_channel,
size_t dst_samples_per_channel,
size_t num_channels);
// Buffers used for when a deinterleaving step is necessary.
std::unique_ptr<T[]> source_;
std::unique_ptr<T[]> destination_;
DeinterleavedView<T> source_view_;
DeinterleavedView<T> destination_view_;
std::vector<std::unique_ptr<PushSincResampler>> resamplers_;
};
} // namespace webrtc
#endif // COMMON_AUDIO_RESAMPLER_INCLUDE_PUSH_RESAMPLER_H_

View File

@ -0,0 +1,99 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* A wrapper for resampling a numerous amount of sampling combinations.
*/
#ifndef COMMON_AUDIO_RESAMPLER_INCLUDE_RESAMPLER_H_
#define COMMON_AUDIO_RESAMPLER_INCLUDE_RESAMPLER_H_
#include <stddef.h>
#include <stdint.h>
namespace webrtc {
// All methods return 0 on success and -1 on failure.
class Resampler {
public:
Resampler();
Resampler(int inFreq, int outFreq, size_t num_channels);
~Resampler();
// Reset all states
int Reset(int inFreq, int outFreq, size_t num_channels);
// Reset all states if any parameter has changed
int ResetIfNeeded(int inFreq, int outFreq, size_t num_channels);
// Resample samplesIn to samplesOut.
int Push(const int16_t* samplesIn,
size_t lengthIn,
int16_t* samplesOut,
size_t maxLen,
size_t& outLen); // NOLINT: to avoid changing APIs
private:
enum ResamplerMode {
kResamplerMode1To1,
kResamplerMode1To2,
kResamplerMode1To3,
kResamplerMode1To4,
kResamplerMode1To6,
kResamplerMode1To12,
kResamplerMode2To3,
kResamplerMode2To11,
kResamplerMode4To11,
kResamplerMode8To11,
kResamplerMode11To16,
kResamplerMode11To32,
kResamplerMode2To1,
kResamplerMode3To1,
kResamplerMode4To1,
kResamplerMode6To1,
kResamplerMode12To1,
kResamplerMode3To2,
kResamplerMode11To2,
kResamplerMode11To4,
kResamplerMode11To8
};
// Computes the resampler mode for a given sampling frequency pair.
// Returns -1 for unsupported frequency pairs.
static int ComputeResamplerMode(int in_freq_hz,
int out_freq_hz,
ResamplerMode* mode);
// Generic pointers since we don't know what states we'll need
void* state1_;
void* state2_;
void* state3_;
// Storage if needed
int16_t* in_buffer_;
int16_t* out_buffer_;
size_t in_buffer_size_;
size_t out_buffer_size_;
size_t in_buffer_size_max_;
size_t out_buffer_size_max_;
int my_in_frequency_khz_;
int my_out_frequency_khz_;
ResamplerMode my_mode_;
size_t num_channels_;
// Extra instance for stereo
Resampler* helper_left_;
Resampler* helper_right_;
};
} // namespace webrtc
#endif // COMMON_AUDIO_RESAMPLER_INCLUDE_RESAMPLER_H_

View File

@ -0,0 +1,87 @@
/*
* Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This header file includes the VAD API calls. Specific function calls are
* given below.
*/
#ifndef COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT
#define COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_
#include <stddef.h>
#include <stdint.h>
typedef struct WebRtcVadInst VadInst;
#ifdef __cplusplus
extern "C" {
#endif
// Creates an instance to the VAD structure.
VadInst* WebRtcVad_Create(void);
// Frees the dynamic memory of a specified VAD instance.
//
// - handle [i] : Pointer to VAD instance that should be freed.
void WebRtcVad_Free(VadInst* handle);
// Initializes a VAD instance.
//
// - handle [i/o] : Instance that should be initialized.
//
// returns : 0 - (OK),
// -1 - (null pointer or Default mode could not be set).
int WebRtcVad_Init(VadInst* handle);
// Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
// restrictive in reporting speech. Put in other words the probability of being
// speech when the VAD returns 1 is increased with increasing mode. As a
// consequence also the missed detection rate goes up.
//
// - handle [i/o] : VAD instance.
// - mode [i] : Aggressiveness mode (0, 1, 2, or 3).
//
// returns : 0 - (OK),
// -1 - (null pointer, mode could not be set or the VAD instance
// has not been initialized).
int WebRtcVad_set_mode(VadInst* handle, int mode);
// Calculates a VAD decision for the `audio_frame`. For valid sampling rates
// frame lengths, see the description of WebRtcVad_ValidRatesAndFrameLengths().
//
// - handle [i/o] : VAD Instance. Needs to be initialized by
// WebRtcVad_Init() before call.
// - fs [i] : Sampling frequency (Hz): 8000, 16000, or 32000
// - audio_frame [i] : Audio frame buffer.
// - frame_length [i] : Length of audio frame buffer in number of samples.
//
// returns : 1 - (Active Voice),
// 0 - (Non-active Voice),
// -1 - (Error)
int WebRtcVad_Process(VadInst* handle,
int fs,
const int16_t* audio_frame,
size_t frame_length);
// Checks for valid combinations of `rate` and `frame_length`. We support 10,
// 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz.
//
// - rate [i] : Sampling frequency (Hz).
// - frame_length [i] : Speech frame buffer length in number of samples.
//
// returns : 0 - (valid combination), -1 - (invalid combination)
int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length);
#ifdef __cplusplus
}
#endif
#endif // COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_ // NOLINT