add vad code.

2024-09-06 18:26:45 +08:00
parent 35bf68338f
commit 2bed1dacf2
93 changed files with 12362 additions and 2 deletions
--- a/VocieProcess/common_audio/audio_converter.cc
+++ b/VocieProcess/common_audio/audio_converter.cc
@ -0,0 +1,219 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "common_audio/audio_converter.h"
+
+#include <cstring>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "common_audio/channel_buffer.h"
+#include "common_audio/resampler/push_sinc_resampler.h"
+#include "rtc_base/checks.h"
+#include "rtc_base/numerics/safe_conversions.h"
+
+namespace webrtc {
+
+class CopyConverter : public AudioConverter {
+ public:
+  CopyConverter(size_t src_channels,
+                size_t src_frames,
+                size_t dst_channels,
+                size_t dst_frames)
+      : AudioConverter(src_channels, src_frames, dst_channels, dst_frames) {}
+  ~CopyConverter() override {}
+
+  void Convert(const float* const* src,
+               size_t src_size,
+               float* const* dst,
+               size_t dst_capacity) override {
+    CheckSizes(src_size, dst_capacity);
+    if (src != dst) {
+      for (size_t i = 0; i < src_channels(); ++i)
+        std::memcpy(dst[i], src[i], dst_frames() * sizeof(*dst[i]));
+    }
+  }
+};
+
+class UpmixConverter : public AudioConverter {
+ public:
+  UpmixConverter(size_t src_channels,
+                 size_t src_frames,
+                 size_t dst_channels,
+                 size_t dst_frames)
+      : AudioConverter(src_channels, src_frames, dst_channels, dst_frames) {}
+  ~UpmixConverter() override {}
+
+  void Convert(const float* const* src,
+               size_t src_size,
+               float* const* dst,
+               size_t dst_capacity) override {
+    CheckSizes(src_size, dst_capacity);
+    for (size_t i = 0; i < dst_frames(); ++i) {
+      const float value = src[0][i];
+      for (size_t j = 0; j < dst_channels(); ++j)
+        dst[j][i] = value;
+    }
+  }
+};
+
+class DownmixConverter : public AudioConverter {
+ public:
+  DownmixConverter(size_t src_channels,
+                   size_t src_frames,
+                   size_t dst_channels,
+                   size_t dst_frames)
+      : AudioConverter(src_channels, src_frames, dst_channels, dst_frames) {}
+  ~DownmixConverter() override {}
+
+  void Convert(const float* const* src,
+               size_t src_size,
+               float* const* dst,
+               size_t dst_capacity) override {
+    CheckSizes(src_size, dst_capacity);
+    float* dst_mono = dst[0];
+    for (size_t i = 0; i < src_frames(); ++i) {
+      float sum = 0;
+      for (size_t j = 0; j < src_channels(); ++j)
+        sum += src[j][i];
+      dst_mono[i] = sum / src_channels();
+    }
+  }
+};
+
+class ResampleConverter : public AudioConverter {
+ public:
+  ResampleConverter(size_t src_channels,
+                    size_t src_frames,
+                    size_t dst_channels,
+                    size_t dst_frames)
+      : AudioConverter(src_channels, src_frames, dst_channels, dst_frames) {
+    resamplers_.reserve(src_channels);
+    for (size_t i = 0; i < src_channels; ++i)
+      resamplers_.push_back(std::unique_ptr<PushSincResampler>(
+          new PushSincResampler(src_frames, dst_frames)));
+  }
+  ~ResampleConverter() override {}
+
+  void Convert(const float* const* src,
+               size_t src_size,
+               float* const* dst,
+               size_t dst_capacity) override {
+    CheckSizes(src_size, dst_capacity);
+    for (size_t i = 0; i < resamplers_.size(); ++i)
+      resamplers_[i]->Resample(src[i], src_frames(), dst[i], dst_frames());
+  }
+
+ private:
+  std::vector<std::unique_ptr<PushSincResampler>> resamplers_;
+};
+
+// Apply a vector of converters in serial, in the order given. At least two
+// converters must be provided.
+class CompositionConverter : public AudioConverter {
+ public:
+  explicit CompositionConverter(
+      std::vector<std::unique_ptr<AudioConverter>> converters)
+      : converters_(std::move(converters)) {
+    RTC_CHECK_GE(converters_.size(), 2);
+    // We need an intermediate buffer after every converter.
+    for (auto it = converters_.begin(); it != converters_.end() - 1; ++it)
+      buffers_.push_back(
+          std::unique_ptr<ChannelBuffer<float>>(new ChannelBuffer<float>(
+              (*it)->dst_frames(), (*it)->dst_channels())));
+  }
+  ~CompositionConverter() override {}
+
+  void Convert(const float* const* src,
+               size_t src_size,
+               float* const* dst,
+               size_t dst_capacity) override {
+    converters_.front()->Convert(src, src_size, buffers_.front()->channels(),
+                                 buffers_.front()->size());
+    for (size_t i = 2; i < converters_.size(); ++i) {
+      auto& src_buffer = buffers_[i - 2];
+      auto& dst_buffer = buffers_[i - 1];
+      converters_[i]->Convert(src_buffer->channels(), src_buffer->size(),
+                              dst_buffer->channels(), dst_buffer->size());
+    }
+    converters_.back()->Convert(buffers_.back()->channels(),
+                                buffers_.back()->size(), dst, dst_capacity);
+  }
+
+ private:
+  std::vector<std::unique_ptr<AudioConverter>> converters_;
+  std::vector<std::unique_ptr<ChannelBuffer<float>>> buffers_;
+};
+
+std::unique_ptr<AudioConverter> AudioConverter::Create(size_t src_channels,
+                                                       size_t src_frames,
+                                                       size_t dst_channels,
+                                                       size_t dst_frames) {
+  std::unique_ptr<AudioConverter> sp;
+  if (src_channels > dst_channels) {
+    if (src_frames != dst_frames) {
+      std::vector<std::unique_ptr<AudioConverter>> converters;
+      converters.push_back(std::unique_ptr<AudioConverter>(new DownmixConverter(
+          src_channels, src_frames, dst_channels, src_frames)));
+      converters.push_back(
+          std::unique_ptr<AudioConverter>(new ResampleConverter(
+              dst_channels, src_frames, dst_channels, dst_frames)));
+      sp.reset(new CompositionConverter(std::move(converters)));
+    } else {
+      sp.reset(new DownmixConverter(src_channels, src_frames, dst_channels,
+                                    dst_frames));
+    }
+  } else if (src_channels < dst_channels) {
+    if (src_frames != dst_frames) {
+      std::vector<std::unique_ptr<AudioConverter>> converters;
+      converters.push_back(
+          std::unique_ptr<AudioConverter>(new ResampleConverter(
+              src_channels, src_frames, src_channels, dst_frames)));
+      converters.push_back(std::unique_ptr<AudioConverter>(new UpmixConverter(
+          src_channels, dst_frames, dst_channels, dst_frames)));
+      sp.reset(new CompositionConverter(std::move(converters)));
+    } else {
+      sp.reset(new UpmixConverter(src_channels, src_frames, dst_channels,
+                                  dst_frames));
+    }
+  } else if (src_frames != dst_frames) {
+    sp.reset(new ResampleConverter(src_channels, src_frames, dst_channels,
+                                   dst_frames));
+  } else {
+    sp.reset(
+        new CopyConverter(src_channels, src_frames, dst_channels, dst_frames));
+  }
+
+  return sp;
+}
+
+// For CompositionConverter.
+AudioConverter::AudioConverter()
+    : src_channels_(0), src_frames_(0), dst_channels_(0), dst_frames_(0) {}
+
+AudioConverter::AudioConverter(size_t src_channels,
+                               size_t src_frames,
+                               size_t dst_channels,
+                               size_t dst_frames)
+    : src_channels_(src_channels),
+      src_frames_(src_frames),
+      dst_channels_(dst_channels),
+      dst_frames_(dst_frames) {
+  RTC_CHECK(dst_channels == src_channels || dst_channels == 1 ||
+            src_channels == 1);
+}
+
+void AudioConverter::CheckSizes(size_t src_size, size_t dst_capacity) const {
+  RTC_CHECK_EQ(src_size, src_channels() * src_frames());
+  RTC_CHECK_GE(dst_capacity, dst_channels() * dst_frames());
+}
+
+}  // namespace webrtc
--- a/VocieProcess/common_audio/audio_converter.h
+++ b/VocieProcess/common_audio/audio_converter.h
@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef COMMON_AUDIO_AUDIO_CONVERTER_H_
+#define COMMON_AUDIO_AUDIO_CONVERTER_H_
+
+#include <stddef.h>
+
+#include <memory>
+
+namespace webrtc {
+
+// Format conversion (remixing and resampling) for audio. Only simple remixing
+// conversions are supported: downmix to mono (i.e. `dst_channels` == 1) or
+// upmix from mono (i.e. |src_channels == 1|).
+//
+// The source and destination chunks have the same duration in time; specifying
+// the number of frames is equivalent to specifying the sample rates.
+class AudioConverter {
+ public:
+  // Returns a new AudioConverter, which will use the supplied format for its
+  // lifetime. Caller is responsible for the memory.
+  static std::unique_ptr<AudioConverter> Create(size_t src_channels,
+                                                size_t src_frames,
+                                                size_t dst_channels,
+                                                size_t dst_frames);
+  virtual ~AudioConverter() {}
+
+  AudioConverter(const AudioConverter&) = delete;
+  AudioConverter& operator=(const AudioConverter&) = delete;
+
+  // Convert `src`, containing `src_size` samples, to `dst`, having a sample
+  // capacity of `dst_capacity`. Both point to a series of buffers containing
+  // the samples for each channel. The sizes must correspond to the format
+  // passed to Create().
+  virtual void Convert(const float* const* src,
+                       size_t src_size,
+                       float* const* dst,
+                       size_t dst_capacity) = 0;
+
+  size_t src_channels() const { return src_channels_; }
+  size_t src_frames() const { return src_frames_; }
+  size_t dst_channels() const { return dst_channels_; }
+  size_t dst_frames() const { return dst_frames_; }
+
+ protected:
+  AudioConverter();
+  AudioConverter(size_t src_channels,
+                 size_t src_frames,
+                 size_t dst_channels,
+                 size_t dst_frames);
+
+  // Helper to RTC_CHECK that inputs are correctly sized.
+  void CheckSizes(size_t src_size, size_t dst_capacity) const;
+
+ private:
+  const size_t src_channels_;
+  const size_t src_frames_;
+  const size_t dst_channels_;
+  const size_t dst_frames_;
+};
+
+}  // namespace webrtc
+
+#endif  // COMMON_AUDIO_AUDIO_CONVERTER_H_
--- a/VocieProcess/common_audio/resampler/include/push_resampler.h
+++ b/VocieProcess/common_audio/resampler/include/push_resampler.h
@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef COMMON_AUDIO_RESAMPLER_INCLUDE_PUSH_RESAMPLER_H_
+#define COMMON_AUDIO_RESAMPLER_INCLUDE_PUSH_RESAMPLER_H_
+
+#include <memory>
+#include <vector>
+
+#include "api/audio/audio_view.h"
+
+namespace webrtc {
+
+class PushSincResampler;
+
+// Wraps PushSincResampler to provide stereo support.
+// Note: This implementation assumes 10ms buffer sizes throughout.
+template <typename T>
+class PushResampler final {
+ public:
+  PushResampler();
+  PushResampler(size_t src_samples_per_channel,
+                size_t dst_samples_per_channel,
+                size_t num_channels);
+  ~PushResampler();
+
+  // Returns the total number of samples provided in destination (e.g. 32 kHz,
+  // 2 channel audio gives 640 samples).
+  int Resample(InterleavedView<const T> src, InterleavedView<T> dst);
+  // For when a deinterleaved/mono channel already exists and we can skip the
+  // deinterleaved operation.
+  int Resample(MonoView<const T> src, MonoView<T> dst);
+
+ private:
+  // Ensures that source and destination buffers for deinterleaving are
+  // correctly configured prior to resampling that requires deinterleaving.
+  void EnsureInitialized(size_t src_samples_per_channel,
+                         size_t dst_samples_per_channel,
+                         size_t num_channels);
+
+  // Buffers used for when a deinterleaving step is necessary.
+  std::unique_ptr<T[]> source_;
+  std::unique_ptr<T[]> destination_;
+  DeinterleavedView<T> source_view_;
+  DeinterleavedView<T> destination_view_;
+
+  std::vector<std::unique_ptr<PushSincResampler>> resamplers_;
+};
+}  // namespace webrtc
+
+#endif  // COMMON_AUDIO_RESAMPLER_INCLUDE_PUSH_RESAMPLER_H_
--- a/VocieProcess/common_audio/resampler/include/resampler.h
+++ b/VocieProcess/common_audio/resampler/include/resampler.h
@ -0,0 +1,99 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * A wrapper for resampling a numerous amount of sampling combinations.
+ */
+
+#ifndef COMMON_AUDIO_RESAMPLER_INCLUDE_RESAMPLER_H_
+#define COMMON_AUDIO_RESAMPLER_INCLUDE_RESAMPLER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace webrtc {
+
+// All methods return 0 on success and -1 on failure.
+class Resampler {
+ public:
+  Resampler();
+  Resampler(int inFreq, int outFreq, size_t num_channels);
+  ~Resampler();
+
+  // Reset all states
+  int Reset(int inFreq, int outFreq, size_t num_channels);
+
+  // Reset all states if any parameter has changed
+  int ResetIfNeeded(int inFreq, int outFreq, size_t num_channels);
+
+  // Resample samplesIn to samplesOut.
+  int Push(const int16_t* samplesIn,
+           size_t lengthIn,
+           int16_t* samplesOut,
+           size_t maxLen,
+           size_t& outLen);  // NOLINT: to avoid changing APIs
+
+ private:
+  enum ResamplerMode {
+    kResamplerMode1To1,
+    kResamplerMode1To2,
+    kResamplerMode1To3,
+    kResamplerMode1To4,
+    kResamplerMode1To6,
+    kResamplerMode1To12,
+    kResamplerMode2To3,
+    kResamplerMode2To11,
+    kResamplerMode4To11,
+    kResamplerMode8To11,
+    kResamplerMode11To16,
+    kResamplerMode11To32,
+    kResamplerMode2To1,
+    kResamplerMode3To1,
+    kResamplerMode4To1,
+    kResamplerMode6To1,
+    kResamplerMode12To1,
+    kResamplerMode3To2,
+    kResamplerMode11To2,
+    kResamplerMode11To4,
+    kResamplerMode11To8
+  };
+
+  // Computes the resampler mode for a given sampling frequency pair.
+  // Returns -1 for unsupported frequency pairs.
+  static int ComputeResamplerMode(int in_freq_hz,
+                                  int out_freq_hz,
+                                  ResamplerMode* mode);
+
+  // Generic pointers since we don't know what states we'll need
+  void* state1_;
+  void* state2_;
+  void* state3_;
+
+  // Storage if needed
+  int16_t* in_buffer_;
+  int16_t* out_buffer_;
+  size_t in_buffer_size_;
+  size_t out_buffer_size_;
+  size_t in_buffer_size_max_;
+  size_t out_buffer_size_max_;
+
+  int my_in_frequency_khz_;
+  int my_out_frequency_khz_;
+  ResamplerMode my_mode_;
+  size_t num_channels_;
+
+  // Extra instance for stereo
+  Resampler* helper_left_;
+  Resampler* helper_right_;
+};
+
+}  // namespace webrtc
+
+#endif  // COMMON_AUDIO_RESAMPLER_INCLUDE_RESAMPLER_H_
--- a/VocieProcess/common_audio/vad/include/webrtc_vad.h
+++ b/VocieProcess/common_audio/vad/include/webrtc_vad.h
@ -0,0 +1,87 @@
+/*
+ *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * This header file includes the VAD API calls. Specific function calls are
+ * given below.
+ */
+
+#ifndef COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_  // NOLINT
+#define COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct WebRtcVadInst VadInst;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Creates an instance to the VAD structure.
+VadInst* WebRtcVad_Create(void);
+
+// Frees the dynamic memory of a specified VAD instance.
+//
+// - handle [i] : Pointer to VAD instance that should be freed.
+void WebRtcVad_Free(VadInst* handle);
+
+// Initializes a VAD instance.
+//
+// - handle [i/o] : Instance that should be initialized.
+//
+// returns        : 0 - (OK),
+//                 -1 - (null pointer or Default mode could not be set).
+int WebRtcVad_Init(VadInst* handle);
+
+// Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
+// restrictive in reporting speech. Put in other words the probability of being
+// speech when the VAD returns 1 is increased with increasing mode. As a
+// consequence also the missed detection rate goes up.
+//
+// - handle [i/o] : VAD instance.
+// - mode   [i]   : Aggressiveness mode (0, 1, 2, or 3).
+//
+// returns        : 0 - (OK),
+//                 -1 - (null pointer, mode could not be set or the VAD instance
+//                       has not been initialized).
+int WebRtcVad_set_mode(VadInst* handle, int mode);
+
+// Calculates a VAD decision for the `audio_frame`. For valid sampling rates
+// frame lengths, see the description of WebRtcVad_ValidRatesAndFrameLengths().
+//
+// - handle       [i/o] : VAD Instance. Needs to be initialized by
+//                        WebRtcVad_Init() before call.
+// - fs           [i]   : Sampling frequency (Hz): 8000, 16000, or 32000
+// - audio_frame  [i]   : Audio frame buffer.
+// - frame_length [i]   : Length of audio frame buffer in number of samples.
+//
+// returns              : 1 - (Active Voice),
+//                        0 - (Non-active Voice),
+//                       -1 - (Error)
+int WebRtcVad_Process(VadInst* handle,
+                      int fs,
+                      const int16_t* audio_frame,
+                      size_t frame_length);
+
+// Checks for valid combinations of `rate` and `frame_length`. We support 10,
+// 20 and 30 ms frames and the rates 8000, 16000 and 32000 Hz.
+//
+// - rate         [i] : Sampling frequency (Hz).
+// - frame_length [i] : Speech frame buffer length in number of samples.
+//
+// returns            : 0 - (valid combination), -1 - (invalid combination)
+int WebRtcVad_ValidRateAndFrameLength(int rate, size_t frame_length);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // COMMON_AUDIO_VAD_INCLUDE_WEBRTC_VAD_H_  // NOLINT