DRAFT STT

pschatzmann · pschatzmann · commit 2257b5b162ad · 2025-12-01T09:41:35.000+01:00
diff --git a/src/AudioTools/STT/EchoCancellation.h b/src/AudioTools/STT/EchoCancellation.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include "AudioTools/CoreAudio/AudioStreams.h"
+#include "AudioTools/CoreAudio/Buffers.h"
+
+namespace audio_tools {
+/**
+ * @class EchoCancellation
+ * @brief Echo cancellation with adaptive LMS filtering for microcontrollers.
+ *
+ * This class implements echo cancellation using an adaptive FIR filter (LMS
+ * algorithm). It estimates the echo path and subtracts the estimated echo from
+ * the microphone input.
+ */
+template <typename T = int16_t>
+ class EchoCancellation : public AudioStream {
+ public:
+  /**
+   * @brief Constructor
+   * @param in Reference to the input stream (microphone or audio input)
+   * @param lag_samples Number of samples to delay the echo subtraction
+   * (default: 0)
+   * @param buffer_size Size of the internal ring buffer (default: 512)
+   */
+  EchoCancellation(Stream& in, size_t lag_samples = 0, size_t buffer_size = 512,
+                   size_t filter_len = 32, float mu = 0.001f)
+      : lag(lag_samples),
+        buffer_size(buffer_size),
+        filter_len(filter_len),
+        adaptation_rate(mu) {
+    p_io = &in;
+    filter.resize(filter_len, 0.0f);
+    reset();
+  }
+
+  /**
+   * @brief Store the output signal (sent to speaker)
+   * @param buf Pointer to PCM data sent to the speaker (T*)
+   * @param len Number of bytes in buf
+   * @return Number of bytes processed
+   */
+  size_t write(const uint8_t* buf, size_t len) override {
+    // Store output signal in queue for echo estimation
+    return ring_buffer.writeArray((T*)buf, len / sizeof(T)) *
+           sizeof(T);
+  }
+
+  /**
+   * @brief Read input and remove echo (subtract output signal with lag)
+   * @param buf Pointer to buffer to store processed input (T*)
+   * @param len Number of bytes to read
+   * @return Number of bytes read from input
+   */
+  size_t readBytes(uint8_t* buf, size_t len) override {
+    size_t read = p_io->readBytes(buf, len);
+    size_t actual_samples = read / sizeof(T);
+    T* data = (T*)buf;
+    Vector<T> ref_vec(filter_len, 0);
+    ring_buffer.peekArray(ref_vec.data(), filter_len);
+    for (size_t i = 0; i < actual_samples; ++i) {
+      // Build the reference vector for the adaptive filter
+      float echo_est = 0.0f;
+      for (size_t k = 0; k < filter_len; ++k) {
+        echo_est += filter[k] * ref_vec[k];
+      }
+      float mic = (float)data[i];
+      float error = mic - echo_est;
+      data[i] = (T)error;
+      // LMS update
+      for (size_t k = 0; k < filter_len; ++k) {
+        filter[k] += adaptation_rate * error * ref_vec[k];
+      }
+      T dummy;
+      ring_buffer.read(dummy);  // Advance the queue
+      // Shift ref_vec left and append dummy
+      for (size_t k = 0; k < filter_len - 1; ++k) {
+        ref_vec[k] = ref_vec[k + 1];
+      }
+      ref_vec[filter_len - 1] = dummy;
+    }
+    return read;
+  }
+
+  /**
+   * @brief Set the lag (delay) in samples for echo cancellation.
+   * @param lag_samples Number of samples to delay the echo subtraction
+   */
+  void setLag(size_t lag_samples) { lag = lag_samples; }
+
+  /**
+   * @brief Set the adaptation rate (mu) for the LMS algorithm.
+   * @param mu Adaptation rate
+   */
+  void setMu(float mu) { adaptation_rate = mu; }
+
+  /**
+   * @brief Set the filter length for the adaptive filter.
+   * @param len Length of the adaptive filter
+   */
+  void setFilterLen(size_t len) {
+    filter_len = len;
+    filter.resize(filter_len, 0.0f);
+  }
+
+  /**
+   * @brief Reset the internal buffer and lag state.
+   */
+  void reset() {
+    ring_buffer.resize(buffer_size + lag);
+    for (size_t j = 0; j < lag; j++) {
+      ring_buffer.write(0);
+    }
+    filter.assign(filter_len, 0.0f);
+  }
+
+ protected:
+  Stream* p_io = nullptr;
+  RingBuffer<T> ring_buffer{0};
+  size_t buffer_size;
+  size_t lag;  // lag in samples
+  // Adaptive filter
+  size_t filter_len;
+  float adaptation_rate = 0.01f;
+  Vector<float> filter;
+};
+
+}  // namespace audio_tools
diff --git a/src/AudioTools/STT/EchoCanellation.h b/src/AudioTools/STT/EchoCanellation.h
diff --git a/src/AudioTools/STT/WakeWordDetector.h b/src/AudioTools/STT/WakeWordDetector.h
@@ -1,12 +1,12 @@
-namespace audio_tools {
-
 #pragma once
 
 #include <algorithm>
 #include <cmath>
 
-#include "AudioOutput.h"
-#include "Vector.h"
+#include "AudioTools/CoreAudio/AudioOutput.h"
+#include "AudioTools/CoreAudio/AudioBasic/Collections/Vector.h"
+#include "AudioTools/CoreAudio/Buffers.h"
+#include "AudioTools/AudioLibs/AudioFFT.h"
 
 namespace audio_tools {
 
@@ -41,12 +41,13 @@ struct FrequencyFrame {
  *
  * Example:
  * @code
- * audio_tools::WakeWordDetector<3> detector(fft, fft_size, frame_size);
- *detector.addTemplate(my_template_frames, 80.0f, "hello");
- *detector.setWakeWordCallback([](const char* name) { Serial.println(name); });
-  ... (file header and includes)
-*/
-template <size_t N = 3>
+ * audio_tools::WakeWordDetector<3> detector(fft);
+ * detector.addTemplate(my_template_frames, 80.0f, "hello");
+ * detector.setWakeWordCallback([](const char* name) { Serial.println(name); });
+ * // ...
+ * @endcode
+ */
+template <typename T = int16_t, size_t N = 3>
 class WakeWordDetector : public AudioOutput {
  public:
   struct Template {
@@ -61,12 +62,12 @@ class WakeWordDetector : public AudioOutput {
 
   using WakeWordCallback = void (*)(const char* name);
 
-  WakeWordDetector(AudioFFTBase& fft, size_t fft_size, size_t frame_size)
-      : _fft_size(fft_size), _frame_size(frame_size), p_fft(&fft) {
-    _buffer.resize(_frame_size, 0);
+  WakeWordDetector(AudioFFTBase& fft)
+      : p_fft(&fft) {
     _frame_pos = 0;
-    fft.config().ref = this;
-    fft.callback = fftResult;
+    auto& fft_cfg = fft.config();
+    fft_cfg.ref = this;
+    fft_cfg.callback = fftResult;
   }
 
   void startRecording() {
@@ -94,17 +95,17 @@ class WakeWordDetector : public AudioOutput {
 
   void setWakeWordCallback(WakeWordCallback cb) { _callback = cb; }
 
-  size_t write(const void* buf, size_t count) override {
-    return p_fft->write((const uint8_t*)buf, count);
+  size_t write(const uint8_t* buf, size_t size) override {
+    return p_fft->write(buf, size);
   }
 
   static void fftResult(AudioFFTBase& fft) {
     // This static method must access instance data via fft.config().ref
-    auto* self = static_cast<WakeWordDetector<N>*>(fft.config().ref);
+    auto* self = static_cast<WakeWordDetector<T,N>*>(fft.config().ref);
     if (!self) return;
     FrequencyFrame<N> frame;
     AudioFFTResult result[N];
-    self->p_fft->resultArray(result, N);
+      fft.resultArray(result);
     for (size_t j = 0; j < N; j++) {
       frame.top_freqs[j] = result[j].frequency;
     }
@@ -130,11 +131,9 @@ class WakeWordDetector : public AudioOutput {
  protected:
   Vector<Template> _templates;               ///< List of wake word templates
   Vector<FrequencyFrame<N>> _recent_frames;  ///< Recent frames for comparison
-  Vector<int16_t> _buffer;  ///< Buffer for incoming PCM samples
+  Vector<T> _buffer;  ///< Buffer for incoming PCM samples
   AudioFFTBase* p_fft = nullptr;
   bool _is_recording = false;    ///< True if currently recording a template
-  size_t _fft_size;              ///< FFT size per frame
-  size_t _frame_size;            ///< Number of PCM samples per frame
   size_t _frame_pos;             ///< Current position in frame buffer
   size_t _max_template_len = 0;  ///< Length of the longest template
   WakeWordCallback _callback = nullptr;