Play TTS files in response to TTS prompts, but it's legible now

- input files are upsamples and padded to stereo before playback - any in-progress playback is cancelled before playing a new file
10 months ago · 542ebc6531
parent d0b739c66e
commit 542ebc6531
5 changed files with 162 additions and 61 deletions
--- a/src/tangara/audio/processor.cpp
+++ b/src/tangara/audio/processor.cpp
@ -347,34 +347,39 @@ auto SampleProcessor::discardCommand(Args& command) -> void {
  // End of stream commands can just be dropped without further action.
 }
-SampleProcessor::Buffer::Buffer()
+Buffer::Buffer(std::span<sample::Sample> storage)
-    : buffer_(reinterpret_cast<sample::Sample*>(
+    : storage_(nullptr), buffer_(storage), samples_in_buffer_() {}
-                  heap_caps_calloc(kSampleBufferLength,
+
-                                   sizeof(sample::Sample),
+Buffer::Buffer()
-                                   MALLOC_CAP_DMA)),
+    : storage_(reinterpret_cast<sample::Sample*>(
-              kSampleBufferLength),
+          heap_caps_calloc(kSampleBufferLength,
                           sizeof(sample::Sample),
                           MALLOC_CAP_DMA))),
      buffer_(storage_, kSampleBufferLength),
      samples_in_buffer_() {}
-SampleProcessor::Buffer::~Buffer() {
+Buffer::~Buffer() {
-  heap_caps_free(buffer_.data());
+  if (storage_) {
    heap_caps_free(storage_);
  }
 }
-auto SampleProcessor::Buffer::writeAcquire() -> std::span<sample::Sample> {
+auto Buffer::writeAcquire() -> std::span<sample::Sample> {
  return buffer_.subspan(samples_in_buffer_.size());
 }
-auto SampleProcessor::Buffer::writeCommit(size_t samples) -> void {
+auto Buffer::writeCommit(size_t samples) -> void {
  if (samples == 0) {
    return;
  }
  samples_in_buffer_ = buffer_.first(samples + samples_in_buffer_.size());
 }
-auto SampleProcessor::Buffer::readAcquire() -> std::span<sample::Sample> {
+auto Buffer::readAcquire() -> std::span<sample::Sample> {
  return samples_in_buffer_;
 }
-auto SampleProcessor::Buffer::readCommit(size_t samples) -> void {
+auto Buffer::readCommit(size_t samples) -> void {
  if (samples == 0) {
    return;
  }
@ -389,11 +394,11 @@ auto SampleProcessor::Buffer::readCommit(size_t samples) -> void {
  }
 }
-auto SampleProcessor::Buffer::isEmpty() -> bool {
+auto Buffer::isEmpty() -> bool {
  return samples_in_buffer_.empty();
 }
-auto SampleProcessor::Buffer::clear() -> void {
+auto Buffer::clear() -> void {
  samples_in_buffer_ = {};
 }
--- a/src/tangara/audio/processor.hpp
+++ b/src/tangara/audio/processor.hpp
@ -22,6 +22,35 @@
 namespace audio {
 /* Utility for managing buffering samples between digital filters. */
 class Buffer {
 public:
  Buffer(std::span<sample::Sample> storage);
  Buffer();
  ~Buffer();
  /* Returns a span of the unused space within the buffer. */
  auto writeAcquire() -> std::span<sample::Sample>;
  /* Signals how many samples were just added to the writeAcquire span. */
  auto writeCommit(size_t) -> void;
  /* Returns a span of the samples stored within the buffer. */
  auto readAcquire() -> std::span<sample::Sample>;
  /* Signals how many samples from the readAcquire span were consumed. */
  auto readCommit(size_t) -> void;
  auto isEmpty() -> bool;
  auto clear() -> void;
  Buffer(const Buffer&) = delete;
  Buffer& operator=(const Buffer&) = delete;
 private:
  sample::Sample* storage_;
  std::span<sample::Sample> buffer_;
  std::span<sample::Sample> samples_in_buffer_;
 };
 /*
 * Handle to a persistent task that converts samples between formats (sample
 * rate, channels, bits per sample), in order to put samples in the preferred
@ -87,33 +116,6 @@ class SampleProcessor {
  StreamBufferHandle_t source_;
  drivers::PcmBuffer& sink_;
  /* Internal utility for managing buffering samples between our filters. */
  class Buffer {
   public:
    Buffer();
    ~Buffer();
    /* Returns a span of the unused space within the buffer. */
    auto writeAcquire() -> std::span<sample::Sample>;
    /* Signals how many samples were just added to the writeAcquire span. */
    auto writeCommit(size_t) -> void;
    /* Returns a span of the samples stored within the buffer. */
    auto readAcquire() -> std::span<sample::Sample>;
    /* Signals how many samples from the readAcquire span were consumed. */
    auto readCommit(size_t) -> void;
    auto isEmpty() -> bool;
    auto clear() -> void;
    Buffer(const Buffer&) = delete;
    Buffer& operator=(const Buffer&) = delete;
   private:
    std::span<sample::Sample> buffer_;
    std::span<sample::Sample> samples_in_buffer_;
  };
  Buffer input_buffer_;
  Buffer resampled_buffer_;
  Buffer output_buffer_;
--- a/src/tangara/tts/player.cpp
+++ b/src/tangara/tts/player.cpp
@ -6,8 +6,12 @@
 #include "tts/player.hpp"
 #include "audio/processor.hpp"
 #include "audio/resample.hpp"
 #include "codec.hpp"
 #include "esp_log.h"
 #include "freertos/projdefs.h"
 #include "portmacro.h"
 #include "sample.hpp"
 #include "types.hpp"
@ -18,57 +22,140 @@ namespace tts {
 Player::Player(tasks::WorkerPool& worker,
               drivers::PcmBuffer& output,
               audio::FatfsStreamFactory& factory)
-    : bg_(worker), stream_factory_(factory), output_(output) {}
+    : bg_(worker), stream_factory_(factory), output_(output), play_count_(0) {}
 auto Player::playFile(const std::string& path) -> void {
  ESP_LOGI(kTag, "playing '%s'", path.c_str());
-  bg_.Dispatch<void>([=]() {
+  int this_play = ++play_count_;
  bg_.Dispatch<void>([=, this]() {
    auto stream = stream_factory_.create(path);
    if (!stream) {
      ESP_LOGE(kTag, "creating stream failed");
      return;
    }
    // FIXME: Rather than hardcoding WAV support only, we should work out a
    // proper subset of 'low memory' decoders that can all be used for TTS
    // playback.
    if (stream->type() != codecs::StreamType::kWav) {
      ESP_LOGE(kTag, "stream was unsupported type");
      return;
    }
    auto decoder = codecs::CreateCodecForType(stream->type());
    if (!decoder) {
      ESP_LOGE(kTag, "creating decoder failed");
      return;
    }
    std::unique_ptr<codecs::ICodec> codec{*decoder};
    auto open_res = codec->OpenStream(stream, 0);
    if (open_res.has_error()) {
      ESP_LOGE(kTag, "opening stream failed");
      return;
    }
-    // if (open_res->sample_rate_hz != 48000 || open_res->num_channels != 2) {
+
-    // ESP_LOGE(kTag, "stream format is wrong (was %u channels @ %lu hz)",
+    decodeToSink(*open_res, std::move(codec), this_play);
-    // open_res->num_channels, open_res->sample_rate_hz);
+  });
-    // return;
+}
-    // }
+
-    sample::Sample decode_buf[4096];
+auto Player::decodeToSink(const codecs::ICodec::OutputFormat& format,
-    for (;;) {
+                          std::unique_ptr<codecs::ICodec> codec,
-      auto decode_res = codec->DecodeTo(decode_buf);
+                          int play_count) -> void {
  // Set up buffers to hold samples between the intermediary parts of
  // processing. We can just use the stack for these, since this method is
  // called only from background workers, which have enormous stacks.
  sample::Sample decode_storage[4096];
  audio::Buffer decode_buf(decode_storage);
  sample::Sample resample_storage[4096];
  audio::Buffer resample_buf(resample_storage);
  sample::Sample stereo_storage[4096];
  audio::Buffer stereo_buf(stereo_storage);
  // Work out what processing the codec's output needs.
  std::unique_ptr<audio::Resampler> resampler;
  if (format.sample_rate_hz != 48000) {
    resampler = std::make_unique<audio::Resampler>(format.sample_rate_hz, 48000,
                                                   format.num_channels);
  }
  bool double_samples = format.num_channels == 1;
  // FIXME: This decode-and-process loop is substantially the same as the audio
  // processor's filter loop. Ideally we should refactor both of these loops to
  // reuse code, however I'm holding off on doing this until we've implemented
  // more advanced audio processing features in the audio processor (EQ, tempo
  // shifting, etc.) as it's not clear to me yet how much the two codepaths will
  // be diverging later anyway.
  while (codec || !decode_buf.isEmpty() || !resample_buf.isEmpty() ||
         !stereo_buf.isEmpty()) {
    if (play_count != play_count_) {
      // FIXME: This is a little unsafe and could maybe take out the first few
      // samples of the next file.
      output_.clear();
      break;
    }
    if (codec) {
      auto decode_res = codec->DecodeTo(decode_buf.writeAcquire());
      if (decode_res.has_error()) {
        ESP_LOGE(kTag, "decoding error");
-        return;
+        break;
      }
      decode_buf.writeCommit(decode_res->samples_written);
      if (decode_res->is_stream_finished) {
-        break;
+        codec.reset();
      }
    }
    if (!decode_buf.isEmpty()) {
      auto resample_input = decode_buf.readAcquire();
      auto resample_output = resample_buf.writeAcquire();
-      std::span<sample::Sample> decode_span{decode_buf,
+      size_t read, wrote;
-                                            decode_res->samples_written};
+      if (resampler) {
-      while (!decode_span.empty()) {
+        std::tie(read, wrote) =
-        size_t sent = output_.send(decode_span);
+            resampler->Process(resample_input, resample_output, false);
-        decode_span = decode_span.subspan(sent);
+      } else {
        read = wrote = std::min(resample_input.size(), resample_output.size());
        std::copy_n(resample_input.begin(), read, resample_output.begin());
      }
      decode_buf.readCommit(read);
      resample_buf.writeCommit(wrote);
    }
-    ESP_LOGI(kTag, "finished playing okay");
+    if (!resample_buf.isEmpty()) {
-  });
+      auto channels_input = resample_buf.readAcquire();
      auto channels_output = stereo_buf.writeAcquire();
      size_t read, wrote;
      if (double_samples) {
        wrote = channels_output.size();
        read = wrote / 2;
        if (read > channels_input.size()) {
          read = channels_input.size();
          wrote = read * 2;
        }
        for (size_t i = 0; i < read; i++) {
          channels_output[i * 2] = channels_input[i];
          channels_output[(i * 2) + 1] = channels_input[i];
        }
      } else {
        read = wrote = std::min(channels_input.size(), channels_output.size());
        std::copy_n(channels_input.begin(), read, channels_output.begin());
      }
      resample_buf.readCommit(read);
      stereo_buf.writeCommit(wrote);
    }
    // The mixin PcmBuffer should almost always be draining, so we can force
    // samples into it more aggressively than with the main music PcmBuffer.
    while (!stereo_buf.isEmpty()) {
      size_t sent = output_.send(stereo_buf.readAcquire());
      stereo_buf.readCommit(sent);
    }
  }
 }
 }  // namespace tts
--- a/src/tangara/tts/player.hpp
+++ b/src/tangara/tts/player.hpp
@ -9,6 +9,7 @@
 #include <string>
 #include "audio/fatfs_stream_factory.hpp"
 #include "codec.hpp"
 #include "drivers/pcm_buffer.hpp"
 #include "tasks.hpp"
@ -33,6 +34,12 @@ class Player {
  tasks::WorkerPool& bg_;
  audio::FatfsStreamFactory& stream_factory_;
  drivers::PcmBuffer& output_;
  std::atomic<int> play_count_;
  auto decodeToSink(const codecs::ICodec::OutputFormat&,
                    std::unique_ptr<codecs::ICodec>,
                    int play_count) -> void;
 };
 }  // namespace tts
--- a/src/tangara/tts/provider.cpp
+++ b/src/tangara/tts/provider.cpp
@ -28,7 +28,7 @@ static const char* kTtsPath = "/.tangara-tts/";
 static auto textToFile(const std::string& text) -> std::optional<std::string> {
  uint64_t hash = komihash(text.data(), text.size(), 0);
  std::stringstream stream;
-  stream << kTtsPath << std::hex << hash << ".wav";
+  stream << kTtsPath << std::hex << hash;
  return stream.str();
 }