Pause and unpause the current audio output in response to TTS

10 months ago · c51709f99f
parent 542ebc6531
commit c51709f99f
7 changed files with 119 additions and 43 deletions
--- a/src/drivers/include/drivers/pcm_buffer.hpp
+++ b/src/drivers/include/drivers/pcm_buffer.hpp
@ -49,6 +49,7 @@ class PcmBuffer {

  auto clear() -> void;
  auto isEmpty() -> bool;
+  auto suspend(bool) -> void;

  /*
   * How many samples have been added to this buffer since it was created. This
@ -75,6 +76,8 @@ class PcmBuffer {

  std::atomic<uint32_t> sent_;
  std::atomic<uint32_t> received_;
+  std::atomic<bool> suspended_;
+
  RingbufHandle_t ringbuf_;
 };

--- a/src/drivers/pcm_buffer.cpp
+++ b/src/drivers/pcm_buffer.cpp
@ -25,7 +25,8 @@ namespace drivers {

 [[maybe_unused]] static const char kTag[] = "pcmbuf";

-PcmBuffer::PcmBuffer(size_t size_in_samples) : sent_(0), received_(0) {
+PcmBuffer::PcmBuffer(size_t size_in_samples)
+    : sent_(0), received_(0), suspended_(false) {
  size_t size_in_bytes = size_in_samples * sizeof(int16_t);
  ESP_LOGI(kTag, "allocating pcm buffer of size %u (%uKiB)", size_in_samples,
           size_in_bytes / 1024);
@ -51,6 +52,13 @@ auto PcmBuffer::send(std::span<const int16_t> data) -> size_t {

 IRAM_ATTR auto PcmBuffer::receive(std::span<int16_t> dest, bool mix, bool isr)
    -> BaseType_t {
+  if (suspended_) {
+    if (!mix) {
+      std::fill_n(dest.begin(), dest.size(), 0);
+    }
+    return false;
+  }
+
  size_t first_read = 0, second_read = 0;
  BaseType_t ret1 = false, ret2 = false;
  std::tie(first_read, ret1) = readSingle(dest, mix, isr);
@ -86,6 +94,10 @@ auto PcmBuffer::isEmpty() -> bool {
         xRingbufferGetCurFreeSize(ringbuf_);
 }

+auto PcmBuffer::suspend(bool s) -> void {
+  suspended_ = s;
+}
+
 auto PcmBuffer::totalSent() -> uint32_t {
  return sent_;
 }
--- a/src/tangara/audio/audio_events.hpp
+++ b/src/tangara/audio/audio_events.hpp
@ -144,8 +144,11 @@ struct OutputModeChanged : tinyfsm::Event {
  std::optional<drivers::NvsStorage::Output> set_to;
 };

-namespace internal {
+struct TtsPlaybackChanged : tinyfsm::Event {
+  bool is_playing;
+};

+namespace internal {
 struct DecodingStarted : tinyfsm::Event {
  std::shared_ptr<TrackInfo> track;
 };
--- a/src/tangara/audio/audio_fsm.cpp
+++ b/src/tangara/audio/audio_fsm.cpp
@ -76,6 +76,7 @@ std::optional<IAudioOutput::Format> AudioState::sDrainFormat;
 StreamCues AudioState::sStreamCues;

 bool AudioState::sIsPaused = true;
+bool AudioState::sIsTtsPlaying = false;

 auto AudioState::emitPlaybackUpdate(bool paused) -> void {
  std::optional<uint32_t> position;
@ -191,6 +192,11 @@ void AudioState::react(const TogglePlayPause& ev) {
  }
 }

+void AudioState::react(const TtsPlaybackChanged& ev) {
+  sIsTtsPlaying = ev.is_playing;
+  updateOutputMode();
+}
+
 void AudioState::react(const internal::DecodingFinished& ev) {
  // If we just finished playing whatever's at the front of the queue, then we
  // need to advanve and start playing the next one ASAP in order to continue
@ -369,8 +375,8 @@ void AudioState::react(const OutputModeChanged& ev) {
      sOutput = sI2SOutput;
      break;
  }
-  sOutput->mode(IAudioOutput::Modes::kOnPaused);
  sSampleProcessor->SetOutput(sOutput);
+  updateOutputMode();

  // Bluetooth volume isn't 'changed' until we've connected to a device.
  if (new_mode == drivers::NvsStorage::Output::kHeadphones) {
@ -381,6 +387,14 @@ void AudioState::react(const OutputModeChanged& ev) {
  }
 }

+auto AudioState::updateOutputMode() -> void {
+  if (is_in_state<states::Playback>() || sIsTtsPlaying) {
+    sOutput->mode(IAudioOutput::Modes::kOnPlaying);
+  } else {
+    sOutput->mode(IAudioOutput::Modes::kOnPaused);
+  }
+}
+
 auto AudioState::commitVolume() -> void {
  auto mode = sServices->nvs().OutputMode();
  auto vol = sOutput->GetVolume();
@ -402,6 +416,7 @@ void Uninitialised::react(const system_fsm::BootComplete& ev) {

  sDrainBuffers = std::make_unique<drivers::OutputBuffers>(
      kTrackDrainLatencySamples, kSystemDrainLatencySamples);
+  sDrainBuffers->first.suspend(true);

  sStreamFactory.reset(
      new FatfsStreamFactory(sServices->database(), sServices->tag_parser()));
@ -454,6 +469,10 @@ void Uninitialised::react(const system_fsm::BootComplete& ev) {
 static const char kQueueKey[] = "audio:queue";
 static const char kCurrentFileKey[] = "audio:current";

+auto Standby::entry() -> void {
+  updateOutputMode();
+}
+
 void Standby::react(const system_fsm::KeyLockChanged& ev) {
  if (!ev.locking) {
    return;
@ -539,7 +558,8 @@ static void heartbeat(TimerHandle_t) {

 void Playback::entry() {
  ESP_LOGI(kTag, "audio output resumed");
-  sOutput->mode(IAudioOutput::Modes::kOnPlaying);
+  sDrainBuffers->first.suspend(false);
+  updateOutputMode();
  emitPlaybackUpdate(false);

  if (!sHeartbeatTimer) {
@ -552,7 +572,7 @@ void Playback::entry() {
 void Playback::exit() {
  ESP_LOGI(kTag, "audio output paused");
  xTimerStop(sHeartbeatTimer, portMAX_DELAY);
-  sOutput->mode(IAudioOutput::Modes::kOnPaused);
+  sDrainBuffers->first.suspend(true);
  emitPlaybackUpdate(true);
 }

--- a/src/tangara/audio/audio_fsm.hpp
+++ b/src/tangara/audio/audio_fsm.hpp
@ -48,6 +48,7 @@ class AudioState : public tinyfsm::Fsm<AudioState> {
  void react(const PlaySineWave&);
  void react(const SetTrack&);
  void react(const TogglePlayPause&);
+  void react(const TtsPlaybackChanged&);

  void react(const internal::DecodingFinished&);
  void react(const internal::StreamStarted&);
@ -70,6 +71,7 @@ class AudioState : public tinyfsm::Fsm<AudioState> {
  virtual void react(const system_fsm::HasPhonesChanged&);

 protected:
+  auto updateOutputMode() -> void;
  auto emitPlaybackUpdate(bool paused) -> void;
  auto commitVolume() -> void;

@ -88,6 +90,7 @@ class AudioState : public tinyfsm::Fsm<AudioState> {
  static std::optional<IAudioOutput::Format> sDrainFormat;

  static bool sIsPaused;
+  static bool sIsTtsPlaying;
 };

 namespace states {
@ -102,6 +105,7 @@ class Uninitialised : public AudioState {

 class Standby : public AudioState {
 public:
+  void entry() override;
  void react(const system_fsm::KeyLockChanged&) override;
  void react(const system_fsm::SdStateChanged&) override;

--- a/src/tangara/tts/player.cpp
+++ b/src/tangara/tts/player.cpp
@ -5,11 +5,14 @@
 */

 #include "tts/player.hpp"
+#include <mutex>

+#include "audio/audio_events.hpp"
 #include "audio/processor.hpp"
 #include "audio/resample.hpp"
 #include "codec.hpp"
 #include "esp_log.h"
+#include "events/event_queue.hpp"
 #include "freertos/projdefs.h"
 #include "portmacro.h"
 #include "sample.hpp"
@ -22,47 +25,70 @@ namespace tts {
 Player::Player(tasks::WorkerPool& worker,
               drivers::PcmBuffer& output,
               audio::FatfsStreamFactory& factory)
-    : bg_(worker), stream_factory_(factory), output_(output), play_count_(0) {}
+    : bg_(worker),
+      stream_factory_(factory),
+      output_(output),
+      stream_playing_(false),
+      stream_cancelled_(false) {}

 auto Player::playFile(const std::string& path) -> void {
  ESP_LOGI(kTag, "playing '%s'", path.c_str());
-  int this_play = ++play_count_;

  bg_.Dispatch<void>([=, this]() {
-    auto stream = stream_factory_.create(path);
-    if (!stream) {
-      ESP_LOGE(kTag, "creating stream failed");
-      return;
+    // Interrupt current playback
+    {
+      std::scoped_lock<std::mutex> lock{new_stream_mutex_};
+      if (stream_playing_) {
+        stream_cancelled_ = true;
+        stream_playing_.wait(true);
+      }
+      stream_cancelled_ = false;
+      stream_playing_ = true;
    }

-    // FIXME: Rather than hardcoding WAV support only, we should work out a
-    // proper subset of 'low memory' decoders that can all be used for TTS
-    // playback.
-    if (stream->type() != codecs::StreamType::kWav) {
-      ESP_LOGE(kTag, "stream was unsupported type");
-      return;
-    }
+    openAndDecode(path);

-    auto decoder = codecs::CreateCodecForType(stream->type());
-    if (!decoder) {
-      ESP_LOGE(kTag, "creating decoder failed");
-      return;
+    if (!stream_cancelled_) {
+      events::Audio().Dispatch(audio::TtsPlaybackChanged{.is_playing = false});
    }
+    stream_playing_ = false;
+    stream_playing_.notify_all();
+  });
+}

-    std::unique_ptr<codecs::ICodec> codec{*decoder};
-    auto open_res = codec->OpenStream(stream, 0);
-    if (open_res.has_error()) {
-      ESP_LOGE(kTag, "opening stream failed");
-      return;
-    }
+auto Player::openAndDecode(const std::string& path) -> void {
+  auto stream = stream_factory_.create(path);
+  if (!stream) {
+    ESP_LOGE(kTag, "creating stream failed");
+    return;
+  }

-    decodeToSink(*open_res, std::move(codec), this_play);
-  });
+  // FIXME: Rather than hardcoding WAV support only, we should work out a
+  // proper subset of 'low memory' decoders that can all be used for TTS
+  // playback.
+  if (stream->type() != codecs::StreamType::kWav) {
+    ESP_LOGE(kTag, "stream was unsupported type");
+    return;
+  }
+
+  auto decoder = codecs::CreateCodecForType(stream->type());
+  if (!decoder) {
+    ESP_LOGE(kTag, "creating decoder failed");
+    return;
+  }
+
+  std::unique_ptr<codecs::ICodec> codec{*decoder};
+  auto open_res = codec->OpenStream(stream, 0);
+  if (open_res.has_error()) {
+    ESP_LOGE(kTag, "opening stream failed");
+    return;
+  }
+
+  decodeToSink(*open_res, std::move(codec));
 }

 auto Player::decodeToSink(const codecs::ICodec::OutputFormat& format,
-                          std::unique_ptr<codecs::ICodec> codec,
-                          int play_count) -> void {
+                          std::unique_ptr<codecs::ICodec> codec) -> void {
  // Set up buffers to hold samples between the intermediary parts of
  // processing. We can just use the stack for these, since this method is
  // called only from background workers, which have enormous stacks.
@ -83,20 +109,18 @@ auto Player::decodeToSink(const codecs::ICodec::OutputFormat& format,
  }
  bool double_samples = format.num_channels == 1;

+  // Start our playback (wait for previous to end?)
+  events::Audio().Dispatch(audio::TtsPlaybackChanged{.is_playing = true});
+
  // FIXME: This decode-and-process loop is substantially the same as the audio
  // processor's filter loop. Ideally we should refactor both of these loops to
  // reuse code, however I'm holding off on doing this until we've implemented
  // more advanced audio processing features in the audio processor (EQ, tempo
  // shifting, etc.) as it's not clear to me yet how much the two codepaths will
  // be diverging later anyway.
-  while (codec || !decode_buf.isEmpty() || !resample_buf.isEmpty() ||
-         !stereo_buf.isEmpty()) {
-    if (play_count != play_count_) {
-      // FIXME: This is a little unsafe and could maybe take out the first few
-      // samples of the next file.
-      output_.clear();
-      break;
-    }
+  while ((codec || !decode_buf.isEmpty() || !resample_buf.isEmpty() ||
+          !stereo_buf.isEmpty()) &&
+         !stream_cancelled_) {
    if (codec) {
      auto decode_res = codec->DecodeTo(decode_buf.writeAcquire());
      if (decode_res.has_error()) {
@ -156,6 +180,14 @@ auto Player::decodeToSink(const codecs::ICodec::OutputFormat& format,
      stereo_buf.readCommit(sent);
    }
  }
+
+  while (!output_.isEmpty()) {
+    if (stream_cancelled_) {
+      output_.clear();
+    } else {
+      vTaskDelay(pdMS_TO_TICKS(100));
+    }
+  }
 }

 }  // namespace tts
--- a/src/tangara/tts/player.hpp
+++ b/src/tangara/tts/player.hpp
@ -35,11 +35,13 @@ class Player {
  audio::FatfsStreamFactory& stream_factory_;
  drivers::PcmBuffer& output_;

-  std::atomic<int> play_count_;
+  std::mutex new_stream_mutex_;
+  std::atomic<bool> stream_playing_;
+  std::atomic<bool> stream_cancelled_;

+  auto openAndDecode(const std::string& path) -> void;
  auto decodeToSink(const codecs::ICodec::OutputFormat&,
-                    std::unique_ptr<codecs::ICodec>,
-                    int play_count) -> void;
+                    std::unique_ptr<codecs::ICodec>) -> void;
 };

 }  // namespace tts