From 9b1b401dcb986a26d10bcc898be670653acc2d3f Mon Sep 17 00:00:00 2001
From: jacqueline <me@jacqueline.id.au>
Date: Wed, 26 Jul 2023 17:11:23 +1000
Subject: [PATCH] big cleanup of new encoder + stream buffer types

---
 src/audio/CMakeLists.txt                |   2 +-
 src/audio/audio_task.cpp                | 341 +++++++++++++++---------
 src/audio/fatfs_audio_input.cpp         |  76 +++---
 src/audio/i2s_audio_output.cpp          |   9 +-
 src/audio/include/audio_sink.hpp        |   2 +-
 src/audio/include/audio_source.hpp      |  20 +-
 src/audio/include/audio_task.hpp        |  18 +-
 src/audio/include/fatfs_audio_input.hpp |  13 +-
 src/audio/include/i2s_audio_output.hpp  |   2 +-
 src/audio/include/stream_info.hpp       |  72 ++---
 src/audio/stream_info.cpp               |  75 +++---
 src/main/main.cpp                       |   4 +
 12 files changed, 361 insertions(+), 273 deletions(-)

diff --git a/src/audio/CMakeLists.txt b/src/audio/CMakeLists.txt
index 6ef144ac..428ea691 100644
--- a/src/audio/CMakeLists.txt
+++ b/src/audio/CMakeLists.txt
@@ -5,7 +5,7 @@
 idf_component_register(
   SRCS "audio_task.cpp" "chunk.cpp" "fatfs_audio_input.cpp"
   "stream_message.cpp" "i2s_audio_output.cpp" "stream_buffer.cpp" "track_queue.cpp"
-  "stream_event.cpp" "pipeline.cpp" "stream_info.cpp" "audio_fsm.cpp"
+  "stream_event.cpp" "stream_info.cpp" "audio_fsm.cpp"
   INCLUDE_DIRS "include"
   REQUIRES "codecs" "drivers" "cbor" "result" "tasks" "span" "memory" "tinyfsm" "database" "system_fsm" "playlist")
 
diff --git a/src/audio/audio_task.cpp b/src/audio/audio_task.cpp
index ae4964a6..6079e077 100644
--- a/src/audio/audio_task.cpp
+++ b/src/audio/audio_task.cpp
@@ -21,6 +21,7 @@
 #include "audio_events.hpp"
 #include "audio_fsm.hpp"
 #include "audio_sink.hpp"
+#include "audio_source.hpp"
 #include "cbor.h"
 #include "codec.hpp"
 #include "esp_err.h"
@@ -43,6 +44,7 @@
 #include "stream_message.hpp"
 #include "sys/_stdint.h"
 #include "tasks.hpp"
+#include "types.hpp"
 #include "ui_fsm.hpp"
 
 namespace audio {
@@ -62,7 +64,7 @@ auto Timer::SetLengthSeconds(uint32_t len) -> void {
 }
 
 auto Timer::SetLengthBytes(uint32_t len) -> void {
-  total_duration_seconds_ = 0;
+  total_duration_seconds_ = bytes_to_samples(len) / format_.sample_rate;
 }
 
 auto Timer::AddBytes(std::size_t bytes) -> void {
@@ -84,14 +86,29 @@ auto Timer::AddBytes(std::size_t bytes) -> void {
   }
 
   if (incremented) {
-    // ESP_LOGI("timer", "new time %lu", current_seconds_);
+    if (total_duration_seconds_ < current_seconds_) {
+      total_duration_seconds_ = current_seconds_;
+    }
+
     events::Audio().Dispatch(PlaybackUpdate{
         .seconds_elapsed = current_seconds_,
-        .seconds_total = 0,
+        .seconds_total = total_duration_seconds_,
     });
   }
 }
 
+auto Timer::bytes_to_samples(uint32_t bytes) -> uint32_t {
+  uint32_t samples = bytes;
+  samples /= format_.channels;
+
+  // Samples must be aligned to 16 bits. The number of actual bytes per
+  // sample is therefore the bps divided by 16, rounded up (align to word),
+  // times two (convert to bytes).
+  uint8_t bytes_per_sample = ((format_.bits_per_sample + 16 - 1) / 16) * 2;
+  samples /= bytes_per_sample;
+  return samples;
+}
+
 auto AudioTask::Start(IAudioSource* source, IAudioSink* sink) -> AudioTask* {
   AudioTask* task = new AudioTask(source, sink);
   tasks::StartPersistent<tasks::Type::kAudio>([=]() { task->Main(); });
@@ -103,7 +120,7 @@ AudioTask::AudioTask(IAudioSource* source, IAudioSink* sink)
       sink_(sink),
       codec_(),
       timer_(),
-      is_new_stream_(false),
+      has_begun_decoding_(false),
       current_input_format_(),
       current_output_format_(),
       sample_buffer_(reinterpret_cast<std::byte*>(
@@ -114,147 +131,213 @@ AudioTask::AudioTask(IAudioSource* source, IAudioSink* sink)
 void AudioTask::Main() {
   for (;;) {
     source_->Read(
-        [this](StreamInfo::Format format) -> bool {
-          if (current_input_format_ && format == *current_input_format_) {
-            // This is the continuation of previous data. We can handle it if
-            // we are able to decode it, or if it doesn't need decoding.
-            return current_output_format_ == format || codec_ != nullptr;
-          }
-          // This must be a new stream of data. Reset everything to prepare to
-          // handle it.
-          current_input_format_ = format;
-          is_new_stream_ = true;
-          codec_.reset();
-          timer_.reset();
-
-          // What kind of data does this new stream contain?
-          if (std::holds_alternative<StreamInfo::Pcm>(format)) {
-            // It's already decoded! We can handle this immediately if it
-            // matches what we're currently sending to the sink. Otherwise, we
-            // will need to wait for the sink to drain before we can reconfigure
-            // it.
-            if (current_output_format_ && format == *current_output_format_) {
-              return true;
-            } else if (xStreamBufferIsEmpty(sink_->stream())) {
-              return true;
-            } else {
-              return false;
-            }
-          } else if (std::holds_alternative<StreamInfo::Encoded>(format)) {
-            // The stream has some kind of encoding. Whether or not we can
-            // handle it is entirely down to whether or not we have a codec for
-            // it.
-            auto encoding = std::get<StreamInfo::Encoded>(format);
-            auto codec = codecs::CreateCodecForType(encoding.type);
-            if (codec) {
-              ESP_LOGI(kTag, "successfully created codec for stream");
-              codec_.reset(*codec);
-              return true;
-            } else {
-              ESP_LOGE(kTag, "stream has unknown encoding");
-              return false;
+        [this](IAudioSource::Flags flags, InputStream& stream) -> void {
+          if (flags.is_start()) {
+            has_begun_decoding_ = false;
+            if (!HandleNewStream(stream)) {
+              return;
             }
-          } else {
-            // programmer error / skill issue :(
-            ESP_LOGE(kTag, "stream has unknown format");
-            current_input_format_ = format;
-            return false;
           }
-        },
-        [this](cpp::span<const std::byte> bytes) -> size_t {
-          // PCM streams are simple, so handle them first.
-          if (std::holds_alternative<StreamInfo::Pcm>(*current_input_format_)) {
-            // First we need to reconfigure the sink for this sample format.
-            // TODO(jacqueline): We should verify whether or not the sink can
-            // actually deal with this format first.
-            if (current_input_format_ != current_output_format_) {
-              current_output_format_ = current_input_format_;
-              sink_->Configure(*current_output_format_);
-              timer_.reset(new Timer(
-                  std::get<StreamInfo::Pcm>(*current_output_format_)));
+
+          auto pcm = stream.info().format_as<StreamInfo::Pcm>();
+          if (pcm) {
+            if (ForwardPcmStream(*pcm, stream.data())) {
+              stream.consume(stream.data().size_bytes());
             }
-            // Stream the raw samples directly to the sink.
-            xStreamBufferSend(sink_->stream(), bytes.data(), bytes.size_bytes(),
-                              portMAX_DELAY);
-            timer_->AddBytes(bytes.size_bytes());
-            return bytes.size_bytes();
+            timer_->SetLengthBytes(
+                stream.info().total_length_bytes().value_or(0));
+            return;
           }
-          // Else, assume it's an encoded stream.
-
-          size_t bytes_used = 0;
-          if (is_new_stream_) {
-            // This is a new stream! First order of business is verifying that
-            // we can indeed decode it.
-            auto res = codec_->BeginStream(bytes);
-            bytes_used += res.first;
-
-            if (res.second.has_error()) {
-              if (res.second.error() != codecs::ICodec::Error::kOutOfInput) {
-                // Decoding the header failed, so we can't actually deal with
-                // this stream after all. It could be malformed.
-                ESP_LOGE(kTag, "error beginning stream");
-                codec_.reset();
-              }
-              return bytes_used;
-            }
-            is_new_stream_ = false;
-
-            codecs::ICodec::OutputFormat format = res.second.value();
-            StreamInfo::Pcm pcm{
-                .channels = format.num_channels,
-                .bits_per_sample = format.bits_per_sample,
-                .sample_rate = format.sample_rate_hz,
-            };
-            StreamInfo::Format new_format{pcm};
-            timer_.reset(new Timer{pcm});
-            if (format.duration_seconds) {
-              timer_->SetLengthSeconds(*format.duration_seconds);
-            }
 
-            // Now that we have the output format for decoded samples from this
-            // stream, we need to see if they are compatible with what's already
-            // in the sink stream.
-            if (new_format != current_output_format_) {
-              // The new format is different to the old one. Wait for the sink
-              // to drain before continuing.
-              while (!xStreamBufferIsEmpty(sink_->stream())) {
-                ESP_LOGI(kTag, "waiting for sink stream to drain...");
-                // TODO(jacqueline): Get the sink drain ISR to notify us of this
-                // via semaphore instead of busy-ish waiting.
-                vTaskDelay(pdMS_TO_TICKS(100));
-              }
-            }
+          if (!stream.info().format_as<StreamInfo::Encoded>() || !codec_) {
+            // Either unknown stream format, or it's encoded but we don't have
+            // a decoder that supports it. Either way, bail out.
+            return;
+          }
 
-            ESP_LOGI(kTag, "configuring sink");
-            current_output_format_ = new_format;
-            sink_->Configure(new_format);
-            timer_.reset(
-                new Timer(std::get<StreamInfo::Pcm>(*current_output_format_)));
+          if (!has_begun_decoding_) {
+            if (BeginDecoding(stream)) {
+              has_begun_decoding_ = true;
+            } else {
+              return;
+            }
           }
 
           // At this point the decoder has been initialised, and the sink has
           // been correctly configured. All that remains is to throw samples
           // into the sink as fast as possible.
-          while (bytes_used < bytes.size_bytes()) {
-            auto res =
-                codec_->ContinueStream(bytes.subspan(bytes_used),
-                                       {sample_buffer_, sample_buffer_len_});
-
-            bytes_used += res.first;
-
-            if (res.second.has_error()) {
-              return bytes_used;
-            } else {
-              xStreamBufferSend(sink_->stream(), sample_buffer_,
-                                res.second->bytes_written, portMAX_DELAY);
-              timer_->AddBytes(res.second->bytes_written);
-            }
+          if (!ContinueDecoding(stream)) {
+            codec_.reset();
           }
 
-          return bytes_used;
+          if (flags.is_end()) {
+            FinishDecoding(stream);
+            events::Audio().Dispatch(internal::InputFileFinished{});
+          }
         },
         portMAX_DELAY);
   }
 }
 
+auto AudioTask::HandleNewStream(const InputStream& stream) -> bool {
+  // This must be a new stream of data. Reset everything to prepare to
+  // handle it.
+  current_input_format_ = stream.info().format();
+  codec_.reset();
+
+  // What kind of data does this new stream contain?
+  auto pcm = stream.info().format_as<StreamInfo::Pcm>();
+  auto encoded = stream.info().format_as<StreamInfo::Encoded>();
+  if (pcm) {
+    // It's already decoded! We can always handle this.
+    return true;
+  } else if (encoded) {
+    // The stream has some kind of encoding. Whether or not we can
+    // handle it is entirely down to whether or not we have a codec for
+    // it.
+    has_begun_decoding_ = false;
+    auto codec = codecs::CreateCodecForType(encoded->type);
+    if (codec) {
+      ESP_LOGI(kTag, "successfully created codec for stream");
+      codec_.reset(*codec);
+      return true;
+    } else {
+      ESP_LOGE(kTag, "stream has unknown encoding");
+      return false;
+    }
+  } else {
+    // programmer error / skill issue :(
+    ESP_LOGE(kTag, "stream has unknown format");
+    return false;
+  }
+}
+
+auto AudioTask::BeginDecoding(InputStream& stream) -> bool {
+  auto res = codec_->BeginStream(stream.data());
+  stream.consume(res.first);
+
+  if (res.second.has_error()) {
+    if (res.second.error() == codecs::ICodec::Error::kOutOfInput) {
+      // Running out of input is fine; just return and we will try beginning the
+      // stream again when we have more data.
+      return false;
+    }
+    // Decoding the header failed, so we can't actually deal with this stream
+    // after all. It could be malformed.
+    ESP_LOGE(kTag, "error beginning stream");
+    codec_.reset();
+    return false;
+  }
+
+  codecs::ICodec::OutputFormat format = res.second.value();
+  StreamInfo::Pcm new_format{
+      .channels = format.num_channels,
+      .bits_per_sample = format.bits_per_sample,
+      .sample_rate = format.sample_rate_hz,
+  };
+
+  if (!ConfigureSink(new_format)) {
+    return false;
+  }
+
+  if (format.duration_seconds) {
+    timer_->SetLengthSeconds(*format.duration_seconds);
+  } else {
+    timer_->SetLengthBytes(stream.info().total_length_bytes().value_or(0));
+  }
+
+  return true;
+}
+
+auto AudioTask::ContinueDecoding(InputStream& stream) -> bool {
+  while (!stream.data().empty()) {
+    auto res = codec_->ContinueStream(stream.data(),
+                                      {sample_buffer_, sample_buffer_len_});
+
+    stream.consume(res.first);
+
+    if (res.second.has_error()) {
+      if (res.second.error() == codecs::ICodec::Error::kOutOfInput) {
+        return true;
+      } else {
+        return false;
+      }
+    } else {
+      xStreamBufferSend(sink_->stream(), sample_buffer_,
+                        res.second->bytes_written, portMAX_DELAY);
+      timer_->AddBytes(res.second->bytes_written);
+    }
+  }
+  return true;
+}
+
+auto AudioTask::FinishDecoding(InputStream& stream) -> void {
+  // HACK: libmad requires each frame passed to it to have an additional
+  // MAD_HEADER_GUARD (8) bytes after the end of the frame. Without these extra
+  // bytes, it will not decode the frame.
+  // The is fine for most of the stream, but at the end of the stream we don't
+  // get a trailing 8 bytes for free.
+  if (stream.info().format_as<StreamInfo::Encoded>()->type ==
+      codecs::StreamType::kMp3) {
+    ESP_LOGI(kTag, "applying MAD_HEADER_GUARD fix");
+
+    std::unique_ptr<RawStream> mad_buffer;
+    mad_buffer.reset(new RawStream(stream.data().size_bytes() + 8));
+
+    OutputStream writer{mad_buffer.get()};
+    std::copy(stream.data().begin(), stream.data().end(),
+              writer.data().begin());
+    std::fill(writer.data().begin(), writer.data().end(), std::byte{0});
+    InputStream padded_stream{mad_buffer.get()};
+
+    auto res = codec_->ContinueStream(stream.data(),
+                                      {sample_buffer_, sample_buffer_len_});
+    if (res.second.has_error()) {
+      return;
+    }
+
+    xStreamBufferSend(sink_->stream(), sample_buffer_,
+                      res.second->bytes_written, portMAX_DELAY);
+    timer_->AddBytes(res.second->bytes_written);
+  }
+}
+
+auto AudioTask::ForwardPcmStream(StreamInfo::Pcm& format,
+                                 cpp::span<const std::byte> samples) -> bool {
+  // First we need to reconfigure the sink for this sample format.
+  if (format != current_output_format_) {
+    if (!ConfigureSink(format)) {
+      return false;
+    }
+  }
+
+  // Stream the raw samples directly to the sink.
+  xStreamBufferSend(sink_->stream(), samples.data(), samples.size_bytes(),
+                    portMAX_DELAY);
+  timer_->AddBytes(samples.size_bytes());
+  return true;
+}
+
+auto AudioTask::ConfigureSink(const StreamInfo::Pcm& format) -> bool {
+  if (format != current_output_format_) {
+    // The new format is different to the old one. Wait for the sink to drain
+    // before continuing.
+    while (!xStreamBufferIsEmpty(sink_->stream())) {
+      ESP_LOGI(kTag, "waiting for sink stream to drain...");
+      // TODO(jacqueline): Get the sink drain ISR to notify us of this
+      // via semaphore instead of busy-ish waiting.
+      vTaskDelay(pdMS_TO_TICKS(100));
+    }
+
+    ESP_LOGI(kTag, "configuring sink");
+    if (!sink_->Configure(format)) {
+      return false;
+    }
+  }
+
+  current_output_format_ = format;
+  timer_.reset(new Timer(format));
+  return true;
+}
+
 }  // namespace audio
diff --git a/src/audio/fatfs_audio_input.cpp b/src/audio/fatfs_audio_input.cpp
index 811c2702..d5df61df 100644
--- a/src/audio/fatfs_audio_input.cpp
+++ b/src/audio/fatfs_audio_input.cpp
@@ -145,21 +145,15 @@ FatfsAudioInput::FatfsAudioInput(
       has_data_(xSemaphoreCreateBinary()),
       streamer_buffer_(xStreamBufferCreate(kStreamerBufferSize, 1)),
       streamer_(new FileStreamer(streamer_buffer_, has_data_)),
-      file_buffer_info_(),
-      file_buffer_len_(kFileBufferSize),
-      file_buffer_(reinterpret_cast<std::byte*>(
-          heap_caps_malloc(file_buffer_len_,
-                           MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL))),
-      file_buffer_stream_(&file_buffer_info_, {file_buffer_, file_buffer_len_}),
+      input_buffer_(new RawStream(kFileBufferSize)),
       source_mutex_(),
       pending_path_(),
-      current_format_() {}
+      is_first_read_(false) {}
 
 FatfsAudioInput::~FatfsAudioInput() {
   streamer_.reset();
   vStreamBufferDelete(streamer_buffer_);
   vSemaphoreDelete(has_data_);
-  free(file_buffer_);
 }
 
 auto FatfsAudioInput::SetPath(std::future<std::optional<std::string>> fut)
@@ -185,10 +179,8 @@ auto FatfsAudioInput::SetPath() -> void {
   CloseCurrentFile();
 }
 
-auto FatfsAudioInput::Read(
-    std::function<bool(StreamInfo::Format)> can_read,
-    std::function<size_t(cpp::span<const std::byte>)> read,
-    TickType_t max_wait) -> void {
+auto FatfsAudioInput::Read(std::function<void(Flags, InputStream&)> read_cb,
+                           TickType_t max_wait) -> void {
   // Wait until we have data to return.
   xSemaphoreTake(has_data_, portMAX_DELAY);
 
@@ -205,7 +197,7 @@ auto FatfsAudioInput::Read(
     auto res = pending_path_->Result();
     pending_path_.reset();
 
-    if (res || *res) {
+    if (res && *res) {
       OpenFile(**res);
     }
 
@@ -217,28 +209,22 @@ auto FatfsAudioInput::Read(
   // Move data from the file streamer's buffer into our file buffer. We need our
   // own buffer so that we can handle concatenating smaller file chunks into
   // complete frames for the decoder.
-  OutputStream writer{&file_buffer_stream_};
+  OutputStream writer{input_buffer_.get()};
   std::size_t bytes_added =
       xStreamBufferReceive(streamer_buffer_, writer.data().data(),
                            writer.data().size_bytes(), pdMS_TO_TICKS(0));
   writer.add(bytes_added);
 
-  // HACK: libmad needs at least MAD_HEADER_GUARD (= 8) extra bytes following a
-  // frame, or else it refuses to decode it.
-  if (IsCurrentFormatMp3() && !HasDataRemaining()) {
-    ESP_LOGI(kTag, "applying MAD_HEADER_GUARD fix");
-    cpp::span<std::byte> buf = writer.data();
-    size_t pad_amount = std::min<size_t>(buf.size_bytes(), 8);
-    std::fill_n(buf.begin(), pad_amount, static_cast<std::byte>(0));
-  }
+  bool has_data_remaining = HasDataRemaining();
 
-  InputStream reader{&file_buffer_stream_};
+  InputStream reader{input_buffer_.get()};
   auto data_for_cb = reader.data();
-  if (!data_for_cb.empty() && std::invoke(can_read, *current_format_)) {
-    reader.consume(std::invoke(read, reader.data()));
+  if (!data_for_cb.empty()) {
+    std::invoke(read_cb, Flags{is_first_read_, !has_data_remaining}, reader);
+    is_first_read_ = false;
   }
 
-  if (!HasDataRemaining()) {
+  if (!has_data_remaining) {
     // Out of data. We're finished. Note we don't care about anything left in
     // the file buffer at this point; the callback as seen it, so if it didn't
     // consume it then presumably whatever is left isn't enough to form a
@@ -273,18 +259,19 @@ auto FatfsAudioInput::OpenFile(const std::string& path) -> void {
     return;
   }
 
-  if (*stream_type == codecs::StreamType::kPcm && tags.channels &&
-      tags.bits_per_sample && tags.channels) {
-    current_format_ = StreamInfo::Pcm{
-        .channels = static_cast<uint8_t>(*tags.channels),
-        .bits_per_sample = static_cast<uint8_t>(*tags.bits_per_sample),
-        .sample_rate = static_cast<uint32_t>(*tags.sample_rate),
-    };
+  StreamInfo::Format format;
+  if (*stream_type == codecs::StreamType::kPcm) {
+    if (tags.channels && tags.bits_per_sample && tags.channels) {
+      format = StreamInfo::Pcm{
+          .channels = static_cast<uint8_t>(*tags.channels),
+          .bits_per_sample = static_cast<uint8_t>(*tags.bits_per_sample),
+          .sample_rate = static_cast<uint32_t>(*tags.sample_rate)};
+    } else {
+      ESP_LOGW(kTag, "pcm stream missing format info");
+      return;
+    }
   } else {
-    current_format_ = StreamInfo::Encoded{
-        .type = *stream_type,
-        .duration_bytes = info.fsize,
-    };
+    format = StreamInfo::Encoded{.type = *stream_type};
   }
 
   std::unique_ptr<FIL> file = std::make_unique<FIL>();
@@ -294,15 +281,17 @@ auto FatfsAudioInput::OpenFile(const std::string& path) -> void {
     return;
   }
 
-  streamer_->Restart(std::move(file));
+  OutputStream writer{input_buffer_.get()};
+  writer.prepare(format, info.fsize);
 
+  streamer_->Restart(std::move(file));
+  is_first_read_ = true;
   events::Audio().Dispatch(internal::InputFileOpened{});
 }
 
 auto FatfsAudioInput::CloseCurrentFile() -> void {
   streamer_->Restart({});
   xStreamBufferReset(streamer_buffer_);
-  current_format_ = {};
 }
 
 auto FatfsAudioInput::HasDataRemaining() -> bool {
@@ -327,14 +316,11 @@ auto FatfsAudioInput::ContainerToStreamType(database::Encoding enc)
 }
 
 auto FatfsAudioInput::IsCurrentFormatMp3() -> bool {
-  if (!current_format_) {
-    return false;
-  }
-  if (!std::holds_alternative<StreamInfo::Encoded>(*current_format_)) {
+  auto format = input_buffer_->info().format_as<StreamInfo::Encoded>();
+  if (!format) {
     return false;
   }
-  return std::get<StreamInfo::Encoded>(*current_format_).type ==
-         codecs::StreamType::kMp3;
+  return format->type == codecs::StreamType::kMp3;
 }
 
 }  // namespace audio
diff --git a/src/audio/i2s_audio_output.cpp b/src/audio/i2s_audio_output.cpp
index 57b5e071..8ce43336 100644
--- a/src/audio/i2s_audio_output.cpp
+++ b/src/audio/i2s_audio_output.cpp
@@ -114,14 +114,7 @@ auto I2SAudioOutput::AdjustVolumeDown() -> bool {
   return true;
 }
 
-auto I2SAudioOutput::Configure(const StreamInfo::Format& format) -> bool {
-  if (!std::holds_alternative<StreamInfo::Pcm>(format)) {
-    ESP_LOGI(kTag, "ignoring non-pcm stream (%d)", format.index());
-    return false;
-  }
-
-  StreamInfo::Pcm pcm = std::get<StreamInfo::Pcm>(format);
-
+auto I2SAudioOutput::Configure(const StreamInfo::Pcm& pcm) -> bool {
   if (current_config_ && pcm == *current_config_) {
     ESP_LOGI(kTag, "ignoring unchanged format");
     return true;
diff --git a/src/audio/include/audio_sink.hpp b/src/audio/include/audio_sink.hpp
index c9124688..261f7c79 100644
--- a/src/audio/include/audio_sink.hpp
+++ b/src/audio/include/audio_sink.hpp
@@ -38,7 +38,7 @@ class IAudioSink {
   virtual auto AdjustVolumeUp() -> bool = 0;
   virtual auto AdjustVolumeDown() -> bool = 0;
 
-  virtual auto Configure(const StreamInfo::Format& format) -> bool = 0;
+  virtual auto Configure(const StreamInfo::Pcm& format) -> bool = 0;
   virtual auto Send(const cpp::span<std::byte>& data) -> void = 0;
 
   auto stream() -> StreamBufferHandle_t { return stream_; }
diff --git a/src/audio/include/audio_source.hpp b/src/audio/include/audio_source.hpp
index e062fd1a..115f8bf4 100644
--- a/src/audio/include/audio_source.hpp
+++ b/src/audio/include/audio_source.hpp
@@ -8,6 +8,7 @@
 
 #include <stdint.h>
 
+#include <bitset>
 #include <memory>
 
 #include "freertos/FreeRTOS.h"
@@ -22,12 +23,25 @@ class IAudioSource {
  public:
   virtual ~IAudioSource() {}
 
+  class Flags {
+   public:
+    Flags(bool is_start, bool is_end) {
+      flags_[0] = is_start;
+      flags_[1] = is_start;
+    }
+
+    auto is_start() -> bool { return flags_[0]; }
+    auto is_end() -> bool { return flags_[1]; }
+
+   private:
+    std::bitset<2> flags_;
+  };
+
   /*
    * Synchronously fetches data from this source.
    */
-  virtual auto Read(std::function<bool(StreamInfo::Format)>,
-                    std::function<size_t(cpp::span<const std::byte>)>,
-                    TickType_t) -> void = 0;
+  virtual auto Read(std::function<void(Flags, InputStream&)>, TickType_t)
+      -> void = 0;
 };
 
 }  // namespace audio
diff --git a/src/audio/include/audio_task.hpp b/src/audio/include/audio_task.hpp
index f80c8878..ae4c2221 100644
--- a/src/audio/include/audio_task.hpp
+++ b/src/audio/include/audio_task.hpp
@@ -14,6 +14,7 @@
 #include "audio_source.hpp"
 #include "codec.hpp"
 #include "pipeline.hpp"
+#include "stream_info.hpp"
 
 namespace audio {
 
@@ -27,10 +28,13 @@ class Timer {
   auto AddBytes(std::size_t) -> void;
 
  private:
+  auto bytes_to_samples(uint32_t) -> uint32_t;
+
   StreamInfo::Pcm format_;
 
   uint32_t current_seconds_;
   uint32_t current_sample_in_second_;
+
   uint32_t total_duration_seconds_;
 };
 
@@ -43,14 +47,24 @@ class AudioTask {
  private:
   AudioTask(IAudioSource* source, IAudioSink* sink);
 
+  auto HandleNewStream(const InputStream&) -> bool;
+
+  auto BeginDecoding(InputStream&) -> bool;
+  auto ContinueDecoding(InputStream&) -> bool;
+  auto FinishDecoding(InputStream&) -> void;
+
+  auto ForwardPcmStream(StreamInfo::Pcm&, cpp::span<const std::byte>) -> bool;
+
+  auto ConfigureSink(const StreamInfo::Pcm&) -> bool;
+
   IAudioSource* source_;
   IAudioSink* sink_;
   std::unique_ptr<codecs::ICodec> codec_;
   std::unique_ptr<Timer> timer_;
 
-  bool is_new_stream_;
+  bool has_begun_decoding_;
   std::optional<StreamInfo::Format> current_input_format_;
-  std::optional<StreamInfo::Format> current_output_format_;
+  std::optional<StreamInfo::Pcm> current_output_format_;
 
   std::byte* sample_buffer_;
   std::size_t sample_buffer_len_;
diff --git a/src/audio/include/fatfs_audio_input.hpp b/src/audio/include/fatfs_audio_input.hpp
index a1b9689b..e13e49e2 100644
--- a/src/audio/include/fatfs_audio_input.hpp
+++ b/src/audio/include/fatfs_audio_input.hpp
@@ -89,9 +89,8 @@ class FatfsAudioInput : public IAudioSource {
   auto SetPath(const std::string&) -> void;
   auto SetPath() -> void;
 
-  auto Read(std::function<bool(StreamInfo::Format)>,
-            std::function<size_t(cpp::span<const std::byte>)>,
-            TickType_t) -> void override;
+  auto Read(std::function<void(Flags, InputStream&)>, TickType_t)
+      -> void override;
 
   FatfsAudioInput(const FatfsAudioInput&) = delete;
   FatfsAudioInput& operator=(const FatfsAudioInput&) = delete;
@@ -118,11 +117,7 @@ class FatfsAudioInput : public IAudioSource {
   StreamBufferHandle_t streamer_buffer_;
   std::unique_ptr<FileStreamer> streamer_;
 
-  StreamInfo file_buffer_info_;
-  std::size_t file_buffer_len_;
-  std::byte* file_buffer_;
-
-  RawStream file_buffer_stream_;
+  std::unique_ptr<RawStream> input_buffer_;
 
   // Mutex guarding the current file/stream associated with this source. Must be
   // held during readings, and before altering the current file.
@@ -130,7 +125,7 @@ class FatfsAudioInput : public IAudioSource {
 
   std::unique_ptr<database::FutureFetcher<std::optional<std::string>>>
       pending_path_;
-  std::optional<StreamInfo::Format> current_format_;
+  bool is_first_read_;
 };
 
 }  // namespace audio
diff --git a/src/audio/include/i2s_audio_output.hpp b/src/audio/include/i2s_audio_output.hpp
index 583a5d6a..d42efc42 100644
--- a/src/audio/include/i2s_audio_output.hpp
+++ b/src/audio/include/i2s_audio_output.hpp
@@ -34,7 +34,7 @@ class I2SAudioOutput : public IAudioSink {
   auto AdjustVolumeUp() -> bool override;
   auto AdjustVolumeDown() -> bool override;
 
-  auto Configure(const StreamInfo::Format& format) -> bool override;
+  auto Configure(const StreamInfo::Pcm& format) -> bool override;
   auto Send(const cpp::span<std::byte>& data) -> void override;
 
   I2SAudioOutput(const I2SAudioOutput&) = delete;
diff --git a/src/audio/include/stream_info.hpp b/src/audio/include/stream_info.hpp
index 00aa1110..77789c24 100644
--- a/src/audio/include/stream_info.hpp
+++ b/src/audio/include/stream_info.hpp
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <stdint.h>
+#include <sys/_stdint.h>
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -25,25 +26,26 @@
 
 namespace audio {
 
-struct StreamInfo {
+class StreamInfo {
+ public:
+  StreamInfo() : bytes_in_stream_(0), total_length_bytes_(), format_() {}
+
   // The number of bytes that are available for consumption within this
   // stream's buffer.
-  std::size_t bytes_in_stream{0};
-
-  bool is_producer_finished = true;
-
-  bool is_consumer_finished = true;
-
-  std::optional<std::uint32_t> duration_seconds;
+  auto bytes_in_stream() -> std::size_t& { return bytes_in_stream_; }
+  auto bytes_in_stream() const -> std::size_t { return bytes_in_stream_; }
 
-  std::optional<std::uint32_t> seek_to_seconds{};
+  auto total_length_bytes() -> std::optional<std::uint32_t>& {
+    return total_length_bytes_;
+  }
+  auto total_length_bytes() const -> std::optional<std::uint32_t> {
+    return total_length_bytes_;
+  }
 
   struct Encoded {
     // The codec that this stream is associated with.
     codecs::StreamType type;
 
-    std::optional<std::size_t> duration_bytes;
-
     bool operator==(const Encoded&) const = default;
   };
 
@@ -59,33 +61,48 @@ struct StreamInfo {
   };
 
   typedef std::variant<std::monostate, Encoded, Pcm> Format;
-  Format format{};
+  auto format() const -> const Format& { return format_; }
+  auto set_format(Format f) -> void { format_ = f; }
+
+  template <typename T>
+  auto format_as() const -> std::optional<T> {
+    if (std::holds_alternative<T>(format_)) {
+      return std::get<T>(format_);
+    }
+    return {};
+  }
 
   bool operator==(const StreamInfo&) const = default;
+
+ private:
+  std::size_t bytes_in_stream_;
+  std::optional<std::uint32_t> total_length_bytes_;
+  Format format_{};
 };
 
+class InputStream;
+class OutputStream;
+
 class RawStream {
  public:
-  StreamInfo* info;
-  cpp::span<std::byte> data;
+  explicit RawStream(std::size_t size);
+  ~RawStream();
 
-  RawStream(StreamInfo* i, cpp::span<std::byte> d) : info(i), data(d) {}
+  auto info() -> StreamInfo& { return info_; }
+  auto data() -> cpp::span<std::byte>;
+
+ private:
+  StreamInfo info_;
+  std::size_t buffer_size_;
+  std::byte* buffer_;
 };
 
-/*
- * A byte buffer + associated metadata, which is not allowed to modify any of
- * the underlying data.
- */
 class InputStream {
  public:
   explicit InputStream(RawStream* s) : raw_(s) {}
 
   void consume(std::size_t bytes) const;
 
-  bool is_producer_finished() const;
-
-  void mark_consumer_finished() const;
-
   const StreamInfo& info() const;
 
   cpp::span<const std::byte> data() const;
@@ -100,18 +117,13 @@ class OutputStream {
 
   void add(std::size_t bytes) const;
 
-  bool prepare(const StreamInfo::Format& new_format);
-
-  void set_duration(std::size_t);
+  void prepare(const StreamInfo::Format& new_format,
+               std::optional<uint32_t> length);
 
   const StreamInfo& info() const;
 
   cpp::span<std::byte> data() const;
 
-  bool is_consumer_finished() const;
-
-  void mark_producer_finished() const;
-
  private:
   RawStream* raw_;
 };
diff --git a/src/audio/stream_info.cpp b/src/audio/stream_info.cpp
index 3927e5f8..6efe297e 100644
--- a/src/audio/stream_info.cpp
+++ b/src/audio/stream_info.cpp
@@ -5,6 +5,7 @@
  */
 
 #include "stream_info.hpp"
+#include <sys/_stdint.h>
 
 #include <cstdint>
 #include <optional>
@@ -14,77 +15,63 @@
 #include <utility>
 #include <variant>
 
+#include "esp_heap_caps.h"
 #include "result.hpp"
 #include "span.hpp"
 #include "types.hpp"
 
 namespace audio {
 
-void InputStream::consume(std::size_t bytes) const {
-  assert(raw_->info->bytes_in_stream >= bytes);
-  auto new_data =
-      raw_->data.subspan(bytes, raw_->info->bytes_in_stream - bytes);
-  std::move(new_data.begin(), new_data.end(), raw_->data.begin());
-  raw_->info->bytes_in_stream = new_data.size_bytes();
+RawStream::RawStream(std::size_t size)
+    : info_(),
+      buffer_size_(size),
+      buffer_(reinterpret_cast<std::byte*>(
+          heap_caps_malloc(size, MALLOC_CAP_INTERNAL | MALLOC_CAP_8BIT))) {
+  assert(buffer_ != NULL);
+}
+
+RawStream::~RawStream() {
+  heap_caps_free(buffer_);
 }
 
-void InputStream::mark_consumer_finished() const {
-  raw_->info->is_consumer_finished = true;
-  if (is_producer_finished()) {
-    raw_->info->format = std::monostate();
-  }
+auto RawStream::data() -> cpp::span<std::byte> {
+  return {buffer_, buffer_size_};
 }
 
-bool InputStream::is_producer_finished() const {
-  return raw_->info->is_producer_finished;
+void InputStream::consume(std::size_t bytes) const {
+  assert(raw_->info().bytes_in_stream() >= bytes);
+  auto new_data =
+      raw_->data().subspan(bytes, raw_->info().bytes_in_stream() - bytes);
+  std::move(new_data.begin(), new_data.end(), raw_->data().begin());
+  raw_->info().bytes_in_stream() = new_data.size_bytes();
 }
 
 const StreamInfo& InputStream::info() const {
-  return *raw_->info;
+  return raw_->info();
 }
 
 cpp::span<const std::byte> InputStream::data() const {
-  return raw_->data.first(raw_->info->bytes_in_stream);
+  return raw_->data().first(raw_->info().bytes_in_stream());
 }
 
 void OutputStream::add(std::size_t bytes) const {
-  assert(raw_->info->bytes_in_stream + bytes <= raw_->data.size_bytes());
-  raw_->info->bytes_in_stream += bytes;
+  assert(raw_->info().bytes_in_stream() + bytes <= raw_->data().size_bytes());
+  raw_->info().bytes_in_stream() += bytes;
 }
 
-bool OutputStream::prepare(const StreamInfo::Format& new_format) {
-  if (std::holds_alternative<std::monostate>(raw_->info->format) ||
-      raw_->info->is_consumer_finished) {
-    raw_->info->format = new_format;
-    raw_->info->bytes_in_stream = 0;
-    raw_->info->is_producer_finished = false;
-    raw_->info->is_consumer_finished = false;
-    return true;
-  }
-  return false;
-}
-
-void OutputStream::set_duration(std::size_t seconds) {
-  raw_->info->duration_seconds = seconds;
+void OutputStream::prepare(const StreamInfo::Format& new_format,
+                           std::optional<uint32_t> length) {
+  raw_->info().set_format(new_format);
+  raw_->info().bytes_in_stream() = 0;
+  raw_->info().total_length_bytes() = length;
 }
 
 const StreamInfo& OutputStream::info() const {
-  return *raw_->info;
+  return raw_->info();
 }
 
 cpp::span<std::byte> OutputStream::data() const {
-  return raw_->data.subspan(raw_->info->bytes_in_stream);
-}
-
-void OutputStream::mark_producer_finished() const {
-  raw_->info->is_producer_finished = true;
-  if (is_consumer_finished()) {
-    raw_->info->format = std::monostate();
-  }
-}
-
-bool OutputStream::is_consumer_finished() const {
-  return raw_->info->is_consumer_finished;
+  return raw_->data().subspan(raw_->info().bytes_in_stream());
 }
 
 }  // namespace audio
diff --git a/src/main/main.cpp b/src/main/main.cpp
index d283b01d..685e1fc1 100644
--- a/src/main/main.cpp
+++ b/src/main/main.cpp
@@ -20,6 +20,10 @@ extern "C" void app_main(void) {
   ESP_ERROR_CHECK(drivers::init_i2c());
   drivers::Gpios* gpios = system_fsm::SystemState::early_init_gpios();
 
+  // Semaphores must be empty before being added to a queue set. Hence all this
+  // weird early init stuff; by being explicit about initialisation order, we're
+  // able to handle GPIO ISR notifcations + system events from the same task,
+  // and a little mess with worth not needing to allocate a whole extra stack.
   QueueSetHandle_t set = xQueueCreateSet(2);
   auto* event_queue = events::queues::SystemAndAudio();
   xQueueAddToSet(event_queue->has_events(), set);