Add vorbis and flac decoders, flesh out codec interface

vorbis doesn't quite work yet, not sure why. will pick it up again later.
2 years ago · a2c1dfbabd
parent 1238437717
commit a2c1dfbabd
17 changed files with 577 additions and 137 deletions
--- a/src/audio/audio_decoder.cpp
+++ b/src/audio/audio_decoder.cpp
@ -14,6 +14,7 @@
 #include <memory>
 #include <variant>

+#include "codec.hpp"
 #include "freertos/FreeRTOS.h"

 #include "esp_heap_caps.h"
@ -50,6 +51,9 @@ auto AudioDecoder::ProcessStreamInfo(const StreamInfo& info) -> bool {
  // Reuse the existing codec if we can. This will help with gapless playback,
  // since we can potentially just continue to decode as we were before,
  // without any setup overhead.
+  // TODO(jacqueline): Reconsider this. It makes a lot of things harder to smash
+  // streams together at this layer.
+  /*
  if (current_codec_ != nullptr && current_input_format_) {
    auto cur_encoding = std::get<StreamInfo::Encoded>(*current_input_format_);
    if (cur_encoding.type == encoded.type) {
@ -58,6 +62,7 @@ auto AudioDecoder::ProcessStreamInfo(const StreamInfo& info) -> bool {
      return true;
    }
  }
+  */
  current_input_format_ = info.format;

  ESP_LOGI(kTag, "creating new decoder");
@ -80,68 +85,94 @@ auto AudioDecoder::Process(const std::vector<InputStream>& inputs,
                           OutputStream* output) -> void {
  auto input = inputs.begin();
  const StreamInfo& info = input->info();
-  if (std::holds_alternative<std::monostate>(info.format) ||
-      info.bytes_in_stream == 0) {
-    // TODO(jacqueline): should we clear the stream format?
-    // output->prepare({});
-    return;
-  }

+  // Check the input stream's format has changed (or, by extension, if this is
+  // the first stream).
  if (!current_input_format_ || *current_input_format_ != info.format) {
-    // The input stream has changed! Immediately throw everything away and
-    // start from scratch.
+    ESP_LOGI(kTag, "beginning new stream");
    has_samples_to_send_ = false;
    ProcessStreamInfo(info);
+    auto res = current_codec_->BeginStream(input->data());
+    input->consume(res.first);
+    if (res.second.has_error()) {
+      // TODO(jacqueline): Handle errors.
+      return;
+    }
+
+    // The stream started successfully. Record what format the samples are in.
+    codecs::ICodec::OutputFormat format = res.second.value();
+    current_output_format_ = StreamInfo::Pcm{
+        .channels = format.num_channels,
+        .bits_per_sample = format.bits_per_sample,
+        .sample_rate = format.sample_rate_hz,
+    };
+
+    if (info.seek_to_seconds) {
+      seek_to_sample_ = *info.seek_to_seconds * format.sample_rate_hz;
+    } else {
+      seek_to_sample_.reset();
+    }
  }

-  current_codec_->SetInput(input->data());
+  while (seek_to_sample_) {
+    ESP_LOGI(kTag, "seeking forwards...");
+    auto res = current_codec_->SeekStream(input->data(), *seek_to_sample_);
+    input->consume(res.first);
+    if (res.second.has_error()) {
+      auto err = res.second.error();
+      if (err == codecs::ICodec::Error::kOutOfInput) {
+        return;
+      } else {
+        // TODO(jacqueline): Handle errors.
+        seek_to_sample_.reset();
+      }
+    } else {
+      seek_to_sample_.reset();
+    }
+  }

+  has_input_remaining_ = true;
  while (true) {
-    if (has_samples_to_send_) {
-      auto format = current_codec_->GetOutputFormat();
-      if (format.has_value()) {
-        current_output_format_ = StreamInfo::Pcm{
-            .channels = format->num_channels,
-            .bits_per_sample = format->bits_per_sample,
-            .sample_rate = format->sample_rate_hz,
-        };
-
-        if (!output->prepare(*current_output_format_)) {
-          break;
-        }
-
-        auto write_res = current_codec_->WriteOutputSamples(output->data());
-        output->add(write_res.first);
-        has_samples_to_send_ = !write_res.second;
-
-        if (has_samples_to_send_) {
-          // We weren't able to fit all the generated samples into the output
-          // buffer. Stop trying; we'll finish up during the next pass.
-          break;
-        }
-      }
+    // TODO(jacqueline): Pass through seek info here?
+    if (!output->prepare(*current_output_format_)) {
+      ESP_LOGI(kTag, "waiting for buffer to become free");
+      break;
    }

-    auto res = current_codec_->ProcessNextFrame();
-    if (res.has_error()) {
-      // TODO(jacqueline): Handle errors.
+    auto res = current_codec_->ContinueStream(input->data(), output->data());
+    input->consume(res.first);
+    if (res.second.has_error()) {
+      if (res.second.error() == codecs::ICodec::Error::kOutOfInput) {
+        ESP_LOGW(kTag, "out of input");
+        ESP_LOGW(kTag, "(%u bytes left)", input->data().size_bytes());
+        has_input_remaining_ = false;
+        // We can't be halfway through sending samples if the codec is asking
+        // for more input.
+        has_samples_to_send_ = false;
+        input->mark_incomplete();
+      } else {
+        // TODO(jacqueline): Handle errors.
+        ESP_LOGE(kTag, "codec return fatal error");
+      }
      return;
    }

-    has_input_remaining_ = !res.value();
-    if (!has_input_remaining_) {
-      // We're out of useable data in this buffer. Finish immediately; there's
-      // nothing to send.
-      input->mark_incomplete();
-      break;
-    } else {
-      has_samples_to_send_ = true;
+    ESP_LOGI(kTag, "enc read: %u", res.first);
+
+    codecs::ICodec::OutputInfo out_info = res.second.value();
+    output->add(out_info.bytes_written);
+    has_samples_to_send_ = !out_info.is_finished_writing;
+
+    ESP_LOGI(kTag, "enc wrote: %u", out_info.bytes_written);
+    if (out_info.is_finished_writing) {
+      ESP_LOGI(kTag, "(write finished)");
    }
-  }

-  std::size_t pos = current_codec_->GetInputPosition();
-  if (pos > 0) {
-    input->consume(pos - 1);
+    if (has_samples_to_send_) {
+      // We weren't able to fit all the generated samples into the output
+      // buffer. Stop trying; we'll finish up during the next pass.
+      break;
+    }
  }
 }

--- a/src/audio/audio_task.cpp
+++ b/src/audio/audio_task.cpp
@ -126,7 +126,7 @@ void AudioTaskMain(std::unique_ptr<Pipeline> pipeline, IAudioSink* sink) {

    if (sink_stream.info().bytes_in_stream == 0) {
      // No new bytes to sink, so skip sinking completely.
-      ESP_LOGI(kTag, "no bytes to sink");
+      ESP_LOGW(kTag, "no bytes to sink");
      continue;
    }

--- a/src/audio/fatfs_audio_input.cpp
+++ b/src/audio/fatfs_audio_input.cpp
@ -56,11 +56,13 @@ auto FatfsAudioInput::OpenFile(const std::string& path) -> bool {
  database::SongTags tags;
  if (!tag_parser.ReadAndParseTags(path, &tags)) {
    ESP_LOGE(kTag, "failed to read tags");
-    return false;
+    tags.encoding = database::Encoding::kFlac;
+    // return false;
  }

  auto stream_type = ContainerToStreamType(tags.encoding);
  if (!stream_type.has_value()) {
+    ESP_LOGE(kTag, "couldn't match container to stream");
    return false;
  }

@ -144,8 +146,8 @@ auto FatfsAudioInput::ContainerToStreamType(database::Encoding enc)
      return codecs::StreamType::kPcm;
    case database::Encoding::kFlac:
      return codecs::StreamType::kFlac;
-    case database::Encoding::kOgg:
-      return codecs::StreamType::kOgg;
+    case database::Encoding::kOgg:  // Misnamed; this is Ogg Vorbis.
+      return codecs::StreamType::kVorbis;
    case database::Encoding::kUnsupported:
    default:
      return {};
--- a/src/audio/include/audio_decoder.hpp
+++ b/src/audio/include/audio_decoder.hpp
@ -42,6 +42,7 @@ class AudioDecoder : public IAudioElement {
  std::unique_ptr<codecs::ICodec> current_codec_;
  std::optional<StreamInfo::Format> current_input_format_;
  std::optional<StreamInfo::Format> current_output_format_;
+  std::optional<std::size_t> seek_to_sample_;
  bool has_samples_to_send_;
  bool has_input_remaining_;

--- a/src/audio/include/stream_info.hpp
+++ b/src/audio/include/stream_info.hpp
@ -6,6 +6,7 @@

 #pragma once

+#include <stdint.h>
 #include <cstdint>
 #include <optional>
 #include <string>
@ -30,6 +31,9 @@ struct StreamInfo {
  // generated audio, etc.)
  std::optional<std::size_t> length_bytes{};

+  //
+  std::optional<uint32_t> seek_to_seconds{};
+
  struct Encoded {
    // The codec that this stream is associated with.
    codecs::StreamType type;
--- a/src/codecs/CMakeLists.txt
+++ b/src/codecs/CMakeLists.txt
@ -3,7 +3,7 @@
 # SPDX-License-Identifier: GPL-3.0-only

 idf_component_register(
-  SRCS "codec.cpp" "mad.cpp"
+  SRCS "codec.cpp" "mad.cpp" "foxenflac.cpp" "stbvorbis.cpp"
  INCLUDE_DIRS "include"
  REQUIRES "result" "span" "libmad" "libfoxenflac" "stb_vorbis")

--- a/src/codecs/codec.cpp
+++ b/src/codecs/codec.cpp
@ -8,7 +8,10 @@

 #include <memory>
 #include <optional>
+
+#include "foxenflac.hpp"
 #include "mad.hpp"
+#include "stbvorbis.hpp"
 #include "types.hpp"

 namespace codecs {
@ -17,6 +20,10 @@ auto CreateCodecForType(StreamType type) -> std::optional<ICodec*> {
  switch (type) {
    case StreamType::kMp3:
      return new MadMp3Decoder();
+    case StreamType::kFlac:
+      return new FoxenFlacDecoder();
+    case StreamType::kVorbis:
+      return new StbVorbisDecoder();
    default:
      return {};
  }
--- a/src/codecs/foxenflac.cpp
+++ b/src/codecs/foxenflac.cpp
@ -0,0 +1,80 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#include "foxenflac.hpp"
+#include <stdint.h>
+
+#include <cstdlib>
+
+#include "esp_log.h"
+#include "foxen/flac.h"
+
+namespace codecs {
+
+FoxenFlacDecoder::FoxenFlacDecoder()
+    : flac_(FX_FLAC_ALLOC(FLAC_MAX_BLOCK_SIZE, 2)) {}
+
+FoxenFlacDecoder::~FoxenFlacDecoder() {
+  free(flac_);
+}
+
+auto FoxenFlacDecoder::BeginStream(const cpp::span<const std::byte> input)
+    -> Result<OutputFormat> {
+  uint32_t bytes_used = input.size_bytes();
+  fx_flac_state_t state =
+      fx_flac_process(flac_, reinterpret_cast<const uint8_t*>(input.data()),
+                      &bytes_used, NULL, NULL);
+  if (state != FLAC_END_OF_METADATA) {
+    return {bytes_used, cpp::fail(Error::kMalformedData)};
+  }
+
+  int64_t channels = fx_flac_get_streaminfo(flac_, FLAC_KEY_N_CHANNELS);
+  int64_t fs = fx_flac_get_streaminfo(flac_, FLAC_KEY_SAMPLE_RATE);
+  if (channels == FLAC_INVALID_METADATA_KEY ||
+      fs == FLAC_INVALID_METADATA_KEY) {
+    return {bytes_used, cpp::fail(Error::kMalformedData)};
+  }
+
+  return {bytes_used,
+          OutputFormat{
+              .num_channels = static_cast<uint8_t>(channels),
+              .bits_per_sample = 32,  // libfoxenflac output is fixed-size.
+              .sample_rate_hz = static_cast<uint32_t>(fs),
+          }};
+}
+
+auto FoxenFlacDecoder::ContinueStream(cpp::span<const std::byte> input,
+                                      cpp::span<std::byte> output)
+    -> Result<OutputInfo> {
+  cpp::span<int32_t> output_as_samples{
+      reinterpret_cast<int32_t*>(output.data()), output.size_bytes() / 4};
+  uint32_t bytes_read = input.size_bytes();
+  uint32_t samples_written = output_as_samples.size();
+
+  fx_flac_state_t state =
+      fx_flac_process(flac_, reinterpret_cast<const uint8_t*>(input.data()),
+                      &bytes_read, output_as_samples.data(), &samples_written);
+  if (state == FLAC_ERR) {
+    return {bytes_read, cpp::fail(Error::kMalformedData)};
+  }
+
+  if (samples_written > 0) {
+    return {bytes_read,
+            OutputInfo{.bytes_written = samples_written * 4,
+                       .is_finished_writing = state == FLAC_END_OF_FRAME}};
+  }
+
+  // No error, but no samples written. We must be out of data.
+  return {bytes_read, cpp::fail(Error::kOutOfInput)};
+}
+
+auto FoxenFlacDecoder::SeekStream(cpp::span<const std::byte> input,
+                                  std::size_t target_sample) -> Result<void> {
+  // TODO(jacqueline): Implement me.
+  return {0, {}};
+}
+
+}  // namespace codecs
--- a/src/codecs/include/codec.hpp
+++ b/src/codecs/include/codec.hpp
@ -21,48 +21,58 @@

 namespace codecs {

+/*
+ * Common interface to be implemented by all audio decoders.
+ */
 class ICodec {
 public:
  virtual ~ICodec() {}

+  /* Errors that may be returned by codecs. */
+  enum class Error {
+    // Indicates that more data is required before this codec can finish its
+    // operation. E.g. the input buffer ends with a truncated frame.
+    kOutOfInput,
+    // Indicates that the data within the input buffer is fatally malformed.
+    kMalformedData,
+
+    kInternalError,
+  };
+
+  /*
+   * Alias for more readable return types. All codec methods, success or
+   * failure, should also return the number of bytes they consumed.
+   */
+  template <typename T>
+  using Result = std::pair<std::size_t, cpp::result<T, Error>>;
+
  struct OutputFormat {
    uint8_t num_channels;
    uint8_t bits_per_sample;
    uint32_t sample_rate_hz;
  };

-  virtual auto GetOutputFormat() -> std::optional<OutputFormat> = 0;
-
-  enum ProcessingError { MALFORMED_DATA };
-
-  virtual auto SetInput(cpp::span<const std::byte> input) -> void = 0;
-
  /*
-   * Returns the codec's next read position within the input buffer. If the
-   * codec is out of usable data, but there is still some data left in the
-   * stream, that data should be prepended to the next input buffer.
+   * Decodes metadata or headers from the given input stream, and returns the
+   * format for the samples that will be decoded from it.
   */
-  virtual auto GetInputPosition() -> std::size_t = 0;
+  virtual auto BeginStream(cpp::span<const std::byte> input)
+      -> Result<OutputFormat> = 0;

-  /*
-   * Read one frame (or equivalent discrete chunk) from the input, and
-   * synthesize output samples for it.
-   *
-   * Returns true if we are out of usable data from the input stream, or false
-   * otherwise.
-   */
-  virtual auto ProcessNextFrame() -> cpp::result<bool, ProcessingError> = 0;
+  struct OutputInfo {
+    std::size_t bytes_written;
+    bool is_finished_writing;
+  };

  /*
   * Writes PCM samples to the given output buffer.
-   *
-   * Returns the number of bytes that were written, and true if all of the
-   * samples synthesized from the last call to `ProcessNextFrame` have been
-   * written. If this returns false, then this method should be called again
-   * after flushing the output buffer.
   */
-  virtual auto WriteOutputSamples(cpp::span<std::byte> output)
-      -> std::pair<std::size_t, bool> = 0;
+  virtual auto ContinueStream(cpp::span<const std::byte> input,
+                              cpp::span<std::byte> output)
+      -> Result<OutputInfo> = 0;
+
+  virtual auto SeekStream(cpp::span<const std::byte> input,
+                          std::size_t target_sample) -> Result<void> = 0;
 };

 auto CreateCodecForType(StreamType type) -> std::optional<ICodec*>;
--- a/src/codecs/include/foxenflac.hpp
+++ b/src/codecs/include/foxenflac.hpp
@ -0,0 +1,38 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "foxen/flac.h"
+#include "span.hpp"
+
+#include "codec.hpp"
+
+namespace codecs {
+
+class FoxenFlacDecoder : public ICodec {
+ public:
+  FoxenFlacDecoder();
+  ~FoxenFlacDecoder();
+
+  auto BeginStream(cpp::span<const std::byte>) -> Result<OutputFormat> override;
+  auto ContinueStream(cpp::span<const std::byte>, cpp::span<std::byte>)
+      -> Result<OutputInfo> override;
+  auto SeekStream(cpp::span<const std::byte> input, std::size_t target_sample)
+      -> Result<void> override;
+
+ private:
+  fx_flac_t* flac_;
+};
+
+}  // namespace codecs
--- a/src/codecs/include/mad.hpp
+++ b/src/codecs/include/mad.hpp
@ -24,12 +24,22 @@ class MadMp3Decoder : public ICodec {
  MadMp3Decoder();
  ~MadMp3Decoder();

-  auto GetOutputFormat() -> std::optional<OutputFormat> override;
-  auto SetInput(cpp::span<const std::byte> input) -> void override;
-  auto GetInputPosition() -> std::size_t override;
-  auto ProcessNextFrame() -> cpp::result<bool, ProcessingError> override;
-  auto WriteOutputSamples(cpp::span<std::byte> output)
-      -> std::pair<std::size_t, bool> override;
+  /*
+   * Returns the output format for the next frame in the stream. MP3 streams
+   * may represent multiple distinct tracks, with different bitrates, and so we
+   * handle the stream only on a frame-by-frame basis.
+   */
+  auto BeginStream(cpp::span<const std::byte>) -> Result<OutputFormat> override;
+
+  /*
+   * Writes samples for the current frame.
+   */
+  auto ContinueStream(cpp::span<const std::byte> input,
+                      cpp::span<std::byte> output)
+      -> Result<OutputInfo> override;
+
+  auto SeekStream(cpp::span<const std::byte> input, std::size_t target_sample)
+      -> Result<void> override;

 private:
  mad_stream stream_;
@ -37,6 +47,8 @@ class MadMp3Decoder : public ICodec {
  mad_synth synth_;

  int current_sample_;
+
+  auto GetInputPosition() -> std::size_t;
 };

 }  // namespace codecs
--- a/src/codecs/include/stbvorbis.hpp
+++ b/src/codecs/include/stbvorbis.hpp
@ -0,0 +1,42 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "stb_vorbis.h"
+
+#include "codec.hpp"
+
+namespace codecs {
+
+class StbVorbisDecoder : public ICodec {
+ public:
+  StbVorbisDecoder();
+  ~StbVorbisDecoder();
+
+  auto BeginStream(cpp::span<const std::byte>) -> Result<OutputFormat> override;
+  auto ContinueStream(cpp::span<const std::byte>, cpp::span<std::byte>)
+      -> Result<OutputInfo> override;
+  auto SeekStream(cpp::span<const std::byte> input, std::size_t target_sample)
+      -> Result<void> override;
+
+ private:
+  stb_vorbis* vorbis_;
+
+  int current_sample_;
+  int num_channels_;
+  int num_samples_;
+  float** samples_array_;
+};
+
+}  // namespace codecs
--- a/src/codecs/include/types.hpp
+++ b/src/codecs/include/types.hpp
@ -13,7 +13,7 @@ namespace codecs {
 enum class StreamType {
  kMp3,
  kPcm,
-  kOgg,
+  kVorbis,
  kFlac,
 };

--- a/src/codecs/mad.cpp
+++ b/src/codecs/mad.cpp
@ -13,11 +13,12 @@
 #include "mad.h"

 #include "codec.hpp"
+#include "result.hpp"
 #include "types.hpp"

 namespace codecs {

-static uint32_t scaleToBits(mad_fixed_t sample, uint8_t bits) {
+static uint32_t mad_fixed_to_pcm(mad_fixed_t sample, uint8_t bits) {
  // Round the bottom bits.
  sample += (1L << (MAD_F_FRACBITS - bits));

@ -42,93 +43,167 @@ MadMp3Decoder::~MadMp3Decoder() {
  mad_synth_finish(&synth_);
 }

-auto MadMp3Decoder::GetOutputFormat() -> std::optional<OutputFormat> {
-  if (synth_.pcm.channels == 0 || synth_.pcm.samplerate == 0) {
-    return {};
-  }
-  return std::optional<OutputFormat>({
-      .num_channels = static_cast<uint8_t>(synth_.pcm.channels),
-      .bits_per_sample = 24,
-      .sample_rate_hz = synth_.pcm.samplerate,
-  });
+auto MadMp3Decoder::GetInputPosition() -> std::size_t {
+  return stream_.next_frame - stream_.buffer;
 }

-auto MadMp3Decoder::SetInput(cpp::span<const std::byte> input) -> void {
+auto MadMp3Decoder::BeginStream(const cpp::span<const std::byte> input)
+    -> Result<OutputFormat> {
  mad_stream_buffer(&stream_,
                    reinterpret_cast<const unsigned char*>(input.data()),
                    input.size());
-}
-
-auto MadMp3Decoder::GetInputPosition() -> std::size_t {
-  return stream_.next_frame - stream_.buffer;
-}
-
-auto MadMp3Decoder::ProcessNextFrame() -> cpp::result<bool, ProcessingError> {
  // Whatever was last synthesized is now invalid, so ensure we don't try to
  // send it.
  current_sample_ = -1;

-  // Decode the next frame. To signal errors, this returns -1 and
-  // stashes an error code in the stream structure.
-  if (mad_frame_decode(&frame_, &stream_) < 0) {
+  // To get the output format for MP3 streams, we simply need to decode the
+  // first frame header.
+  mad_header header;
+  mad_header_init(&header);
+  while (mad_header_decode(&header, &stream_) < 0) {
    if (MAD_RECOVERABLE(stream_.error)) {
      // Recoverable errors are usually malformed parts of the stream.
      // We can recover from them by just retrying the decode.
-      return false;
+      continue;
+    } else {
+      // Don't bother checking for other errors; if the first part of the stream
+      // doesn't even contain a header then something's gone wrong.
+      return {GetInputPosition(), cpp::fail(Error::kMalformedData)};
    }
-
-    if (stream_.error == MAD_ERROR_BUFLEN) {
-      // The decoder ran out of bytes before it completed a frame. We
-      // need to return back to the caller to give us more data.
-      return true;
-    }
-
-    // The error is unrecoverable. Give up.
-    return cpp::fail(MALFORMED_DATA);
  }

-  // We've successfully decoded a frame!
-  // Now we need to synthesize PCM samples based on the frame, and send
-  // them downstream.
-  mad_synth_frame(&synth_, &frame_);
-  current_sample_ = 0;
-  return false;
+  uint8_t channels = MAD_NCHANNELS(&header);
+  return {GetInputPosition(),
+          OutputFormat{
+              .num_channels = channels,
+              .bits_per_sample = 24,  // We always scale to 24 bits
+              .sample_rate_hz = header.samplerate,
+          }};
 }

-auto MadMp3Decoder::WriteOutputSamples(cpp::span<std::byte> output)
-    -> std::pair<std::size_t, bool> {
-  size_t output_byte = 0;
-  // First ensure that we actually have some samples to send off.
+auto MadMp3Decoder::ContinueStream(cpp::span<const std::byte> input,
+                                   cpp::span<std::byte> output)
+    -> Result<OutputInfo> {
  if (current_sample_ < 0) {
-    return std::make_pair(output_byte, true);
+    mad_stream_buffer(&stream_,
+                      reinterpret_cast<const unsigned char*>(input.data()),
+                      input.size());
+
+    // Decode the next frame. To signal errors, this returns -1 and
+    // stashes an error code in the stream structure.
+    while (mad_frame_decode(&frame_, &stream_) < 0) {
+      if (MAD_RECOVERABLE(stream_.error)) {
+        // Recoverable errors are usually malformed parts of the stream.
+        // We can recover from them by just retrying the decode.
+        continue;
+      }
+      if (stream_.error == MAD_ERROR_BUFLEN) {
+        // The decoder ran out of bytes before it completed a frame. We
+        // need to return back to the caller to give us more data.
+        return {GetInputPosition(), cpp::fail(Error::kOutOfInput)};
+      }
+      // The error is unrecoverable. Give up.
+      return {GetInputPosition(), cpp::fail(Error::kMalformedData)};
+    }
+
+    // We've successfully decoded a frame! Now synthesize samples to write out.
+    mad_synth_frame(&synth_, &frame_);
+    current_sample_ = 0;
  }

+  size_t output_byte = 0;
  while (current_sample_ < synth_.pcm.length) {
-    if (output_byte + (2 * synth_.pcm.channels) >= output.size()) {
-      return std::make_pair(output_byte, false);
+    if (output_byte + (4 * synth_.pcm.channels) >= output.size()) {
+      // We can't fit the next sample into the buffer. Stop now, and also avoid
+      // writing the sample for only half the channels.
+      return {GetInputPosition(), OutputInfo{.bytes_written = output_byte,
+                                             .is_finished_writing = false}};
    }

    for (int channel = 0; channel < synth_.pcm.channels; channel++) {
      uint32_t sample_24 =
-          scaleToBits(synth_.pcm.samples[channel][current_sample_], 24);
+          mad_fixed_to_pcm(synth_.pcm.samples[channel][current_sample_], 24);
      output[output_byte++] = static_cast<std::byte>((sample_24 >> 16) & 0xFF);
      output[output_byte++] = static_cast<std::byte>((sample_24 >> 8) & 0xFF);
      output[output_byte++] = static_cast<std::byte>((sample_24)&0xFF);
      // 24 bit samples must still be aligned to 32 bits. The LSB is ignored.
      output[output_byte++] = static_cast<std::byte>(0);
-      /*
-      uint16_t sample_16 =
-          scaleToBits(synth_.pcm.samples[channel][current_sample_], 16);
-      output[output_byte++] = static_cast<std::byte>((sample_16 >> 8) & 0xFF);
-      output[output_byte++] = static_cast<std::byte>((sample_16)&0xFF);
-      */
    }
    current_sample_++;
  }

  // We wrote everything! Reset, ready for the next frame.
  current_sample_ = -1;
-  return std::make_pair(output_byte, true);
+  return {GetInputPosition(), OutputInfo{.bytes_written = output_byte,
+                                         .is_finished_writing = true}};
+}
+
+auto MadMp3Decoder::SeekStream(cpp::span<const std::byte> input,
+                               std::size_t target_sample) -> Result<void> {
+  mad_stream_buffer(&stream_,
+                    reinterpret_cast<const unsigned char*>(input.data()),
+                    input.size());
+  std::size_t current_sample = 0;
+  std::size_t samples_per_frame = 0;
+  while (true) {
+    current_sample += samples_per_frame;
+
+    // First, decode the header for this frame.
+    mad_header header;
+    mad_header_init(&header);
+    while (mad_header_decode(&header, &stream_) < 0) {
+      if (MAD_RECOVERABLE(stream_.error)) {
+        // Recoverable errors are usually malformed parts of the stream.
+        // We can recover from them by just retrying the decode.
+        continue;
+      } else {
+        // Don't bother checking for other errors; if the first part of the
+        // stream doesn't even contain a header then something's gone wrong.
+        return {GetInputPosition(), cpp::fail(Error::kMalformedData)};
+      }
+    }
+
+    // Calculate samples per frame if we haven't already.
+    if (samples_per_frame == 0) {
+      samples_per_frame = 32 * MAD_NSBSAMPLES(&header);
+    }
+
+    // Work out how close we are to the target.
+    std::size_t samples_to_go = target_sample - current_sample;
+    std::size_t frames_to_go = samples_to_go / samples_per_frame;
+    if (frames_to_go > 3) {
+      // The target is far in the distance. Keep skipping through headers only.
+      continue;
+    }
+
+    // The target is within the next few frames. We should decode these, to give
+    // the decoder a chance to sync with the stream.
+    while (mad_frame_decode(&frame_, &stream_) < 0) {
+      if (MAD_RECOVERABLE(stream_.error)) {
+        continue;
+      }
+      if (stream_.error == MAD_ERROR_BUFLEN) {
+        return {GetInputPosition(), cpp::fail(Error::kOutOfInput)};
+      }
+      // The error is unrecoverable. Give up.
+      return {GetInputPosition(), cpp::fail(Error::kMalformedData)};
+    }
+
+    if (frames_to_go <= 1) {
+      // The target is within the next couple of frames. We should start
+      // synthesizing a frame early because this guy says so:
+      // https://lists.mars.org/hyperkitty/list/mad-dev@lists.mars.org/message/UZSHXZTIZEF7FZ4KFOR65DUCKAY2OCUT/
+      mad_synth_frame(&synth_, &frame_);
+    }
+
+    if (frames_to_go == 0) {
+      // The target is actually within this frame! Set up for the ContinueStream
+      // call.
+      current_sample_ =
+          (target_sample > current_sample) ? target_sample - current_sample : 0;
+      return {GetInputPosition(), {}};
+    }
+  }
 }

 }  // namespace codecs
--- a/src/codecs/stbvorbis.cpp
+++ b/src/codecs/stbvorbis.cpp
@ -0,0 +1,128 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#include "stbvorbis.hpp"
+#include <stdint.h>
+
+#include <cstdint>
+#include <optional>
+
+#include "stb_vorbis.h"
+
+namespace codecs {
+
+StbVorbisDecoder::StbVorbisDecoder()
+    : vorbis_(nullptr),
+      current_sample_(-1),
+      num_channels_(0),
+      num_samples_(0),
+      samples_array_(NULL) {}
+
+StbVorbisDecoder::~StbVorbisDecoder() {
+  if (vorbis_ != nullptr) {
+    stb_vorbis_close(vorbis_);
+  }
+}
+
+static uint32_t scaleToBits(float sample, uint8_t bits) {
+  // Scale to range.
+  int32_t max_val = (1 << (bits - 1));
+  int32_t fixed_point = sample * max_val;
+
+  // Clamp within bounds.
+  fixed_point = std::clamp(fixed_point, -max_val, max_val);
+
+  // Remove sign.
+  return *reinterpret_cast<uint32_t*>(&fixed_point);
+}
+
+auto StbVorbisDecoder::BeginStream(const cpp::span<const std::byte> input)
+    -> Result<OutputFormat> {
+  if (vorbis_ != nullptr) {
+    stb_vorbis_close(vorbis_);
+    vorbis_ = nullptr;
+  }
+  current_sample_ = -1;
+  int bytes_read = 0;
+  int error = 0;
+  vorbis_ =
+      stb_vorbis_open_pushdata(reinterpret_cast<const uint8_t*>(input.data()),
+                               input.size_bytes(), &bytes_read, &error, NULL);
+  if (error != 0) {
+    return {0, cpp::fail(Error::kMalformedData)};
+  }
+  stb_vorbis_info info = stb_vorbis_get_info(vorbis_);
+  return {bytes_read,
+          OutputFormat{.num_channels = static_cast<uint8_t>(info.channels),
+                       .bits_per_sample = 24,
+                       .sample_rate_hz = info.sample_rate}};
+}
+
+auto StbVorbisDecoder::ContinueStream(cpp::span<const std::byte> input,
+                                      cpp::span<std::byte> output)
+    -> Result<OutputInfo> {
+  std::size_t bytes_used = 0;
+  if (current_sample_ < 0) {
+    num_channels_ = 0;
+    num_samples_ = 0;
+    samples_array_ = NULL;
+
+    while (true) {
+      auto cropped = input.subspan(bytes_used);
+      std::size_t b = stb_vorbis_decode_frame_pushdata(
+          vorbis_, reinterpret_cast<const uint8_t*>(cropped.data()),
+          cropped.size_bytes(), &num_channels_, &samples_array_, &num_samples_);
+      if (b == 0) {
+        return {bytes_used, cpp::fail(Error::kOutOfInput)};
+      }
+      bytes_used += b;
+
+      if (num_samples_ == 0) {
+        // Decoder is synchronising. Decode more bytes.
+        continue;
+      }
+      if (num_channels_ == 0 || samples_array_ == NULL) {
+        // The decoder isn't satisfying its contract.
+        return {bytes_used, cpp::fail(Error::kInternalError)};
+      }
+      current_sample_ = 0;
+      break;
+    }
+  }
+
+  // We successfully decoded a frame. Time to write out the samples.
+  std::size_t output_byte = 0;
+  while (current_sample_ < num_samples_) {
+    if (output_byte + (2 * num_channels_) >= output.size()) {
+      return {0, OutputInfo{.bytes_written = output_byte,
+                            .is_finished_writing = false}};
+    }
+
+    for (int channel = 0; channel < num_channels_; channel++) {
+      float raw_sample = samples_array_[channel][current_sample_];
+
+      uint16_t sample_24 = scaleToBits(raw_sample, 24);
+      output[output_byte++] = static_cast<std::byte>((sample_24 >> 16) & 0xFF);
+      output[output_byte++] = static_cast<std::byte>((sample_24 >> 8) & 0xFF);
+      output[output_byte++] = static_cast<std::byte>((sample_24)&0xFF);
+      // Pad to 32 bits for alignment.
+      output[output_byte++] = static_cast<std::byte>(0);
+    }
+    current_sample_++;
+  }
+
+  current_sample_ = -1;
+  return {bytes_used, OutputInfo{.bytes_written = output_byte,
+                                 .is_finished_writing = true}};
+}
+
+auto StbVorbisDecoder::SeekStream(cpp::span<const std::byte> input,
+                                  std::size_t target_sample) -> Result<void> {
+  // TODO(jacqueline): Implement me.
+  return {0, {}};
+}
+
+}  // namespace codecs
--- a/src/database/tag_parser.cpp
+++ b/src/database/tag_parser.cpp
@ -96,6 +96,7 @@ auto TagParserImpl::ReadAndParseTags(const std::string& path, SongTags* out)

  if (res != 0) {
    // Parsing failed.
+    ESP_LOGE(kTag, "tag parsing failed, reason %d", res);
    return false;
  }

@ -103,6 +104,15 @@ auto TagParserImpl::ReadAndParseTags(const std::string& path, SongTags* out)
    case Fmp3:
      out->encoding = Encoding::kMp3;
      break;
+    case Fogg:
+      out->encoding = Encoding::kOgg;
+      break;
+    case Fflac:
+      out->encoding = Encoding::kFlac;
+      break;
+    case Fwav:
+      out->encoding = Encoding::kWav;
+      break;
    default:
      out->encoding = Encoding::kUnsupported;
  }
--- a/src/tasks/tasks.cpp
+++ b/src/tasks/tasks.cpp
@ -39,7 +39,7 @@ auto AllocateStack() -> cpp::span<StackType_t>;
 // amount of stack space.
 template <>
 auto AllocateStack<Type::kAudio>() -> cpp::span<StackType_t> {
-  std::size_t size = 32 * 1024;
+  std::size_t size = 48 * 1024;
  return {static_cast<StackType_t*>(heap_caps_malloc(size, MALLOC_CAP_DEFAULT)),
          size};
 }