Add vorbis support whilst we're here

3 years ago · 578c3737f8
parent f277bd5d0c
commit 578c3737f8
11 changed files with 391 additions and 190 deletions
--- a/src/audio/fatfs_audio_input.cpp
+++ b/src/audio/fatfs_audio_input.cpp
@ -311,6 +311,8 @@ auto FatfsAudioInput::ContainerToStreamType(database::Encoding enc)
      return codecs::StreamType::kMp3;
    case database::Encoding::kWav:
      return codecs::StreamType::kPcm;
+    case database::Encoding::kOgg:
+      return codecs::StreamType::kVorbis;
    case database::Encoding::kFlac:
      return codecs::StreamType::kFlac;
    case database::Encoding::kOpus:
--- a/src/codecs/CMakeLists.txt
+++ b/src/codecs/CMakeLists.txt
@ -3,7 +3,7 @@
 # SPDX-License-Identifier: GPL-3.0-only

 idf_component_register(
-  SRCS "codec.cpp" "mad.cpp" "foxenflac.cpp" "opus.cpp"
+  SRCS "codec.cpp" "mad.cpp" "foxenflac.cpp" "opus.cpp" "ogg.cpp" "vorbis.cpp"
  INCLUDE_DIRS "include"
  REQUIRES "result" "span" "libmad" "libfoxenflac" "tremor" "ogg")

--- a/src/codecs/codec.cpp
+++ b/src/codecs/codec.cpp
@ -12,6 +12,7 @@
 #include "foxenflac.hpp"
 #include "opus.hpp"
 #include "mad.hpp"
+#include "vorbis.hpp"
 #include "types.hpp"

 namespace codecs {
@ -20,6 +21,8 @@ auto CreateCodecForType(StreamType type) -> std::optional<ICodec*> {
  switch (type) {
    case StreamType::kMp3:
      return new MadMp3Decoder();
+    case StreamType::kVorbis:
+      return new TremorVorbisDecoder();
    case StreamType::kFlac:
      return new FoxenFlacDecoder();
    case StreamType::kOpus:
--- a/src/codecs/include/ogg.hpp
+++ b/src/codecs/include/ogg.hpp
@ -21,16 +21,23 @@ class OggContainer {
  OggContainer();
  ~OggContainer();

-  auto AddBytes(cpp::span<const std::byte>) -> void;
-  auto HasNextPacket() -> bool;
-  auto NextPacket() -> cpp::span<uint8_t>;
-  auto PeekPacket() -> cpp::span<uint8_t>;
+  auto AddBytes(cpp::span<const std::byte>) -> bool;
+
+  auto Next() -> bool;
+  auto Current() -> cpp::span<uint8_t>;
+  auto HasPacket() -> bool;

 private:
+  auto AdvancePage() -> bool;
+  auto AdvancePacket() -> bool;
+
  ogg_sync_state sync_;
  ogg_stream_state stream_;
  ogg_page page_;
  ogg_packet packet_;
+
+  bool has_stream_;
+  bool has_packet_;
 };

 }  // namespace codecs
--- a/src/codecs/include/stbvorbis.hpp
+++ b/src/codecs/include/stbvorbis.hpp
@ -1,42 +0,0 @@
-/*
- * Copyright 2023 jacqueline <me@jacqueline.id.au>
- *
- * SPDX-License-Identifier: GPL-3.0-only
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-
-#include "stb_vorbis.h"
-
-#include "codec.hpp"
-
-namespace codecs {
-
-class StbVorbisDecoder : public ICodec {
- public:
-  StbVorbisDecoder();
-  ~StbVorbisDecoder();
-
-  auto BeginStream(cpp::span<const std::byte>) -> Result<OutputFormat> override;
-  auto ContinueStream(cpp::span<const std::byte>, cpp::span<std::byte>)
-      -> Result<OutputInfo> override;
-  auto SeekStream(cpp::span<const std::byte> input, std::size_t target_sample)
-      -> Result<void> override;
-
- private:
-  stb_vorbis* vorbis_;
-
-  int current_sample_;
-  int num_channels_;
-  int num_samples_;
-  float** samples_array_;
-};
-
-}  // namespace codecs
--- a/src/codecs/include/types.hpp
+++ b/src/codecs/include/types.hpp
@ -13,6 +13,7 @@ namespace codecs {
 enum class StreamType {
  kMp3,
  kPcm,
+  kVorbis,
  kFlac,
  kOpus,
 };
--- a/src/codecs/include/vorbis.hpp
+++ b/src/codecs/include/vorbis.hpp
@ -0,0 +1,58 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "ivorbisfile.h"
+#include "ogg.hpp"
+#include "ogg/ogg.h"
+#include "opus.h"
+#include "sample.hpp"
+#include "span.hpp"
+
+#include "codec.hpp"
+
+namespace codecs {
+
+class TremorVorbisDecoder : public ICodec {
+ public:
+  TremorVorbisDecoder();
+  ~TremorVorbisDecoder();
+
+  /*
+   * Returns the output format for the next frame in the stream. MP3 streams
+   * may represent multiple distinct tracks, with different bitrates, and so we
+   * handle the stream only on a frame-by-frame basis.
+   */
+  auto BeginStream(cpp::span<const std::byte>) -> Result<OutputFormat> override;
+
+  /*
+   * Writes samples for the current frame.
+   */
+  auto ContinueStream(cpp::span<const std::byte> input,
+                      cpp::span<sample::Sample> output)
+      -> Result<OutputInfo> override;
+
+  auto SeekStream(cpp::span<const std::byte> input, std::size_t target_sample)
+      -> Result<void> override;
+
+  auto ReadCallback() -> cpp::span<const std::byte>;
+  auto AfterReadCallback(size_t bytes_read) -> void;
+
+ private:
+  OggVorbis_File vorbis_;
+  cpp::span<const std::byte> input_;
+  size_t pos_in_input_;
+};
+
+}  // namespace codecs
--- a/src/codecs/ogg.cpp
+++ b/src/codecs/ogg.cpp
@ -0,0 +1,109 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#include "ogg.hpp"
+#include <cstring>
+
+#include "esp_log.h"
+#include "ogg/ogg.h"
+
+namespace codecs {
+
+static constexpr char kTag[] = "ogg";
+
+OggContainer::OggContainer()
+    : sync_(),
+      stream_(),
+      page_(),
+      packet_(),
+      has_stream_(false),
+      has_packet_(false) {
+  ogg_sync_init(&sync_);
+  ogg_sync_pageout(&sync_, &page_);
+}
+
+OggContainer::~OggContainer() {
+  ogg_sync_clear(&sync_);
+  if (has_stream_) {
+    ogg_stream_clear(&stream_);
+  }
+}
+
+auto OggContainer::AddBytes(cpp::span<const std::byte> in) -> bool {
+  ESP_LOGI(kTag, "adding %u bytes to buffer", in.size());
+  char* buf = ogg_sync_buffer(&sync_, in.size());
+  if (buf == NULL) {
+    ESP_LOGE(kTag, "failed to allocate sync buffer");
+    return false;
+  }
+  std::memcpy(buf, in.data(), in.size());
+  if (ogg_sync_wrote(&sync_, in.size()) < 0) {
+    ESP_LOGE(kTag, "failed to write to sync buffer");
+    return false;
+  }
+  return AdvancePage() && AdvancePacket();
+}
+
+auto OggContainer::HasPacket() -> bool {
+  return has_packet_;
+}
+
+auto OggContainer::Next() -> bool {
+  if (AdvancePacket()) {
+    return true;
+  }
+  if (AdvancePage() && AdvancePacket()) {
+    return true;
+  }
+  return false;
+}
+
+auto OggContainer::Current() -> cpp::span<uint8_t> {
+  if (!has_packet_) {
+    return {};
+  }
+  ESP_LOGI(kTag, "getting packet, location %p size %li", packet_.packet,
+           packet_.bytes);
+  return {packet_.packet, static_cast<size_t>(packet_.bytes)};
+}
+
+auto OggContainer::AdvancePage() -> bool {
+  int err;
+  if ((err = ogg_sync_pageout(&sync_, &page_)) != 1) {
+    ESP_LOGE(kTag, "failed to assemble page, res %i", err);
+    return false;
+  }
+  if (!has_stream_) {
+    int serialno = ogg_page_serialno(&page_);
+    ESP_LOGI(kTag, "beginning ogg stream, serial number %i", serialno);
+    if ((err = ogg_stream_init(&stream_, serialno) < 0)) {
+      ESP_LOGE(kTag, "failed to init stream page, res %i", err);
+      return false;
+    }
+    has_stream_ = true;
+  }
+  if (ogg_stream_pagein(&stream_, &page_) < 0) {
+    ESP_LOGE(kTag, "failed to read in page");
+    return false;
+  }
+  return true;
+}
+
+auto OggContainer::AdvancePacket() -> bool {
+  has_packet_ = false;
+  int res;
+  while ((res = ogg_stream_packetout(&stream_, &packet_)) == -1) {
+    // Retry until we sync, or run out of data.
+    ESP_LOGW(kTag, "trying to sync stream...");
+  }
+  has_packet_ = res;
+  if (!has_packet_) {
+    ESP_LOGE(kTag, "failed to read out packet");
+  }
+  return has_packet_;
+}
+
+}  // namespace codecs
--- a/src/codecs/opus.cpp
+++ b/src/codecs/opus.cpp
@ -20,6 +20,7 @@
 #include "esp_log.h"
 #include "ogg/ogg.h"
 #include "opus.h"
+#include "opus_defines.h"
 #include "opus_types.h"
 #include "result.hpp"
 #include "sample.hpp"
@ -49,12 +50,13 @@ XiphOpusDecoder::~XiphOpusDecoder() {

 auto XiphOpusDecoder::BeginStream(const cpp::span<const std::byte> input)
    -> Result<OutputFormat> {
-  ogg_.AddBytes(input);
-  if (!ogg_.HasNextPacket()) {
+  if (!ogg_.AddBytes(input)) {
+    ESP_LOGI(kTag, "need more input to begin");
    return {input.size(), cpp::fail(Error::kOutOfInput)};
  }
-  auto packet = ogg_.NextPacket();
+  auto packet = ogg_.Current();
  int num_channels = opus_packet_get_nb_channels(packet.data());
+  ESP_LOGI(kTag, "opus stream has %i channels", num_channels);
  if (num_channels > 2) {
    // Too many channels; we can't handle this.
    // TODO: better error
@ -78,24 +80,51 @@ auto XiphOpusDecoder::ContinueStream(cpp::span<const std::byte> input,
    -> Result<OutputInfo> {
  size_t bytes_used = 0;
  if (pos_in_buffer_ >= samples_in_buffer_) {
-    ESP_LOGI(kTag, "sample buffer is empty. parsing more.");
-    if (!ogg_.HasNextPacket()) {
+    if (!ogg_.HasPacket()) {
      bytes_used = input.size();
-      ogg_.AddBytes(input);
+      if (!ogg_.AddBytes(input)) {
+        return {bytes_used, cpp::fail(Error::kOutOfInput)};
+      }
    }
-    if (!ogg_.HasNextPacket()) {
-      return {bytes_used, cpp::fail(Error::kOutOfInput)};
-    }
-
-    auto packet = ogg_.NextPacket();

+    auto packet = ogg_.Current();
    pos_in_buffer_ = 0;
-    samples_in_buffer_ =
-        opus_decode(opus_, packet.data(), packet.size_bytes(),
-                    sample_buffer_.data(), sample_buffer_.size(), 0);
+    samples_in_buffer_ = 0;
+    while (samples_in_buffer_ <= 0 && ogg_.HasPacket()) {
+      samples_in_buffer_ =
+          opus_decode(opus_, packet.data(), packet.size_bytes(),
+                      sample_buffer_.data(), sample_buffer_.size(), 0);
+      ogg_.Next();
+    }

    if (samples_in_buffer_ < 0) {
-      ESP_LOGE(kTag, "error decoding stream");
+      std::string err_str;
+      switch (samples_in_buffer_) {
+        case OPUS_BAD_ARG:
+          err_str = "OPUS_BAD_ARG";
+          break;
+        case OPUS_BUFFER_TOO_SMALL:
+          err_str = "OPUS_BUFFER_TOO_SMALL";
+          break;
+        case OPUS_INTERNAL_ERROR:
+          err_str = "OPUS_INTERNAL_ERROR";
+          break;
+        case OPUS_INVALID_PACKET:
+          err_str = "OPUS_INVALID_PACKET";
+          break;
+        case OPUS_UNIMPLEMENTED:
+          err_str = "OPUS_UNIMPLEMENTED";
+          break;
+        case OPUS_INVALID_STATE:
+          err_str = "OPUS_INVALID_STATE";
+          break;
+        case OPUS_ALLOC_FAIL:
+          err_str = "OPUS_ALLOC_FAIL";
+          break;
+        default:
+          err_str = "unknown";
+      }
+      ESP_LOGE(kTag, "error decoding stream, err %s", err_str.c_str());
      return {bytes_used, cpp::fail(Error::kMalformedData)};
    }
  }
--- a/src/codecs/stbvorbis.cpp
+++ b/src/codecs/stbvorbis.cpp
@ -1,128 +0,0 @@
-/*
- * Copyright 2023 jacqueline <me@jacqueline.id.au>
- *
- * SPDX-License-Identifier: GPL-3.0-only
- */
-
-#include "stbvorbis.hpp"
-#include <stdint.h>
-
-#include <cstdint>
-#include <optional>
-
-#include "stb_vorbis.h"
-
-namespace codecs {
-
-StbVorbisDecoder::StbVorbisDecoder()
-    : vorbis_(nullptr),
-      current_sample_(-1),
-      num_channels_(0),
-      num_samples_(0),
-      samples_array_(NULL) {}
-
-StbVorbisDecoder::~StbVorbisDecoder() {
-  if (vorbis_ != nullptr) {
-    stb_vorbis_close(vorbis_);
-  }
-}
-
-static uint32_t scaleToBits(float sample, uint8_t bits) {
-  // Scale to range.
-  int32_t max_val = (1 << (bits - 1));
-  int32_t fixed_point = sample * max_val;
-
-  // Clamp within bounds.
-  fixed_point = std::clamp(fixed_point, -max_val, max_val);
-
-  // Remove sign.
-  return *reinterpret_cast<uint32_t*>(&fixed_point);
-}
-
-auto StbVorbisDecoder::BeginStream(const cpp::span<const std::byte> input)
-    -> Result<OutputFormat> {
-  if (vorbis_ != nullptr) {
-    stb_vorbis_close(vorbis_);
-    vorbis_ = nullptr;
-  }
-  current_sample_ = -1;
-  int bytes_read = 0;
-  int error = 0;
-  vorbis_ =
-      stb_vorbis_open_pushdata(reinterpret_cast<const uint8_t*>(input.data()),
-                               input.size_bytes(), &bytes_read, &error, NULL);
-  if (error != 0) {
-    return {0, cpp::fail(Error::kMalformedData)};
-  }
-  stb_vorbis_info info = stb_vorbis_get_info(vorbis_);
-  return {bytes_read,
-          OutputFormat{.num_channels = static_cast<uint8_t>(info.channels),
-                       .bits_per_sample = 24,
-                       .sample_rate_hz = info.sample_rate}};
-}
-
-auto StbVorbisDecoder::ContinueStream(cpp::span<const std::byte> input,
-                                      cpp::span<std::byte> output)
-    -> Result<OutputInfo> {
-  std::size_t bytes_used = 0;
-  if (current_sample_ < 0) {
-    num_channels_ = 0;
-    num_samples_ = 0;
-    samples_array_ = NULL;
-
-    while (true) {
-      auto cropped = input.subspan(bytes_used);
-      std::size_t b = stb_vorbis_decode_frame_pushdata(
-          vorbis_, reinterpret_cast<const uint8_t*>(cropped.data()),
-          cropped.size_bytes(), &num_channels_, &samples_array_, &num_samples_);
-      if (b == 0) {
-        return {bytes_used, cpp::fail(Error::kOutOfInput)};
-      }
-      bytes_used += b;
-
-      if (num_samples_ == 0) {
-        // Decoder is synchronising. Decode more bytes.
-        continue;
-      }
-      if (num_channels_ == 0 || samples_array_ == NULL) {
-        // The decoder isn't satisfying its contract.
-        return {bytes_used, cpp::fail(Error::kInternalError)};
-      }
-      current_sample_ = 0;
-      break;
-    }
-  }
-
-  // We successfully decoded a frame. Time to write out the samples.
-  std::size_t output_byte = 0;
-  while (current_sample_ < num_samples_) {
-    if (output_byte + (2 * num_channels_) >= output.size()) {
-      return {0, OutputInfo{.bytes_written = output_byte,
-                            .is_finished_writing = false}};
-    }
-
-    for (int channel = 0; channel < num_channels_; channel++) {
-      float raw_sample = samples_array_[channel][current_sample_];
-
-      uint16_t sample_24 = scaleToBits(raw_sample, 24);
-      output[output_byte++] = static_cast<std::byte>((sample_24 >> 16) & 0xFF);
-      output[output_byte++] = static_cast<std::byte>((sample_24 >> 8) & 0xFF);
-      output[output_byte++] = static_cast<std::byte>((sample_24)&0xFF);
-      // Pad to 32 bits for alignment.
-      output[output_byte++] = static_cast<std::byte>(0);
-    }
-    current_sample_++;
-  }
-
-  current_sample_ = -1;
-  return {bytes_used, OutputInfo{.bytes_written = output_byte,
-                                 .is_finished_writing = true}};
-}
-
-auto StbVorbisDecoder::SeekStream(cpp::span<const std::byte> input,
-                                  std::size_t target_sample) -> Result<void> {
-  // TODO(jacqueline): Implement me.
-  return {0, {}};
-}
-
-}  // namespace codecs
--- a/src/codecs/vorbis.cpp
+++ b/src/codecs/vorbis.cpp
@ -0,0 +1,162 @@
+/*
+ * Copyright 2023 jacqueline <me@jacqueline.id.au>
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#include "ivorbiscodec.h"
+#include "ivorbisfile.h"
+#include "ogg/config_types.h"
+#include "opus.hpp"
+
+#include <stdint.h>
+#include <sys/_stdint.h>
+
+#include <cstdint>
+#include <cstring>
+#include <optional>
+
+#include "esp_heap_caps.h"
+#include "mad.h"
+
+#include "codec.hpp"
+#include "esp_log.h"
+#include "ogg/ogg.h"
+#include "opus.h"
+#include "opus_defines.h"
+#include "opus_types.h"
+#include "result.hpp"
+#include "sample.hpp"
+#include "types.hpp"
+#include "vorbis.hpp"
+
+namespace codecs {
+
+static constexpr char kTag[] = "vorbis";
+
+size_t read_cb(void* ptr, size_t size, size_t nmemb, void* instance) {
+  TremorVorbisDecoder* dec = reinterpret_cast<TremorVorbisDecoder*>(instance);
+  auto input = dec->ReadCallback();
+  size_t amount_to_read = std::min<size_t>(size * nmemb, input.size_bytes());
+  std::memcpy(ptr, input.data(), amount_to_read);
+  dec->AfterReadCallback(amount_to_read);
+  return amount_to_read;
+}
+
+int seek_cb(void* instance, ogg_int64_t offset, int whence) {
+  // Seeking is handled separately.
+  return -1;
+}
+
+int close_cb(void* instance) {
+  return 0;
+}
+
+static const ov_callbacks kCallbacks{
+    .read_func = read_cb,
+    .seek_func = seek_cb,
+    .close_func = close_cb,
+    .tell_func = NULL,  // Not seekable
+};
+
+TremorVorbisDecoder::TremorVorbisDecoder()
+    : vorbis_(), input_(), pos_in_input_(0) {}
+
+TremorVorbisDecoder::~TremorVorbisDecoder() {
+  ov_clear(&vorbis_);
+}
+
+auto TremorVorbisDecoder::BeginStream(const cpp::span<const std::byte> input)
+    -> Result<OutputFormat> {
+  int res = ov_open_callbacks(this, &vorbis_,
+                              reinterpret_cast<const char*>(input.data()),
+                              input.size(), kCallbacks);
+  if (res < 0) {
+    std::string err;
+    switch (res) {
+      case OV_EREAD:
+        err = "OV_EREAD";
+        break;
+      case OV_ENOTVORBIS:
+        err = "OV_ENOTVORBIS";
+        break;
+      case OV_EVERSION:
+        err = "OV_EVERSION";
+        break;
+      case OV_EBADHEADER:
+        err = "OV_EBADHEADER";
+        break;
+      case OV_EFAULT:
+        err = "OV_EFAULT";
+        break;
+      default:
+        err = "unknown";
+    }
+    ESP_LOGE(kTag, "error beginning stream: %s", err.c_str());
+    return {input.size(), cpp::fail(Error::kMalformedData)};
+  }
+
+  vorbis_info* info = ov_info(&vorbis_, -1);
+  if (info == NULL) {
+    ESP_LOGE(kTag, "failed to get stream info");
+    return {input.size(), cpp::fail(Error::kMalformedData)};
+  }
+
+  return {input.size(),
+          OutputFormat{
+              .num_channels = static_cast<uint8_t>(info->channels),
+              .sample_rate_hz = static_cast<uint32_t>(info->rate),
+              .bits_per_second = info->bitrate_nominal,
+          }};
+}
+
+auto TremorVorbisDecoder::ContinueStream(cpp::span<const std::byte> input,
+                                         cpp::span<sample::Sample> output)
+    -> Result<OutputInfo> {
+  cpp::span<int16_t> staging_buffer{
+      reinterpret_cast<int16_t*>(output.subspan(output.size() / 2).data()),
+      output.size_bytes() / 2};
+
+  input_ = input;
+  pos_in_input_ = 0;
+
+  int bitstream;
+  long bytes_written =
+      ov_read(&vorbis_, reinterpret_cast<char*>(staging_buffer.data()),
+              staging_buffer.size_bytes(), &bitstream);
+  if (bytes_written == OV_HOLE) {
+    ESP_LOGE(kTag, "got OV_HOLE");
+    return {pos_in_input_, cpp::fail(Error::kMalformedData)};
+  } else if (bytes_written == OV_EBADLINK) {
+    ESP_LOGE(kTag, "got OV_EBADLINK");
+    return {pos_in_input_, cpp::fail(Error::kMalformedData)};
+  } else if (bytes_written == 0) {
+    return {pos_in_input_, cpp::fail(Error::kOutOfInput)};
+  }
+
+  for (int i = 0; i < bytes_written / 2; i++) {
+    output[i] = sample::FromSigned(staging_buffer[i], 16);
+  }
+
+  return {pos_in_input_,
+          OutputInfo{
+              .samples_written = static_cast<size_t>(bytes_written / 2),
+              .is_finished_writing = bytes_written == 0,
+          }};
+}
+
+auto TremorVorbisDecoder::SeekStream(cpp::span<const std::byte> input,
+                                     std::size_t target_sample)
+    -> Result<void> {
+  return {};
+}
+
+auto TremorVorbisDecoder::ReadCallback() -> cpp::span<const std::byte> {
+  return input_.subspan(pos_in_input_);
+}
+
+auto TremorVorbisDecoder::AfterReadCallback(size_t bytes_read) -> void {
+  pos_in_input_ += bytes_read;
+}
+
+}  // namespace codecs