From 245d9ff4b9cde1f487beed76085a52f3f2d6d26c Mon Sep 17 00:00:00 2001 From: jacqueline Date: Fri, 23 Jun 2023 15:32:11 +1000 Subject: [PATCH] add indexing to the database idk man i wrote most of this in a fugue state whilst high on the couch with my cat --- src/app_console/app_console.cpp | 86 ++++++++++- src/audio/fatfs_audio_input.cpp | 4 +- src/database/CMakeLists.txt | 4 +- src/database/database.cpp | 249 ++++++++++++++++++++++-------- src/database/include/database.hpp | 26 +++- src/database/include/index.hpp | 72 +++++++++ src/database/include/records.hpp | 32 ++-- src/database/include/track.hpp | 37 +++-- src/database/index.cpp | 88 +++++++++++ src/database/records.cpp | 211 ++++++++++++++++++++++++- src/database/tag_parser.cpp | 36 +++-- src/database/track.cpp | 39 ++++- 12 files changed, 774 insertions(+), 110 deletions(-) create mode 100644 src/database/include/index.hpp create mode 100644 src/database/index.cpp diff --git a/src/app_console/app_console.cpp b/src/app_console/app_console.cpp index 1a549653..539fea00 100644 --- a/src/app_console/app_console.cpp +++ b/src/app_console/app_console.cpp @@ -8,10 +8,12 @@ #include +#include #include #include #include #include +#include #include #include @@ -22,6 +24,8 @@ #include "esp_log.h" #include "event_queue.hpp" #include "ff.h" +#include "index.hpp" +#include "track.hpp" namespace console { @@ -158,7 +162,8 @@ int CmdDbTracks(int argc, char** argv) { db->GetTracks(20).get()); while (true) { for (database::Track s : res->values()) { - std::cout << s.tags().title.value_or("[BLANK]") << std::endl; + std::cout << s.tags()[database::Tag::kTitle].value_or("[BLANK]") + << std::endl; } if (res->next_page()) { auto continuation = res->next_page().value(); @@ -180,6 +185,84 @@ void RegisterDbTracks() { esp_console_cmd_register(&cmd); } +int CmdDbIndex(int argc, char** argv) { + std::cout << std::endl; + vTaskDelay(1); + static const std::string usage = "usage: db_index [id] [choices ...]"; + + auto db = AppConsole::sDatabase.lock(); + if (!db) { + std::cout << "no database open" << std::endl; + return 1; + } + + auto indexes = db->GetIndexes(); + if (argc <= 1) { + std::cout << usage << std::endl; + std::cout << "available indexes:" << std::endl; + std::cout << "id\tname" << std::endl; + for (const database::IndexInfo& info : indexes) { + std::cout << static_cast(info.id) << '\t' << info.name << std::endl; + } + return 0; + } + + int index_id = std::atoi(argv[1]); + auto index = std::find_if(indexes.begin(), indexes.end(), + [=](const auto& i) { return i.id == index_id; }); + if (index == indexes.end()) { + std::cout << "bad index id" << std::endl; + return -1; + } + + std::unique_ptr> res( + db->GetTracksByIndex(*index, 20).get()); + int choice_index = 2; + + if (res->values().empty()) { + std::cout << "no entries for this index" << std::endl; + return 1; + } + + while (choice_index < argc) { + int choice = std::atoi(argv[choice_index]); + if (choice >= res->values().size()) { + std::cout << "choice out of range" << std::endl; + return -1; + } + auto cont = res->values().at(choice).Expand(20); + if (!cont) { + std::cout << "more choices than levels" << std::endl; + return 0; + } + res.reset(db->GetPage(&*cont).get()); + choice_index++; + } + + for (database::IndexRecord r : res->values()) { + std::cout << r.text().value_or(""); + if (r.track()) { + std::cout << "\t(id:" << r.track()->data().id() << ")"; + } + std::cout << std::endl; + } + + if (res->next_page()) { + std::cout << "(more results not shown)" << std::endl; + } + + return 0; +} + +void RegisterDbIndex() { + esp_console_cmd_t cmd{.command = "db_index", + .help = "queries the database by index", + .hint = NULL, + .func = &CmdDbIndex, + .argtable = NULL}; + esp_console_cmd_register(&cmd); +} + int CmdDbDump(int argc, char** argv) { static const std::string usage = "usage: db_dump"; if (argc != 1) { @@ -232,6 +315,7 @@ auto AppConsole::RegisterExtraComponents() -> void { */ RegisterDbInit(); RegisterDbTracks(); + RegisterDbIndex(); RegisterDbDump(); } diff --git a/src/audio/fatfs_audio_input.cpp b/src/audio/fatfs_audio_input.cpp index ca5b02a1..894ac842 100644 --- a/src/audio/fatfs_audio_input.cpp +++ b/src/audio/fatfs_audio_input.cpp @@ -75,13 +75,13 @@ auto FatfsAudioInput::OpenFile(const std::string& path) -> bool { return false; } - auto stream_type = ContainerToStreamType(tags.encoding); + auto stream_type = ContainerToStreamType(tags.encoding()); if (!stream_type.has_value()) { ESP_LOGE(kTag, "couldn't match container to stream"); return false; } - current_container_ = tags.encoding; + current_container_ = tags.encoding(); if (*stream_type == codecs::StreamType::kPcm && tags.channels && tags.bits_per_sample && tags.channels) { diff --git a/src/database/CMakeLists.txt b/src/database/CMakeLists.txt index e7b1f62c..04e1d5d8 100644 --- a/src/database/CMakeLists.txt +++ b/src/database/CMakeLists.txt @@ -3,9 +3,9 @@ # SPDX-License-Identifier: GPL-3.0-only idf_component_register( - SRCS "env_esp.cpp" "database.cpp" "track.cpp" "records.cpp" "file_gatherer.cpp" "tag_parser.cpp" + SRCS "env_esp.cpp" "database.cpp" "track.cpp" "records.cpp" "file_gatherer.cpp" "tag_parser.cpp" "index.cpp" INCLUDE_DIRS "include" - REQUIRES "result" "span" "esp_psram" "fatfs" "libtags" "komihash" "cbor" "tasks") + REQUIRES "result" "span" "esp_psram" "fatfs" "libtags" "komihash" "cbor" "tasks" "shared_string") target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS}) diff --git a/src/database/database.cpp b/src/database/database.cpp index ac5e4873..1ac5d729 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -13,11 +13,13 @@ #include #include #include +#include #include #include "esp_log.h" #include "ff.h" #include "freertos/projdefs.h" +#include "index.hpp" #include "leveldb/cache.h" #include "leveldb/db.h" #include "leveldb/iterator.h" @@ -130,72 +132,91 @@ Database::~Database() { auto Database::Update() -> std::future { return worker_task_->Dispatch([&]() -> void { - // Stage 1: verify all existing tracks are still valid. - ESP_LOGI(kTag, "verifying existing tracks"); - const leveldb::Snapshot* snapshot = db_->GetSnapshot(); leveldb::ReadOptions read_options; read_options.fill_cache = false; - read_options.snapshot = snapshot; - leveldb::Iterator* it = db_->NewIterator(read_options); - OwningSlice prefix = CreateDataPrefix(); - it->Seek(prefix.slice); - while (it->Valid() && it->key().starts_with(prefix.slice)) { - std::optional track = ParseDataValue(it->value()); - if (!track) { - // The value was malformed. Drop this record. - ESP_LOGW(kTag, "dropping malformed metadata"); + + // Stage 0: discard indexes + // TODO(jacqueline): I think it should be possible to incrementally update + // indexes, but my brain hurts. + ESP_LOGI(kTag, "dropping stale indexes"); + { + leveldb::Iterator* it = db_->NewIterator(read_options); + OwningSlice prefix = EncodeAllIndexesPrefix(); + it->Seek(prefix.slice); + while (it->Valid() && it->key().starts_with(prefix.slice)) { db_->Delete(leveldb::WriteOptions(), it->key()); it->Next(); - continue; } + } - if (track->is_tombstoned()) { - ESP_LOGW(kTag, "skipping tombstoned %lx", track->id()); - it->Next(); - continue; - } + // Stage 1: verify all existing tracks are still valid. + ESP_LOGI(kTag, "verifying existing tracks"); + { + leveldb::Iterator* it = db_->NewIterator(read_options); + OwningSlice prefix = EncodeDataPrefix(); + it->Seek(prefix.slice); + while (it->Valid() && it->key().starts_with(prefix.slice)) { + std::optional track = ParseDataValue(it->value()); + if (!track) { + // The value was malformed. Drop this record. + ESP_LOGW(kTag, "dropping malformed metadata"); + db_->Delete(leveldb::WriteOptions(), it->key()); + it->Next(); + continue; + } - TrackTags tags; - if (!tag_parser_->ReadAndParseTags(track->filepath(), &tags) || - tags.encoding == Encoding::kUnsupported) { - // We couldn't read the tags for this track. Either they were - // malformed, or perhaps the file is missing. Either way, tombstone - // this record. - ESP_LOGW(kTag, "entombing missing #%lx", track->id()); - dbPutTrackData(track->Entomb()); - it->Next(); - continue; - } + if (track->is_tombstoned()) { + ESP_LOGW(kTag, "skipping tombstoned %lx", track->id()); + it->Next(); + continue; + } - uint64_t new_hash = tags.Hash(); - if (new_hash != track->tags_hash()) { - // This track's tags have changed. Since the filepath is exactly the - // same, we assume this is a legitimate correction. Update the - // database. - ESP_LOGI(kTag, "updating hash (%llx -> %llx)", track->tags_hash(), - new_hash); - dbPutTrackData(track->UpdateHash(new_hash)); - dbPutHash(new_hash, track->id()); - } + TrackTags tags{}; + if (!tag_parser_->ReadAndParseTags(track->filepath(), &tags) || + tags.encoding() == Encoding::kUnsupported) { + // We couldn't read the tags for this track. Either they were + // malformed, or perhaps the file is missing. Either way, tombstone + // this record. + ESP_LOGW(kTag, "entombing missing #%lx", track->id()); + dbPutTrackData(track->Entomb()); + it->Next(); + continue; + } - it->Next(); + // At this point, we know that the track still exists in its original + // location. All that's left to do is update any metadata about it. + + uint64_t new_hash = tags.Hash(); + if (new_hash != track->tags_hash()) { + // This track's tags have changed. Since the filepath is exactly the + // same, we assume this is a legitimate correction. Update the + // database. + ESP_LOGI(kTag, "updating hash (%llx -> %llx)", track->tags_hash(), + new_hash); + dbPutTrackData(track->UpdateHash(new_hash)); + dbPutHash(new_hash, track->id()); + } + + dbCreateIndexesForTrack({*track, tags}); + + it->Next(); + } + delete it; } - delete it; - db_->ReleaseSnapshot(snapshot); // Stage 2: search for newly added files. ESP_LOGI(kTag, "scanning for new tracks"); file_gatherer_->FindFiles("", [&](const std::string& path) { TrackTags tags; if (!tag_parser_->ReadAndParseTags(path, &tags) || - tags.encoding == Encoding::kUnsupported) { + tags.encoding() == Encoding::kUnsupported) { // No parseable tags; skip this fiile. return; } // Check for any existing record with the same hash. uint64_t hash = tags.Hash(); - OwningSlice key = CreateHashKey(hash); + OwningSlice key = EncodeHashKey(hash); std::optional existing_hash; std::string raw_entry; if (db_->Get(leveldb::ReadOptions(), key.slice, &raw_entry).ok()) { @@ -207,7 +228,11 @@ auto Database::Update() -> std::future { // malformed. Either way, record this as a new track. TrackId id = dbMintNewTrackId(); ESP_LOGI(kTag, "recording new 0x%lx", id); - dbPutTrack(id, path, hash); + + TrackData data(id, path, hash); + dbPutTrackData(data); + dbPutHash(hash, id); + dbCreateIndexesForTrack({data, tags}); return; } @@ -216,12 +241,14 @@ auto Database::Update() -> std::future { // We found a hash that matches, but there's no data record? Weird. TrackData new_data(*existing_hash, path, hash); dbPutTrackData(new_data); + dbCreateIndexesForTrack({*existing_data, tags}); return; } if (existing_data->is_tombstoned()) { ESP_LOGI(kTag, "exhuming track %lu", existing_data->id()); dbPutTrackData(existing_data->Exhume(path)); + dbCreateIndexesForTrack({*existing_data, tags}); } else if (existing_data->filepath() != path) { ESP_LOGW(kTag, "tag hash collision"); } @@ -241,11 +268,41 @@ auto Database::GetTrackPath(TrackId id) }); } +auto Database::GetIndexes() -> std::vector { + // TODO(jacqueline): This probably needs to be async? When we have runtime + // configurable indexes, they will need to come from somewhere. + return { + kAllTracks, + kAlbumsByArtist, + kTracksByGenre, + }; +} + +auto Database::GetTracksByIndex(const IndexInfo& index, std::size_t page_size) + -> std::future*> { + return worker_task_->Dispatch*>( + [=, this]() -> Result* { + IndexKey::Header header{ + .id = index.id, + .depth = 0, + .components_hash = 0, + }; + OwningSlice prefix = EncodeIndexPrefix(header); + Continuation c{.iterator = nullptr, + .prefix = prefix.data, + .start_key = prefix.data, + .forward = true, + .was_prev_forward = true, + .page_size = page_size}; + return dbGetPage(c); + }); +} + auto Database::GetTracks(std::size_t page_size) -> std::future*> { return worker_task_->Dispatch*>([=, this]() -> Result* { Continuation c{.iterator = nullptr, - .prefix = CreateDataPrefix().data, - .start_key = CreateDataPrefix().data, + .prefix = EncodeDataPrefix().data, + .start_key = EncodeDataPrefix().data, .forward = true, .was_prev_forward = true, .page_size = page_size}; @@ -276,6 +333,8 @@ auto Database::GetPage(Continuation* c) -> std::future*> { template auto Database::GetPage(Continuation* c) -> std::future*>; +template auto Database::GetPage(Continuation* c) + -> std::future*>; template auto Database::GetPage(Continuation* c) -> std::future*>; @@ -300,23 +359,23 @@ auto Database::dbMintNewTrackId() -> TrackId { } auto Database::dbEntomb(TrackId id, uint64_t hash) -> void { - OwningSlice key = CreateHashKey(hash); - OwningSlice val = CreateHashValue(id); + OwningSlice key = EncodeHashKey(hash); + OwningSlice val = EncodeHashValue(id); if (!db_->Put(leveldb::WriteOptions(), key.slice, val.slice).ok()) { ESP_LOGE(kTag, "failed to entomb #%llx (id #%lx)", hash, id); } } auto Database::dbPutTrackData(const TrackData& s) -> void { - OwningSlice key = CreateDataKey(s.id()); - OwningSlice val = CreateDataValue(s); + OwningSlice key = EncodeDataKey(s.id()); + OwningSlice val = EncodeDataValue(s); if (!db_->Put(leveldb::WriteOptions(), key.slice, val.slice).ok()) { ESP_LOGE(kTag, "failed to write data for #%lx", s.id()); } } auto Database::dbGetTrackData(TrackId id) -> std::optional { - OwningSlice key = CreateDataKey(id); + OwningSlice key = EncodeDataKey(id); std::string raw_val; if (!db_->Get(leveldb::ReadOptions(), key.slice, &raw_val).ok()) { ESP_LOGW(kTag, "no key found for #%lx", id); @@ -326,15 +385,15 @@ auto Database::dbGetTrackData(TrackId id) -> std::optional { } auto Database::dbPutHash(const uint64_t& hash, TrackId i) -> void { - OwningSlice key = CreateHashKey(hash); - OwningSlice val = CreateHashValue(i); + OwningSlice key = EncodeHashKey(hash); + OwningSlice val = EncodeHashValue(i); if (!db_->Put(leveldb::WriteOptions(), key.slice, val.slice).ok()) { ESP_LOGE(kTag, "failed to write hash for #%lx", i); } } auto Database::dbGetHash(const uint64_t& hash) -> std::optional { - OwningSlice key = CreateHashKey(hash); + OwningSlice key = EncodeHashKey(hash); std::string raw_val; if (!db_->Get(leveldb::ReadOptions(), key.slice, &raw_val).ok()) { ESP_LOGW(kTag, "no key found for hash #%llx", hash); @@ -343,11 +402,13 @@ auto Database::dbGetHash(const uint64_t& hash) -> std::optional { return ParseHashValue(raw_val); } -auto Database::dbPutTrack(TrackId id, - const std::string& path, - const uint64_t& hash) -> void { - dbPutTrackData(TrackData(id, path, hash)); - dbPutHash(hash, id); +auto Database::dbCreateIndexesForTrack(Track track) -> void { + for (const IndexInfo& index : GetIndexes()) { + leveldb::WriteBatch writes; + if (Index(index, track, &writes)) { + db_->Write(leveldb::WriteOptions(), &writes); + } + } } template @@ -474,6 +535,31 @@ template auto Database::dbGetPage(const Continuation& c) template auto Database::dbGetPage( const Continuation& c) -> Result*; +template <> +auto Database::ParseRecord(const leveldb::Slice& key, + const leveldb::Slice& val) + -> std::optional { + std::optional data = ParseIndexKey(key); + if (!data) { + return {}; + } + + // If there was a track id included for this key, then this is a leaf record. + // Fetch the actual track data instead of relying on the information in the + // key. + std::optional track; + if (data->track) { + std::optional track_data = dbGetTrackData(*data->track); + TrackTags track_tags; + if (track_data && + tag_parser_->ReadAndParseTags(track_data->filepath(), &track_tags)) { + track.emplace(*track_data, track_tags); + } + } + + return IndexRecord(*data, track); +} + template <> auto Database::ParseRecord(const leveldb::Slice& key, const leveldb::Slice& val) @@ -510,13 +596,46 @@ auto Database::ParseRecord(const leveldb::Slice& key, } } } - stream << "\tval: 0x"; - std::string str = val.ToString(); - for (int i = 0; i < val.size(); i++) { - stream << std::hex << std::setfill('0') << std::setw(2) - << static_cast(str[i]); + if (!val.empty()) { + stream << "\tval: 0x"; + std::string str = val.ToString(); + for (int i = 0; i < val.size(); i++) { + stream << std::hex << std::setfill('0') << std::setw(2) + << static_cast(str[i]); + } } return stream.str(); } +IndexRecord::IndexRecord(const IndexKey& key, std::optional track) + : key_(key), track_(track) {} + +auto IndexRecord::text() const -> std::optional { + if (track_) { + return track_->TitleOrFilename(); + } + return key_.item; +} + +auto IndexRecord::track() const -> std::optional { + return track_; +} + +auto IndexRecord::Expand(std::size_t page_size) const + -> std::optional> { + if (track_) { + return {}; + } + IndexKey::Header new_header = ExpandHeader(key_.header, key_.item); + OwningSlice new_prefix = EncodeIndexPrefix(new_header); + return Continuation{ + .iterator = nullptr, + .prefix = new_prefix.data, + .start_key = new_prefix.data, + .forward = true, + .was_prev_forward = true, + .page_size = page_size, + }; +} + } // namespace database diff --git a/src/database/include/database.hpp b/src/database/include/database.hpp index 8fecc5f6..77a17b75 100644 --- a/src/database/include/database.hpp +++ b/src/database/include/database.hpp @@ -16,6 +16,7 @@ #include #include "file_gatherer.hpp" +#include "index.hpp" #include "leveldb/cache.h" #include "leveldb/db.h" #include "leveldb/iterator.h" @@ -23,6 +24,7 @@ #include "leveldb/slice.h" #include "records.hpp" #include "result.hpp" +#include "shared_string.h" #include "tag_parser.hpp" #include "tasks.hpp" #include "track.hpp" @@ -66,6 +68,20 @@ class Result { std::optional> prev_page_; }; +class IndexRecord { + public: + explicit IndexRecord(const IndexKey&, std::optional); + + auto text() const -> std::optional; + auto track() const -> std::optional; + + auto Expand(std::size_t) const -> std::optional>; + + private: + IndexKey key_; + std::optional track_; +}; + class Database { public: enum DatabaseError { @@ -84,6 +100,9 @@ class Database { auto GetTrackPath(TrackId id) -> std::future>; + auto GetIndexes() -> std::vector; + auto GetTracksByIndex(const IndexInfo& index, std::size_t page_size) + -> std::future*>; auto GetTracks(std::size_t page_size) -> std::future*>; auto GetDump(std::size_t page_size) -> std::future*>; @@ -118,8 +137,7 @@ class Database { auto dbGetTrackData(TrackId id) -> std::optional; auto dbPutHash(const uint64_t& hash, TrackId i) -> void; auto dbGetHash(const uint64_t& hash) -> std::optional; - auto dbPutTrack(TrackId id, const std::string& path, const uint64_t& hash) - -> void; + auto dbCreateIndexesForTrack(Track track) -> void; template auto dbGetPage(const Continuation& c) -> Result*; @@ -129,6 +147,10 @@ class Database { -> std::optional; }; +template <> +auto Database::ParseRecord(const leveldb::Slice& key, + const leveldb::Slice& val) + -> std::optional; template <> auto Database::ParseRecord(const leveldb::Slice& key, const leveldb::Slice& val) diff --git a/src/database/include/index.hpp b/src/database/include/index.hpp new file mode 100644 index 00000000..17229164 --- /dev/null +++ b/src/database/include/index.hpp @@ -0,0 +1,72 @@ +/* + * Copyright 2023 jacqueline + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include "leveldb/db.h" +#include "leveldb/slice.h" + +#include "leveldb/write_batch.h" +#include "shared_string.h" +#include "track.hpp" + +namespace database { + +typedef uint8_t IndexId; + +struct IndexInfo { + // Unique id for this index + IndexId id; + // Localised, user-friendly description of this index. e.g. "Albums by Artist" + // or "All Tracks". + std::string name; + // Specifier for how this index breaks down the database. + std::vector components; +}; + +struct IndexKey { + struct Header { + // The index that this key was created for. + IndexId id; + // The number of components of IndexInfo that have already been filtered. + // For example, if an index consists of { kGenre, kArtist }, and this key + // represents an artist, then depth = 1. + std::uint8_t depth; + // The cumulative hash of all filtered components, in order. For example, if + // an index consists of { kArtist, kAlbum, kTitle }, and we are at depth = 2 + // then this may contain hash(hash("Jacqueline"), "My Cool Album"). + std::uint64_t components_hash; + }; + Header header; + + // The filterable / selectable item that this key represents. "Jacqueline" for + // kArtist, "My Cool Album" for kAlbum, etc. + std::optional item; + // If this is a leaf component, the track id for this record. + // This could reasonably be the value for a record, but we keep it as a part + // of the key to help with disambiguation. + std::optional track; +}; + +auto Index(const IndexInfo&, const Track&, leveldb::WriteBatch*) -> bool; +auto ExpandHeader(const IndexKey::Header&, const std::optional&) + -> IndexKey::Header; + +// Predefined indexes +// TODO(jacqueline): Make these defined at runtime! :) + +extern const IndexInfo kAlbumsByArtist; +extern const IndexInfo kTracksByGenre; +extern const IndexInfo kAllTracks; + +} // namespace database diff --git a/src/database/include/records.hpp b/src/database/include/records.hpp index 95a1a1e8..58f29b20 100644 --- a/src/database/include/records.hpp +++ b/src/database/include/records.hpp @@ -9,10 +9,14 @@ #include #include +#include +#include #include "leveldb/db.h" #include "leveldb/slice.h" +#include "index.hpp" +#include "shared_string.h" #include "track.hpp" namespace database { @@ -34,39 +38,49 @@ class OwningSlice { * Returns the prefix added to every TrackData key. This can be used to iterate * over every data record in the database. */ -auto CreateDataPrefix() -> OwningSlice; +auto EncodeDataPrefix() -> OwningSlice; -/* Creates a data key for a track with the specified id. */ -auto CreateDataKey(const TrackId& id) -> OwningSlice; +/* Encodes a data key for a track with the specified id. */ +auto EncodeDataKey(const TrackId& id) -> OwningSlice; /* * Encodes a TrackData instance into bytes, in preparation for storing it within * the database. This encoding is consistent, and will remain stable over time. */ -auto CreateDataValue(const TrackData& track) -> OwningSlice; +auto EncodeDataValue(const TrackData& track) -> OwningSlice; /* - * Parses bytes previously encoded via CreateDataValue back into a TrackData. + * Parses bytes previously encoded via EncodeDataValue back into a TrackData. * May return nullopt if parsing fails. */ auto ParseDataValue(const leveldb::Slice& slice) -> std::optional; -/* Creates a hash key for the specified hash. */ -auto CreateHashKey(const uint64_t& hash) -> OwningSlice; +/* Encodes a hash key for the specified hash. */ +auto EncodeHashKey(const uint64_t& hash) -> OwningSlice; /* * Encodes a hash value (at this point just a track id) into bytes, in * preparation for storing within the database. This encoding is consistent, and * will remain stable over time. */ -auto CreateHashValue(TrackId id) -> OwningSlice; +auto EncodeHashValue(TrackId id) -> OwningSlice; /* - * Parses bytes previously encoded via CreateHashValue back into a track id. May + * Parses bytes previously encoded via EncodeHashValue back into a track id. May * return nullopt if parsing fails. */ auto ParseHashValue(const leveldb::Slice&) -> std::optional; +/* Encodes a prefix that matches all index keys, of all ids and depths. */ +auto EncodeAllIndexesPrefix() -> OwningSlice; + +/* + */ +auto EncodeIndexPrefix(const IndexKey::Header&) -> OwningSlice; + +auto EncodeIndexKey(const IndexKey&) -> OwningSlice; +auto ParseIndexKey(const leveldb::Slice&) -> std::optional; + /* Encodes a TrackId as bytes. */ auto TrackIdToBytes(TrackId id) -> OwningSlice; diff --git a/src/database/include/track.hpp b/src/database/include/track.hpp index 5a0c0ca8..e3f94db4 100644 --- a/src/database/include/track.hpp +++ b/src/database/include/track.hpp @@ -8,11 +8,14 @@ #include +#include +#include #include #include #include #include "leveldb/db.h" +#include "shared_string.h" #include "span.hpp" namespace database { @@ -41,25 +44,33 @@ enum class Encoding { kFlac = 4, }; +enum class Tag { + kTitle = 0, + kArtist = 1, + kAlbum = 2, + kAlbumTrack = 3, + kGenre = 4, +}; + /* * Owning container for tag-related track metadata that was extracted from a * file. */ -struct TrackTags { - Encoding encoding; - std::optional title; - - // TODO(jacqueline): It would be nice to use shared_ptr's for the artist and - // album, since there's likely a fair number of duplicates for each - // (especially the former). +class TrackTags { + public: + auto encoding() const -> Encoding { return encoding_; }; + auto encoding(Encoding e) -> void { encoding_ = e; }; - std::optional artist; - std::optional album; + TrackTags() : encoding_(Encoding::kUnsupported) {} std::optional channels; std::optional sample_rate; std::optional bits_per_sample; + auto set(const Tag& key, const std::string& val) -> void; + auto at(const Tag& key) const -> std::optional; + auto operator[](const Tag& key) const -> std::optional; + /* * Returns a hash of the 'identifying' tags of this track. That is, a hash * that can be used to determine if one track is likely the same as another, @@ -69,6 +80,12 @@ struct TrackTags { auto Hash() const -> uint64_t; bool operator==(const TrackTags&) const = default; + TrackTags& operator=(const TrackTags&) = default; + TrackTags(const TrackTags&) = default; + + private: + Encoding encoding_; + std::map tags_; }; /* @@ -156,6 +173,8 @@ class Track { auto data() const -> const TrackData& { return data_; } auto tags() const -> const TrackTags& { return tags_; } + auto TitleOrFilename() const -> shared_string; + bool operator==(const Track&) const = default; Track operator=(const Track& other) const { return Track(other); } diff --git a/src/database/index.cpp b/src/database/index.cpp new file mode 100644 index 00000000..a828578d --- /dev/null +++ b/src/database/index.cpp @@ -0,0 +1,88 @@ +/* + * Copyright 2023 jacqueline + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#include "index.hpp" +#include +#include +#include "komihash.h" +#include "leveldb/write_batch.h" +#include "records.hpp" + +namespace database { + +const IndexInfo kAlbumsByArtist{ + .id = 1, + .name = "Albums by Artist", + .components = {Tag::kArtist, Tag::kAlbum, Tag::kAlbumTrack}, +}; + +const IndexInfo kTracksByGenre{ + .id = 2, + .name = "Tracks by Genre", + .components = {Tag::kGenre, Tag::kTitle}, +}; + +const IndexInfo kAllTracks{ + .id = 3, + .name = "All Tracks", + .components = {Tag::kTitle}, +}; + +auto Index(const IndexInfo& info, const Track& t, leveldb::WriteBatch* batch) + -> bool { + IndexKey key{ + .header{ + .id = info.id, + .depth = 0, + .components_hash = 0, + }, + .item = {}, + .track = {}, + }; + + for (std::uint8_t i = 0; i < info.components.size(); i++) { + // Fill in the text for this depth. + auto text = t.tags().at(info.components.at(i)); + if (text) { + key.item = *text; + } else { + key.item = {}; + } + + // If this is the last component, then we should also fill in the track id. + if (i == info.components.size() - 1) { + key.track = t.data().id(); + } else { + key.track = {}; + } + + auto encoded = EncodeIndexKey(key); + batch->Put(encoded.slice, leveldb::Slice{}); + + // If there are more components after this, then we need to finish by + // narrowing the header with the current title. + if (i < info.components.size() - 1) { + key.header = ExpandHeader(key.header, key.item); + } + } + return true; +} + +auto ExpandHeader(const IndexKey::Header& header, + const std::optional& component) + -> IndexKey::Header { + IndexKey::Header ret{header}; + ret.depth++; + if (component) { + ret.components_hash = + komihash(component->data(), component->size(), ret.components_hash); + } else { + ret.components_hash = komihash(NULL, 0, ret.components_hash); + } + return ret; +} + +} // namespace database diff --git a/src/database/records.cpp b/src/database/records.cpp index 49e5db0b..72608eb0 100644 --- a/src/database/records.cpp +++ b/src/database/records.cpp @@ -8,20 +8,43 @@ #include +#include #include +#include #include #include "cbor.h" #include "esp_log.h" +#include "index.hpp" +#include "komihash.h" +#include "shared_string.h" #include "track.hpp" +// As LevelDB is a key-value store, each record in the database consists of a +// key and an optional value. +// +// Values, when present, are always cbor-encoded. This is fast, compact, and +// very easy to evolve over time due to its inclusion of type information. +// +// Keys have a more complicated scheme, as for performance we rely heavily on +// LevelDB's sorted storage format. We must therefore worry about clustering of +// similar records, and the sortability of our encoding format. +// Each kind of key consists of a a single-byte prefix, then one or more +// fields separated by null (0) bytes. Each field may be cbor-encoded, or may +// use some bespoke encoding; it depends on whether we want to be able to sort +// by that field. +// For debugging and discussion purposes, we represent field separators +// textually as '/', and write each field as its hex encoding. e.g. a data key +// for the track with id 17 would be written as 'D / 0x11'. + namespace database { static const char* kTag = "RECORDS"; static const char kDataPrefix = 'D'; static const char kHashPrefix = 'H'; +static const char kIndexPrefix = 'I'; static const char kFieldSeparator = '\0'; /* @@ -39,6 +62,8 @@ static const char kFieldSeparator = '\0'; template auto cbor_encode(uint8_t** out_buf, T fn) -> std::size_t { // First pass: work out how many bytes we will encode into. + // FIXME: With benchmarking to help, we could consider preallocting a small + // buffer here to do the whole encoding in one pass. CborEncoder size_encoder; cbor_encoder_init(&size_encoder, NULL, 0, 0); std::invoke(fn, &size_encoder); @@ -55,19 +80,21 @@ auto cbor_encode(uint8_t** out_buf, T fn) -> std::size_t { OwningSlice::OwningSlice(std::string d) : data(d), slice(data) {} -auto CreateDataPrefix() -> OwningSlice { +/* 'D/' */ +auto EncodeDataPrefix() -> OwningSlice { char data[2] = {kDataPrefix, kFieldSeparator}; return OwningSlice({data, 2}); } -auto CreateDataKey(const TrackId& id) -> OwningSlice { +/* 'D/ 0xACAB' */ +auto EncodeDataKey(const TrackId& id) -> OwningSlice { std::ostringstream output; output.put(kDataPrefix).put(kFieldSeparator); output << TrackIdToBytes(id).data; return OwningSlice(output.str()); } -auto CreateDataValue(const TrackData& track) -> OwningSlice { +auto EncodeDataValue(const TrackData& track) -> OwningSlice { uint8_t* buf; std::size_t buf_len = cbor_encode(&buf, [&](CborEncoder* enc) { CborEncoder array_encoder; @@ -179,7 +206,8 @@ auto ParseDataValue(const leveldb::Slice& slice) -> std::optional { return TrackData(id, path, hash, play_count, is_tombstoned); } -auto CreateHashKey(const uint64_t& hash) -> OwningSlice { +/* 'H/ 0xBEEF' */ +auto EncodeHashKey(const uint64_t& hash) -> OwningSlice { std::ostringstream output; output.put(kHashPrefix).put(kFieldSeparator); @@ -197,10 +225,183 @@ auto ParseHashValue(const leveldb::Slice& slice) -> std::optional { return BytesToTrackId(slice.ToString()); } -auto CreateHashValue(TrackId id) -> OwningSlice { +auto EncodeHashValue(TrackId id) -> OwningSlice { return TrackIdToBytes(id); } +/* 'I/' */ +auto EncodeAllIndexesPrefix() -> OwningSlice { + char data[2] = {kIndexPrefix, kFieldSeparator}; + return OwningSlice({data, 2}); +} + +auto AppendIndexHeader(const IndexKey::Header& header, std::ostringstream* out) + -> void { + *out << kIndexPrefix << kFieldSeparator; + + // Construct the header. + uint8_t* buf; + std::size_t buf_len = cbor_encode(&buf, [&](CborEncoder* enc) { + CborEncoder array_encoder; + CborError err; + err = cbor_encoder_create_array(enc, &array_encoder, 3); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + err = cbor_encode_uint(&array_encoder, header.id); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + err = cbor_encode_uint(&array_encoder, header.depth); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + err = cbor_encode_uint(&array_encoder, header.components_hash); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + err = cbor_encoder_close_container(enc, &array_encoder); + if (err != CborNoError && err != CborErrorOutOfMemory) { + ESP_LOGE(kTag, "encoding err %u", err); + return; + } + }); + std::string encoded{reinterpret_cast(buf), buf_len}; + delete buf; + *out << encoded << kFieldSeparator; +} + +auto EncodeIndexPrefix(const IndexKey::Header& header) -> OwningSlice { + std::ostringstream out; + AppendIndexHeader(header, &out); + return OwningSlice(out.str()); +} + +/* + * 'I/0xa2/0x686921/0xb9' + * ^ --- trailer + * ^ --- component ("hi!") + * ^ -------- header + * + * The components *must* be encoded in a way that is easy to sort + * lexicographically. The header and footer do not have this restriction, so + * cbor is fine. + * + * We store grouping information within the header; which index, filtered + * components. We store disambiguation information in the trailer; just a track + * id for now, but could reasonably be something like 'release year' as well. + */ +auto EncodeIndexKey(const IndexKey& key) -> OwningSlice { + std::ostringstream out; + + // Construct the header. + AppendIndexHeader(key.header, &out); + + // The component should already be UTF-8 encoded, so just write it. + if (key.item) { + out << *key.item; + } + + // Construct the footer. + out << kFieldSeparator; + if (key.track) { + out << TrackIdToBytes(*key.track).data; + } + return OwningSlice(out.str()); +} + +auto ParseIndexKey(const leveldb::Slice& slice) -> std::optional { + IndexKey result{}; + + auto prefix = EncodeAllIndexesPrefix(); + if (!slice.starts_with(prefix.data)) { + return {}; + } + + std::string key_data = slice.ToString().substr(prefix.data.size()); + std::size_t header_length = 0; + { + CborParser parser; + CborValue container; + CborError err; + err = cbor_parser_init(reinterpret_cast(key_data.data()), + key_data.size(), 0, &parser, &container); + if (err != CborNoError || !cbor_value_is_container(&container)) { + return {}; + } + + CborValue val; + err = cbor_value_enter_container(&container, &val); + if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) { + return {}; + } + + uint64_t raw_int; + err = cbor_value_get_uint64(&val, &raw_int); + if (err != CborNoError) { + return {}; + } + result.header.id = raw_int; + err = cbor_value_advance(&val); + if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) { + return {}; + } + + err = cbor_value_get_uint64(&val, &raw_int); + if (err != CborNoError) { + return {}; + } + result.header.depth = raw_int; + err = cbor_value_advance(&val); + if (err != CborNoError || !cbor_value_is_unsigned_integer(&val)) { + return {}; + } + + err = cbor_value_get_uint64(&val, &raw_int); + if (err != CborNoError) { + return {}; + } + result.header.components_hash = raw_int; + err = cbor_value_advance(&val); + if (err != CborNoError || !cbor_value_at_end(&val)) { + return {}; + } + + const uint8_t* next_byte = cbor_value_get_next_byte(&val); + header_length = + next_byte - reinterpret_cast(key_data.data()); + } + + if (header_length == 0) { + return {}; + } + + if (header_length >= key_data.size()) { + return {}; + } + + std::istringstream in(key_data.substr(header_length + 1)); + std::stringbuf buffer{}; + + in.get(buffer, kFieldSeparator); + if (buffer.str().size() > 0) { + result.item = buffer.str(); + } + + buffer = {}; + in.get(buffer); + if (buffer.str().size() > 1) { + std::string raw_id = buffer.str().substr(1); + result.track = BytesToTrackId(raw_id); + } + + return result; +} + auto TrackIdToBytes(TrackId id) -> OwningSlice { uint8_t buf[8]; CborEncoder enc; diff --git a/src/database/tag_parser.cpp b/src/database/tag_parser.cpp index 83b0a796..49febe27 100644 --- a/src/database/tag_parser.cpp +++ b/src/database/tag_parser.cpp @@ -12,6 +12,23 @@ namespace database { +auto convert_tag(int tag) -> std::optional { + switch (tag) { + case Ttitle: + return Tag::kTitle; + case Tartist: + return Tag::kArtist; + case Talbum: + return Tag::kAlbum; + case Ttrack: + return Tag::kAlbumTrack; + case Tgenre: + return Tag::kGenre; + default: + return {}; + } +} + namespace libtags { struct Aux { @@ -55,12 +72,9 @@ static void tag(Tagctx* ctx, int size, Tagread f) { Aux* aux = reinterpret_cast(ctx->aux); - if (t == Ttitle) { - aux->tags->title = v; - } else if (t == Tartist) { - aux->tags->artist = v; - } else if (t == Talbum) { - aux->tags->album = v; + auto tag = convert_tag(t); + if (tag) { + aux->tags->set(*tag, v); } } @@ -108,19 +122,19 @@ auto TagParserImpl::ReadAndParseTags(const std::string& path, TrackTags* out) switch (ctx.format) { case Fmp3: - out->encoding = Encoding::kMp3; + out->encoding(Encoding::kMp3); break; case Fogg: - out->encoding = Encoding::kOgg; + out->encoding(Encoding::kOgg); break; case Fflac: - out->encoding = Encoding::kFlac; + out->encoding(Encoding::kFlac); break; case Fwav: - out->encoding = Encoding::kWav; + out->encoding(Encoding::kWav); break; default: - out->encoding = Encoding::kUnsupported; + out->encoding(Encoding::kUnsupported); } if (ctx.channels > 0) { diff --git a/src/database/track.cpp b/src/database/track.cpp index 00acc1f6..dc33701d 100644 --- a/src/database/track.cpp +++ b/src/database/track.cpp @@ -7,11 +7,28 @@ #include "track.hpp" #include +#include "shared_string.h" namespace database { +auto TrackTags::set(const Tag& key, const std::string& val) -> void { + tags_[key] = val; +} + +auto TrackTags::at(const Tag& key) const -> std::optional { + if (tags_.contains(key)) { + return tags_.at(key); + } + return {}; +} + +auto TrackTags::operator[](const Tag& key) const + -> std::optional { + return at(key); +} + /* Helper function to update a komihash stream with a std::string. */ -auto HashString(komihash_stream_t* stream, std::string str) -> void { +auto HashString(komihash_stream_t* stream, const std::string& str) -> void { komihash_stream_update(stream, str.c_str(), str.length()); } @@ -24,9 +41,11 @@ auto TrackTags::Hash() const -> uint64_t { // tags at all. komihash_stream_t stream; komihash_stream_init(&stream, 0); - HashString(&stream, title.value_or("")); - HashString(&stream, artist.value_or("")); - HashString(&stream, album.value_or("")); + + HashString(&stream, at(Tag::kTitle).value_or("")); + HashString(&stream, at(Tag::kArtist).value_or("")); + HashString(&stream, at(Tag::kAlbum).value_or("")); + return komihash_stream_final(&stream); } @@ -48,4 +67,16 @@ void swap(Track& first, Track& second) { second = temp; } +auto Track::TitleOrFilename() const -> shared_string { + auto title = tags().at(Tag::kTitle); + if (title) { + return *title; + } + auto start = data().filepath().find_last_of('/'); + if (start == std::string::npos) { + return data().filepath(); + } + return data().filepath().substr(start); +} + } // namespace database