From 2086ab09b8d89c27f524d82a68b9a2035ea02436 Mon Sep 17 00:00:00 2001 From: jacqueline Date: Tue, 24 Oct 2023 16:29:46 +1100 Subject: [PATCH] Implement incremental updates of database indexes This makes rescanning the library *so* much faster. Yay! --- dependencies.lock | 2 +- src/database/database.cpp | 107 ++++++++++++++++++++++-------- src/database/include/database.hpp | 6 ++ src/database/include/index.hpp | 6 +- src/database/include/records.hpp | 3 + src/database/include/track.hpp | 6 ++ src/database/index.cpp | 10 +-- src/database/records.cpp | 24 ++++++- 8 files changed, 127 insertions(+), 37 deletions(-) diff --git a/dependencies.lock b/dependencies.lock index 7f376ce3..a9723d1e 100644 --- a/dependencies.lock +++ b/dependencies.lock @@ -4,6 +4,6 @@ dependencies: source: type: idf version: 5.1.1 -manifest_hash: 7e6103d8e34e5eabd5a6a51c49836c58f1686c3aa287f2e288b1ad76243aa61a +manifest_hash: b9761e0028130d307b778c710e5dd39fb3c942d8084ed429d448d938957fb0e6 target: esp32 version: 1.0.0 diff --git a/src/database/database.cpp b/src/database/database.cpp index 0a092774..a0bd4fd2 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -21,6 +21,7 @@ #include "ff.h" #include "freertos/projdefs.h" #include "index.hpp" +#include "komihash.h" #include "leveldb/cache.h" #include "leveldb/db.h" #include "leveldb/iterator.h" @@ -48,7 +49,7 @@ static const char* kTag = "DB"; static const char kDbPath[] = "/.tangara-db"; static const char kKeyDbVersion[] = "schema_version"; -static const uint8_t kCurrentDbVersion = 1; +static const uint8_t kCurrentDbVersion = 2; static const char kKeyTrackId[] = "next_track_id"; @@ -163,20 +164,6 @@ auto Database::Update() -> std::future { leveldb::ReadOptions read_options; read_options.fill_cache = false; - // Stage 0: discard indexes - // TODO(jacqueline): I think it should be possible to incrementally update - // indexes, but my brain hurts. - ESP_LOGI(kTag, "dropping stale indexes"); - { - std::unique_ptr it{db_->NewIterator(read_options)}; - std::string prefix = EncodeAllIndexesPrefix(); - it->Seek(prefix); - while (it->Valid() && it->key().starts_with(prefix)) { - db_->Delete(leveldb::WriteOptions(), it->key()); - it->Next(); - } - } - std::pair newest_track{0, 0}; // Stage 1: verify all existing tracks are still valid. @@ -185,8 +172,8 @@ auto Database::Update() -> std::future { uint64_t num_processed = 0; std::unique_ptr it{db_->NewIterator(read_options)}; std::string prefix = EncodeDataPrefix(); - it->Seek(prefix); - while (it->Valid() && it->key().starts_with(prefix)) { + for (it->Seek(prefix); it->Valid() && it->key().starts_with(prefix); + it->Next()) { num_processed++; events::Ui().Dispatch(event::UpdateProgress{ .stage = event::UpdateProgress::Stage::kVerifyingExistingTracks, @@ -198,13 +185,11 @@ auto Database::Update() -> std::future { // The value was malformed. Drop this record. ESP_LOGW(kTag, "dropping malformed metadata"); db_->Delete(leveldb::WriteOptions(), it->key()); - it->Next(); continue; } if (track->is_tombstoned) { ESP_LOGW(kTag, "skipping tombstoned %lx", track->id); - it->Next(); continue; } @@ -221,6 +206,7 @@ auto Database::Update() -> std::future { } if (modified_at == track->modified_at) { newest_track = std::max(modified_at, newest_track); + continue; } else { track->modified_at = modified_at; } @@ -232,9 +218,9 @@ auto Database::Update() -> std::future { // malformed, or perhaps the file is missing. Either way, tombstone // this record. ESP_LOGW(kTag, "entombing missing #%lx", track->id); + dbRemoveIndexes(track); track->is_tombstoned = true; dbPutTrackData(*track); - it->Next(); continue; } @@ -248,16 +234,13 @@ auto Database::Update() -> std::future { // database. ESP_LOGI(kTag, "updating hash (%llx -> %llx)", track->tags_hash, new_hash); + dbRemoveIndexes(track); + track->tags_hash = new_hash; + dbIngestTagHashes(*tags, track->individual_tag_hashes); dbPutTrackData(*track); dbPutHash(new_hash, track->id); } - - Track t{track, tags}; - - dbCreateIndexesForTrack(t); - - it->Next(); } } @@ -306,6 +289,7 @@ auto Database::Update() -> std::future { data->filepath = path; data->tags_hash = hash; data->modified_at = modified; + dbIngestTagHashes(*tags, data->individual_tag_hashes); dbPutTrackData(*data); dbPutHash(hash, id); @@ -322,6 +306,7 @@ auto Database::Update() -> std::future { new_data->filepath = path; new_data->tags_hash = hash; new_data->modified_at = modified; + dbIngestTagHashes(*tags, new_data->individual_tag_hashes); dbPutTrackData(*new_data); auto t = std::make_shared(new_data, tags); dbCreateIndexesForTrack(*t); @@ -554,10 +539,76 @@ auto Database::dbGetHash(const uint64_t& hash) -> std::optional { auto Database::dbCreateIndexesForTrack(const Track& track) -> void { for (const IndexInfo& index : GetIndexes()) { leveldb::WriteBatch writes; - if (Index(index, track, &writes)) { - db_->Write(leveldb::WriteOptions(), &writes); + auto entries = Index(index, track); + for (const auto& it : entries) { + writes.Put(EncodeIndexKey(it.first), + {it.second.data(), it.second.size()}); + } + db_->Write(leveldb::WriteOptions(), &writes); + } +} + +auto Database::dbRemoveIndexes(std::shared_ptr data) -> void { + auto tags = dbRecoverTagsFromHashes(data->individual_tag_hashes); + if (!tags) { + return; + } + Track track{data, tags}; + for (const IndexInfo& index : GetIndexes()) { + auto entries = Index(index, track); + for (auto it = entries.rbegin(); it != entries.rend(); it++) { + auto key = EncodeIndexKey(it->first); + auto status = db_->Delete(leveldb::WriteOptions{}, key); + if (!status.ok()) { + return; + } + + std::unique_ptr cursor{db_->NewIterator({})}; + cursor->Seek(key); + cursor->Prev(); + + auto prev_key = ParseIndexKey(cursor->key()); + if (prev_key && prev_key->header == it->first.header) { + break; + } + + cursor->Next(); + auto next_key = ParseIndexKey(cursor->key()); + if (next_key && next_key->header == it->first.header) { + break; + } + } + } +} + +auto Database::dbIngestTagHashes(const TrackTags& tags, + std::pmr::unordered_map& out) + -> void { + leveldb::WriteBatch batch{}; + for (auto& entry : tags.tags()) { + auto hash = + komihash_stream_oneshot(entry.second.data(), entry.second.size(), 0); + batch.Put(EncodeTagHashKey(hash), entry.second.c_str()); + out[entry.first] = hash; + } + db_->Write(leveldb::WriteOptions{}, &batch); +} + +auto Database::dbRecoverTagsFromHashes( + const std::pmr::unordered_map& hashes) + -> std::shared_ptr { + auto out = std::make_shared(); + for (const auto& entry : hashes) { + std::string value; + auto res = db_->Get(leveldb::ReadOptions{}, EncodeTagHashKey(entry.second), + &value); + if (!res.ok()) { + ESP_LOGI(kTag, "failed to retrieve tag!"); + continue; } + out->set(entry.first, {value.data(), value.size()}); } + return out; } template diff --git a/src/database/include/database.hpp b/src/database/include/database.hpp index 7cb1d09c..cdf69db0 100644 --- a/src/database/include/database.hpp +++ b/src/database/include/database.hpp @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include #include @@ -150,6 +151,11 @@ class Database { auto dbPutHash(const uint64_t& hash, TrackId i) -> void; auto dbGetHash(const uint64_t& hash) -> std::optional; auto dbCreateIndexesForTrack(const Track& track) -> void; + auto dbRemoveIndexes(std::shared_ptr) -> void; + auto dbIngestTagHashes(const TrackTags&, + std::pmr::unordered_map&) -> void; + auto dbRecoverTagsFromHashes(const std::pmr::unordered_map&) + -> std::shared_ptr; template auto dbGetPage(const Continuation& c) -> Result*; diff --git a/src/database/include/index.hpp b/src/database/include/index.hpp index 838eff31..13de952d 100644 --- a/src/database/include/index.hpp +++ b/src/database/include/index.hpp @@ -46,6 +46,8 @@ struct IndexKey { // an index consists of { kArtist, kAlbum, kTitle }, and we are at depth = 2 // then this may contain hash(hash("Jacqueline"), "My Cool Album"). std::uint64_t components_hash; + + bool operator==(const Header&) const = default; }; Header header; @@ -58,7 +60,9 @@ struct IndexKey { std::optional track; }; -auto Index(const IndexInfo&, const Track&, leveldb::WriteBatch*) -> bool; +auto Index(const IndexInfo&, const Track&) + -> std::vector>; + auto ExpandHeader(const IndexKey::Header&, const std::optional&) -> IndexKey::Header; diff --git a/src/database/include/records.hpp b/src/database/include/records.hpp index e13c6568..09764ed0 100644 --- a/src/database/include/records.hpp +++ b/src/database/include/records.hpp @@ -52,6 +52,9 @@ auto EncodeHashKey(const uint64_t& hash) -> std::string; */ auto EncodeHashValue(TrackId id) -> std::string; +/* Encodes a hash key for the specified hash. */ +auto EncodeTagHashKey(const uint64_t& hash) -> std::string; + /* * Parses bytes previously encoded via EncodeHashValue back into a track id. May * return nullopt if parsing fails. diff --git a/src/database/include/track.hpp b/src/database/include/track.hpp index b07da9ba..72296e8d 100644 --- a/src/database/include/track.hpp +++ b/src/database/include/track.hpp @@ -83,6 +83,10 @@ class TrackTags { auto at(const Tag& key) const -> std::optional; auto operator[](const Tag& key) const -> std::optional; + auto tags() const -> const std::pmr::unordered_map& { + return tags_; + } + /* * Returns a hash of the 'identifying' tags of this track. That is, a hash * that can be used to determine if one track is likely the same as another, @@ -119,12 +123,14 @@ struct TrackData { : id(0), filepath(&memory::kSpiRamResource), tags_hash(0), + individual_tag_hashes(&memory::kSpiRamResource), is_tombstoned(false), modified_at() {} TrackId id; std::pmr::string filepath; uint64_t tags_hash; + std::pmr::unordered_map individual_tag_hashes; bool is_tombstoned; std::pair modified_at; diff --git a/src/database/index.cpp b/src/database/index.cpp index 4d1f7b06..84ea050a 100644 --- a/src/database/index.cpp +++ b/src/database/index.cpp @@ -59,8 +59,9 @@ static auto missing_component_text(const Track& track, Tag tag) } } -auto Index(const IndexInfo& info, const Track& t, leveldb::WriteBatch* batch) - -> bool { +auto Index(const IndexInfo& info, const Track& t) + -> std::vector> { + std::vector> out; IndexKey key{ .header{ .id = info.id, @@ -93,8 +94,7 @@ auto Index(const IndexInfo& info, const Track& t, leveldb::WriteBatch* batch) value = t.TitleOrFilename(); } - auto encoded = EncodeIndexKey(key); - batch->Put(encoded, {value.data(), value.size()}); + out.push_back(std::make_pair(key, value)); // If there are more components after this, then we need to finish by // narrowing the header with the current title. @@ -102,7 +102,7 @@ auto Index(const IndexInfo& info, const Track& t, leveldb::WriteBatch* batch) key.header = ExpandHeader(key.header, key.item); } } - return true; + return out; } auto ExpandHeader(const IndexKey::Header& header, diff --git a/src/database/records.cpp b/src/database/records.cpp index 0619cd93..c9fafe08 100644 --- a/src/database/records.cpp +++ b/src/database/records.cpp @@ -48,6 +48,7 @@ static const char* kTag = "RECORDS"; static const char kDataPrefix = 'D'; static const char kHashPrefix = 'H'; +static const char kTagHashPrefix = 'T'; static const char kIndexPrefix = 'I'; static const char kFieldSeparator = '\0'; @@ -62,6 +63,11 @@ auto EncodeDataKey(const TrackId& id) -> std::string { } auto EncodeDataValue(const TrackData& track) -> std::string { + auto* tag_hashes = new cppbor::Map{}; // Free'd by Array's dtor. + for (const auto& entry : track.individual_tag_hashes) { + tag_hashes->add(cppbor::Uint{static_cast(entry.first)}, + cppbor::Uint{entry.second}); + } cppbor::Array val{ cppbor::Uint{track.id}, cppbor::Tstr{track.filepath}, @@ -69,6 +75,7 @@ auto EncodeDataValue(const TrackData& track) -> std::string { cppbor::Bool{track.is_tombstoned}, cppbor::Uint{track.modified_at.first}, cppbor::Uint{track.modified_at.second}, + tag_hashes, }; return val.toString(); } @@ -80,12 +87,13 @@ auto ParseDataValue(const leveldb::Slice& slice) -> std::shared_ptr { return nullptr; } auto vals = item->asArray(); - if (vals->size() != 6 || vals->get(0)->type() != cppbor::UINT || + if (vals->size() != 7 || vals->get(0)->type() != cppbor::UINT || vals->get(1)->type() != cppbor::TSTR || vals->get(2)->type() != cppbor::UINT || vals->get(3)->type() != cppbor::SIMPLE || vals->get(4)->type() != cppbor::UINT || - vals->get(5)->type() != cppbor::UINT) { + vals->get(5)->type() != cppbor::UINT || + vals->get(6)->type() != cppbor::MAP) { return {}; } auto res = std::make_shared(); @@ -96,6 +104,12 @@ auto ParseDataValue(const leveldb::Slice& slice) -> std::shared_ptr { res->modified_at = std::make_pair( vals->get(4)->asUint()->unsignedValue(), vals->get(5)->asUint()->unsignedValue()); + + auto tag_hashes = vals->get(6)->asMap(); + for (const auto& entry : *tag_hashes) { + auto tag = static_cast(entry.first->asUint()->unsignedValue()); + res->individual_tag_hashes[tag] = entry.second->asUint()->unsignedValue(); + } return res; } @@ -113,6 +127,12 @@ auto EncodeHashValue(TrackId id) -> std::string { return TrackIdToBytes(id); } +/* 'T/ 0xBEEF' */ +auto EncodeTagHashKey(const uint64_t& hash) -> std::string { + return std::string{kTagHashPrefix, kFieldSeparator} + + cppbor::Uint{hash}.toString(); +} + /* 'I/' */ auto EncodeAllIndexesPrefix() -> std::string { return {kIndexPrefix, kFieldSeparator};