From 30aaefca64445efa421edb93403036d59382920f Mon Sep 17 00:00:00 2001 From: jacqueline Date: Thu, 8 Aug 2024 14:35:53 +1000 Subject: [PATCH] Batch up the db operations associated with adding new tracks This is ostensibly yet another 'prepare for multithreaded updates' commit, however it does actually save us another 60(!!) odd milliseconds per track. --- src/tangara/database/database.cpp | 183 ++++++++++++------------------ src/tangara/database/database.hpp | 14 ++- src/tangara/database/index.cpp | 57 ++++++---- src/tangara/database/index.hpp | 3 +- src/tangara/database/track.cpp | 11 -- src/tangara/database/track.hpp | 2 - 6 files changed, 122 insertions(+), 148 deletions(-) diff --git a/src/tangara/database/database.cpp b/src/tangara/database/database.cpp index c543b941..aec661d9 100644 --- a/src/tangara/database/database.cpp +++ b/src/tangara/database/database.cpp @@ -352,11 +352,19 @@ auto Database::updateIndexes() -> void { // We couldn't read the tags for this track. Either they were // malformed, or perhaps the file is missing. Either way, tombstone // this record. - ESP_LOGW(kTag, "entombing missing #%lx", track->id); + ESP_LOGI(kTag, "entombing missing #%lx", track->id); + + // Remove the indexes first, so that interrupted operations don't leave + // dangling index records. dbRemoveIndexes(track); + + // Do the rest of the tombstoning as one atomic write. + leveldb::WriteBatch batch; track->is_tombstoned = true; - dbPutTrackData(*track); - db_->Delete(leveldb::WriteOptions{}, EncodePathKey(track->filepath)); + batch.Put(EncodeDataKey(track->id), EncodeDataValue(*track)); + batch.Delete(EncodePathKey(track->filepath)); + + db_->Write(leveldb::WriteOptions(), &batch); continue; } @@ -370,12 +378,20 @@ auto Database::updateIndexes() -> void { // database. ESP_LOGI(kTag, "updating hash (%llx -> %llx)", track->tags_hash, new_hash); + + // Again, we remove the old index records first so has to avoid + // dangling references. dbRemoveIndexes(track); + // Atomically correct the hash + create the new index records. + leveldb::WriteBatch batch; track->tags_hash = new_hash; - dbIngestTagHashes(*tags, track->individual_tag_hashes); - dbPutTrackData(*track); - dbPutHash(new_hash, track->id); + dbIngestTagHashes(*tags, track->individual_tag_hashes, batch); + + dbCreateIndexesForTrack(*track, *tags, batch); + batch.Put(EncodeDataKey(track->id), EncodeDataValue(*track)); + batch.Put(EncodeHashKey(new_hash), EncodeHashValue(track->id)); + db_->Write(leveldb::WriteOptions(), &batch); } } } @@ -404,72 +420,56 @@ auto Database::updateIndexes() -> void { return; } - // Check for any existing record with the same hash. + // Check for any existing track with the same hash. uint64_t hash = tags->Hash(); - std::string key = EncodeHashKey(hash); - std::optional existing_hash; + std::optional existing_id; std::string raw_entry; - if (db_->Get(leveldb::ReadOptions(), key, &raw_entry).ok()) { - existing_hash = ParseHashValue(raw_entry); + if (db_->Get(leveldb::ReadOptions(), EncodeHashKey(hash), &raw_entry) + .ok()) { + existing_id = ParseHashValue(raw_entry); } - std::pair modified{info.fdate, info.ftime}; - if (!existing_hash) { - // We've never met this track before! Or we have, but the entry is - // malformed. Either way, record this as a new track. - TrackId id = dbMintNewTrackId(); - ESP_LOGD(kTag, "recording new 0x%lx", id); + std::shared_ptr data; + if (existing_id) { + // Do we have any existing data for this track? This could be the case if + // this is a tombstoned entry. In such as case, we want to reuse the + // previous TrackData so that any extra metadata is preserved. + data = dbGetTrackData(*existing_id); + if (!data) { + data = std::make_shared(); + data->id = *existing_id; + } else if (data->filepath != path) { + ESP_LOGW(kTag, "hash collision: %s, %s, %s", + tags->title().value_or("no title").c_str(), + tags->artist().value_or("no artist").c_str(), + tags->album().value_or("no album").c_str()); + // Don't commit anything if there's a hash collision, since we're + // likely to make a big mess. + return; + } + } else { num_new_tracks++; - - auto data = std::make_shared(); - data->id = id; - data->filepath = path; - data->tags_hash = hash; - data->modified_at = modified; - dbIngestTagHashes(*tags, data->individual_tag_hashes); - - dbPutTrackData(*data); - dbPutHash(hash, id); - auto t = std::make_shared(data, tags); - dbCreateIndexesForTrack(*t); - db_->Put(leveldb::WriteOptions{}, EncodePathKey(path), - TrackIdToBytes(id)); - return; + data = std::make_shared(); + data->id = dbMintNewTrackId(); } - std::shared_ptr existing_data = dbGetTrackData(*existing_hash); - if (!existing_data) { - // We found a hash that matches, but there's no data record? Weird. - auto new_data = std::make_shared(); - new_data->id = dbMintNewTrackId(); - new_data->filepath = path; - new_data->tags_hash = hash; - new_data->modified_at = modified; - dbIngestTagHashes(*tags, new_data->individual_tag_hashes); - dbPutTrackData(*new_data); - auto t = std::make_shared(new_data, tags); - dbCreateIndexesForTrack(*t); - db_->Put(leveldb::WriteOptions{}, EncodePathKey(path), - TrackIdToBytes(new_data->id)); - return; - } + // Make sure the file-based metadata on the TrackData is up to date. + data->filepath = path; + data->tags_hash = hash; + data->modified_at = {info.fdate, info.ftime}; - if (existing_data->is_tombstoned) { - ESP_LOGI(kTag, "exhuming track %lu", existing_data->id); - existing_data->is_tombstoned = false; - existing_data->modified_at = modified; - dbPutTrackData(*existing_data); - auto t = std::make_shared(existing_data, tags); - dbCreateIndexesForTrack(*t); - db_->Put(leveldb::WriteOptions{}, EncodePathKey(path), - TrackIdToBytes(existing_data->id)); - } else if (existing_data->filepath != - std::pmr::string{path.data(), path.size()}) { - ESP_LOGW(kTag, "hash collision: %s, %s, %s", - tags->title().value_or("no title").c_str(), - tags->artist().value_or("no artist").c_str(), - tags->album().value_or("no album").c_str()); - } + // Apply all the actual database changes as one atomic batch. This makes + // the whole 'new track' operation atomic, and also reduces the amount of + // lock contention when adding many tracks at once. + leveldb::WriteBatch batch; + dbIngestTagHashes(*tags, data->individual_tag_hashes, batch); + + dbCreateIndexesForTrack(*data, *tags, batch); + batch.Put(EncodeDataKey(data->id), EncodeDataValue(*data)); + batch.Put(EncodeHashKey(data->tags_hash), EncodeHashValue(data->id)); + batch.Put(EncodePathKey(path), TrackIdToBytes(data->id)); + + db_->Write(leveldb::WriteOptions(), &batch); }); uint64_t end_time = esp_timer_get_time(); @@ -536,22 +536,6 @@ auto Database::dbMintNewTrackId() -> TrackId { return next_track_id_++; } -auto Database::dbEntomb(TrackId id, uint64_t hash) -> void { - std::string key = EncodeHashKey(hash); - std::string val = EncodeHashValue(id); - if (!db_->Put(leveldb::WriteOptions(), key, val).ok()) { - ESP_LOGE(kTag, "failed to entomb #%llx (id #%lx)", hash, id); - } -} - -auto Database::dbPutTrackData(const TrackData& s) -> void { - std::string key = EncodeDataKey(s.id); - std::string val = EncodeDataValue(s); - if (!db_->Put(leveldb::WriteOptions(), key, val).ok()) { - ESP_LOGE(kTag, "failed to write data for #%lx", s.id); - } -} - auto Database::dbGetTrackData(TrackId id) -> std::shared_ptr { std::string key = EncodeDataKey(id); std::string raw_val; @@ -562,33 +546,19 @@ auto Database::dbGetTrackData(TrackId id) -> std::shared_ptr { return ParseDataValue(raw_val); } -auto Database::dbPutHash(const uint64_t& hash, TrackId i) -> void { - std::string key = EncodeHashKey(hash); - std::string val = EncodeHashValue(i); - if (!db_->Put(leveldb::WriteOptions(), key, val).ok()) { - ESP_LOGE(kTag, "failed to write hash for #%lx", i); - } -} - -auto Database::dbGetHash(const uint64_t& hash) -> std::optional { - std::string key = EncodeHashKey(hash); - std::string raw_val; - if (!db_->Get(leveldb::ReadOptions(), key, &raw_val).ok()) { - ESP_LOGW(kTag, "no key found for hash #%llx", hash); - return {}; - } - return ParseHashValue(raw_val); +auto Database::dbCreateIndexesForTrack(const Track& track, + leveldb::WriteBatch& batch) -> void { + dbCreateIndexesForTrack(track.data(), track.tags(), batch); } -auto Database::dbCreateIndexesForTrack(const Track& track) -> void { +auto Database::dbCreateIndexesForTrack(const TrackData& data, + const TrackTags& tags, + leveldb::WriteBatch& batch) -> void { for (const IndexInfo& index : getIndexes()) { - leveldb::WriteBatch writes; - auto entries = Index(collator_, index, track); + auto entries = Index(collator_, index, data, tags); for (const auto& it : entries) { - writes.Put(EncodeIndexKey(it.first), - {it.second.data(), it.second.size()}); + batch.Put(EncodeIndexKey(it.first), {it.second.data(), it.second.size()}); } - db_->Write(leveldb::WriteOptions(), &writes); } } @@ -597,9 +567,8 @@ auto Database::dbRemoveIndexes(std::shared_ptr data) -> void { if (!tags) { return; } - Track track{data, tags}; for (const IndexInfo& index : getIndexes()) { - auto entries = Index(collator_, index, track); + auto entries = Index(collator_, index, *data, *tags); for (auto it = entries.rbegin(); it != entries.rend(); it++) { auto key = EncodeIndexKey(it->first); auto status = db_->Delete(leveldb::WriteOptions{}, key); @@ -626,16 +595,14 @@ auto Database::dbRemoveIndexes(std::shared_ptr data) -> void { } auto Database::dbIngestTagHashes(const TrackTags& tags, - std::pmr::unordered_map& out) - -> void { - leveldb::WriteBatch batch{}; + std::pmr::unordered_map& out, + leveldb::WriteBatch& batch) -> void { for (const auto& tag : tags.allPresent()) { auto val = tags.get(tag); auto hash = tagHash(val); batch.Put(EncodeTagHashKey(hash), tagToString(val)); out[tag] = hash; } - db_->Write(leveldb::WriteOptions{}, &batch); } auto Database::dbRecoverTagsFromHashes( diff --git a/src/tangara/database/database.hpp b/src/tangara/database/database.hpp index 2b385013..39665dbf 100644 --- a/src/tangara/database/database.hpp +++ b/src/tangara/database/database.hpp @@ -29,6 +29,7 @@ #include "leveldb/iterator.h" #include "leveldb/options.h" #include "leveldb/slice.h" +#include "leveldb/write_batch.h" #include "memory_resource.hpp" #include "result.hpp" #include "tasks.hpp" @@ -111,17 +112,18 @@ class Database { auto dbCalculateNextTrackId() -> void; auto dbMintNewTrackId() -> TrackId; - auto dbEntomb(TrackId track, uint64_t hash) -> void; - auto dbPutTrackData(const TrackData& s) -> void; auto dbGetTrackData(TrackId id) -> std::shared_ptr; - auto dbPutHash(const uint64_t& hash, TrackId i) -> void; - auto dbGetHash(const uint64_t& hash) -> std::optional; - auto dbCreateIndexesForTrack(const Track& track) -> void; + auto dbCreateIndexesForTrack(const Track&, leveldb::WriteBatch&) -> void; + auto dbCreateIndexesForTrack(const TrackData&, + const TrackTags&, + leveldb::WriteBatch&) -> void; + auto dbRemoveIndexes(std::shared_ptr) -> void; auto dbIngestTagHashes(const TrackTags&, - std::pmr::unordered_map&) -> void; + std::pmr::unordered_map&, + leveldb::WriteBatch&) -> void; auto dbRecoverTagsFromHashes(const std::pmr::unordered_map&) -> std::shared_ptr; diff --git a/src/tangara/database/index.cpp b/src/tangara/database/index.cpp index 93a2b1c2..dec458f4 100644 --- a/src/tangara/database/index.cpp +++ b/src/tangara/database/index.cpp @@ -52,10 +52,29 @@ const IndexInfo kAllAlbums{ .components = {Tag::kAlbum, Tag::kAlbumOrder}, }; +static auto titleOrFilename(const TrackData& data, const TrackTags& tags) + -> std::pmr::string { + auto title = tags.title(); + if (title) { + return *title; + } + auto start = data.filepath.find_last_of('/'); + if (start == std::pmr::string::npos) { + return data.filepath; + } + return data.filepath.substr(start + 1); +} + class Indexer { public: - Indexer(locale::ICollator& collator, const Track& t, const IndexInfo& idx) - : collator_(collator), track_(t), index_(idx) {} + Indexer(locale::ICollator& collator, + const IndexInfo& idx, + const TrackData& data, + const TrackTags& tags) + : collator_(collator), + index_(idx), + track_data_(data), + track_tags_(tags) {} auto index() -> std::vector>; @@ -70,14 +89,13 @@ class Indexer { auto missing_value(Tag tag) -> TagValue { switch (tag) { case Tag::kTitle: - return track_.TitleOrFilename(); + return titleOrFilename(track_data_, track_tags_); case Tag::kArtist: return "Unknown Artist"; case Tag::kAlbum: return "Unknown Album"; case Tag::kAlbumArtist: - return track_.tags().artist().value_or("Unknown Artist"); - return "Unknown Album"; + return track_tags_.artist().value_or("Unknown Artist"); case Tag::kGenres: return std::pmr::vector{}; case Tag::kDisc: @@ -91,8 +109,9 @@ class Indexer { } locale::ICollator& collator_; - const Track& track_; const IndexInfo index_; + const TrackData& track_data_; + const TrackTags& track_tags_; std::vector> out_; }; @@ -113,7 +132,7 @@ auto Indexer::index() -> std::vector> { auto Indexer::handleLevel(const IndexKey::Header& header, std::span components) -> void { Tag component = components.front(); - TagValue value = track_.tags().get(component); + TagValue value = track_tags_.get(component); if (std::holds_alternative(value)) { value = missing_value(component); } @@ -157,21 +176,17 @@ auto Indexer::handleItem(const IndexKey::Header& header, auto xfrm = collator_.Transform(value); key.item = {xfrm.data(), xfrm.size()}; } else if constexpr (std::is_same_v) { - value = std::to_string(arg); - // FIXME: this sucks lol. we should just write the number directly, - // LSB-first, but then we need to be able to parse it back properly. - std::ostringstream str; - str << std::setw(8) << std::setfill('0') << arg; - std::string encoded = str.str(); - key.item = {encoded.data(), encoded.size()}; + // CBOR's varint encoding actually works great for lexicographical + // sorting. + key.item = cppbor::Uint{arg}.toString(); } }, item); std::optional next_level; if (components.size() == 1) { - value = track_.TitleOrFilename(); - key.track = track_.data().id; + value = titleOrFilename(track_data_, track_tags_); + key.track = track_data_.id; } else { next_level = ExpandHeader(key.header, key.item); } @@ -183,10 +198,12 @@ auto Indexer::handleItem(const IndexKey::Header& header, } } -auto Index(locale::ICollator& c, - const IndexInfo& i, - const Track& t) -> std::vector> { - Indexer indexer{c, t, i}; +auto Index(locale::ICollator& collator, + const IndexInfo& index, + const TrackData& data, + const TrackTags& tags) + -> std::vector> { + Indexer indexer{collator, index, data, tags}; return indexer.index(); } diff --git a/src/tangara/database/index.hpp b/src/tangara/database/index.hpp index 8f78439b..bc01ec2f 100644 --- a/src/tangara/database/index.hpp +++ b/src/tangara/database/index.hpp @@ -63,7 +63,8 @@ struct IndexKey { auto Index(locale::ICollator&, const IndexInfo&, - const Track&) -> std::vector>; + const TrackData&, + const TrackTags&) -> std::vector>; auto ExpandHeader(const IndexKey::Header&, const std::optional&) -> IndexKey::Header; diff --git a/src/tangara/database/track.cpp b/src/tangara/database/track.cpp index 5bf8c3e2..461f4561 100644 --- a/src/tangara/database/track.cpp +++ b/src/tangara/database/track.cpp @@ -293,15 +293,4 @@ auto TrackTags::Hash() const -> uint64_t { return komihash_stream_final(&stream); } -auto Track::TitleOrFilename() const -> std::pmr::string { - auto title = tags().title(); - if (title) { - return *title; - } - auto start = data().filepath.find_last_of('/'); - if (start == std::pmr::string::npos) { - return data().filepath; - } - return data().filepath.substr(start + 1); -} } // namespace database diff --git a/src/tangara/database/track.hpp b/src/tangara/database/track.hpp index b097ab52..6501e31f 100644 --- a/src/tangara/database/track.hpp +++ b/src/tangara/database/track.hpp @@ -195,8 +195,6 @@ class Track { auto data() const -> const TrackData& { return *data_; } auto tags() const -> const TrackTags& { return *tags_; } - auto TitleOrFilename() const -> std::pmr::string; - private: std::shared_ptr data_; std::shared_ptr tags_;