add locale-aware colation to db indexes

3 years ago · b58c081508
parent 18d90051c9
commit b58c081508
15 changed files with 1037 additions and 23 deletions
--- a/partitions.csv
+++ b/partitions.csv
@ -5,5 +5,5 @@ otadata,  data, ota,	0xd000,    0x2000,
 phy_init,  data, phy,    0xf000,    0x1000,
 ota_0,     app,  ota_0,  0x10000,   4M,
 ota_1,     app,  ota_1,  0x410000,  4M,
-assets,   data, fat,    0x810000,  6M,
+collate,   0x40, 0x00,   0x810000,  3M,
--- a/src/database/database.cpp
+++ b/src/database/database.cpp
@ -17,6 +17,7 @@
 #include <optional>
 #include <sstream>
 #include "collation.hpp"
 #include "esp_log.h"
 #include "ff.h"
 #include "freertos/projdefs.h"
@ -49,7 +50,7 @@ static SingletonEnv<leveldb::EspEnv> sEnv;
 static const char kDbPath[] = "/.tangara-db";
 static const char kKeyDbVersion[] = "schema_version";
-static const uint8_t kCurrentDbVersion = 2;
+static const uint8_t kCurrentDbVersion = 3;
 static const char kKeyTrackId[] = "next_track_id";
@ -73,7 +74,9 @@ static auto CreateNewDatabase(leveldb::Options& options) -> leveldb::DB* {
  return db;
 }
-auto Database::Open(IFileGatherer& gatherer, ITagParser& parser)
+auto Database::Open(IFileGatherer& gatherer,
                    ITagParser& parser,
                    locale::ICollator& collator)
    -> cpp::result<Database*, DatabaseError> {
  // TODO(jacqueline): Why isn't compare_and_exchange_* available?
  if (sIsDbOpen.exchange(true)) {
@ -127,7 +130,8 @@ auto Database::Open(IFileGatherer& gatherer, ITagParser& parser)
            }
            ESP_LOGI(kTag, "Database opened successfully");
-            return new Database(db, cache.release(), gatherer, parser, worker);
+            return new Database(db, cache.release(), gatherer, parser, collator,
                                worker);
          })
      .get();
 }
@ -142,12 +146,14 @@ Database::Database(leveldb::DB* db,
                   leveldb::Cache* cache,
                   IFileGatherer& file_gatherer,
                   ITagParser& tag_parser,
                   locale::ICollator& collator,
                   std::shared_ptr<tasks::Worker> worker)
    : db_(db),
      cache_(cache),
      worker_task_(worker),
      file_gatherer_(file_gatherer),
-      tag_parser_(tag_parser) {}
+      tag_parser_(tag_parser),
      collator_(collator) {}
 Database::~Database() {
  // Delete db_ first so that any outstanding background work finishes before
@ -539,7 +545,7 @@ auto Database::dbGetHash(const uint64_t& hash) -> std::optional<TrackId> {
 auto Database::dbCreateIndexesForTrack(const Track& track) -> void {
  for (const IndexInfo& index : GetIndexes()) {
    leveldb::WriteBatch writes;
-    auto entries = Index(index, track);
+    auto entries = Index(collator_, index, track);
    for (const auto& it : entries) {
      writes.Put(EncodeIndexKey(it.first),
                 {it.second.data(), it.second.size()});
@ -555,7 +561,7 @@ auto Database::dbRemoveIndexes(std::shared_ptr<TrackData> data) -> void {
  }
  Track track{data, tags};
  for (const IndexInfo& index : GetIndexes()) {
-    auto entries = Index(index, track);
+    auto entries = Index(collator_, index, track);
    for (auto it = entries.rbegin(); it != entries.rend(); it++) {
      auto key = EncodeIndexKey(it->first);
      auto status = db_->Delete(leveldb::WriteOptions{}, key);
--- a/src/database/include/database.hpp
+++ b/src/database/include/database.hpp
@ -16,6 +16,7 @@
 #include <utility>
 #include <vector>
 #include "collation.hpp"
 #include "file_gatherer.hpp"
 #include "index.hpp"
 #include "leveldb/cache.h"
@ -92,9 +93,10 @@ class Database {
    ALREADY_OPEN,
    FAILED_TO_OPEN,
  };
-  static auto Open(IFileGatherer& file_gatherer, ITagParser& tag_parser)
+  static auto Open(IFileGatherer& file_gatherer,
                   ITagParser& tag_parser,
                   locale::ICollator& collator)
      -> cpp::result<Database*, DatabaseError>;
  static auto Open() -> cpp::result<Database*, DatabaseError>;
  static auto Destroy() -> void;
@ -136,11 +138,13 @@ class Database {
  // Not owned.
  IFileGatherer& file_gatherer_;
  ITagParser& tag_parser_;
  locale::ICollator& collator_;
  Database(leveldb::DB* db,
           leveldb::Cache* cache,
           IFileGatherer& file_gatherer,
           ITagParser& tag_parser,
           locale::ICollator& collator,
           std::shared_ptr<tasks::Worker> worker);
  auto dbMintNewTrackId() -> TrackId;
--- a/src/database/include/index.hpp
+++ b/src/database/include/index.hpp
@ -13,6 +13,7 @@
 #include <variant>
 #include <vector>
 #include "collation.hpp"
 #include "leveldb/db.h"
 #include "leveldb/slice.h"
@ -60,7 +61,7 @@ struct IndexKey {
  std::optional<TrackId> track;
 };
-auto Index(const IndexInfo&, const Track&)
+auto Index(locale::ICollator&, const IndexInfo&, const Track&)
    -> std::vector<std::pair<IndexKey, std::pmr::string>>;
 auto ExpandHeader(const IndexKey::Header&,
--- a/src/database/index.cpp
+++ b/src/database/index.cpp
@ -7,8 +7,11 @@
 #include "index.hpp"
 #include <cstdint>
 #include <sstream>
 #include <variant>
 #include "collation.hpp"
 #include "esp_log.h"
 #include "komihash.h"
 #include "leveldb/write_batch.h"
@ -59,7 +62,7 @@ static auto missing_component_text(const Track& track, Tag tag)
  }
 }
-auto Index(const IndexInfo& info, const Track& t)
+auto Index(locale::ICollator& collator, const IndexInfo& info, const Track& t)
    -> std::vector<std::pair<IndexKey, std::pmr::string>> {
  std::vector<std::pair<IndexKey, std::pmr::string>> out;
  IndexKey key{
@ -72,15 +75,14 @@ auto Index(const IndexInfo& info, const Track& t)
      .track = {},
  };
  auto& col = std::use_facet<std::collate<char>>(std::locale());
  for (std::uint8_t i = 0; i < info.components.size(); i++) {
    // Fill in the text for this depth.
    auto text = t.tags().at(info.components.at(i));
    std::pmr::string value;
    if (text) {
      std::pmr::string orig = *text;
-      key.item = col.transform(&orig[0], &orig[0] + orig.size());
+      auto xfrm = collator.Transform({orig.data(), orig.size()});
      key.item = {xfrm.data(), xfrm.size()};
      value = *text;
    } else {
      key.item = {};
--- a/src/locale/CMakeLists.txt
+++ b/src/locale/CMakeLists.txt
@ -0,0 +1,11 @@
 # Copyright 2023 jacqueline <me@jacqueline.id.au>
 #
 # SPDX-License-Identifier: GPL-3.0-only
 idf_component_register(
  SRCS "collation.cpp" "strxfrm_l.c"
  INCLUDE_DIRS "include"
  PRIV_INCLUDE_DIRS "priv_include"
  REQUIRES "span" "esp_partition" "spi_flash")
 target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS})
--- a/src/locale/collation.cpp
+++ b/src/locale/collation.cpp
@ -0,0 +1,92 @@
 /*
 * Copyright 2023 jacqueline <me@jacqueline.id.au>
 *
 * SPDX-License-Identifier: GPL-3.0-only
 */
 #include "collation.hpp"
 #include <stdint.h>
 #include <memory>
 #include "esp_flash_spi_init.h"
 #include "esp_log.h"
 #include "esp_partition.h"
 #include "hal/spi_flash_types.h"
 #include "spi_flash_mmap.h"
 #include "strxfrm.h"
 namespace locale {
 static constexpr char kTag[] = "collate";
 static constexpr esp_partition_type_t kLocalePartitionType =
    static_cast<esp_partition_type_t>(0x40);
 static constexpr esp_partition_subtype_t kLcCollateSubtype =
    static_cast<esp_partition_subtype_t>(0x0);
 auto CreateCollator() -> std::unique_ptr<ICollator> {
  std::unique_ptr<ICollator> ret{GLibCollator::create()};
  if (!ret) {
    ret.reset(new NoopCollator());
  }
  return ret;
 }
 auto GLibCollator::create() -> GLibCollator* {
  uint32_t data_pages = spi_flash_mmap_get_free_pages(SPI_FLASH_MMAP_DATA);
  ESP_LOGI(kTag, "free data pages: %lu (%lu KiB)", data_pages, data_pages * 64);
  const esp_partition_t* partition =
      esp_partition_find_first(kLocalePartitionType, kLcCollateSubtype, NULL);
  if (partition == NULL) {
    ESP_LOGW(kTag, "no LC_COLLATE partition found");
  }
  ESP_LOGI(kTag, "found LC_COLLATE partition of size %lu", partition->size);
  if (partition->size > data_pages * 64 * 1024) {
    ESP_LOGE(kTag, "not enough free pages to map LC_COLLATE partition!");
    return nullptr;
  }
  const void* region;
  esp_partition_mmap_handle_t handle;
  esp_err_t err = esp_partition_mmap(partition, 0, partition->size,
                                     ESP_PARTITION_MMAP_DATA, &region, &handle);
  if (err != ESP_OK) {
    ESP_LOGE(kTag, "LC_COLLATE mmap failed");
    return nullptr;
  }
  auto data = std::make_unique<locale_data_t>();
  if (!parse_locale_data(region, partition->size, data.get())) {
    ESP_LOGE(kTag, "parsing locale data failed");
    esp_partition_munmap(handle);
    return nullptr;
  }
  return new GLibCollator(handle, std::move(data));
 }
 GLibCollator::GLibCollator(const esp_partition_mmap_handle_t handle,
                           std::unique_ptr<locale_data_t> locale)
    : handle_(handle), locale_data_(std::move(locale)) {}
 GLibCollator::~GLibCollator() {
  esp_partition_munmap(handle_);
 }
 auto GLibCollator::Transform(const std::string& in) -> std::string {
  char dest[256];
  size_t size = glib_strxfrm(dest, in.c_str(), 256, locale_data_.get());
  if (size >= 256) {
    char* larger_dest = new char[size + 1]{0};
    glib_strxfrm(larger_dest, in.c_str(), size, locale_data_.get());
    std::string out{larger_dest, size};
    delete[] larger_dest;
    return out;
  }
  return {dest, size};
 }
 }  // namespace locale
--- a/src/locale/include/collation.hpp
+++ b/src/locale/include/collation.hpp
@ -0,0 +1,49 @@
 /*
 * Copyright 2023 jacqueline <me@jacqueline.id.au>
 *
 * SPDX-License-Identifier: GPL-3.0-only
 */
 #pragma once
 #include <cstddef>
 #include <memory>
 #include <string>
 #include "esp_partition.h"
 #include "span.hpp"
 #include "strxfrm.h"
 namespace locale {
 class ICollator {
 public:
  virtual ~ICollator() {}
  virtual auto Transform(const std::string&) -> std::string = 0;
 };
 class NoopCollator : public ICollator {
 public:
  auto Transform(const std::string& in) -> std::string override { return in; }
 };
 auto CreateCollator() -> std::unique_ptr<ICollator>;
 class GLibCollator : public ICollator {
 public:
  static auto create() -> GLibCollator*;
  ~GLibCollator();
  auto Transform(const std::string& in) -> std::string override;
 private:
  GLibCollator(const esp_partition_mmap_handle_t,
               std::unique_ptr<locale_data_t>);
  const esp_partition_mmap_handle_t handle_;
  std::unique_ptr<locale_data_t> locale_data_;
 };
 }  // namespace locale
--- a/src/locale/include/strxfrm.h
+++ b/src/locale/include/strxfrm.h
@ -0,0 +1,35 @@
 /*
 * Copyright 2023 jacqueline <me@jacqueline.id.au>
 *
 * SPDX-License-Identifier: GPL-3.0-only
 */
 #pragma once
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct {
  uint_fast32_t nrules;
  unsigned char* rulesets;
  unsigned char* weights;
  int32_t* table;
  unsigned char* extra;
  int32_t* indirect;
 } locale_data_t;
 bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out);
 size_t glib_strxfrm(char* dest,
                    const char* src,
                    size_t n,
                    locale_data_t* locale);
 #ifdef __cplusplus
 }
 #endif
--- a/src/locale/priv_include/weight.h
+++ b/src/locale/priv_include/weight.h
@ -0,0 +1,160 @@
 /* Copyright (C) 1996-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Written by Ulrich Drepper, <drepper@cygnus.com>.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
 #pragma once
 #include <stdint.h>
 #include <stddef.h>
 /* This alignment is used for 32-bit integers in locale files, both
   those that are explicitly int32_t or uint32_t and those that are
   wchar_t, regardless of the (possibly smaller) alignment required
   for such integers on a particular host.  */
 #define LOCFILE_ALIGN		sizeof (int32_t)
 #define LOCFILE_ALIGN_MASK	(LOCFILE_ALIGN - 1)
 #define LOCFILE_ALIGN_UP(x)	(((x) + LOCFILE_ALIGN - 1)	\
 				 & ~LOCFILE_ALIGN_MASK)
 #define LOCFILE_ALIGNED_P(x)	(((x) & LOCFILE_ALIGN_MASK) == 0)
 /* Find index of weight.  */
 static inline int32_t __attribute__ ((always_inline))
 findidx (const int32_t *table,
 	 const int32_t *indirect,
 	 const unsigned char *extra,
 	 const unsigned char **cpp, size_t len)
 {
  int_fast32_t i = table[*(*cpp)++];
  const unsigned char *cp;
  const unsigned char *usrc;
  if (i >= 0)
    /* This is an index into the weight table.  Cool.  */
    return i;
  /* Oh well, more than one sequence starting with this byte.
     Search for the correct one.  */
  cp = &extra[-i];
  usrc = *cpp;
  --len;
  while (1)
    {
      size_t nhere;
      /* The first thing is the index.  */
      i = *((const int32_t *) cp);
      cp += sizeof (int32_t);
      /* Next is the length of the byte sequence.  These are always
 	 short byte sequences so there is no reason to call any
 	 function (even if they are inlined).  */
      nhere = *cp++;
      if (i >= 0)
 	{
 	  /* It is a single character.  If it matches we found our
 	     index.  Note that at the end of each list there is an
 	     entry of length zero which represents the single byte
 	     sequence.  The first (and here only) byte was tested
 	     already.  */
 	  size_t cnt;
 	  /* With GCC 5.3 when compiling with -Os the compiler warns
 	     that seq2.back_us, which becomes usrc, might be used
 	     uninitialized.  This can't be true because we pass a length
 	     of -1 for len at the same time which means that this loop
 	     never executes.  */
 	  for (cnt = 0; cnt < nhere && cnt < len; ++cnt)
 	    if (cp[cnt] != usrc[cnt])
 	      break;
 	  if (cnt == nhere)
 	    {
 	      /* Found it.  */
 	      *cpp += nhere;
 	      return i;
 	    }
 	  /* Up to the next entry.  */
 	  cp += nhere;
 	  if (!LOCFILE_ALIGNED_P (1 + nhere))
 	    cp += LOCFILE_ALIGN - (1 + nhere) % LOCFILE_ALIGN;
 	}
      else
 	{
 	  /* This is a range of characters.  First decide whether the
 	     current byte sequence lies in the range.  */
 	  size_t cnt;
 	  size_t offset = 0;
 	  for (cnt = 0; cnt < nhere && cnt < len; ++cnt)
 	    if (cp[cnt] != usrc[cnt])
 	      break;
 	  if (cnt != nhere)
 	    {
 	      if (cnt == len || cp[cnt] > usrc[cnt])
 		{
 		  /* Cannot be in this range.  */
 		  cp += 2 * nhere;
 		  if (!LOCFILE_ALIGNED_P (1 + 2 * nhere))
 		    cp += (LOCFILE_ALIGN
 			   - (1 + 2 * nhere) % LOCFILE_ALIGN);
 		  continue;
 		}
 	      /* Test against the end of the range.  */
 	      for (cnt = 0; cnt < nhere; ++cnt)
 		if (cp[nhere + cnt] != usrc[cnt])
 		  break;
 	      if (cnt != nhere && cp[nhere + cnt] < usrc[cnt])
 		{
 		  /* Cannot be in this range.  */
 		  cp += 2 * nhere;
 		  if (!LOCFILE_ALIGNED_P (1 + 2 * nhere))
 		    cp += (LOCFILE_ALIGN
 			   - (1 + 2 * nhere) % LOCFILE_ALIGN);
 		  continue;
 		}
 	      /* This range matches the next characters.  Now find
 		 the offset in the indirect table.  */
 	      for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt);
 	      do
 		{
 		  offset <<= 8;
 		  /* With GCC 7 when compiling with -Os the compiler
 		     warns that seq1.back_us and seq2.back_us, which
 		     become usrc, might be used uninitialized.  This
 		     is impossible for the same reason as described
 		     above.  */
 		  offset += usrc[cnt] - cp[cnt];
 		}
 	      while (++cnt < nhere);
 	    }
 	  *cpp += nhere;
 	  return indirect[-i + offset];
 	}
    }
  /* NOTREACHED */
  return 0x43219876;
 }
--- a/src/locale/strxfrm_l.c
+++ b/src/locale/strxfrm_l.c
@ -0,0 +1,641 @@
 /* Copyright (C) 1995-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Written by Ulrich Drepper <drepper@gnu.org>, 1995.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
 #include "strxfrm.h"
 #include <assert.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/param.h>
 #include "esp_log.h"
 #include "weight.h"
 static const char kTag[] = "strxfrm";
 #ifndef STRING_TYPE
 #define STRING_TYPE char
 #define USTRING_TYPE unsigned char
 #define STRLEN strlen
 #define STPNCPY stpncpy
 #define L(arg) arg
 #endif
 #define CONCAT(a, b) CONCAT1(a, b)
 #define CONCAT1(a, b) a##b
 /* Maximum string size that is calculated with cached indices.  Right now this
   is an arbitrary value open to optimizations.  SMALL_STR_SIZE * 4 has to be
   lower than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
 #define SMALL_STR_SIZE 4095
 /* We know three kinds of collation sorting rules.  */
 enum coll_sort_rule {
  illegal_0__,
  sort_forward,
  sort_backward,
  illegal_3__,
  sort_position,
  sort_forward_position,
  sort_backward_position,
  sort_mask
 };
 enum collate_element {
  COLLATE_NRULES = 0,
  COLLATE_RULESETS,
  COLLATE_TABLEMB,
  COLLATE_WEIGHTMB,
  COLLATE_EXTRAMB,
  COLLATE_INDIRECTMB,
  COLLATE_GAP1,
  COLLATE_GAP2,
  COLLATE_GAP3,
  COLLATE_TABLEWC,
  COLLATE_WEIGHTWC,
  COLLATE_EXTRAWC,
  COLLATE_INDIRECTWC,
  COLLATE_SYMB_HASH_SIZEMB,
  COLLATE_SYMB_TABLEMB,
  COLLATE_SYMB_EXTRAMB,
  COLLATE_COLLSEQMB,
  COLLATE_COLLSEQWC,
  COLLATE_CODESET,
  // Not a real element; used to know how many elements there are.
  COLLATE_LAST
 };
 typedef enum collate_element collate_element_t;
 bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out) {
  const struct {
    unsigned int magic;
    unsigned int nstrings;
    unsigned int strindex[0];
  }* const header = raw_data;
  if (header->magic != 0x20051017) {
    ESP_LOGE(kTag, "file magic incorrect (was %x)", header->magic);
    return false;
  }
  if (sizeof(*header) + header->nstrings * sizeof(unsigned int) >= size) {
    ESP_LOGE(kTag, "file was too small to contain header");
    return false;
  }
  if (header->nstrings != COLLATE_LAST) {
    ESP_LOGE(kTag, "file has incorrect number of elements (was %u, wanted %u)",
             header->nstrings, COLLATE_LAST);
    return false;
  }
  // The LC_COLLATE partition appears to contain data in the correct shape.
  // Pull out pointers to the various attributes it contains.
  const void* offsets[COLLATE_LAST];
  for (size_t i = 0; i < header->nstrings; i++) {
    size_t offset = header->strindex[i];
    if (offset > size) {
      ESP_LOGE(kTag, "element offset (%u) exceeds file size", offset);
      return false;
    }
    offsets[i] = (raw_data + offset);
  }
  // Now parse those pointers into the output struct.
  out->nrules = *(const unsigned int*)offsets[COLLATE_NRULES];
  out->rulesets = (unsigned char*)offsets[COLLATE_RULESETS];
  out->table = (int32_t*)offsets[COLLATE_TABLEMB];
  out->weights = (unsigned char*)offsets[COLLATE_WEIGHTMB];
  out->extra = (unsigned char*)offsets[COLLATE_EXTRAMB];
  out->indirect = (int32_t*)offsets[COLLATE_INDIRECTMB];
  assert(((uintptr_t)out->table) % __alignof__(out->table[0]) == 0);
  assert(((uintptr_t)out->weights) % __alignof__(out->weights[0]) == 0);
  assert(((uintptr_t)out->extra) % __alignof__(out->extra[0]) == 0);
  assert(((uintptr_t)out->indirect) % __alignof__(out->indirect[0]) == 0);
  return true;
 }
 /* We need UTF-8 encoding of numbers.  */
 static int utf8_encode(char* buf, int val) {
  int retval;
  if (val < 0x80) {
    *buf++ = (char)val;
    retval = 1;
  } else {
    int step;
    for (step = 2; step < 6; ++step)
      if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
        break;
    retval = step;
    *buf = (unsigned char)(~0xff >> step);
    --step;
    do {
      buf[step] = 0x80 | (val & 0x3f);
      val >>= 6;
    } while (--step > 0);
    *buf |= val;
  }
  return retval;
 }
 /* Find next weight and rule index.  Inlined since called for every char.  */
 static __always_inline size_t find_idx(const USTRING_TYPE** us,
                                       int32_t* weight_idx,
                                       unsigned char* rule_idx,
                                       const locale_data_t* l_data,
                                       const int pass) {
  int32_t tmp = findidx(l_data->table, l_data->indirect, l_data->extra, us, -1);
  *rule_idx = tmp >> 24;
  int32_t idx = tmp & 0xffffff;
  size_t len = l_data->weights[idx++];
  /* Skip over indices of previous levels.  */
  for (int i = 0; i < pass; i++) {
    idx += len;
    len = l_data->weights[idx++];
  }
  *weight_idx = idx;
  return len;
 }
 static int find_position(const USTRING_TYPE* us,
                         const locale_data_t* l_data,
                         const int pass) {
  int32_t weight_idx;
  unsigned char rule_idx;
  const USTRING_TYPE* usrc = us;
  find_idx(&usrc, &weight_idx, &rule_idx, l_data, pass);
  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
 }
 /* Do the transformation.  */
 static size_t do_xfrm(const USTRING_TYPE* usrc,
                      STRING_TYPE* dest,
                      size_t n,
                      const locale_data_t* l_data) {
  int32_t weight_idx;
  unsigned char rule_idx;
  uint_fast32_t pass;
  size_t needed = 0;
  size_t last_needed;
  /* Now the passes over the weights.  */
  for (pass = 0; pass < l_data->nrules; ++pass) {
    size_t backw_len = 0;
    last_needed = needed;
    const USTRING_TYPE* cur = usrc;
    const USTRING_TYPE* backw_start = NULL;
    /* We assume that if a rule has defined `position' in one section
      this is true for all of them.  */
    int position = find_position(cur, l_data, pass);
    if (position == 0) {
      while (*cur != L('\0')) {
        const USTRING_TYPE* pos = cur;
        size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass);
        int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
        if ((rule & sort_forward) != 0) {
          /* Handle the pushed backward sequence.  */
          if (backw_start != NULL) {
            for (size_t i = backw_len; i > 0;) {
              int32_t weight_idx;
              unsigned char rule_idx;
              size_t len =
                  find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass);
              if (needed + i < n)
                for (size_t j = len; j > 0; j--)
                  dest[needed + i - j] = l_data->weights[weight_idx++];
              i -= len;
            }
            needed += backw_len;
            backw_start = NULL;
            backw_len = 0;
          }
          /* Now handle the forward element.  */
          if (needed + len < n)
            while (len-- > 0)
              dest[needed++] = l_data->weights[weight_idx++];
          else
            /* No more characters fit into the buffer.  */
            needed += len;
        } else {
          /* Remember start of the backward sequence & track length.  */
          if (backw_start == NULL)
            backw_start = pos;
          backw_len += len;
        }
      }
      /* Handle the pushed backward sequence.  */
      if (backw_start != NULL) {
        for (size_t i = backw_len; i > 0;) {
          size_t len =
              find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass);
          if (needed + i < n)
            for (size_t j = len; j > 0; j--)
              dest[needed + i - j] = l_data->weights[weight_idx++];
          i -= len;
        }
        needed += backw_len;
      }
    } else {
      int val = 1;
      char buf[7];
      size_t buflen;
      size_t i;
      while (*cur != L('\0')) {
        const USTRING_TYPE* pos = cur;
        size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass);
        int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
        if ((rule & sort_forward) != 0) {
          /* Handle the pushed backward sequence.  */
          if (backw_start != NULL) {
            for (size_t p = backw_len; p > 0; p--) {
              size_t len;
              int32_t weight_idx;
              unsigned char rule_idx;
              const USTRING_TYPE* backw_cur = backw_start;
              /* To prevent a warning init the used vars.  */
              len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);
              for (i = 1; i < p; i++)
                len =
                    find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);
              if (len != 0) {
                buflen = utf8_encode(buf, val);
                if (needed + buflen + len < n) {
                  for (i = 0; i < buflen; ++i)
                    dest[needed + i] = buf[i];
                  for (i = 0; i < len; ++i)
                    dest[needed + buflen + i] = l_data->weights[weight_idx + i];
                }
                needed += buflen + len;
                val = 1;
              } else
                ++val;
            }
            backw_start = NULL;
            backw_len = 0;
          }
          /* Now handle the forward element.  */
          if (len != 0) {
            buflen = utf8_encode(buf, val);
            if (needed + buflen + len < n) {
              for (i = 0; i < buflen; ++i)
                dest[needed + i] = buf[i];
              for (i = 0; i < len; ++i)
                dest[needed + buflen + i] = l_data->weights[weight_idx + i];
            }
            needed += buflen + len;
            val = 1;
          } else
            ++val;
        } else {
          /* Remember start of the backward sequence & track length.  */
          if (backw_start == NULL)
            backw_start = pos;
          backw_len++;
        }
      }
      /* Handle the pushed backward sequence.  */
      if (backw_start != NULL) {
        for (size_t p = backw_len; p > 0; p--) {
          size_t len;
          int32_t weight_idx;
          unsigned char rule_idx;
          const USTRING_TYPE* backw_cur = backw_start;
          /* To prevent a warning init the used vars.  */
          len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);
          for (i = 1; i < p; i++)
            len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);
          if (len != 0) {
            buflen = utf8_encode(buf, val);
            if (needed + buflen + len < n) {
              for (i = 0; i < buflen; ++i)
                dest[needed + i] = buf[i];
              for (i = 0; i < len; ++i)
                dest[needed + buflen + i] = l_data->weights[weight_idx + i];
            }
            needed += buflen + len;
            val = 1;
          } else
            ++val;
        }
      }
    }
    /* Finally store the byte to separate the passes or terminate
       the string.  */
    if (needed < n)
      dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
    ++needed;
  }
  /* This is a little optimization: many collation specifications have
     a `position' rule at the end and if no non-ignored character
     is found the last \1 byte is immediately followed by a \0 byte
     signalling this.  We can avoid the \1 byte(s).  */
  if (needed > 2 && needed == last_needed + 1) {
    /* Remove the \1 byte.  */
    if (--needed <= n)
      dest[needed - 1] = L('\0');
  }
  /* Return the number of bytes/words we need, but don't count the NUL
     byte/word at the end.  */
  return needed - 1;
 }
 /* Do the transformation using weight-index and rule cache.  */
 static size_t do_xfrm_cached(STRING_TYPE* dest,
                             size_t n,
                             const locale_data_t* l_data,
                             size_t idxmax,
                             int32_t* idxarr,
                             const unsigned char* rulearr) {
  uint_fast32_t nrules = l_data->nrules;
  unsigned char* rulesets = l_data->rulesets;
  USTRING_TYPE* weights = l_data->weights;
  uint_fast32_t pass;
  size_t needed = 0;
  size_t last_needed;
  size_t idxcnt;
  /* Now the passes over the weights.  */
  for (pass = 0; pass < nrules; ++pass) {
    size_t backw_stop = ~0ul;
    int rule = rulesets[rulearr[0] * nrules + pass];
    /* We assume that if a rule has defined `position' in one section
       this is true for all of them.  */
    int position = rule & sort_position;
    last_needed = needed;
    if (position == 0) {
      for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) {
        if ((rule & sort_forward) != 0) {
          size_t len;
          if (backw_stop != ~0ul) {
            /* Handle the pushed elements now.  */
            size_t backw;
            for (backw = idxcnt; backw > backw_stop;) {
              --backw;
              len = weights[idxarr[backw]++];
              if (needed + len < n)
                while (len-- > 0)
                  dest[needed++] = weights[idxarr[backw]++];
              else {
                /* No more characters fit into the buffer.  */
                needed += len;
                idxarr[backw] += len;
              }
            }
            backw_stop = ~0ul;
          }
          /* Now handle the forward element.  */
          len = weights[idxarr[idxcnt]++];
          if (needed + len < n)
            while (len-- > 0)
              dest[needed++] = weights[idxarr[idxcnt]++];
          else {
            /* No more characters fit into the buffer.  */
            needed += len;
            idxarr[idxcnt] += len;
          }
        } else {
          /* Remember where the backwards series started.  */
          if (backw_stop == ~0ul)
            backw_stop = idxcnt;
        }
        rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
      }
      if (backw_stop != ~0ul) {
        /* Handle the pushed elements now.  */
        size_t backw;
        backw = idxcnt;
        while (backw > backw_stop) {
          size_t len = weights[idxarr[--backw]++];
          if (needed + len < n)
            while (len-- > 0)
              dest[needed++] = weights[idxarr[backw]++];
          else {
            /* No more characters fit into the buffer.  */
            needed += len;
            idxarr[backw] += len;
          }
        }
      }
    } else {
      int val = 1;
      char buf[7];
      size_t buflen;
      size_t i;
      for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) {
        if ((rule & sort_forward) != 0) {
          size_t len;
          if (backw_stop != ~0ul) {
            /* Handle the pushed elements now.  */
            size_t backw;
            for (backw = idxcnt; backw > backw_stop;) {
              --backw;
              len = weights[idxarr[backw]++];
              if (len != 0) {
                buflen = utf8_encode(buf, val);
                if (needed + buflen + len < n) {
                  for (i = 0; i < buflen; ++i)
                    dest[needed + i] = buf[i];
                  for (i = 0; i < len; ++i)
                    dest[needed + buflen + i] = weights[idxarr[backw] + i];
                }
                needed += buflen + len;
                idxarr[backw] += len;
                val = 1;
              } else
                ++val;
            }
            backw_stop = ~0ul;
          }
          /* Now handle the forward element.  */
          len = weights[idxarr[idxcnt]++];
          if (len != 0) {
            buflen = utf8_encode(buf, val);
            if (needed + buflen + len < n) {
              for (i = 0; i < buflen; ++i)
                dest[needed + i] = buf[i];
              for (i = 0; i < len; ++i)
                dest[needed + buflen + i] = weights[idxarr[idxcnt] + i];
            }
            needed += buflen + len;
            idxarr[idxcnt] += len;
            val = 1;
          } else
            /* Note that we don't have to increment `idxarr[idxcnt]'
               since the length is zero.  */
            ++val;
        } else {
          /* Remember where the backwards series started.  */
          if (backw_stop == ~0ul)
            backw_stop = idxcnt;
        }
        rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
      }
      if (backw_stop != ~0ul) {
        /* Handle the pushed elements now.  */
        size_t backw;
        backw = idxmax - 1;
        while (backw > backw_stop) {
          size_t len = weights[idxarr[--backw]++];
          if (len != 0) {
            buflen = utf8_encode(buf, val);
            if (needed + buflen + len < n) {
              for (i = 0; i < buflen; ++i)
                dest[needed + i] = buf[i];
              for (i = 0; i < len; ++i)
                dest[needed + buflen + i] = weights[idxarr[backw] + i];
            }
            needed += buflen + len;
            idxarr[backw] += len;
            val = 1;
          } else
            ++val;
        }
      }
    }
    /* Finally store the byte to separate the passes or terminate
       the string.  */
    if (needed < n)
      dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
    ++needed;
  }
  /* This is a little optimization: many collation specifications have
     a `position' rule at the end and if no non-ignored character
     is found the last \1 byte is immediately followed by a \0 byte
     signalling this.  We can avoid the \1 byte(s).  */
  if (needed > 2 && needed == last_needed + 1) {
    /* Remove the \1 byte.  */
    if (--needed <= n)
      dest[needed - 1] = L('\0');
  }
  /* Return the number of bytes/words we need, but don't count the NUL
     byte/word at the end.  */
  return needed - 1;
 }
 size_t glib_strxfrm(char* dest,
                    const char* src,
                    size_t n,
                    locale_data_t* locale) {
  /* Handle byte comparison case.  */
  if (locale->nrules == 0) {
    size_t srclen = strlen(src);
    if (n != 0) {
      strncpy(dest, src, MIN(srclen + 1, n));
    }
    return srclen;
  }
  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
  if (*src == L('\0')) {
    if (n != 0)
      *dest = L('\0');
    return 0;
  }
  /* We need the elements of the string as unsigned values since they
     are used as indeces.  */
  const USTRING_TYPE* usrc = (const USTRING_TYPE*)src;
  /* Allocate cache for small strings on the stack and fill it with weight and
     rule indices.  If the cache size is not sufficient, continue with the
     uncached xfrm version.  */
  size_t idxmax = 0;
  const USTRING_TYPE* cur = usrc;
  int32_t* idxarr = alloca(SMALL_STR_SIZE * sizeof(int32_t));
  unsigned char* rulearr = alloca(SMALL_STR_SIZE + 1);
  do {
    int32_t tmp =
        findidx(locale->table, locale->indirect, locale->extra, &cur, -1);
    rulearr[idxmax] = tmp >> 24;
    idxarr[idxmax] = tmp & 0xffffff;
    ++idxmax;
  } while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
  /* This element is only read, the value never used but to determine
     another value which then is ignored.  */
  rulearr[idxmax] = '\0';
  /* Do the transformation.  */
  if (*cur == L('\0'))
    return do_xfrm_cached(dest, n, locale, idxmax, idxarr, rulearr);
  else
    return do_xfrm(usrc, dest, n, locale);
 }
--- a/src/system_fsm/CMakeLists.txt
+++ b/src/system_fsm/CMakeLists.txt
@ -5,5 +5,5 @@
 idf_component_register(
  SRCS "system_fsm.cpp" "running.cpp" "booting.cpp" "idle.cpp" "service_locator.cpp"
  INCLUDE_DIRS "include"
-  REQUIRES "tinyfsm" "drivers" "database" "ui" "result" "events" "audio" "app_console" "battery")
+  REQUIRES "tinyfsm" "drivers" "database" "ui" "result" "events" "audio" "app_console" "battery" "locale")
 target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS})
--- a/src/system_fsm/booting.cpp
+++ b/src/system_fsm/booting.cpp
@ -4,6 +4,7 @@
 * SPDX-License-Identifier: GPL-3.0-only
 */
 #include "collation.hpp"
 #include "system_fsm.hpp"
 #include <stdint.h>
@ -76,6 +77,7 @@ auto Booting::entry() -> void {
  sServices->track_queue(std::make_unique<audio::TrackQueue>());
  sServices->tag_parser(std::make_unique<database::TagParserImpl>());
  sServices->collator(locale::CreateCollator());
  ESP_LOGI(kTag, "init bluetooth");
  sServices->bluetooth(std::make_unique<drivers::Bluetooth>(sServices->nvs()));
--- a/src/system_fsm/include/service_locator.hpp
+++ b/src/system_fsm/include/service_locator.hpp
@ -10,6 +10,7 @@
 #include "battery.hpp"
 #include "bluetooth.hpp"
 #include "collation.hpp"
 #include "database.hpp"
 #include "gpios.hpp"
 #include "nvs.hpp"
@ -101,6 +102,15 @@ class ServiceLocator {
    queue_ = std::move(i);
  }
  auto collator() -> locale::ICollator& {
    assert(collator_ != nullptr);
    return *collator_;
  }
  auto collator(std::unique_ptr<locale::ICollator> i) {
    collator_ = std::move(i);
  }
  // Not copyable or movable.
  ServiceLocator(const ServiceLocator&) = delete;
  ServiceLocator& operator=(const ServiceLocator&) = delete;
@ -117,6 +127,7 @@ class ServiceLocator {
  std::shared_ptr<database::Database> database_;
  std::unique_ptr<database::ITagParser> tag_parser_;
  std::unique_ptr<locale::ICollator> collator_;
  drivers::SdState sd_;
 };
--- a/src/system_fsm/running.cpp
+++ b/src/system_fsm/running.cpp
@ -55,8 +55,8 @@ void Running::entry() {
  ESP_LOGI(kTag, "opening database");
  sFileGatherer = new database::FileGathererImpl();
-  auto database_res =
+  auto database_res = database::Database::Open(
-      database::Database::Open(*sFileGatherer, sServices->tag_parser());
+      *sFileGatherer, sServices->tag_parser(), sServices->collator());
  if (database_res.has_error()) {
    ESP_LOGW(kTag, "failed to open!");
    events::System().Dispatch(StorageError{});