From b58c08150853b8055093dc116d407ffd543f8ec8 Mon Sep 17 00:00:00 2001 From: jacqueline Date: Mon, 30 Oct 2023 15:47:38 +1100 Subject: [PATCH] add locale-aware colation to db indexes --- partitions.csv | 14 +- src/database/database.cpp | 18 +- src/database/include/database.hpp | 8 +- src/database/include/index.hpp | 3 +- src/database/index.cpp | 10 +- src/locale/CMakeLists.txt | 11 + src/locale/collation.cpp | 92 +++ src/locale/include/collation.hpp | 49 ++ src/locale/include/strxfrm.h | 35 ++ src/locale/priv_include/weight.h | 160 +++++ src/locale/strxfrm_l.c | 641 +++++++++++++++++++++ src/system_fsm/CMakeLists.txt | 2 +- src/system_fsm/booting.cpp | 2 + src/system_fsm/include/service_locator.hpp | 11 + src/system_fsm/running.cpp | 4 +- 15 files changed, 1037 insertions(+), 23 deletions(-) create mode 100644 src/locale/CMakeLists.txt create mode 100644 src/locale/collation.cpp create mode 100644 src/locale/include/collation.hpp create mode 100644 src/locale/include/strxfrm.h create mode 100644 src/locale/priv_include/weight.h create mode 100644 src/locale/strxfrm_l.c diff --git a/partitions.csv b/partitions.csv index 3cc251a7..dec2348c 100644 --- a/partitions.csv +++ b/partitions.csv @@ -1,9 +1,9 @@ # ESP-IDF Partition Table -# Name, Type, SubType, Offset, Size, Flags -nvs, data, nvs, 0x9000, 0x4000, -otadata, data, ota, 0xd000, 0x2000, -phy_init, data, phy, 0xf000, 0x1000, -ota_0, app, ota_0, 0x10000, 4M, -ota_1, app, ota_1, 0x410000, 4M, -assets, data, fat, 0x810000, 6M, +# Name, Type, SubType, Offset, Size, Flags +nvs, data, nvs, 0x9000, 0x4000, +otadata, data, ota, 0xd000, 0x2000, +phy_init, data, phy, 0xf000, 0x1000, +ota_0, app, ota_0, 0x10000, 4M, +ota_1, app, ota_1, 0x410000, 4M, +collate, 0x40, 0x00, 0x810000, 3M, diff --git a/src/database/database.cpp b/src/database/database.cpp index e05fa26a..9b978b8c 100644 --- a/src/database/database.cpp +++ b/src/database/database.cpp @@ -17,6 +17,7 @@ #include #include +#include "collation.hpp" #include "esp_log.h" #include "ff.h" #include "freertos/projdefs.h" @@ -49,7 +50,7 @@ static SingletonEnv sEnv; static const char kDbPath[] = "/.tangara-db"; static const char kKeyDbVersion[] = "schema_version"; -static const uint8_t kCurrentDbVersion = 2; +static const uint8_t kCurrentDbVersion = 3; static const char kKeyTrackId[] = "next_track_id"; @@ -73,7 +74,9 @@ static auto CreateNewDatabase(leveldb::Options& options) -> leveldb::DB* { return db; } -auto Database::Open(IFileGatherer& gatherer, ITagParser& parser) +auto Database::Open(IFileGatherer& gatherer, + ITagParser& parser, + locale::ICollator& collator) -> cpp::result { // TODO(jacqueline): Why isn't compare_and_exchange_* available? if (sIsDbOpen.exchange(true)) { @@ -127,7 +130,8 @@ auto Database::Open(IFileGatherer& gatherer, ITagParser& parser) } ESP_LOGI(kTag, "Database opened successfully"); - return new Database(db, cache.release(), gatherer, parser, worker); + return new Database(db, cache.release(), gatherer, parser, collator, + worker); }) .get(); } @@ -142,12 +146,14 @@ Database::Database(leveldb::DB* db, leveldb::Cache* cache, IFileGatherer& file_gatherer, ITagParser& tag_parser, + locale::ICollator& collator, std::shared_ptr worker) : db_(db), cache_(cache), worker_task_(worker), file_gatherer_(file_gatherer), - tag_parser_(tag_parser) {} + tag_parser_(tag_parser), + collator_(collator) {} Database::~Database() { // Delete db_ first so that any outstanding background work finishes before @@ -539,7 +545,7 @@ auto Database::dbGetHash(const uint64_t& hash) -> std::optional { auto Database::dbCreateIndexesForTrack(const Track& track) -> void { for (const IndexInfo& index : GetIndexes()) { leveldb::WriteBatch writes; - auto entries = Index(index, track); + auto entries = Index(collator_, index, track); for (const auto& it : entries) { writes.Put(EncodeIndexKey(it.first), {it.second.data(), it.second.size()}); @@ -555,7 +561,7 @@ auto Database::dbRemoveIndexes(std::shared_ptr data) -> void { } Track track{data, tags}; for (const IndexInfo& index : GetIndexes()) { - auto entries = Index(index, track); + auto entries = Index(collator_, index, track); for (auto it = entries.rbegin(); it != entries.rend(); it++) { auto key = EncodeIndexKey(it->first); auto status = db_->Delete(leveldb::WriteOptions{}, key); diff --git a/src/database/include/database.hpp b/src/database/include/database.hpp index cdf69db0..5eb3a8e9 100644 --- a/src/database/include/database.hpp +++ b/src/database/include/database.hpp @@ -16,6 +16,7 @@ #include #include +#include "collation.hpp" #include "file_gatherer.hpp" #include "index.hpp" #include "leveldb/cache.h" @@ -92,9 +93,10 @@ class Database { ALREADY_OPEN, FAILED_TO_OPEN, }; - static auto Open(IFileGatherer& file_gatherer, ITagParser& tag_parser) + static auto Open(IFileGatherer& file_gatherer, + ITagParser& tag_parser, + locale::ICollator& collator) -> cpp::result; - static auto Open() -> cpp::result; static auto Destroy() -> void; @@ -136,11 +138,13 @@ class Database { // Not owned. IFileGatherer& file_gatherer_; ITagParser& tag_parser_; + locale::ICollator& collator_; Database(leveldb::DB* db, leveldb::Cache* cache, IFileGatherer& file_gatherer, ITagParser& tag_parser, + locale::ICollator& collator, std::shared_ptr worker); auto dbMintNewTrackId() -> TrackId; diff --git a/src/database/include/index.hpp b/src/database/include/index.hpp index 13de952d..15b21ee8 100644 --- a/src/database/include/index.hpp +++ b/src/database/include/index.hpp @@ -13,6 +13,7 @@ #include #include +#include "collation.hpp" #include "leveldb/db.h" #include "leveldb/slice.h" @@ -60,7 +61,7 @@ struct IndexKey { std::optional track; }; -auto Index(const IndexInfo&, const Track&) +auto Index(locale::ICollator&, const IndexInfo&, const Track&) -> std::vector>; auto ExpandHeader(const IndexKey::Header&, diff --git a/src/database/index.cpp b/src/database/index.cpp index 84ea050a..7d556192 100644 --- a/src/database/index.cpp +++ b/src/database/index.cpp @@ -7,8 +7,11 @@ #include "index.hpp" #include +#include #include +#include "collation.hpp" +#include "esp_log.h" #include "komihash.h" #include "leveldb/write_batch.h" @@ -59,7 +62,7 @@ static auto missing_component_text(const Track& track, Tag tag) } } -auto Index(const IndexInfo& info, const Track& t) +auto Index(locale::ICollator& collator, const IndexInfo& info, const Track& t) -> std::vector> { std::vector> out; IndexKey key{ @@ -72,15 +75,14 @@ auto Index(const IndexInfo& info, const Track& t) .track = {}, }; - auto& col = std::use_facet>(std::locale()); - for (std::uint8_t i = 0; i < info.components.size(); i++) { // Fill in the text for this depth. auto text = t.tags().at(info.components.at(i)); std::pmr::string value; if (text) { std::pmr::string orig = *text; - key.item = col.transform(&orig[0], &orig[0] + orig.size()); + auto xfrm = collator.Transform({orig.data(), orig.size()}); + key.item = {xfrm.data(), xfrm.size()}; value = *text; } else { key.item = {}; diff --git a/src/locale/CMakeLists.txt b/src/locale/CMakeLists.txt new file mode 100644 index 00000000..627ca314 --- /dev/null +++ b/src/locale/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright 2023 jacqueline +# +# SPDX-License-Identifier: GPL-3.0-only + +idf_component_register( + SRCS "collation.cpp" "strxfrm_l.c" + INCLUDE_DIRS "include" + PRIV_INCLUDE_DIRS "priv_include" + REQUIRES "span" "esp_partition" "spi_flash") + +target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS}) diff --git a/src/locale/collation.cpp b/src/locale/collation.cpp new file mode 100644 index 00000000..f5e8272a --- /dev/null +++ b/src/locale/collation.cpp @@ -0,0 +1,92 @@ +/* + * Copyright 2023 jacqueline + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#include "collation.hpp" + +#include +#include + +#include "esp_flash_spi_init.h" +#include "esp_log.h" +#include "esp_partition.h" +#include "hal/spi_flash_types.h" +#include "spi_flash_mmap.h" +#include "strxfrm.h" + +namespace locale { + +static constexpr char kTag[] = "collate"; + +static constexpr esp_partition_type_t kLocalePartitionType = + static_cast(0x40); +static constexpr esp_partition_subtype_t kLcCollateSubtype = + static_cast(0x0); + +auto CreateCollator() -> std::unique_ptr { + std::unique_ptr ret{GLibCollator::create()}; + if (!ret) { + ret.reset(new NoopCollator()); + } + return ret; +} + +auto GLibCollator::create() -> GLibCollator* { + uint32_t data_pages = spi_flash_mmap_get_free_pages(SPI_FLASH_MMAP_DATA); + ESP_LOGI(kTag, "free data pages: %lu (%lu KiB)", data_pages, data_pages * 64); + + const esp_partition_t* partition = + esp_partition_find_first(kLocalePartitionType, kLcCollateSubtype, NULL); + if (partition == NULL) { + ESP_LOGW(kTag, "no LC_COLLATE partition found"); + } + + ESP_LOGI(kTag, "found LC_COLLATE partition of size %lu", partition->size); + if (partition->size > data_pages * 64 * 1024) { + ESP_LOGE(kTag, "not enough free pages to map LC_COLLATE partition!"); + return nullptr; + } + + const void* region; + esp_partition_mmap_handle_t handle; + esp_err_t err = esp_partition_mmap(partition, 0, partition->size, + ESP_PARTITION_MMAP_DATA, ®ion, &handle); + if (err != ESP_OK) { + ESP_LOGE(kTag, "LC_COLLATE mmap failed"); + return nullptr; + } + + auto data = std::make_unique(); + if (!parse_locale_data(region, partition->size, data.get())) { + ESP_LOGE(kTag, "parsing locale data failed"); + esp_partition_munmap(handle); + return nullptr; + } + + return new GLibCollator(handle, std::move(data)); +} + +GLibCollator::GLibCollator(const esp_partition_mmap_handle_t handle, + std::unique_ptr locale) + : handle_(handle), locale_data_(std::move(locale)) {} + +GLibCollator::~GLibCollator() { + esp_partition_munmap(handle_); +} + +auto GLibCollator::Transform(const std::string& in) -> std::string { + char dest[256]; + size_t size = glib_strxfrm(dest, in.c_str(), 256, locale_data_.get()); + if (size >= 256) { + char* larger_dest = new char[size + 1]{0}; + glib_strxfrm(larger_dest, in.c_str(), size, locale_data_.get()); + std::string out{larger_dest, size}; + delete[] larger_dest; + return out; + } + return {dest, size}; +} + +} // namespace locale diff --git a/src/locale/include/collation.hpp b/src/locale/include/collation.hpp new file mode 100644 index 00000000..e8b6fa17 --- /dev/null +++ b/src/locale/include/collation.hpp @@ -0,0 +1,49 @@ +/* + * Copyright 2023 jacqueline + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#pragma once + +#include +#include +#include + +#include "esp_partition.h" +#include "span.hpp" + +#include "strxfrm.h" + +namespace locale { + +class ICollator { + public: + virtual ~ICollator() {} + + virtual auto Transform(const std::string&) -> std::string = 0; +}; + +class NoopCollator : public ICollator { + public: + auto Transform(const std::string& in) -> std::string override { return in; } +}; + +auto CreateCollator() -> std::unique_ptr; + +class GLibCollator : public ICollator { + public: + static auto create() -> GLibCollator*; + ~GLibCollator(); + + auto Transform(const std::string& in) -> std::string override; + + private: + GLibCollator(const esp_partition_mmap_handle_t, + std::unique_ptr); + + const esp_partition_mmap_handle_t handle_; + std::unique_ptr locale_data_; +}; + +} // namespace locale diff --git a/src/locale/include/strxfrm.h b/src/locale/include/strxfrm.h new file mode 100644 index 00000000..a40f7596 --- /dev/null +++ b/src/locale/include/strxfrm.h @@ -0,0 +1,35 @@ +/* + * Copyright 2023 jacqueline + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#pragma once + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + uint_fast32_t nrules; + unsigned char* rulesets; + unsigned char* weights; + int32_t* table; + unsigned char* extra; + int32_t* indirect; +} locale_data_t; + +bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out); + +size_t glib_strxfrm(char* dest, + const char* src, + size_t n, + locale_data_t* locale); + +#ifdef __cplusplus +} +#endif diff --git a/src/locale/priv_include/weight.h b/src/locale/priv_include/weight.h new file mode 100644 index 00000000..34465d90 --- /dev/null +++ b/src/locale/priv_include/weight.h @@ -0,0 +1,160 @@ +/* Copyright (C) 1996-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Ulrich Drepper, . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#pragma once + +#include +#include + +/* This alignment is used for 32-bit integers in locale files, both + those that are explicitly int32_t or uint32_t and those that are + wchar_t, regardless of the (possibly smaller) alignment required + for such integers on a particular host. */ +#define LOCFILE_ALIGN sizeof (int32_t) +#define LOCFILE_ALIGN_MASK (LOCFILE_ALIGN - 1) +#define LOCFILE_ALIGN_UP(x) (((x) + LOCFILE_ALIGN - 1) \ + & ~LOCFILE_ALIGN_MASK) +#define LOCFILE_ALIGNED_P(x) (((x) & LOCFILE_ALIGN_MASK) == 0) + +/* Find index of weight. */ +static inline int32_t __attribute__ ((always_inline)) +findidx (const int32_t *table, + const int32_t *indirect, + const unsigned char *extra, + const unsigned char **cpp, size_t len) +{ + int_fast32_t i = table[*(*cpp)++]; + const unsigned char *cp; + const unsigned char *usrc; + + if (i >= 0) + /* This is an index into the weight table. Cool. */ + return i; + + /* Oh well, more than one sequence starting with this byte. + Search for the correct one. */ + cp = &extra[-i]; + usrc = *cpp; + --len; + while (1) + { + size_t nhere; + + /* The first thing is the index. */ + i = *((const int32_t *) cp); + cp += sizeof (int32_t); + + /* Next is the length of the byte sequence. These are always + short byte sequences so there is no reason to call any + function (even if they are inlined). */ + nhere = *cp++; + + if (i >= 0) + { + /* It is a single character. If it matches we found our + index. Note that at the end of each list there is an + entry of length zero which represents the single byte + sequence. The first (and here only) byte was tested + already. */ + size_t cnt; + + /* With GCC 5.3 when compiling with -Os the compiler warns + that seq2.back_us, which becomes usrc, might be used + uninitialized. This can't be true because we pass a length + of -1 for len at the same time which means that this loop + never executes. */ + for (cnt = 0; cnt < nhere && cnt < len; ++cnt) + if (cp[cnt] != usrc[cnt]) + break; + + if (cnt == nhere) + { + /* Found it. */ + *cpp += nhere; + return i; + } + + /* Up to the next entry. */ + cp += nhere; + if (!LOCFILE_ALIGNED_P (1 + nhere)) + cp += LOCFILE_ALIGN - (1 + nhere) % LOCFILE_ALIGN; + } + else + { + /* This is a range of characters. First decide whether the + current byte sequence lies in the range. */ + size_t cnt; + size_t offset = 0; + + for (cnt = 0; cnt < nhere && cnt < len; ++cnt) + if (cp[cnt] != usrc[cnt]) + break; + + if (cnt != nhere) + { + if (cnt == len || cp[cnt] > usrc[cnt]) + { + /* Cannot be in this range. */ + cp += 2 * nhere; + if (!LOCFILE_ALIGNED_P (1 + 2 * nhere)) + cp += (LOCFILE_ALIGN + - (1 + 2 * nhere) % LOCFILE_ALIGN); + continue; + } + + /* Test against the end of the range. */ + for (cnt = 0; cnt < nhere; ++cnt) + if (cp[nhere + cnt] != usrc[cnt]) + break; + + if (cnt != nhere && cp[nhere + cnt] < usrc[cnt]) + { + /* Cannot be in this range. */ + cp += 2 * nhere; + if (!LOCFILE_ALIGNED_P (1 + 2 * nhere)) + cp += (LOCFILE_ALIGN + - (1 + 2 * nhere) % LOCFILE_ALIGN); + continue; + } + + /* This range matches the next characters. Now find + the offset in the indirect table. */ + for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt); + + do + { + offset <<= 8; + /* With GCC 7 when compiling with -Os the compiler + warns that seq1.back_us and seq2.back_us, which + become usrc, might be used uninitialized. This + is impossible for the same reason as described + above. */ + offset += usrc[cnt] - cp[cnt]; + } + while (++cnt < nhere); + } + + *cpp += nhere; + return indirect[-i + offset]; + } + } + + /* NOTREACHED */ + return 0x43219876; +} + diff --git a/src/locale/strxfrm_l.c b/src/locale/strxfrm_l.c new file mode 100644 index 00000000..198b59b3 --- /dev/null +++ b/src/locale/strxfrm_l.c @@ -0,0 +1,641 @@ +/* Copyright (C) 1995-2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Ulrich Drepper , 1995. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "strxfrm.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "esp_log.h" + +#include "weight.h" + +static const char kTag[] = "strxfrm"; + +#ifndef STRING_TYPE +#define STRING_TYPE char +#define USTRING_TYPE unsigned char +#define STRLEN strlen +#define STPNCPY stpncpy +#define L(arg) arg +#endif + +#define CONCAT(a, b) CONCAT1(a, b) +#define CONCAT1(a, b) a##b + +/* Maximum string size that is calculated with cached indices. Right now this + is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be + lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */ +#define SMALL_STR_SIZE 4095 + +/* We know three kinds of collation sorting rules. */ +enum coll_sort_rule { + illegal_0__, + sort_forward, + sort_backward, + illegal_3__, + sort_position, + sort_forward_position, + sort_backward_position, + sort_mask +}; + +enum collate_element { + COLLATE_NRULES = 0, + COLLATE_RULESETS, + COLLATE_TABLEMB, + COLLATE_WEIGHTMB, + COLLATE_EXTRAMB, + COLLATE_INDIRECTMB, + COLLATE_GAP1, + COLLATE_GAP2, + COLLATE_GAP3, + COLLATE_TABLEWC, + COLLATE_WEIGHTWC, + COLLATE_EXTRAWC, + COLLATE_INDIRECTWC, + COLLATE_SYMB_HASH_SIZEMB, + COLLATE_SYMB_TABLEMB, + COLLATE_SYMB_EXTRAMB, + COLLATE_COLLSEQMB, + COLLATE_COLLSEQWC, + COLLATE_CODESET, + + // Not a real element; used to know how many elements there are. + COLLATE_LAST +}; +typedef enum collate_element collate_element_t; + +bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out) { + const struct { + unsigned int magic; + unsigned int nstrings; + unsigned int strindex[0]; + }* const header = raw_data; + + if (header->magic != 0x20051017) { + ESP_LOGE(kTag, "file magic incorrect (was %x)", header->magic); + return false; + } + + if (sizeof(*header) + header->nstrings * sizeof(unsigned int) >= size) { + ESP_LOGE(kTag, "file was too small to contain header"); + return false; + } + + if (header->nstrings != COLLATE_LAST) { + ESP_LOGE(kTag, "file has incorrect number of elements (was %u, wanted %u)", + header->nstrings, COLLATE_LAST); + return false; + } + + // The LC_COLLATE partition appears to contain data in the correct shape. + // Pull out pointers to the various attributes it contains. + const void* offsets[COLLATE_LAST]; + for (size_t i = 0; i < header->nstrings; i++) { + size_t offset = header->strindex[i]; + if (offset > size) { + ESP_LOGE(kTag, "element offset (%u) exceeds file size", offset); + return false; + } + offsets[i] = (raw_data + offset); + } + + // Now parse those pointers into the output struct. + out->nrules = *(const unsigned int*)offsets[COLLATE_NRULES]; + + out->rulesets = (unsigned char*)offsets[COLLATE_RULESETS]; + out->table = (int32_t*)offsets[COLLATE_TABLEMB]; + out->weights = (unsigned char*)offsets[COLLATE_WEIGHTMB]; + out->extra = (unsigned char*)offsets[COLLATE_EXTRAMB]; + out->indirect = (int32_t*)offsets[COLLATE_INDIRECTMB]; + + assert(((uintptr_t)out->table) % __alignof__(out->table[0]) == 0); + assert(((uintptr_t)out->weights) % __alignof__(out->weights[0]) == 0); + assert(((uintptr_t)out->extra) % __alignof__(out->extra[0]) == 0); + assert(((uintptr_t)out->indirect) % __alignof__(out->indirect[0]) == 0); + + return true; +} + +/* We need UTF-8 encoding of numbers. */ +static int utf8_encode(char* buf, int val) { + int retval; + + if (val < 0x80) { + *buf++ = (char)val; + retval = 1; + } else { + int step; + + for (step = 2; step < 6; ++step) + if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0) + break; + retval = step; + + *buf = (unsigned char)(~0xff >> step); + --step; + do { + buf[step] = 0x80 | (val & 0x3f); + val >>= 6; + } while (--step > 0); + *buf |= val; + } + + return retval; +} + +/* Find next weight and rule index. Inlined since called for every char. */ +static __always_inline size_t find_idx(const USTRING_TYPE** us, + int32_t* weight_idx, + unsigned char* rule_idx, + const locale_data_t* l_data, + const int pass) { + int32_t tmp = findidx(l_data->table, l_data->indirect, l_data->extra, us, -1); + *rule_idx = tmp >> 24; + int32_t idx = tmp & 0xffffff; + size_t len = l_data->weights[idx++]; + + /* Skip over indices of previous levels. */ + for (int i = 0; i < pass; i++) { + idx += len; + len = l_data->weights[idx++]; + } + + *weight_idx = idx; + return len; +} + +static int find_position(const USTRING_TYPE* us, + const locale_data_t* l_data, + const int pass) { + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE* usrc = us; + + find_idx(&usrc, &weight_idx, &rule_idx, l_data, pass); + return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position; +} + +/* Do the transformation. */ +static size_t do_xfrm(const USTRING_TYPE* usrc, + STRING_TYPE* dest, + size_t n, + const locale_data_t* l_data) { + int32_t weight_idx; + unsigned char rule_idx; + uint_fast32_t pass; + size_t needed = 0; + size_t last_needed; + + /* Now the passes over the weights. */ + for (pass = 0; pass < l_data->nrules; ++pass) { + size_t backw_len = 0; + last_needed = needed; + const USTRING_TYPE* cur = usrc; + const USTRING_TYPE* backw_start = NULL; + + /* We assume that if a rule has defined `position' in one section + this is true for all of them. */ + int position = find_position(cur, l_data, pass); + + if (position == 0) { + while (*cur != L('\0')) { + const USTRING_TYPE* pos = cur; + size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); + int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; + + if ((rule & sort_forward) != 0) { + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) { + for (size_t i = backw_len; i > 0;) { + int32_t weight_idx; + unsigned char rule_idx; + size_t len = + find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); + if (needed + i < n) + for (size_t j = len; j > 0; j--) + dest[needed + i - j] = l_data->weights[weight_idx++]; + + i -= len; + } + + needed += backw_len; + backw_start = NULL; + backw_len = 0; + } + + /* Now handle the forward element. */ + if (needed + len < n) + while (len-- > 0) + dest[needed++] = l_data->weights[weight_idx++]; + else + /* No more characters fit into the buffer. */ + needed += len; + } else { + /* Remember start of the backward sequence & track length. */ + if (backw_start == NULL) + backw_start = pos; + backw_len += len; + } + } + + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) { + for (size_t i = backw_len; i > 0;) { + size_t len = + find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); + if (needed + i < n) + for (size_t j = len; j > 0; j--) + dest[needed + i - j] = l_data->weights[weight_idx++]; + + i -= len; + } + + needed += backw_len; + } + } else { + int val = 1; + char buf[7]; + size_t buflen; + size_t i; + + while (*cur != L('\0')) { + const USTRING_TYPE* pos = cur; + size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); + int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; + + if ((rule & sort_forward) != 0) { + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) { + for (size_t p = backw_len; p > 0; p--) { + size_t len; + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE* backw_cur = backw_start; + + /* To prevent a warning init the used vars. */ + len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); + + for (i = 1; i < p; i++) + len = + find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); + + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = l_data->weights[weight_idx + i]; + } + needed += buflen + len; + val = 1; + } else + ++val; + } + + backw_start = NULL; + backw_len = 0; + } + + /* Now handle the forward element. */ + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = l_data->weights[weight_idx + i]; + } + needed += buflen + len; + val = 1; + } else + ++val; + } else { + /* Remember start of the backward sequence & track length. */ + if (backw_start == NULL) + backw_start = pos; + backw_len++; + } + } + + /* Handle the pushed backward sequence. */ + if (backw_start != NULL) { + for (size_t p = backw_len; p > 0; p--) { + size_t len; + int32_t weight_idx; + unsigned char rule_idx; + const USTRING_TYPE* backw_cur = backw_start; + + /* To prevent a warning init the used vars. */ + len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); + + for (i = 1; i < p; i++) + len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); + + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = l_data->weights[weight_idx + i]; + } + needed += buflen + len; + val = 1; + } else + ++val; + } + } + } + + /* Finally store the byte to separate the passes or terminate + the string. */ + if (needed < n) + dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0'); + ++needed; + } + + /* This is a little optimization: many collation specifications have + a `position' rule at the end and if no non-ignored character + is found the last \1 byte is immediately followed by a \0 byte + signalling this. We can avoid the \1 byte(s). */ + if (needed > 2 && needed == last_needed + 1) { + /* Remove the \1 byte. */ + if (--needed <= n) + dest[needed - 1] = L('\0'); + } + + /* Return the number of bytes/words we need, but don't count the NUL + byte/word at the end. */ + return needed - 1; +} + +/* Do the transformation using weight-index and rule cache. */ +static size_t do_xfrm_cached(STRING_TYPE* dest, + size_t n, + const locale_data_t* l_data, + size_t idxmax, + int32_t* idxarr, + const unsigned char* rulearr) { + uint_fast32_t nrules = l_data->nrules; + unsigned char* rulesets = l_data->rulesets; + USTRING_TYPE* weights = l_data->weights; + uint_fast32_t pass; + size_t needed = 0; + size_t last_needed; + size_t idxcnt; + + /* Now the passes over the weights. */ + for (pass = 0; pass < nrules; ++pass) { + size_t backw_stop = ~0ul; + int rule = rulesets[rulearr[0] * nrules + pass]; + /* We assume that if a rule has defined `position' in one section + this is true for all of them. */ + int position = rule & sort_position; + + last_needed = needed; + if (position == 0) { + for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { + if ((rule & sort_forward) != 0) { + size_t len; + + if (backw_stop != ~0ul) { + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxcnt; backw > backw_stop;) { + --backw; + len = weights[idxarr[backw]++]; + + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[backw]++]; + else { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[backw] += len; + } + } + + backw_stop = ~0ul; + } + + /* Now handle the forward element. */ + len = weights[idxarr[idxcnt]++]; + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[idxcnt]++]; + else { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[idxcnt] += len; + } + } else { + /* Remember where the backwards series started. */ + if (backw_stop == ~0ul) + backw_stop = idxcnt; + } + + rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; + } + + if (backw_stop != ~0ul) { + /* Handle the pushed elements now. */ + size_t backw; + + backw = idxcnt; + while (backw > backw_stop) { + size_t len = weights[idxarr[--backw]++]; + + if (needed + len < n) + while (len-- > 0) + dest[needed++] = weights[idxarr[backw]++]; + else { + /* No more characters fit into the buffer. */ + needed += len; + idxarr[backw] += len; + } + } + } + } else { + int val = 1; + char buf[7]; + size_t buflen; + size_t i; + + for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { + if ((rule & sort_forward) != 0) { + size_t len; + + if (backw_stop != ~0ul) { + /* Handle the pushed elements now. */ + size_t backw; + + for (backw = idxcnt; backw > backw_stop;) { + --backw; + len = weights[idxarr[backw]++]; + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = weights[idxarr[backw] + i]; + } + needed += buflen + len; + idxarr[backw] += len; + val = 1; + } else + ++val; + } + + backw_stop = ~0ul; + } + + /* Now handle the forward element. */ + len = weights[idxarr[idxcnt]++]; + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = weights[idxarr[idxcnt] + i]; + } + needed += buflen + len; + idxarr[idxcnt] += len; + val = 1; + } else + /* Note that we don't have to increment `idxarr[idxcnt]' + since the length is zero. */ + ++val; + } else { + /* Remember where the backwards series started. */ + if (backw_stop == ~0ul) + backw_stop = idxcnt; + } + + rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; + } + + if (backw_stop != ~0ul) { + /* Handle the pushed elements now. */ + size_t backw; + + backw = idxmax - 1; + while (backw > backw_stop) { + size_t len = weights[idxarr[--backw]++]; + if (len != 0) { + buflen = utf8_encode(buf, val); + if (needed + buflen + len < n) { + for (i = 0; i < buflen; ++i) + dest[needed + i] = buf[i]; + for (i = 0; i < len; ++i) + dest[needed + buflen + i] = weights[idxarr[backw] + i]; + } + needed += buflen + len; + idxarr[backw] += len; + val = 1; + } else + ++val; + } + } + } + + /* Finally store the byte to separate the passes or terminate + the string. */ + if (needed < n) + dest[needed] = pass + 1 < nrules ? L('\1') : L('\0'); + ++needed; + } + + /* This is a little optimization: many collation specifications have + a `position' rule at the end and if no non-ignored character + is found the last \1 byte is immediately followed by a \0 byte + signalling this. We can avoid the \1 byte(s). */ + if (needed > 2 && needed == last_needed + 1) { + /* Remove the \1 byte. */ + if (--needed <= n) + dest[needed - 1] = L('\0'); + } + + /* Return the number of bytes/words we need, but don't count the NUL + byte/word at the end. */ + return needed - 1; +} + +size_t glib_strxfrm(char* dest, + const char* src, + size_t n, + locale_data_t* locale) { + /* Handle byte comparison case. */ + if (locale->nrules == 0) { + size_t srclen = strlen(src); + + if (n != 0) { + strncpy(dest, src, MIN(srclen + 1, n)); + } + + return srclen; + } + + /* Handle an empty string, code hereafter relies on strlen (src) > 0. */ + if (*src == L('\0')) { + if (n != 0) + *dest = L('\0'); + return 0; + } + + /* We need the elements of the string as unsigned values since they + are used as indeces. */ + const USTRING_TYPE* usrc = (const USTRING_TYPE*)src; + + /* Allocate cache for small strings on the stack and fill it with weight and + rule indices. If the cache size is not sufficient, continue with the + uncached xfrm version. */ + size_t idxmax = 0; + const USTRING_TYPE* cur = usrc; + int32_t* idxarr = alloca(SMALL_STR_SIZE * sizeof(int32_t)); + unsigned char* rulearr = alloca(SMALL_STR_SIZE + 1); + + do { + int32_t tmp = + findidx(locale->table, locale->indirect, locale->extra, &cur, -1); + rulearr[idxmax] = tmp >> 24; + idxarr[idxmax] = tmp & 0xffffff; + + ++idxmax; + } while (*cur != L('\0') && idxmax < SMALL_STR_SIZE); + + /* This element is only read, the value never used but to determine + another value which then is ignored. */ + rulearr[idxmax] = '\0'; + + /* Do the transformation. */ + if (*cur == L('\0')) + return do_xfrm_cached(dest, n, locale, idxmax, idxarr, rulearr); + else + return do_xfrm(usrc, dest, n, locale); +} diff --git a/src/system_fsm/CMakeLists.txt b/src/system_fsm/CMakeLists.txt index 449e14cc..8535a0e7 100644 --- a/src/system_fsm/CMakeLists.txt +++ b/src/system_fsm/CMakeLists.txt @@ -5,5 +5,5 @@ idf_component_register( SRCS "system_fsm.cpp" "running.cpp" "booting.cpp" "idle.cpp" "service_locator.cpp" INCLUDE_DIRS "include" - REQUIRES "tinyfsm" "drivers" "database" "ui" "result" "events" "audio" "app_console" "battery") + REQUIRES "tinyfsm" "drivers" "database" "ui" "result" "events" "audio" "app_console" "battery" "locale") target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS}) diff --git a/src/system_fsm/booting.cpp b/src/system_fsm/booting.cpp index 82d83836..893a4560 100644 --- a/src/system_fsm/booting.cpp +++ b/src/system_fsm/booting.cpp @@ -4,6 +4,7 @@ * SPDX-License-Identifier: GPL-3.0-only */ +#include "collation.hpp" #include "system_fsm.hpp" #include @@ -76,6 +77,7 @@ auto Booting::entry() -> void { sServices->track_queue(std::make_unique()); sServices->tag_parser(std::make_unique()); + sServices->collator(locale::CreateCollator()); ESP_LOGI(kTag, "init bluetooth"); sServices->bluetooth(std::make_unique(sServices->nvs())); diff --git a/src/system_fsm/include/service_locator.hpp b/src/system_fsm/include/service_locator.hpp index 1dcf0f5e..24dc1eb9 100644 --- a/src/system_fsm/include/service_locator.hpp +++ b/src/system_fsm/include/service_locator.hpp @@ -10,6 +10,7 @@ #include "battery.hpp" #include "bluetooth.hpp" +#include "collation.hpp" #include "database.hpp" #include "gpios.hpp" #include "nvs.hpp" @@ -101,6 +102,15 @@ class ServiceLocator { queue_ = std::move(i); } + auto collator() -> locale::ICollator& { + assert(collator_ != nullptr); + return *collator_; + } + + auto collator(std::unique_ptr i) { + collator_ = std::move(i); + } + // Not copyable or movable. ServiceLocator(const ServiceLocator&) = delete; ServiceLocator& operator=(const ServiceLocator&) = delete; @@ -117,6 +127,7 @@ class ServiceLocator { std::shared_ptr database_; std::unique_ptr tag_parser_; + std::unique_ptr collator_; drivers::SdState sd_; }; diff --git a/src/system_fsm/running.cpp b/src/system_fsm/running.cpp index 567553a9..91cd46af 100644 --- a/src/system_fsm/running.cpp +++ b/src/system_fsm/running.cpp @@ -55,8 +55,8 @@ void Running::entry() { ESP_LOGI(kTag, "opening database"); sFileGatherer = new database::FileGathererImpl(); - auto database_res = - database::Database::Open(*sFileGatherer, sServices->tag_parser()); + auto database_res = database::Database::Open( + *sFileGatherer, sServices->tag_parser(), sServices->collator()); if (database_res.has_error()) { ESP_LOGW(kTag, "failed to open!"); events::System().Dispatch(StorageError{});