parent
18d90051c9
commit
b58c081508
|
@ -0,0 +1,11 @@ |
|||||||
|
# Copyright 2023 jacqueline <me@jacqueline.id.au> |
||||||
|
# |
||||||
|
# SPDX-License-Identifier: GPL-3.0-only |
||||||
|
|
||||||
|
idf_component_register( |
||||||
|
SRCS "collation.cpp" "strxfrm_l.c" |
||||||
|
INCLUDE_DIRS "include" |
||||||
|
PRIV_INCLUDE_DIRS "priv_include" |
||||||
|
REQUIRES "span" "esp_partition" "spi_flash") |
||||||
|
|
||||||
|
target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS}) |
@ -0,0 +1,92 @@ |
|||||||
|
/*
|
||||||
|
* Copyright 2023 jacqueline <me@jacqueline.id.au> |
||||||
|
* |
||||||
|
* SPDX-License-Identifier: GPL-3.0-only |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "collation.hpp" |
||||||
|
|
||||||
|
#include <stdint.h> |
||||||
|
#include <memory> |
||||||
|
|
||||||
|
#include "esp_flash_spi_init.h" |
||||||
|
#include "esp_log.h" |
||||||
|
#include "esp_partition.h" |
||||||
|
#include "hal/spi_flash_types.h" |
||||||
|
#include "spi_flash_mmap.h" |
||||||
|
#include "strxfrm.h" |
||||||
|
|
||||||
|
namespace locale { |
||||||
|
|
||||||
|
static constexpr char kTag[] = "collate"; |
||||||
|
|
||||||
|
static constexpr esp_partition_type_t kLocalePartitionType = |
||||||
|
static_cast<esp_partition_type_t>(0x40); |
||||||
|
static constexpr esp_partition_subtype_t kLcCollateSubtype = |
||||||
|
static_cast<esp_partition_subtype_t>(0x0); |
||||||
|
|
||||||
|
auto CreateCollator() -> std::unique_ptr<ICollator> { |
||||||
|
std::unique_ptr<ICollator> ret{GLibCollator::create()}; |
||||||
|
if (!ret) { |
||||||
|
ret.reset(new NoopCollator()); |
||||||
|
} |
||||||
|
return ret; |
||||||
|
} |
||||||
|
|
||||||
|
auto GLibCollator::create() -> GLibCollator* { |
||||||
|
uint32_t data_pages = spi_flash_mmap_get_free_pages(SPI_FLASH_MMAP_DATA); |
||||||
|
ESP_LOGI(kTag, "free data pages: %lu (%lu KiB)", data_pages, data_pages * 64); |
||||||
|
|
||||||
|
const esp_partition_t* partition = |
||||||
|
esp_partition_find_first(kLocalePartitionType, kLcCollateSubtype, NULL); |
||||||
|
if (partition == NULL) { |
||||||
|
ESP_LOGW(kTag, "no LC_COLLATE partition found"); |
||||||
|
} |
||||||
|
|
||||||
|
ESP_LOGI(kTag, "found LC_COLLATE partition of size %lu", partition->size); |
||||||
|
if (partition->size > data_pages * 64 * 1024) { |
||||||
|
ESP_LOGE(kTag, "not enough free pages to map LC_COLLATE partition!"); |
||||||
|
return nullptr; |
||||||
|
} |
||||||
|
|
||||||
|
const void* region; |
||||||
|
esp_partition_mmap_handle_t handle; |
||||||
|
esp_err_t err = esp_partition_mmap(partition, 0, partition->size, |
||||||
|
ESP_PARTITION_MMAP_DATA, ®ion, &handle); |
||||||
|
if (err != ESP_OK) { |
||||||
|
ESP_LOGE(kTag, "LC_COLLATE mmap failed"); |
||||||
|
return nullptr; |
||||||
|
} |
||||||
|
|
||||||
|
auto data = std::make_unique<locale_data_t>(); |
||||||
|
if (!parse_locale_data(region, partition->size, data.get())) { |
||||||
|
ESP_LOGE(kTag, "parsing locale data failed"); |
||||||
|
esp_partition_munmap(handle); |
||||||
|
return nullptr; |
||||||
|
} |
||||||
|
|
||||||
|
return new GLibCollator(handle, std::move(data)); |
||||||
|
} |
||||||
|
|
||||||
|
GLibCollator::GLibCollator(const esp_partition_mmap_handle_t handle, |
||||||
|
std::unique_ptr<locale_data_t> locale) |
||||||
|
: handle_(handle), locale_data_(std::move(locale)) {} |
||||||
|
|
||||||
|
GLibCollator::~GLibCollator() { |
||||||
|
esp_partition_munmap(handle_); |
||||||
|
} |
||||||
|
|
||||||
|
auto GLibCollator::Transform(const std::string& in) -> std::string { |
||||||
|
char dest[256]; |
||||||
|
size_t size = glib_strxfrm(dest, in.c_str(), 256, locale_data_.get()); |
||||||
|
if (size >= 256) { |
||||||
|
char* larger_dest = new char[size + 1]{0}; |
||||||
|
glib_strxfrm(larger_dest, in.c_str(), size, locale_data_.get()); |
||||||
|
std::string out{larger_dest, size}; |
||||||
|
delete[] larger_dest; |
||||||
|
return out; |
||||||
|
} |
||||||
|
return {dest, size}; |
||||||
|
} |
||||||
|
|
||||||
|
} // namespace locale
|
@ -0,0 +1,49 @@ |
|||||||
|
/*
|
||||||
|
* Copyright 2023 jacqueline <me@jacqueline.id.au> |
||||||
|
* |
||||||
|
* SPDX-License-Identifier: GPL-3.0-only |
||||||
|
*/ |
||||||
|
|
||||||
|
#pragma once |
||||||
|
|
||||||
|
#include <cstddef> |
||||||
|
#include <memory> |
||||||
|
#include <string> |
||||||
|
|
||||||
|
#include "esp_partition.h" |
||||||
|
#include "span.hpp" |
||||||
|
|
||||||
|
#include "strxfrm.h" |
||||||
|
|
||||||
|
namespace locale { |
||||||
|
|
||||||
|
class ICollator { |
||||||
|
public: |
||||||
|
virtual ~ICollator() {} |
||||||
|
|
||||||
|
virtual auto Transform(const std::string&) -> std::string = 0; |
||||||
|
}; |
||||||
|
|
||||||
|
class NoopCollator : public ICollator { |
||||||
|
public: |
||||||
|
auto Transform(const std::string& in) -> std::string override { return in; } |
||||||
|
}; |
||||||
|
|
||||||
|
auto CreateCollator() -> std::unique_ptr<ICollator>; |
||||||
|
|
||||||
|
class GLibCollator : public ICollator { |
||||||
|
public: |
||||||
|
static auto create() -> GLibCollator*; |
||||||
|
~GLibCollator(); |
||||||
|
|
||||||
|
auto Transform(const std::string& in) -> std::string override; |
||||||
|
|
||||||
|
private: |
||||||
|
GLibCollator(const esp_partition_mmap_handle_t, |
||||||
|
std::unique_ptr<locale_data_t>); |
||||||
|
|
||||||
|
const esp_partition_mmap_handle_t handle_; |
||||||
|
std::unique_ptr<locale_data_t> locale_data_; |
||||||
|
}; |
||||||
|
|
||||||
|
} // namespace locale
|
@ -0,0 +1,35 @@ |
|||||||
|
/*
|
||||||
|
* Copyright 2023 jacqueline <me@jacqueline.id.au> |
||||||
|
* |
||||||
|
* SPDX-License-Identifier: GPL-3.0-only |
||||||
|
*/ |
||||||
|
|
||||||
|
#pragma once |
||||||
|
|
||||||
|
#include <stddef.h> |
||||||
|
#include <stdint.h> |
||||||
|
#include <stdbool.h> |
||||||
|
|
||||||
|
#ifdef __cplusplus |
||||||
|
extern "C" { |
||||||
|
#endif |
||||||
|
|
||||||
|
typedef struct { |
||||||
|
uint_fast32_t nrules; |
||||||
|
unsigned char* rulesets; |
||||||
|
unsigned char* weights; |
||||||
|
int32_t* table; |
||||||
|
unsigned char* extra; |
||||||
|
int32_t* indirect; |
||||||
|
} locale_data_t; |
||||||
|
|
||||||
|
bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out); |
||||||
|
|
||||||
|
size_t glib_strxfrm(char* dest, |
||||||
|
const char* src, |
||||||
|
size_t n, |
||||||
|
locale_data_t* locale); |
||||||
|
|
||||||
|
#ifdef __cplusplus |
||||||
|
} |
||||||
|
#endif |
@ -0,0 +1,160 @@ |
|||||||
|
/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library. |
||||||
|
Written by Ulrich Drepper, <drepper@cygnus.com>. |
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or |
||||||
|
modify it under the terms of the GNU Lesser General Public |
||||||
|
License as published by the Free Software Foundation; either |
||||||
|
version 2.1 of the License, or (at your option) any later version. |
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful, |
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
Lesser General Public License for more details. |
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public |
||||||
|
License along with the GNU C Library; if not, see |
||||||
|
<http://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#pragma once |
||||||
|
|
||||||
|
#include <stdint.h> |
||||||
|
#include <stddef.h> |
||||||
|
|
||||||
|
/* This alignment is used for 32-bit integers in locale files, both
|
||||||
|
those that are explicitly int32_t or uint32_t and those that are |
||||||
|
wchar_t, regardless of the (possibly smaller) alignment required |
||||||
|
for such integers on a particular host. */ |
||||||
|
#define LOCFILE_ALIGN sizeof (int32_t) |
||||||
|
#define LOCFILE_ALIGN_MASK (LOCFILE_ALIGN - 1) |
||||||
|
#define LOCFILE_ALIGN_UP(x) (((x) + LOCFILE_ALIGN - 1) \ |
||||||
|
& ~LOCFILE_ALIGN_MASK) |
||||||
|
#define LOCFILE_ALIGNED_P(x) (((x) & LOCFILE_ALIGN_MASK) == 0) |
||||||
|
|
||||||
|
/* Find index of weight. */ |
||||||
|
static inline int32_t __attribute__ ((always_inline)) |
||||||
|
findidx (const int32_t *table, |
||||||
|
const int32_t *indirect, |
||||||
|
const unsigned char *extra, |
||||||
|
const unsigned char **cpp, size_t len) |
||||||
|
{ |
||||||
|
int_fast32_t i = table[*(*cpp)++]; |
||||||
|
const unsigned char *cp; |
||||||
|
const unsigned char *usrc; |
||||||
|
|
||||||
|
if (i >= 0) |
||||||
|
/* This is an index into the weight table. Cool. */ |
||||||
|
return i; |
||||||
|
|
||||||
|
/* Oh well, more than one sequence starting with this byte.
|
||||||
|
Search for the correct one. */ |
||||||
|
cp = &extra[-i]; |
||||||
|
usrc = *cpp; |
||||||
|
--len; |
||||||
|
while (1) |
||||||
|
{ |
||||||
|
size_t nhere; |
||||||
|
|
||||||
|
/* The first thing is the index. */ |
||||||
|
i = *((const int32_t *) cp); |
||||||
|
cp += sizeof (int32_t); |
||||||
|
|
||||||
|
/* Next is the length of the byte sequence. These are always
|
||||||
|
short byte sequences so there is no reason to call any |
||||||
|
function (even if they are inlined). */ |
||||||
|
nhere = *cp++; |
||||||
|
|
||||||
|
if (i >= 0) |
||||||
|
{ |
||||||
|
/* It is a single character. If it matches we found our
|
||||||
|
index. Note that at the end of each list there is an |
||||||
|
entry of length zero which represents the single byte |
||||||
|
sequence. The first (and here only) byte was tested |
||||||
|
already. */ |
||||||
|
size_t cnt; |
||||||
|
|
||||||
|
/* With GCC 5.3 when compiling with -Os the compiler warns
|
||||||
|
that seq2.back_us, which becomes usrc, might be used |
||||||
|
uninitialized. This can't be true because we pass a length |
||||||
|
of -1 for len at the same time which means that this loop |
||||||
|
never executes. */ |
||||||
|
for (cnt = 0; cnt < nhere && cnt < len; ++cnt) |
||||||
|
if (cp[cnt] != usrc[cnt]) |
||||||
|
break; |
||||||
|
|
||||||
|
if (cnt == nhere) |
||||||
|
{ |
||||||
|
/* Found it. */ |
||||||
|
*cpp += nhere; |
||||||
|
return i; |
||||||
|
} |
||||||
|
|
||||||
|
/* Up to the next entry. */ |
||||||
|
cp += nhere; |
||||||
|
if (!LOCFILE_ALIGNED_P (1 + nhere)) |
||||||
|
cp += LOCFILE_ALIGN - (1 + nhere) % LOCFILE_ALIGN; |
||||||
|
} |
||||||
|
else |
||||||
|
{ |
||||||
|
/* This is a range of characters. First decide whether the
|
||||||
|
current byte sequence lies in the range. */ |
||||||
|
size_t cnt; |
||||||
|
size_t offset = 0; |
||||||
|
|
||||||
|
for (cnt = 0; cnt < nhere && cnt < len; ++cnt) |
||||||
|
if (cp[cnt] != usrc[cnt]) |
||||||
|
break; |
||||||
|
|
||||||
|
if (cnt != nhere) |
||||||
|
{ |
||||||
|
if (cnt == len || cp[cnt] > usrc[cnt]) |
||||||
|
{ |
||||||
|
/* Cannot be in this range. */ |
||||||
|
cp += 2 * nhere; |
||||||
|
if (!LOCFILE_ALIGNED_P (1 + 2 * nhere)) |
||||||
|
cp += (LOCFILE_ALIGN |
||||||
|
- (1 + 2 * nhere) % LOCFILE_ALIGN); |
||||||
|
continue; |
||||||
|
} |
||||||
|
|
||||||
|
/* Test against the end of the range. */ |
||||||
|
for (cnt = 0; cnt < nhere; ++cnt) |
||||||
|
if (cp[nhere + cnt] != usrc[cnt]) |
||||||
|
break; |
||||||
|
|
||||||
|
if (cnt != nhere && cp[nhere + cnt] < usrc[cnt]) |
||||||
|
{ |
||||||
|
/* Cannot be in this range. */ |
||||||
|
cp += 2 * nhere; |
||||||
|
if (!LOCFILE_ALIGNED_P (1 + 2 * nhere)) |
||||||
|
cp += (LOCFILE_ALIGN |
||||||
|
- (1 + 2 * nhere) % LOCFILE_ALIGN); |
||||||
|
continue; |
||||||
|
} |
||||||
|
|
||||||
|
/* This range matches the next characters. Now find
|
||||||
|
the offset in the indirect table. */ |
||||||
|
for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt); |
||||||
|
|
||||||
|
do |
||||||
|
{ |
||||||
|
offset <<= 8; |
||||||
|
/* With GCC 7 when compiling with -Os the compiler
|
||||||
|
warns that seq1.back_us and seq2.back_us, which |
||||||
|
become usrc, might be used uninitialized. This |
||||||
|
is impossible for the same reason as described |
||||||
|
above. */ |
||||||
|
offset += usrc[cnt] - cp[cnt]; |
||||||
|
} |
||||||
|
while (++cnt < nhere); |
||||||
|
} |
||||||
|
|
||||||
|
*cpp += nhere; |
||||||
|
return indirect[-i + offset]; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/* NOTREACHED */ |
||||||
|
return 0x43219876; |
||||||
|
} |
||||||
|
|
@ -0,0 +1,641 @@ |
|||||||
|
/* Copyright (C) 1995-2018 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library. |
||||||
|
Written by Ulrich Drepper <drepper@gnu.org>, 1995. |
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or |
||||||
|
modify it under the terms of the GNU Lesser General Public |
||||||
|
License as published by the Free Software Foundation; either |
||||||
|
version 2.1 of the License, or (at your option) any later version. |
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful, |
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||||
|
Lesser General Public License for more details. |
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public |
||||||
|
License along with the GNU C Library; if not, see |
||||||
|
<http://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
|
#include "strxfrm.h" |
||||||
|
|
||||||
|
#include <assert.h> |
||||||
|
#include <stdbool.h> |
||||||
|
#include <stddef.h> |
||||||
|
#include <stdint.h> |
||||||
|
#include <stdlib.h> |
||||||
|
#include <string.h> |
||||||
|
#include <sys/param.h> |
||||||
|
|
||||||
|
#include "esp_log.h" |
||||||
|
|
||||||
|
#include "weight.h" |
||||||
|
|
||||||
|
static const char kTag[] = "strxfrm"; |
||||||
|
|
||||||
|
#ifndef STRING_TYPE |
||||||
|
#define STRING_TYPE char |
||||||
|
#define USTRING_TYPE unsigned char |
||||||
|
#define STRLEN strlen |
||||||
|
#define STPNCPY stpncpy |
||||||
|
#define L(arg) arg |
||||||
|
#endif |
||||||
|
|
||||||
|
#define CONCAT(a, b) CONCAT1(a, b) |
||||||
|
#define CONCAT1(a, b) a##b |
||||||
|
|
||||||
|
/* Maximum string size that is calculated with cached indices. Right now this
|
||||||
|
is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be |
||||||
|
lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */ |
||||||
|
#define SMALL_STR_SIZE 4095 |
||||||
|
|
||||||
|
/* We know three kinds of collation sorting rules. */ |
||||||
|
enum coll_sort_rule { |
||||||
|
illegal_0__, |
||||||
|
sort_forward, |
||||||
|
sort_backward, |
||||||
|
illegal_3__, |
||||||
|
sort_position, |
||||||
|
sort_forward_position, |
||||||
|
sort_backward_position, |
||||||
|
sort_mask |
||||||
|
}; |
||||||
|
|
||||||
|
enum collate_element { |
||||||
|
COLLATE_NRULES = 0, |
||||||
|
COLLATE_RULESETS, |
||||||
|
COLLATE_TABLEMB, |
||||||
|
COLLATE_WEIGHTMB, |
||||||
|
COLLATE_EXTRAMB, |
||||||
|
COLLATE_INDIRECTMB, |
||||||
|
COLLATE_GAP1, |
||||||
|
COLLATE_GAP2, |
||||||
|
COLLATE_GAP3, |
||||||
|
COLLATE_TABLEWC, |
||||||
|
COLLATE_WEIGHTWC, |
||||||
|
COLLATE_EXTRAWC, |
||||||
|
COLLATE_INDIRECTWC, |
||||||
|
COLLATE_SYMB_HASH_SIZEMB, |
||||||
|
COLLATE_SYMB_TABLEMB, |
||||||
|
COLLATE_SYMB_EXTRAMB, |
||||||
|
COLLATE_COLLSEQMB, |
||||||
|
COLLATE_COLLSEQWC, |
||||||
|
COLLATE_CODESET, |
||||||
|
|
||||||
|
// Not a real element; used to know how many elements there are.
|
||||||
|
COLLATE_LAST |
||||||
|
}; |
||||||
|
typedef enum collate_element collate_element_t; |
||||||
|
|
||||||
|
bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out) { |
||||||
|
const struct { |
||||||
|
unsigned int magic; |
||||||
|
unsigned int nstrings; |
||||||
|
unsigned int strindex[0]; |
||||||
|
}* const header = raw_data; |
||||||
|
|
||||||
|
if (header->magic != 0x20051017) { |
||||||
|
ESP_LOGE(kTag, "file magic incorrect (was %x)", header->magic); |
||||||
|
return false; |
||||||
|
} |
||||||
|
|
||||||
|
if (sizeof(*header) + header->nstrings * sizeof(unsigned int) >= size) { |
||||||
|
ESP_LOGE(kTag, "file was too small to contain header"); |
||||||
|
return false; |
||||||
|
} |
||||||
|
|
||||||
|
if (header->nstrings != COLLATE_LAST) { |
||||||
|
ESP_LOGE(kTag, "file has incorrect number of elements (was %u, wanted %u)", |
||||||
|
header->nstrings, COLLATE_LAST); |
||||||
|
return false; |
||||||
|
} |
||||||
|
|
||||||
|
// The LC_COLLATE partition appears to contain data in the correct shape.
|
||||||
|
// Pull out pointers to the various attributes it contains.
|
||||||
|
const void* offsets[COLLATE_LAST]; |
||||||
|
for (size_t i = 0; i < header->nstrings; i++) { |
||||||
|
size_t offset = header->strindex[i]; |
||||||
|
if (offset > size) { |
||||||
|
ESP_LOGE(kTag, "element offset (%u) exceeds file size", offset); |
||||||
|
return false; |
||||||
|
} |
||||||
|
offsets[i] = (raw_data + offset); |
||||||
|
} |
||||||
|
|
||||||
|
// Now parse those pointers into the output struct.
|
||||||
|
out->nrules = *(const unsigned int*)offsets[COLLATE_NRULES]; |
||||||
|
|
||||||
|
out->rulesets = (unsigned char*)offsets[COLLATE_RULESETS]; |
||||||
|
out->table = (int32_t*)offsets[COLLATE_TABLEMB]; |
||||||
|
out->weights = (unsigned char*)offsets[COLLATE_WEIGHTMB]; |
||||||
|
out->extra = (unsigned char*)offsets[COLLATE_EXTRAMB]; |
||||||
|
out->indirect = (int32_t*)offsets[COLLATE_INDIRECTMB]; |
||||||
|
|
||||||
|
assert(((uintptr_t)out->table) % __alignof__(out->table[0]) == 0); |
||||||
|
assert(((uintptr_t)out->weights) % __alignof__(out->weights[0]) == 0); |
||||||
|
assert(((uintptr_t)out->extra) % __alignof__(out->extra[0]) == 0); |
||||||
|
assert(((uintptr_t)out->indirect) % __alignof__(out->indirect[0]) == 0); |
||||||
|
|
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
/* We need UTF-8 encoding of numbers. */ |
||||||
|
static int utf8_encode(char* buf, int val) { |
||||||
|
int retval; |
||||||
|
|
||||||
|
if (val < 0x80) { |
||||||
|
*buf++ = (char)val; |
||||||
|
retval = 1; |
||||||
|
} else { |
||||||
|
int step; |
||||||
|
|
||||||
|
for (step = 2; step < 6; ++step) |
||||||
|
if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0) |
||||||
|
break; |
||||||
|
retval = step; |
||||||
|
|
||||||
|
*buf = (unsigned char)(~0xff >> step); |
||||||
|
--step; |
||||||
|
do { |
||||||
|
buf[step] = 0x80 | (val & 0x3f); |
||||||
|
val >>= 6; |
||||||
|
} while (--step > 0); |
||||||
|
*buf |= val; |
||||||
|
} |
||||||
|
|
||||||
|
return retval; |
||||||
|
} |
||||||
|
|
||||||
|
/* Find next weight and rule index. Inlined since called for every char. */ |
||||||
|
static __always_inline size_t find_idx(const USTRING_TYPE** us, |
||||||
|
int32_t* weight_idx, |
||||||
|
unsigned char* rule_idx, |
||||||
|
const locale_data_t* l_data, |
||||||
|
const int pass) { |
||||||
|
int32_t tmp = findidx(l_data->table, l_data->indirect, l_data->extra, us, -1); |
||||||
|
*rule_idx = tmp >> 24; |
||||||
|
int32_t idx = tmp & 0xffffff; |
||||||
|
size_t len = l_data->weights[idx++]; |
||||||
|
|
||||||
|
/* Skip over indices of previous levels. */ |
||||||
|
for (int i = 0; i < pass; i++) { |
||||||
|
idx += len; |
||||||
|
len = l_data->weights[idx++]; |
||||||
|
} |
||||||
|
|
||||||
|
*weight_idx = idx; |
||||||
|
return len; |
||||||
|
} |
||||||
|
|
||||||
|
static int find_position(const USTRING_TYPE* us, |
||||||
|
const locale_data_t* l_data, |
||||||
|
const int pass) { |
||||||
|
int32_t weight_idx; |
||||||
|
unsigned char rule_idx; |
||||||
|
const USTRING_TYPE* usrc = us; |
||||||
|
|
||||||
|
find_idx(&usrc, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position; |
||||||
|
} |
||||||
|
|
||||||
|
/* Do the transformation. */ |
||||||
|
static size_t do_xfrm(const USTRING_TYPE* usrc, |
||||||
|
STRING_TYPE* dest, |
||||||
|
size_t n, |
||||||
|
const locale_data_t* l_data) { |
||||||
|
int32_t weight_idx; |
||||||
|
unsigned char rule_idx; |
||||||
|
uint_fast32_t pass; |
||||||
|
size_t needed = 0; |
||||||
|
size_t last_needed; |
||||||
|
|
||||||
|
/* Now the passes over the weights. */ |
||||||
|
for (pass = 0; pass < l_data->nrules; ++pass) { |
||||||
|
size_t backw_len = 0; |
||||||
|
last_needed = needed; |
||||||
|
const USTRING_TYPE* cur = usrc; |
||||||
|
const USTRING_TYPE* backw_start = NULL; |
||||||
|
|
||||||
|
/* We assume that if a rule has defined `position' in one section
|
||||||
|
this is true for all of them. */ |
||||||
|
int position = find_position(cur, l_data, pass); |
||||||
|
|
||||||
|
if (position == 0) { |
||||||
|
while (*cur != L('\0')) { |
||||||
|
const USTRING_TYPE* pos = cur; |
||||||
|
size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; |
||||||
|
|
||||||
|
if ((rule & sort_forward) != 0) { |
||||||
|
/* Handle the pushed backward sequence. */ |
||||||
|
if (backw_start != NULL) { |
||||||
|
for (size_t i = backw_len; i > 0;) { |
||||||
|
int32_t weight_idx; |
||||||
|
unsigned char rule_idx; |
||||||
|
size_t len = |
||||||
|
find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
if (needed + i < n) |
||||||
|
for (size_t j = len; j > 0; j--) |
||||||
|
dest[needed + i - j] = l_data->weights[weight_idx++]; |
||||||
|
|
||||||
|
i -= len; |
||||||
|
} |
||||||
|
|
||||||
|
needed += backw_len; |
||||||
|
backw_start = NULL; |
||||||
|
backw_len = 0; |
||||||
|
} |
||||||
|
|
||||||
|
/* Now handle the forward element. */ |
||||||
|
if (needed + len < n) |
||||||
|
while (len-- > 0) |
||||||
|
dest[needed++] = l_data->weights[weight_idx++]; |
||||||
|
else |
||||||
|
/* No more characters fit into the buffer. */ |
||||||
|
needed += len; |
||||||
|
} else { |
||||||
|
/* Remember start of the backward sequence & track length. */ |
||||||
|
if (backw_start == NULL) |
||||||
|
backw_start = pos; |
||||||
|
backw_len += len; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/* Handle the pushed backward sequence. */ |
||||||
|
if (backw_start != NULL) { |
||||||
|
for (size_t i = backw_len; i > 0;) { |
||||||
|
size_t len = |
||||||
|
find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
if (needed + i < n) |
||||||
|
for (size_t j = len; j > 0; j--) |
||||||
|
dest[needed + i - j] = l_data->weights[weight_idx++]; |
||||||
|
|
||||||
|
i -= len; |
||||||
|
} |
||||||
|
|
||||||
|
needed += backw_len; |
||||||
|
} |
||||||
|
} else { |
||||||
|
int val = 1; |
||||||
|
char buf[7]; |
||||||
|
size_t buflen; |
||||||
|
size_t i; |
||||||
|
|
||||||
|
while (*cur != L('\0')) { |
||||||
|
const USTRING_TYPE* pos = cur; |
||||||
|
size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; |
||||||
|
|
||||||
|
if ((rule & sort_forward) != 0) { |
||||||
|
/* Handle the pushed backward sequence. */ |
||||||
|
if (backw_start != NULL) { |
||||||
|
for (size_t p = backw_len; p > 0; p--) { |
||||||
|
size_t len; |
||||||
|
int32_t weight_idx; |
||||||
|
unsigned char rule_idx; |
||||||
|
const USTRING_TYPE* backw_cur = backw_start; |
||||||
|
|
||||||
|
/* To prevent a warning init the used vars. */ |
||||||
|
len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
|
||||||
|
for (i = 1; i < p; i++) |
||||||
|
len = |
||||||
|
find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
|
||||||
|
if (len != 0) { |
||||||
|
buflen = utf8_encode(buf, val); |
||||||
|
if (needed + buflen + len < n) { |
||||||
|
for (i = 0; i < buflen; ++i) |
||||||
|
dest[needed + i] = buf[i]; |
||||||
|
for (i = 0; i < len; ++i) |
||||||
|
dest[needed + buflen + i] = l_data->weights[weight_idx + i]; |
||||||
|
} |
||||||
|
needed += buflen + len; |
||||||
|
val = 1; |
||||||
|
} else |
||||||
|
++val; |
||||||
|
} |
||||||
|
|
||||||
|
backw_start = NULL; |
||||||
|
backw_len = 0; |
||||||
|
} |
||||||
|
|
||||||
|
/* Now handle the forward element. */ |
||||||
|
if (len != 0) { |
||||||
|
buflen = utf8_encode(buf, val); |
||||||
|
if (needed + buflen + len < n) { |
||||||
|
for (i = 0; i < buflen; ++i) |
||||||
|
dest[needed + i] = buf[i]; |
||||||
|
for (i = 0; i < len; ++i) |
||||||
|
dest[needed + buflen + i] = l_data->weights[weight_idx + i]; |
||||||
|
} |
||||||
|
needed += buflen + len; |
||||||
|
val = 1; |
||||||
|
} else |
||||||
|
++val; |
||||||
|
} else { |
||||||
|
/* Remember start of the backward sequence & track length. */ |
||||||
|
if (backw_start == NULL) |
||||||
|
backw_start = pos; |
||||||
|
backw_len++; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/* Handle the pushed backward sequence. */ |
||||||
|
if (backw_start != NULL) { |
||||||
|
for (size_t p = backw_len; p > 0; p--) { |
||||||
|
size_t len; |
||||||
|
int32_t weight_idx; |
||||||
|
unsigned char rule_idx; |
||||||
|
const USTRING_TYPE* backw_cur = backw_start; |
||||||
|
|
||||||
|
/* To prevent a warning init the used vars. */ |
||||||
|
len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
|
||||||
|
for (i = 1; i < p; i++) |
||||||
|
len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); |
||||||
|
|
||||||
|
if (len != 0) { |
||||||
|
buflen = utf8_encode(buf, val); |
||||||
|
if (needed + buflen + len < n) { |
||||||
|
for (i = 0; i < buflen; ++i) |
||||||
|
dest[needed + i] = buf[i]; |
||||||
|
for (i = 0; i < len; ++i) |
||||||
|
dest[needed + buflen + i] = l_data->weights[weight_idx + i]; |
||||||
|
} |
||||||
|
needed += buflen + len; |
||||||
|
val = 1; |
||||||
|
} else |
||||||
|
++val; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/* Finally store the byte to separate the passes or terminate
|
||||||
|
the string. */ |
||||||
|
if (needed < n) |
||||||
|
dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0'); |
||||||
|
++needed; |
||||||
|
} |
||||||
|
|
||||||
|
/* This is a little optimization: many collation specifications have
|
||||||
|
a `position' rule at the end and if no non-ignored character |
||||||
|
is found the last \1 byte is immediately followed by a \0 byte |
||||||
|
signalling this. We can avoid the \1 byte(s). */ |
||||||
|
if (needed > 2 && needed == last_needed + 1) { |
||||||
|
/* Remove the \1 byte. */ |
||||||
|
if (--needed <= n) |
||||||
|
dest[needed - 1] = L('\0'); |
||||||
|
} |
||||||
|
|
||||||
|
/* Return the number of bytes/words we need, but don't count the NUL
|
||||||
|
byte/word at the end. */ |
||||||
|
return needed - 1; |
||||||
|
} |
||||||
|
|
||||||
|
/* Do the transformation using weight-index and rule cache. */ |
||||||
|
static size_t do_xfrm_cached(STRING_TYPE* dest, |
||||||
|
size_t n, |
||||||
|
const locale_data_t* l_data, |
||||||
|
size_t idxmax, |
||||||
|
int32_t* idxarr, |
||||||
|
const unsigned char* rulearr) { |
||||||
|
uint_fast32_t nrules = l_data->nrules; |
||||||
|
unsigned char* rulesets = l_data->rulesets; |
||||||
|
USTRING_TYPE* weights = l_data->weights; |
||||||
|
uint_fast32_t pass; |
||||||
|
size_t needed = 0; |
||||||
|
size_t last_needed; |
||||||
|
size_t idxcnt; |
||||||
|
|
||||||
|
/* Now the passes over the weights. */ |
||||||
|
for (pass = 0; pass < nrules; ++pass) { |
||||||
|
size_t backw_stop = ~0ul; |
||||||
|
int rule = rulesets[rulearr[0] * nrules + pass]; |
||||||
|
/* We assume that if a rule has defined `position' in one section
|
||||||
|
this is true for all of them. */ |
||||||
|
int position = rule & sort_position; |
||||||
|
|
||||||
|
last_needed = needed; |
||||||
|
if (position == 0) { |
||||||
|
for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { |
||||||
|
if ((rule & sort_forward) != 0) { |
||||||
|
size_t len; |
||||||
|
|
||||||
|
if (backw_stop != ~0ul) { |
||||||
|
/* Handle the pushed elements now. */ |
||||||
|
size_t backw; |
||||||
|
|
||||||
|
for (backw = idxcnt; backw > backw_stop;) { |
||||||
|
--backw; |
||||||
|
len = weights[idxarr[backw]++]; |
||||||
|
|
||||||
|
if (needed + len < n) |
||||||
|
while (len-- > 0) |
||||||
|
dest[needed++] = weights[idxarr[backw]++]; |
||||||
|
else { |
||||||
|
/* No more characters fit into the buffer. */ |
||||||
|
needed += len; |
||||||
|
idxarr[backw] += len; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
backw_stop = ~0ul; |
||||||
|
} |
||||||
|
|
||||||
|
/* Now handle the forward element. */ |
||||||
|
len = weights[idxarr[idxcnt]++]; |
||||||
|
if (needed + len < n) |
||||||
|
while (len-- > 0) |
||||||
|
dest[needed++] = weights[idxarr[idxcnt]++]; |
||||||
|
else { |
||||||
|
/* No more characters fit into the buffer. */ |
||||||
|
needed += len; |
||||||
|
idxarr[idxcnt] += len; |
||||||
|
} |
||||||
|
} else { |
||||||
|
/* Remember where the backwards series started. */ |
||||||
|
if (backw_stop == ~0ul) |
||||||
|
backw_stop = idxcnt; |
||||||
|
} |
||||||
|
|
||||||
|
rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; |
||||||
|
} |
||||||
|
|
||||||
|
if (backw_stop != ~0ul) { |
||||||
|
/* Handle the pushed elements now. */ |
||||||
|
size_t backw; |
||||||
|
|
||||||
|
backw = idxcnt; |
||||||
|
while (backw > backw_stop) { |
||||||
|
size_t len = weights[idxarr[--backw]++]; |
||||||
|
|
||||||
|
if (needed + len < n) |
||||||
|
while (len-- > 0) |
||||||
|
dest[needed++] = weights[idxarr[backw]++]; |
||||||
|
else { |
||||||
|
/* No more characters fit into the buffer. */ |
||||||
|
needed += len; |
||||||
|
idxarr[backw] += len; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
} else { |
||||||
|
int val = 1; |
||||||
|
char buf[7]; |
||||||
|
size_t buflen; |
||||||
|
size_t i; |
||||||
|
|
||||||
|
for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { |
||||||
|
if ((rule & sort_forward) != 0) { |
||||||
|
size_t len; |
||||||
|
|
||||||
|
if (backw_stop != ~0ul) { |
||||||
|
/* Handle the pushed elements now. */ |
||||||
|
size_t backw; |
||||||
|
|
||||||
|
for (backw = idxcnt; backw > backw_stop;) { |
||||||
|
--backw; |
||||||
|
len = weights[idxarr[backw]++]; |
||||||
|
if (len != 0) { |
||||||
|
buflen = utf8_encode(buf, val); |
||||||
|
if (needed + buflen + len < n) { |
||||||
|
for (i = 0; i < buflen; ++i) |
||||||
|
dest[needed + i] = buf[i]; |
||||||
|
for (i = 0; i < len; ++i) |
||||||
|
dest[needed + buflen + i] = weights[idxarr[backw] + i]; |
||||||
|
} |
||||||
|
needed += buflen + len; |
||||||
|
idxarr[backw] += len; |
||||||
|
val = 1; |
||||||
|
} else |
||||||
|
++val; |
||||||
|
} |
||||||
|
|
||||||
|
backw_stop = ~0ul; |
||||||
|
} |
||||||
|
|
||||||
|
/* Now handle the forward element. */ |
||||||
|
len = weights[idxarr[idxcnt]++]; |
||||||
|
if (len != 0) { |
||||||
|
buflen = utf8_encode(buf, val); |
||||||
|
if (needed + buflen + len < n) { |
||||||
|
for (i = 0; i < buflen; ++i) |
||||||
|
dest[needed + i] = buf[i]; |
||||||
|
for (i = 0; i < len; ++i) |
||||||
|
dest[needed + buflen + i] = weights[idxarr[idxcnt] + i]; |
||||||
|
} |
||||||
|
needed += buflen + len; |
||||||
|
idxarr[idxcnt] += len; |
||||||
|
val = 1; |
||||||
|
} else |
||||||
|
/* Note that we don't have to increment `idxarr[idxcnt]'
|
||||||
|
since the length is zero. */ |
||||||
|
++val; |
||||||
|
} else { |
||||||
|
/* Remember where the backwards series started. */ |
||||||
|
if (backw_stop == ~0ul) |
||||||
|
backw_stop = idxcnt; |
||||||
|
} |
||||||
|
|
||||||
|
rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; |
||||||
|
} |
||||||
|
|
||||||
|
if (backw_stop != ~0ul) { |
||||||
|
/* Handle the pushed elements now. */ |
||||||
|
size_t backw; |
||||||
|
|
||||||
|
backw = idxmax - 1; |
||||||
|
while (backw > backw_stop) { |
||||||
|
size_t len = weights[idxarr[--backw]++]; |
||||||
|
if (len != 0) { |
||||||
|
buflen = utf8_encode(buf, val); |
||||||
|
if (needed + buflen + len < n) { |
||||||
|
for (i = 0; i < buflen; ++i) |
||||||
|
dest[needed + i] = buf[i]; |
||||||
|
for (i = 0; i < len; ++i) |
||||||
|
dest[needed + buflen + i] = weights[idxarr[backw] + i]; |
||||||
|
} |
||||||
|
needed += buflen + len; |
||||||
|
idxarr[backw] += len; |
||||||
|
val = 1; |
||||||
|
} else |
||||||
|
++val; |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
/* Finally store the byte to separate the passes or terminate
|
||||||
|
the string. */ |
||||||
|
if (needed < n) |
||||||
|
dest[needed] = pass + 1 < nrules ? L('\1') : L('\0'); |
||||||
|
++needed; |
||||||
|
} |
||||||
|
|
||||||
|
/* This is a little optimization: many collation specifications have
|
||||||
|
a `position' rule at the end and if no non-ignored character |
||||||
|
is found the last \1 byte is immediately followed by a \0 byte |
||||||
|
signalling this. We can avoid the \1 byte(s). */ |
||||||
|
if (needed > 2 && needed == last_needed + 1) { |
||||||
|
/* Remove the \1 byte. */ |
||||||
|
if (--needed <= n) |
||||||
|
dest[needed - 1] = L('\0'); |
||||||
|
} |
||||||
|
|
||||||
|
/* Return the number of bytes/words we need, but don't count the NUL
|
||||||
|
byte/word at the end. */ |
||||||
|
return needed - 1; |
||||||
|
} |
||||||
|
|
||||||
|
size_t glib_strxfrm(char* dest, |
||||||
|
const char* src, |
||||||
|
size_t n, |
||||||
|
locale_data_t* locale) { |
||||||
|
/* Handle byte comparison case. */ |
||||||
|
if (locale->nrules == 0) { |
||||||
|
size_t srclen = strlen(src); |
||||||
|
|
||||||
|
if (n != 0) { |
||||||
|
strncpy(dest, src, MIN(srclen + 1, n)); |
||||||
|
} |
||||||
|
|
||||||
|
return srclen; |
||||||
|
} |
||||||
|
|
||||||
|
/* Handle an empty string, code hereafter relies on strlen (src) > 0. */ |
||||||
|
if (*src == L('\0')) { |
||||||
|
if (n != 0) |
||||||
|
*dest = L('\0'); |
||||||
|
return 0; |
||||||
|
} |
||||||
|
|
||||||
|
/* We need the elements of the string as unsigned values since they
|
||||||
|
are used as indeces. */ |
||||||
|
const USTRING_TYPE* usrc = (const USTRING_TYPE*)src; |
||||||
|
|
||||||
|
/* Allocate cache for small strings on the stack and fill it with weight and
|
||||||
|
rule indices. If the cache size is not sufficient, continue with the |
||||||
|
uncached xfrm version. */ |
||||||
|
size_t idxmax = 0; |
||||||
|
const USTRING_TYPE* cur = usrc; |
||||||
|
int32_t* idxarr = alloca(SMALL_STR_SIZE * sizeof(int32_t)); |
||||||
|
unsigned char* rulearr = alloca(SMALL_STR_SIZE + 1); |
||||||
|
|
||||||
|
do { |
||||||
|
int32_t tmp = |
||||||
|
findidx(locale->table, locale->indirect, locale->extra, &cur, -1); |
||||||
|
rulearr[idxmax] = tmp >> 24; |
||||||
|
idxarr[idxmax] = tmp & 0xffffff; |
||||||
|
|
||||||
|
++idxmax; |
||||||
|
} while (*cur != L('\0') && idxmax < SMALL_STR_SIZE); |
||||||
|
|
||||||
|
/* This element is only read, the value never used but to determine
|
||||||
|
another value which then is ignored. */ |
||||||
|
rulearr[idxmax] = '\0'; |
||||||
|
|
||||||
|
/* Do the transformation. */ |
||||||
|
if (*cur == L('\0')) |
||||||
|
return do_xfrm_cached(dest, n, locale, idxmax, idxarr, rulearr); |
||||||
|
else |
||||||
|
return do_xfrm(usrc, dest, n, locale); |
||||||
|
} |
Loading…
Reference in new issue