parent
18d90051c9
commit
b58c081508
|
@ -0,0 +1,11 @@ |
||||
# Copyright 2023 jacqueline <me@jacqueline.id.au> |
||||
# |
||||
# SPDX-License-Identifier: GPL-3.0-only |
||||
|
||||
idf_component_register( |
||||
SRCS "collation.cpp" "strxfrm_l.c" |
||||
INCLUDE_DIRS "include" |
||||
PRIV_INCLUDE_DIRS "priv_include" |
||||
REQUIRES "span" "esp_partition" "spi_flash") |
||||
|
||||
target_compile_options(${COMPONENT_LIB} PRIVATE ${EXTRA_WARNINGS}) |
@ -0,0 +1,92 @@ |
||||
/*
|
||||
* Copyright 2023 jacqueline <me@jacqueline.id.au> |
||||
* |
||||
* SPDX-License-Identifier: GPL-3.0-only |
||||
*/ |
||||
|
||||
#include "collation.hpp" |
||||
|
||||
#include <stdint.h> |
||||
#include <memory> |
||||
|
||||
#include "esp_flash_spi_init.h" |
||||
#include "esp_log.h" |
||||
#include "esp_partition.h" |
||||
#include "hal/spi_flash_types.h" |
||||
#include "spi_flash_mmap.h" |
||||
#include "strxfrm.h" |
||||
|
||||
namespace locale { |
||||
|
||||
static constexpr char kTag[] = "collate"; |
||||
|
||||
static constexpr esp_partition_type_t kLocalePartitionType = |
||||
static_cast<esp_partition_type_t>(0x40); |
||||
static constexpr esp_partition_subtype_t kLcCollateSubtype = |
||||
static_cast<esp_partition_subtype_t>(0x0); |
||||
|
||||
auto CreateCollator() -> std::unique_ptr<ICollator> { |
||||
std::unique_ptr<ICollator> ret{GLibCollator::create()}; |
||||
if (!ret) { |
||||
ret.reset(new NoopCollator()); |
||||
} |
||||
return ret; |
||||
} |
||||
|
||||
auto GLibCollator::create() -> GLibCollator* { |
||||
uint32_t data_pages = spi_flash_mmap_get_free_pages(SPI_FLASH_MMAP_DATA); |
||||
ESP_LOGI(kTag, "free data pages: %lu (%lu KiB)", data_pages, data_pages * 64); |
||||
|
||||
const esp_partition_t* partition = |
||||
esp_partition_find_first(kLocalePartitionType, kLcCollateSubtype, NULL); |
||||
if (partition == NULL) { |
||||
ESP_LOGW(kTag, "no LC_COLLATE partition found"); |
||||
} |
||||
|
||||
ESP_LOGI(kTag, "found LC_COLLATE partition of size %lu", partition->size); |
||||
if (partition->size > data_pages * 64 * 1024) { |
||||
ESP_LOGE(kTag, "not enough free pages to map LC_COLLATE partition!"); |
||||
return nullptr; |
||||
} |
||||
|
||||
const void* region; |
||||
esp_partition_mmap_handle_t handle; |
||||
esp_err_t err = esp_partition_mmap(partition, 0, partition->size, |
||||
ESP_PARTITION_MMAP_DATA, ®ion, &handle); |
||||
if (err != ESP_OK) { |
||||
ESP_LOGE(kTag, "LC_COLLATE mmap failed"); |
||||
return nullptr; |
||||
} |
||||
|
||||
auto data = std::make_unique<locale_data_t>(); |
||||
if (!parse_locale_data(region, partition->size, data.get())) { |
||||
ESP_LOGE(kTag, "parsing locale data failed"); |
||||
esp_partition_munmap(handle); |
||||
return nullptr; |
||||
} |
||||
|
||||
return new GLibCollator(handle, std::move(data)); |
||||
} |
||||
|
||||
GLibCollator::GLibCollator(const esp_partition_mmap_handle_t handle, |
||||
std::unique_ptr<locale_data_t> locale) |
||||
: handle_(handle), locale_data_(std::move(locale)) {} |
||||
|
||||
GLibCollator::~GLibCollator() { |
||||
esp_partition_munmap(handle_); |
||||
} |
||||
|
||||
auto GLibCollator::Transform(const std::string& in) -> std::string { |
||||
char dest[256]; |
||||
size_t size = glib_strxfrm(dest, in.c_str(), 256, locale_data_.get()); |
||||
if (size >= 256) { |
||||
char* larger_dest = new char[size + 1]{0}; |
||||
glib_strxfrm(larger_dest, in.c_str(), size, locale_data_.get()); |
||||
std::string out{larger_dest, size}; |
||||
delete[] larger_dest; |
||||
return out; |
||||
} |
||||
return {dest, size}; |
||||
} |
||||
|
||||
} // namespace locale
|
@ -0,0 +1,49 @@ |
||||
/*
|
||||
* Copyright 2023 jacqueline <me@jacqueline.id.au> |
||||
* |
||||
* SPDX-License-Identifier: GPL-3.0-only |
||||
*/ |
||||
|
||||
#pragma once |
||||
|
||||
#include <cstddef> |
||||
#include <memory> |
||||
#include <string> |
||||
|
||||
#include "esp_partition.h" |
||||
#include "span.hpp" |
||||
|
||||
#include "strxfrm.h" |
||||
|
||||
namespace locale { |
||||
|
||||
class ICollator { |
||||
public: |
||||
virtual ~ICollator() {} |
||||
|
||||
virtual auto Transform(const std::string&) -> std::string = 0; |
||||
}; |
||||
|
||||
class NoopCollator : public ICollator { |
||||
public: |
||||
auto Transform(const std::string& in) -> std::string override { return in; } |
||||
}; |
||||
|
||||
auto CreateCollator() -> std::unique_ptr<ICollator>; |
||||
|
||||
class GLibCollator : public ICollator { |
||||
public: |
||||
static auto create() -> GLibCollator*; |
||||
~GLibCollator(); |
||||
|
||||
auto Transform(const std::string& in) -> std::string override; |
||||
|
||||
private: |
||||
GLibCollator(const esp_partition_mmap_handle_t, |
||||
std::unique_ptr<locale_data_t>); |
||||
|
||||
const esp_partition_mmap_handle_t handle_; |
||||
std::unique_ptr<locale_data_t> locale_data_; |
||||
}; |
||||
|
||||
} // namespace locale
|
@ -0,0 +1,35 @@ |
||||
/*
|
||||
* Copyright 2023 jacqueline <me@jacqueline.id.au> |
||||
* |
||||
* SPDX-License-Identifier: GPL-3.0-only |
||||
*/ |
||||
|
||||
#pragma once |
||||
|
||||
#include <stddef.h> |
||||
#include <stdint.h> |
||||
#include <stdbool.h> |
||||
|
||||
#ifdef __cplusplus |
||||
extern "C" { |
||||
#endif |
||||
|
||||
typedef struct { |
||||
uint_fast32_t nrules; |
||||
unsigned char* rulesets; |
||||
unsigned char* weights; |
||||
int32_t* table; |
||||
unsigned char* extra; |
||||
int32_t* indirect; |
||||
} locale_data_t; |
||||
|
||||
bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out); |
||||
|
||||
size_t glib_strxfrm(char* dest, |
||||
const char* src, |
||||
size_t n, |
||||
locale_data_t* locale); |
||||
|
||||
#ifdef __cplusplus |
||||
} |
||||
#endif |
@ -0,0 +1,160 @@ |
||||
/* Copyright (C) 1996-2018 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library. |
||||
Written by Ulrich Drepper, <drepper@cygnus.com>. |
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or |
||||
modify it under the terms of the GNU Lesser General Public |
||||
License as published by the Free Software Foundation; either |
||||
version 2.1 of the License, or (at your option) any later version. |
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful, |
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
Lesser General Public License for more details. |
||||
|
||||
You should have received a copy of the GNU Lesser General Public |
||||
License along with the GNU C Library; if not, see |
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#pragma once |
||||
|
||||
#include <stdint.h> |
||||
#include <stddef.h> |
||||
|
||||
/* This alignment is used for 32-bit integers in locale files, both
|
||||
those that are explicitly int32_t or uint32_t and those that are |
||||
wchar_t, regardless of the (possibly smaller) alignment required |
||||
for such integers on a particular host. */ |
||||
#define LOCFILE_ALIGN sizeof (int32_t) |
||||
#define LOCFILE_ALIGN_MASK (LOCFILE_ALIGN - 1) |
||||
#define LOCFILE_ALIGN_UP(x) (((x) + LOCFILE_ALIGN - 1) \ |
||||
& ~LOCFILE_ALIGN_MASK) |
||||
#define LOCFILE_ALIGNED_P(x) (((x) & LOCFILE_ALIGN_MASK) == 0) |
||||
|
||||
/* Find index of weight. */ |
||||
static inline int32_t __attribute__ ((always_inline)) |
||||
findidx (const int32_t *table, |
||||
const int32_t *indirect, |
||||
const unsigned char *extra, |
||||
const unsigned char **cpp, size_t len) |
||||
{ |
||||
int_fast32_t i = table[*(*cpp)++]; |
||||
const unsigned char *cp; |
||||
const unsigned char *usrc; |
||||
|
||||
if (i >= 0) |
||||
/* This is an index into the weight table. Cool. */ |
||||
return i; |
||||
|
||||
/* Oh well, more than one sequence starting with this byte.
|
||||
Search for the correct one. */ |
||||
cp = &extra[-i]; |
||||
usrc = *cpp; |
||||
--len; |
||||
while (1) |
||||
{ |
||||
size_t nhere; |
||||
|
||||
/* The first thing is the index. */ |
||||
i = *((const int32_t *) cp); |
||||
cp += sizeof (int32_t); |
||||
|
||||
/* Next is the length of the byte sequence. These are always
|
||||
short byte sequences so there is no reason to call any |
||||
function (even if they are inlined). */ |
||||
nhere = *cp++; |
||||
|
||||
if (i >= 0) |
||||
{ |
||||
/* It is a single character. If it matches we found our
|
||||
index. Note that at the end of each list there is an |
||||
entry of length zero which represents the single byte |
||||
sequence. The first (and here only) byte was tested |
||||
already. */ |
||||
size_t cnt; |
||||
|
||||
/* With GCC 5.3 when compiling with -Os the compiler warns
|
||||
that seq2.back_us, which becomes usrc, might be used |
||||
uninitialized. This can't be true because we pass a length |
||||
of -1 for len at the same time which means that this loop |
||||
never executes. */ |
||||
for (cnt = 0; cnt < nhere && cnt < len; ++cnt) |
||||
if (cp[cnt] != usrc[cnt]) |
||||
break; |
||||
|
||||
if (cnt == nhere) |
||||
{ |
||||
/* Found it. */ |
||||
*cpp += nhere; |
||||
return i; |
||||
} |
||||
|
||||
/* Up to the next entry. */ |
||||
cp += nhere; |
||||
if (!LOCFILE_ALIGNED_P (1 + nhere)) |
||||
cp += LOCFILE_ALIGN - (1 + nhere) % LOCFILE_ALIGN; |
||||
} |
||||
else |
||||
{ |
||||
/* This is a range of characters. First decide whether the
|
||||
current byte sequence lies in the range. */ |
||||
size_t cnt; |
||||
size_t offset = 0; |
||||
|
||||
for (cnt = 0; cnt < nhere && cnt < len; ++cnt) |
||||
if (cp[cnt] != usrc[cnt]) |
||||
break; |
||||
|
||||
if (cnt != nhere) |
||||
{ |
||||
if (cnt == len || cp[cnt] > usrc[cnt]) |
||||
{ |
||||
/* Cannot be in this range. */ |
||||
cp += 2 * nhere; |
||||
if (!LOCFILE_ALIGNED_P (1 + 2 * nhere)) |
||||
cp += (LOCFILE_ALIGN |
||||
- (1 + 2 * nhere) % LOCFILE_ALIGN); |
||||
continue; |
||||
} |
||||
|
||||
/* Test against the end of the range. */ |
||||
for (cnt = 0; cnt < nhere; ++cnt) |
||||
if (cp[nhere + cnt] != usrc[cnt]) |
||||
break; |
||||
|
||||
if (cnt != nhere && cp[nhere + cnt] < usrc[cnt]) |
||||
{ |
||||
/* Cannot be in this range. */ |
||||
cp += 2 * nhere; |
||||
if (!LOCFILE_ALIGNED_P (1 + 2 * nhere)) |
||||
cp += (LOCFILE_ALIGN |
||||
- (1 + 2 * nhere) % LOCFILE_ALIGN); |
||||
continue; |
||||
} |
||||
|
||||
/* This range matches the next characters. Now find
|
||||
the offset in the indirect table. */ |
||||
for (cnt = 0; cp[cnt] == usrc[cnt]; ++cnt); |
||||
|
||||
do |
||||
{ |
||||
offset <<= 8; |
||||
/* With GCC 7 when compiling with -Os the compiler
|
||||
warns that seq1.back_us and seq2.back_us, which |
||||
become usrc, might be used uninitialized. This |
||||
is impossible for the same reason as described |
||||
above. */ |
||||
offset += usrc[cnt] - cp[cnt]; |
||||
} |
||||
while (++cnt < nhere); |
||||
} |
||||
|
||||
*cpp += nhere; |
||||
return indirect[-i + offset]; |
||||
} |
||||
} |
||||
|
||||
/* NOTREACHED */ |
||||
return 0x43219876; |
||||
} |
||||
|
@ -0,0 +1,641 @@ |
||||
/* Copyright (C) 1995-2018 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library. |
||||
Written by Ulrich Drepper <drepper@gnu.org>, 1995. |
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or |
||||
modify it under the terms of the GNU Lesser General Public |
||||
License as published by the Free Software Foundation; either |
||||
version 2.1 of the License, or (at your option) any later version. |
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful, |
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||||
Lesser General Public License for more details. |
||||
|
||||
You should have received a copy of the GNU Lesser General Public |
||||
License along with the GNU C Library; if not, see |
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "strxfrm.h" |
||||
|
||||
#include <assert.h> |
||||
#include <stdbool.h> |
||||
#include <stddef.h> |
||||
#include <stdint.h> |
||||
#include <stdlib.h> |
||||
#include <string.h> |
||||
#include <sys/param.h> |
||||
|
||||
#include "esp_log.h" |
||||
|
||||
#include "weight.h" |
||||
|
||||
static const char kTag[] = "strxfrm"; |
||||
|
||||
#ifndef STRING_TYPE |
||||
#define STRING_TYPE char |
||||
#define USTRING_TYPE unsigned char |
||||
#define STRLEN strlen |
||||
#define STPNCPY stpncpy |
||||
#define L(arg) arg |
||||
#endif |
||||
|
||||
#define CONCAT(a, b) CONCAT1(a, b) |
||||
#define CONCAT1(a, b) a##b |
||||
|
||||
/* Maximum string size that is calculated with cached indices. Right now this
|
||||
is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be |
||||
lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */ |
||||
#define SMALL_STR_SIZE 4095 |
||||
|
||||
/* We know three kinds of collation sorting rules. */ |
||||
enum coll_sort_rule { |
||||
illegal_0__, |
||||
sort_forward, |
||||
sort_backward, |
||||
illegal_3__, |
||||
sort_position, |
||||
sort_forward_position, |
||||
sort_backward_position, |
||||
sort_mask |
||||
}; |
||||
|
||||
enum collate_element { |
||||
COLLATE_NRULES = 0, |
||||
COLLATE_RULESETS, |
||||
COLLATE_TABLEMB, |
||||
COLLATE_WEIGHTMB, |
||||
COLLATE_EXTRAMB, |
||||
COLLATE_INDIRECTMB, |
||||
COLLATE_GAP1, |
||||
COLLATE_GAP2, |
||||
COLLATE_GAP3, |
||||
COLLATE_TABLEWC, |
||||
COLLATE_WEIGHTWC, |
||||
COLLATE_EXTRAWC, |
||||
COLLATE_INDIRECTWC, |
||||
COLLATE_SYMB_HASH_SIZEMB, |
||||
COLLATE_SYMB_TABLEMB, |
||||
COLLATE_SYMB_EXTRAMB, |
||||
COLLATE_COLLSEQMB, |
||||
COLLATE_COLLSEQWC, |
||||
COLLATE_CODESET, |
||||
|
||||
// Not a real element; used to know how many elements there are.
|
||||
COLLATE_LAST |
||||
}; |
||||
typedef enum collate_element collate_element_t; |
||||
|
||||
bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out) { |
||||
const struct { |
||||
unsigned int magic; |
||||
unsigned int nstrings; |
||||
unsigned int strindex[0]; |
||||
}* const header = raw_data; |
||||
|
||||
if (header->magic != 0x20051017) { |
||||
ESP_LOGE(kTag, "file magic incorrect (was %x)", header->magic); |
||||
return false; |
||||
} |
||||
|
||||
if (sizeof(*header) + header->nstrings * sizeof(unsigned int) >= size) { |
||||
ESP_LOGE(kTag, "file was too small to contain header"); |
||||
return false; |
||||
} |
||||
|
||||
if (header->nstrings != COLLATE_LAST) { |
||||
ESP_LOGE(kTag, "file has incorrect number of elements (was %u, wanted %u)", |
||||
header->nstrings, COLLATE_LAST); |
||||
return false; |
||||
} |
||||
|
||||
// The LC_COLLATE partition appears to contain data in the correct shape.
|
||||
// Pull out pointers to the various attributes it contains.
|
||||
const void* offsets[COLLATE_LAST]; |
||||
for (size_t i = 0; i < header->nstrings; i++) { |
||||
size_t offset = header->strindex[i]; |
||||
if (offset > size) { |
||||
ESP_LOGE(kTag, "element offset (%u) exceeds file size", offset); |
||||
return false; |
||||
} |
||||
offsets[i] = (raw_data + offset); |
||||
} |
||||
|
||||
// Now parse those pointers into the output struct.
|
||||
out->nrules = *(const unsigned int*)offsets[COLLATE_NRULES]; |
||||
|
||||
out->rulesets = (unsigned char*)offsets[COLLATE_RULESETS]; |
||||
out->table = (int32_t*)offsets[COLLATE_TABLEMB]; |
||||
out->weights = (unsigned char*)offsets[COLLATE_WEIGHTMB]; |
||||
out->extra = (unsigned char*)offsets[COLLATE_EXTRAMB]; |
||||
out->indirect = (int32_t*)offsets[COLLATE_INDIRECTMB]; |
||||
|
||||
assert(((uintptr_t)out->table) % __alignof__(out->table[0]) == 0); |
||||
assert(((uintptr_t)out->weights) % __alignof__(out->weights[0]) == 0); |
||||
assert(((uintptr_t)out->extra) % __alignof__(out->extra[0]) == 0); |
||||
assert(((uintptr_t)out->indirect) % __alignof__(out->indirect[0]) == 0); |
||||
|
||||
return true; |
||||
} |
||||
|
||||
/* We need UTF-8 encoding of numbers. */ |
||||
static int utf8_encode(char* buf, int val) { |
||||
int retval; |
||||
|
||||
if (val < 0x80) { |
||||
*buf++ = (char)val; |
||||
retval = 1; |
||||
} else { |
||||
int step; |
||||
|
||||
for (step = 2; step < 6; ++step) |
||||
if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0) |
||||
break; |
||||
retval = step; |
||||
|
||||
*buf = (unsigned char)(~0xff >> step); |
||||
--step; |
||||
do { |
||||
buf[step] = 0x80 | (val & 0x3f); |
||||
val >>= 6; |
||||
} while (--step > 0); |
||||
*buf |= val; |
||||
} |
||||
|
||||
return retval; |
||||
} |
||||
|
||||
/* Find next weight and rule index. Inlined since called for every char. */ |
||||
static __always_inline size_t find_idx(const USTRING_TYPE** us, |
||||
int32_t* weight_idx, |
||||
unsigned char* rule_idx, |
||||
const locale_data_t* l_data, |
||||
const int pass) { |
||||
int32_t tmp = findidx(l_data->table, l_data->indirect, l_data->extra, us, -1); |
||||
*rule_idx = tmp >> 24; |
||||
int32_t idx = tmp & 0xffffff; |
||||
size_t len = l_data->weights[idx++]; |
||||
|
||||
/* Skip over indices of previous levels. */ |
||||
for (int i = 0; i < pass; i++) { |
||||
idx += len; |
||||
len = l_data->weights[idx++]; |
||||
} |
||||
|
||||
*weight_idx = idx; |
||||
return len; |
||||
} |
||||
|
||||
static int find_position(const USTRING_TYPE* us, |
||||
const locale_data_t* l_data, |
||||
const int pass) { |
||||
int32_t weight_idx; |
||||
unsigned char rule_idx; |
||||
const USTRING_TYPE* usrc = us; |
||||
|
||||
find_idx(&usrc, &weight_idx, &rule_idx, l_data, pass); |
||||
return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position; |
||||
} |
||||
|
||||
/* Do the transformation. */ |
||||
static size_t do_xfrm(const USTRING_TYPE* usrc, |
||||
STRING_TYPE* dest, |
||||
size_t n, |
||||
const locale_data_t* l_data) { |
||||
int32_t weight_idx; |
||||
unsigned char rule_idx; |
||||
uint_fast32_t pass; |
||||
size_t needed = 0; |
||||
size_t last_needed; |
||||
|
||||
/* Now the passes over the weights. */ |
||||
for (pass = 0; pass < l_data->nrules; ++pass) { |
||||
size_t backw_len = 0; |
||||
last_needed = needed; |
||||
const USTRING_TYPE* cur = usrc; |
||||
const USTRING_TYPE* backw_start = NULL; |
||||
|
||||
/* We assume that if a rule has defined `position' in one section
|
||||
this is true for all of them. */ |
||||
int position = find_position(cur, l_data, pass); |
||||
|
||||
if (position == 0) { |
||||
while (*cur != L('\0')) { |
||||
const USTRING_TYPE* pos = cur; |
||||
size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); |
||||
int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; |
||||
|
||||
if ((rule & sort_forward) != 0) { |
||||
/* Handle the pushed backward sequence. */ |
||||
if (backw_start != NULL) { |
||||
for (size_t i = backw_len; i > 0;) { |
||||
int32_t weight_idx; |
||||
unsigned char rule_idx; |
||||
size_t len = |
||||
find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); |
||||
if (needed + i < n) |
||||
for (size_t j = len; j > 0; j--) |
||||
dest[needed + i - j] = l_data->weights[weight_idx++]; |
||||
|
||||
i -= len; |
||||
} |
||||
|
||||
needed += backw_len; |
||||
backw_start = NULL; |
||||
backw_len = 0; |
||||
} |
||||
|
||||
/* Now handle the forward element. */ |
||||
if (needed + len < n) |
||||
while (len-- > 0) |
||||
dest[needed++] = l_data->weights[weight_idx++]; |
||||
else |
||||
/* No more characters fit into the buffer. */ |
||||
needed += len; |
||||
} else { |
||||
/* Remember start of the backward sequence & track length. */ |
||||
if (backw_start == NULL) |
||||
backw_start = pos; |
||||
backw_len += len; |
||||
} |
||||
} |
||||
|
||||
/* Handle the pushed backward sequence. */ |
||||
if (backw_start != NULL) { |
||||
for (size_t i = backw_len; i > 0;) { |
||||
size_t len = |
||||
find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); |
||||
if (needed + i < n) |
||||
for (size_t j = len; j > 0; j--) |
||||
dest[needed + i - j] = l_data->weights[weight_idx++]; |
||||
|
||||
i -= len; |
||||
} |
||||
|
||||
needed += backw_len; |
||||
} |
||||
} else { |
||||
int val = 1; |
||||
char buf[7]; |
||||
size_t buflen; |
||||
size_t i; |
||||
|
||||
while (*cur != L('\0')) { |
||||
const USTRING_TYPE* pos = cur; |
||||
size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); |
||||
int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; |
||||
|
||||
if ((rule & sort_forward) != 0) { |
||||
/* Handle the pushed backward sequence. */ |
||||
if (backw_start != NULL) { |
||||
for (size_t p = backw_len; p > 0; p--) { |
||||
size_t len; |
||||
int32_t weight_idx; |
||||
unsigned char rule_idx; |
||||
const USTRING_TYPE* backw_cur = backw_start; |
||||
|
||||
/* To prevent a warning init the used vars. */ |
||||
len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); |
||||
|
||||
for (i = 1; i < p; i++) |
||||
len = |
||||
find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); |
||||
|
||||
if (len != 0) { |
||||
buflen = utf8_encode(buf, val); |
||||
if (needed + buflen + len < n) { |
||||
for (i = 0; i < buflen; ++i) |
||||
dest[needed + i] = buf[i]; |
||||
for (i = 0; i < len; ++i) |
||||
dest[needed + buflen + i] = l_data->weights[weight_idx + i]; |
||||
} |
||||
needed += buflen + len; |
||||
val = 1; |
||||
} else |
||||
++val; |
||||
} |
||||
|
||||
backw_start = NULL; |
||||
backw_len = 0; |
||||
} |
||||
|
||||
/* Now handle the forward element. */ |
||||
if (len != 0) { |
||||
buflen = utf8_encode(buf, val); |
||||
if (needed + buflen + len < n) { |
||||
for (i = 0; i < buflen; ++i) |
||||
dest[needed + i] = buf[i]; |
||||
for (i = 0; i < len; ++i) |
||||
dest[needed + buflen + i] = l_data->weights[weight_idx + i]; |
||||
} |
||||
needed += buflen + len; |
||||
val = 1; |
||||
} else |
||||
++val; |
||||
} else { |
||||
/* Remember start of the backward sequence & track length. */ |
||||
if (backw_start == NULL) |
||||
backw_start = pos; |
||||
backw_len++; |
||||
} |
||||
} |
||||
|
||||
/* Handle the pushed backward sequence. */ |
||||
if (backw_start != NULL) { |
||||
for (size_t p = backw_len; p > 0; p--) { |
||||
size_t len; |
||||
int32_t weight_idx; |
||||
unsigned char rule_idx; |
||||
const USTRING_TYPE* backw_cur = backw_start; |
||||
|
||||
/* To prevent a warning init the used vars. */ |
||||
len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); |
||||
|
||||
for (i = 1; i < p; i++) |
||||
len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); |
||||
|
||||
if (len != 0) { |
||||
buflen = utf8_encode(buf, val); |
||||
if (needed + buflen + len < n) { |
||||
for (i = 0; i < buflen; ++i) |
||||
dest[needed + i] = buf[i]; |
||||
for (i = 0; i < len; ++i) |
||||
dest[needed + buflen + i] = l_data->weights[weight_idx + i]; |
||||
} |
||||
needed += buflen + len; |
||||
val = 1; |
||||
} else |
||||
++val; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/* Finally store the byte to separate the passes or terminate
|
||||
the string. */ |
||||
if (needed < n) |
||||
dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0'); |
||||
++needed; |
||||
} |
||||
|
||||
/* This is a little optimization: many collation specifications have
|
||||
a `position' rule at the end and if no non-ignored character |
||||
is found the last \1 byte is immediately followed by a \0 byte |
||||
signalling this. We can avoid the \1 byte(s). */ |
||||
if (needed > 2 && needed == last_needed + 1) { |
||||
/* Remove the \1 byte. */ |
||||
if (--needed <= n) |
||||
dest[needed - 1] = L('\0'); |
||||
} |
||||
|
||||
/* Return the number of bytes/words we need, but don't count the NUL
|
||||
byte/word at the end. */ |
||||
return needed - 1; |
||||
} |
||||
|
||||
/* Do the transformation using weight-index and rule cache. */ |
||||
static size_t do_xfrm_cached(STRING_TYPE* dest, |
||||
size_t n, |
||||
const locale_data_t* l_data, |
||||
size_t idxmax, |
||||
int32_t* idxarr, |
||||
const unsigned char* rulearr) { |
||||
uint_fast32_t nrules = l_data->nrules; |
||||
unsigned char* rulesets = l_data->rulesets; |
||||
USTRING_TYPE* weights = l_data->weights; |
||||
uint_fast32_t pass; |
||||
size_t needed = 0; |
||||
size_t last_needed; |
||||
size_t idxcnt; |
||||
|
||||
/* Now the passes over the weights. */ |
||||
for (pass = 0; pass < nrules; ++pass) { |
||||
size_t backw_stop = ~0ul; |
||||
int rule = rulesets[rulearr[0] * nrules + pass]; |
||||
/* We assume that if a rule has defined `position' in one section
|
||||
this is true for all of them. */ |
||||
int position = rule & sort_position; |
||||
|
||||
last_needed = needed; |
||||
if (position == 0) { |
||||
for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { |
||||
if ((rule & sort_forward) != 0) { |
||||
size_t len; |
||||
|
||||
if (backw_stop != ~0ul) { |
||||
/* Handle the pushed elements now. */ |
||||
size_t backw; |
||||
|
||||
for (backw = idxcnt; backw > backw_stop;) { |
||||
--backw; |
||||
len = weights[idxarr[backw]++]; |
||||
|
||||
if (needed + len < n) |
||||
while (len-- > 0) |
||||
dest[needed++] = weights[idxarr[backw]++]; |
||||
else { |
||||
/* No more characters fit into the buffer. */ |
||||
needed += len; |
||||
idxarr[backw] += len; |
||||
} |
||||
} |
||||
|
||||
backw_stop = ~0ul; |
||||
} |
||||
|
||||
/* Now handle the forward element. */ |
||||
len = weights[idxarr[idxcnt]++]; |
||||
if (needed + len < n) |
||||
while (len-- > 0) |
||||
dest[needed++] = weights[idxarr[idxcnt]++]; |
||||
else { |
||||
/* No more characters fit into the buffer. */ |
||||
needed += len; |
||||
idxarr[idxcnt] += len; |
||||
} |
||||
} else { |
||||
/* Remember where the backwards series started. */ |
||||
if (backw_stop == ~0ul) |
||||
backw_stop = idxcnt; |
||||
} |
||||
|
||||
rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; |
||||
} |
||||
|
||||
if (backw_stop != ~0ul) { |
||||
/* Handle the pushed elements now. */ |
||||
size_t backw; |
||||
|
||||
backw = idxcnt; |
||||
while (backw > backw_stop) { |
||||
size_t len = weights[idxarr[--backw]++]; |
||||
|
||||
if (needed + len < n) |
||||
while (len-- > 0) |
||||
dest[needed++] = weights[idxarr[backw]++]; |
||||
else { |
||||
/* No more characters fit into the buffer. */ |
||||
needed += len; |
||||
idxarr[backw] += len; |
||||
} |
||||
} |
||||
} |
||||
} else { |
||||
int val = 1; |
||||
char buf[7]; |
||||
size_t buflen; |
||||
size_t i; |
||||
|
||||
for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { |
||||
if ((rule & sort_forward) != 0) { |
||||
size_t len; |
||||
|
||||
if (backw_stop != ~0ul) { |
||||
/* Handle the pushed elements now. */ |
||||
size_t backw; |
||||
|
||||
for (backw = idxcnt; backw > backw_stop;) { |
||||
--backw; |
||||
len = weights[idxarr[backw]++]; |
||||
if (len != 0) { |
||||
buflen = utf8_encode(buf, val); |
||||
if (needed + buflen + len < n) { |
||||
for (i = 0; i < buflen; ++i) |
||||
dest[needed + i] = buf[i]; |
||||
for (i = 0; i < len; ++i) |
||||
dest[needed + buflen + i] = weights[idxarr[backw] + i]; |
||||
} |
||||
needed += buflen + len; |
||||
idxarr[backw] += len; |
||||
val = 1; |
||||
} else |
||||
++val; |
||||
} |
||||
|
||||
backw_stop = ~0ul; |
||||
} |
||||
|
||||
/* Now handle the forward element. */ |
||||
len = weights[idxarr[idxcnt]++]; |
||||
if (len != 0) { |
||||
buflen = utf8_encode(buf, val); |
||||
if (needed + buflen + len < n) { |
||||
for (i = 0; i < buflen; ++i) |
||||
dest[needed + i] = buf[i]; |
||||
for (i = 0; i < len; ++i) |
||||
dest[needed + buflen + i] = weights[idxarr[idxcnt] + i]; |
||||
} |
||||
needed += buflen + len; |
||||
idxarr[idxcnt] += len; |
||||
val = 1; |
||||
} else |
||||
/* Note that we don't have to increment `idxarr[idxcnt]'
|
||||
since the length is zero. */ |
||||
++val; |
||||
} else { |
||||
/* Remember where the backwards series started. */ |
||||
if (backw_stop == ~0ul) |
||||
backw_stop = idxcnt; |
||||
} |
||||
|
||||
rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; |
||||
} |
||||
|
||||
if (backw_stop != ~0ul) { |
||||
/* Handle the pushed elements now. */ |
||||
size_t backw; |
||||
|
||||
backw = idxmax - 1; |
||||
while (backw > backw_stop) { |
||||
size_t len = weights[idxarr[--backw]++]; |
||||
if (len != 0) { |
||||
buflen = utf8_encode(buf, val); |
||||
if (needed + buflen + len < n) { |
||||
for (i = 0; i < buflen; ++i) |
||||
dest[needed + i] = buf[i]; |
||||
for (i = 0; i < len; ++i) |
||||
dest[needed + buflen + i] = weights[idxarr[backw] + i]; |
||||
} |
||||
needed += buflen + len; |
||||
idxarr[backw] += len; |
||||
val = 1; |
||||
} else |
||||
++val; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/* Finally store the byte to separate the passes or terminate
|
||||
the string. */ |
||||
if (needed < n) |
||||
dest[needed] = pass + 1 < nrules ? L('\1') : L('\0'); |
||||
++needed; |
||||
} |
||||
|
||||
/* This is a little optimization: many collation specifications have
|
||||
a `position' rule at the end and if no non-ignored character |
||||
is found the last \1 byte is immediately followed by a \0 byte |
||||
signalling this. We can avoid the \1 byte(s). */ |
||||
if (needed > 2 && needed == last_needed + 1) { |
||||
/* Remove the \1 byte. */ |
||||
if (--needed <= n) |
||||
dest[needed - 1] = L('\0'); |
||||
} |
||||
|
||||
/* Return the number of bytes/words we need, but don't count the NUL
|
||||
byte/word at the end. */ |
||||
return needed - 1; |
||||
} |
||||
|
||||
size_t glib_strxfrm(char* dest, |
||||
const char* src, |
||||
size_t n, |
||||
locale_data_t* locale) { |
||||
/* Handle byte comparison case. */ |
||||
if (locale->nrules == 0) { |
||||
size_t srclen = strlen(src); |
||||
|
||||
if (n != 0) { |
||||
strncpy(dest, src, MIN(srclen + 1, n)); |
||||
} |
||||
|
||||
return srclen; |
||||
} |
||||
|
||||
/* Handle an empty string, code hereafter relies on strlen (src) > 0. */ |
||||
if (*src == L('\0')) { |
||||
if (n != 0) |
||||
*dest = L('\0'); |
||||
return 0; |
||||
} |
||||
|
||||
/* We need the elements of the string as unsigned values since they
|
||||
are used as indeces. */ |
||||
const USTRING_TYPE* usrc = (const USTRING_TYPE*)src; |
||||
|
||||
/* Allocate cache for small strings on the stack and fill it with weight and
|
||||
rule indices. If the cache size is not sufficient, continue with the |
||||
uncached xfrm version. */ |
||||
size_t idxmax = 0; |
||||
const USTRING_TYPE* cur = usrc; |
||||
int32_t* idxarr = alloca(SMALL_STR_SIZE * sizeof(int32_t)); |
||||
unsigned char* rulearr = alloca(SMALL_STR_SIZE + 1); |
||||
|
||||
do { |
||||
int32_t tmp = |
||||
findidx(locale->table, locale->indirect, locale->extra, &cur, -1); |
||||
rulearr[idxmax] = tmp >> 24; |
||||
idxarr[idxmax] = tmp & 0xffffff; |
||||
|
||||
++idxmax; |
||||
} while (*cur != L('\0') && idxmax < SMALL_STR_SIZE); |
||||
|
||||
/* This element is only read, the value never used but to determine
|
||||
another value which then is ignored. */ |
||||
rulearr[idxmax] = '\0'; |
||||
|
||||
/* Do the transformation. */ |
||||
if (*cur == L('\0')) |
||||
return do_xfrm_cached(dest, n, locale, idxmax, idxarr, rulearr); |
||||
else |
||||
return do_xfrm(usrc, dest, n, locale); |
||||
} |
Loading…
Reference in new issue