/* Copyright (C) 1995-2018 Free Software Foundation, Inc. This file is part of the GNU C Library. Written by Ulrich Drepper , 1995. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see . */ #include "strxfrm.h" #include #include #include #include #include #include #include #include "esp_log.h" #include "weight.h" static const char kTag[] = "strxfrm"; #ifndef STRING_TYPE #define STRING_TYPE char #define USTRING_TYPE unsigned char #define STRLEN strlen #define STPNCPY stpncpy #define L(arg) arg #endif #define CONCAT(a, b) CONCAT1(a, b) #define CONCAT1(a, b) a##b /* Maximum string size that is calculated with cached indices. Right now this is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */ #define SMALL_STR_SIZE 4095 /* We know three kinds of collation sorting rules. */ enum coll_sort_rule { illegal_0__, sort_forward, sort_backward, illegal_3__, sort_position, sort_forward_position, sort_backward_position, sort_mask }; enum collate_element { COLLATE_NRULES = 0, COLLATE_RULESETS, COLLATE_TABLEMB, COLLATE_WEIGHTMB, COLLATE_EXTRAMB, COLLATE_INDIRECTMB, COLLATE_GAP1, COLLATE_GAP2, COLLATE_GAP3, COLLATE_TABLEWC, COLLATE_WEIGHTWC, COLLATE_EXTRAWC, COLLATE_INDIRECTWC, COLLATE_SYMB_HASH_SIZEMB, COLLATE_SYMB_TABLEMB, COLLATE_SYMB_EXTRAMB, COLLATE_COLLSEQMB, COLLATE_COLLSEQWC, COLLATE_CODESET, // Not a real element; used to know how many elements there are. COLLATE_LAST }; typedef enum collate_element collate_element_t; bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out) { const struct { unsigned int magic; unsigned int nstrings; unsigned int strindex[0]; }* const header = raw_data; if (header->magic != 0x20051017) { ESP_LOGE(kTag, "file magic incorrect (was %x)", header->magic); return false; } if (sizeof(*header) + header->nstrings * sizeof(unsigned int) >= size) { ESP_LOGE(kTag, "file was too small to contain header"); return false; } if (header->nstrings != COLLATE_LAST) { ESP_LOGE(kTag, "file has incorrect number of elements (was %u, wanted %u)", header->nstrings, COLLATE_LAST); return false; } // The LC_COLLATE partition appears to contain data in the correct shape. // Pull out pointers to the various attributes it contains. const void* offsets[COLLATE_LAST]; for (size_t i = 0; i < header->nstrings; i++) { size_t offset = header->strindex[i]; if (offset > size) { ESP_LOGE(kTag, "element offset (%u) exceeds file size", offset); return false; } offsets[i] = (raw_data + offset); } // Now parse those pointers into the output struct. out->nrules = *(const unsigned int*)offsets[COLLATE_NRULES]; out->rulesets = (unsigned char*)offsets[COLLATE_RULESETS]; out->table = (int32_t*)offsets[COLLATE_TABLEMB]; out->weights = (unsigned char*)offsets[COLLATE_WEIGHTMB]; out->extra = (unsigned char*)offsets[COLLATE_EXTRAMB]; out->indirect = (int32_t*)offsets[COLLATE_INDIRECTMB]; assert(((uintptr_t)out->table) % __alignof__(out->table[0]) == 0); assert(((uintptr_t)out->weights) % __alignof__(out->weights[0]) == 0); assert(((uintptr_t)out->extra) % __alignof__(out->extra[0]) == 0); assert(((uintptr_t)out->indirect) % __alignof__(out->indirect[0]) == 0); return true; } /* We need UTF-8 encoding of numbers. */ static int utf8_encode(char* buf, int val) { int retval; if (val < 0x80) { *buf++ = (char)val; retval = 1; } else { int step; for (step = 2; step < 6; ++step) if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0) break; retval = step; *buf = (unsigned char)(~0xff >> step); --step; do { buf[step] = 0x80 | (val & 0x3f); val >>= 6; } while (--step > 0); *buf |= val; } return retval; } /* Find next weight and rule index. Inlined since called for every char. */ static __always_inline size_t find_idx(const USTRING_TYPE** us, int32_t* weight_idx, unsigned char* rule_idx, const locale_data_t* l_data, const int pass) { int32_t tmp = findidx(l_data->table, l_data->indirect, l_data->extra, us, -1); *rule_idx = tmp >> 24; int32_t idx = tmp & 0xffffff; size_t len = l_data->weights[idx++]; /* Skip over indices of previous levels. */ for (int i = 0; i < pass; i++) { idx += len; len = l_data->weights[idx++]; } *weight_idx = idx; return len; } static int find_position(const USTRING_TYPE* us, const locale_data_t* l_data, const int pass) { int32_t weight_idx; unsigned char rule_idx; const USTRING_TYPE* usrc = us; find_idx(&usrc, &weight_idx, &rule_idx, l_data, pass); return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position; } /* Do the transformation. */ static size_t do_xfrm(const USTRING_TYPE* usrc, STRING_TYPE* dest, size_t n, const locale_data_t* l_data) { int32_t weight_idx; unsigned char rule_idx; uint_fast32_t pass; size_t needed = 0; size_t last_needed; /* Now the passes over the weights. */ for (pass = 0; pass < l_data->nrules; ++pass) { size_t backw_len = 0; last_needed = needed; const USTRING_TYPE* cur = usrc; const USTRING_TYPE* backw_start = NULL; /* We assume that if a rule has defined `position' in one section this is true for all of them. */ int position = find_position(cur, l_data, pass); if (position == 0) { while (*cur != L('\0')) { const USTRING_TYPE* pos = cur; size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; if ((rule & sort_forward) != 0) { /* Handle the pushed backward sequence. */ if (backw_start != NULL) { for (size_t i = backw_len; i > 0;) { int32_t weight_idx; unsigned char rule_idx; size_t len = find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); if (needed + i < n) for (size_t j = len; j > 0; j--) dest[needed + i - j] = l_data->weights[weight_idx++]; i -= len; } needed += backw_len; backw_start = NULL; backw_len = 0; } /* Now handle the forward element. */ if (needed + len < n) while (len-- > 0) dest[needed++] = l_data->weights[weight_idx++]; else /* No more characters fit into the buffer. */ needed += len; } else { /* Remember start of the backward sequence & track length. */ if (backw_start == NULL) backw_start = pos; backw_len += len; } } /* Handle the pushed backward sequence. */ if (backw_start != NULL) { for (size_t i = backw_len; i > 0;) { size_t len = find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass); if (needed + i < n) for (size_t j = len; j > 0; j--) dest[needed + i - j] = l_data->weights[weight_idx++]; i -= len; } needed += backw_len; } } else { int val = 1; char buf[7]; size_t buflen; size_t i; while (*cur != L('\0')) { const USTRING_TYPE* pos = cur; size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass); int rule = l_data->rulesets[rule_idx * l_data->nrules + pass]; if ((rule & sort_forward) != 0) { /* Handle the pushed backward sequence. */ if (backw_start != NULL) { for (size_t p = backw_len; p > 0; p--) { size_t len; int32_t weight_idx; unsigned char rule_idx; const USTRING_TYPE* backw_cur = backw_start; /* To prevent a warning init the used vars. */ len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); for (i = 1; i < p; i++) len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); if (len != 0) { buflen = utf8_encode(buf, val); if (needed + buflen + len < n) { for (i = 0; i < buflen; ++i) dest[needed + i] = buf[i]; for (i = 0; i < len; ++i) dest[needed + buflen + i] = l_data->weights[weight_idx + i]; } needed += buflen + len; val = 1; } else ++val; } backw_start = NULL; backw_len = 0; } /* Now handle the forward element. */ if (len != 0) { buflen = utf8_encode(buf, val); if (needed + buflen + len < n) { for (i = 0; i < buflen; ++i) dest[needed + i] = buf[i]; for (i = 0; i < len; ++i) dest[needed + buflen + i] = l_data->weights[weight_idx + i]; } needed += buflen + len; val = 1; } else ++val; } else { /* Remember start of the backward sequence & track length. */ if (backw_start == NULL) backw_start = pos; backw_len++; } } /* Handle the pushed backward sequence. */ if (backw_start != NULL) { for (size_t p = backw_len; p > 0; p--) { size_t len; int32_t weight_idx; unsigned char rule_idx; const USTRING_TYPE* backw_cur = backw_start; /* To prevent a warning init the used vars. */ len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); for (i = 1; i < p; i++) len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass); if (len != 0) { buflen = utf8_encode(buf, val); if (needed + buflen + len < n) { for (i = 0; i < buflen; ++i) dest[needed + i] = buf[i]; for (i = 0; i < len; ++i) dest[needed + buflen + i] = l_data->weights[weight_idx + i]; } needed += buflen + len; val = 1; } else ++val; } } } /* Finally store the byte to separate the passes or terminate the string. */ if (needed < n) dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0'); ++needed; } /* This is a little optimization: many collation specifications have a `position' rule at the end and if no non-ignored character is found the last \1 byte is immediately followed by a \0 byte signalling this. We can avoid the \1 byte(s). */ if (needed > 2 && needed == last_needed + 1) { /* Remove the \1 byte. */ if (--needed <= n) dest[needed - 1] = L('\0'); } /* Return the number of bytes/words we need, but don't count the NUL byte/word at the end. */ return needed - 1; } /* Do the transformation using weight-index and rule cache. */ static size_t do_xfrm_cached(STRING_TYPE* dest, size_t n, const locale_data_t* l_data, size_t idxmax, int32_t* idxarr, const unsigned char* rulearr) { uint_fast32_t nrules = l_data->nrules; unsigned char* rulesets = l_data->rulesets; USTRING_TYPE* weights = l_data->weights; uint_fast32_t pass; size_t needed = 0; size_t last_needed; size_t idxcnt; /* Now the passes over the weights. */ for (pass = 0; pass < nrules; ++pass) { size_t backw_stop = ~0ul; int rule = rulesets[rulearr[0] * nrules + pass]; /* We assume that if a rule has defined `position' in one section this is true for all of them. */ int position = rule & sort_position; last_needed = needed; if (position == 0) { for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { if ((rule & sort_forward) != 0) { size_t len; if (backw_stop != ~0ul) { /* Handle the pushed elements now. */ size_t backw; for (backw = idxcnt; backw > backw_stop;) { --backw; len = weights[idxarr[backw]++]; if (needed + len < n) while (len-- > 0) dest[needed++] = weights[idxarr[backw]++]; else { /* No more characters fit into the buffer. */ needed += len; idxarr[backw] += len; } } backw_stop = ~0ul; } /* Now handle the forward element. */ len = weights[idxarr[idxcnt]++]; if (needed + len < n) while (len-- > 0) dest[needed++] = weights[idxarr[idxcnt]++]; else { /* No more characters fit into the buffer. */ needed += len; idxarr[idxcnt] += len; } } else { /* Remember where the backwards series started. */ if (backw_stop == ~0ul) backw_stop = idxcnt; } rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; } if (backw_stop != ~0ul) { /* Handle the pushed elements now. */ size_t backw; backw = idxcnt; while (backw > backw_stop) { size_t len = weights[idxarr[--backw]++]; if (needed + len < n) while (len-- > 0) dest[needed++] = weights[idxarr[backw]++]; else { /* No more characters fit into the buffer. */ needed += len; idxarr[backw] += len; } } } } else { int val = 1; char buf[7]; size_t buflen; size_t i; for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) { if ((rule & sort_forward) != 0) { size_t len; if (backw_stop != ~0ul) { /* Handle the pushed elements now. */ size_t backw; for (backw = idxcnt; backw > backw_stop;) { --backw; len = weights[idxarr[backw]++]; if (len != 0) { buflen = utf8_encode(buf, val); if (needed + buflen + len < n) { for (i = 0; i < buflen; ++i) dest[needed + i] = buf[i]; for (i = 0; i < len; ++i) dest[needed + buflen + i] = weights[idxarr[backw] + i]; } needed += buflen + len; idxarr[backw] += len; val = 1; } else ++val; } backw_stop = ~0ul; } /* Now handle the forward element. */ len = weights[idxarr[idxcnt]++]; if (len != 0) { buflen = utf8_encode(buf, val); if (needed + buflen + len < n) { for (i = 0; i < buflen; ++i) dest[needed + i] = buf[i]; for (i = 0; i < len; ++i) dest[needed + buflen + i] = weights[idxarr[idxcnt] + i]; } needed += buflen + len; idxarr[idxcnt] += len; val = 1; } else /* Note that we don't have to increment `idxarr[idxcnt]' since the length is zero. */ ++val; } else { /* Remember where the backwards series started. */ if (backw_stop == ~0ul) backw_stop = idxcnt; } rule = rulesets[rulearr[idxcnt + 1] * nrules + pass]; } if (backw_stop != ~0ul) { /* Handle the pushed elements now. */ size_t backw; backw = idxmax - 1; while (backw > backw_stop) { size_t len = weights[idxarr[--backw]++]; if (len != 0) { buflen = utf8_encode(buf, val); if (needed + buflen + len < n) { for (i = 0; i < buflen; ++i) dest[needed + i] = buf[i]; for (i = 0; i < len; ++i) dest[needed + buflen + i] = weights[idxarr[backw] + i]; } needed += buflen + len; idxarr[backw] += len; val = 1; } else ++val; } } } /* Finally store the byte to separate the passes or terminate the string. */ if (needed < n) dest[needed] = pass + 1 < nrules ? L('\1') : L('\0'); ++needed; } /* This is a little optimization: many collation specifications have a `position' rule at the end and if no non-ignored character is found the last \1 byte is immediately followed by a \0 byte signalling this. We can avoid the \1 byte(s). */ if (needed > 2 && needed == last_needed + 1) { /* Remove the \1 byte. */ if (--needed <= n) dest[needed - 1] = L('\0'); } /* Return the number of bytes/words we need, but don't count the NUL byte/word at the end. */ return needed - 1; } size_t glib_strxfrm(char* dest, const char* src, size_t n, locale_data_t* locale) { /* Handle byte comparison case. */ if (locale->nrules == 0) { size_t srclen = strlen(src); if (n != 0) { memcpy(dest, src, MIN(srclen + 1, n)); } return srclen; } /* Handle an empty string, code hereafter relies on strlen (src) > 0. */ if (*src == L('\0')) { if (n != 0) *dest = L('\0'); return 0; } /* We need the elements of the string as unsigned values since they are used as indeces. */ const USTRING_TYPE* usrc = (const USTRING_TYPE*)src; /* Allocate cache for small strings on the stack and fill it with weight and rule indices. If the cache size is not sufficient, continue with the uncached xfrm version. */ size_t idxmax = 0; const USTRING_TYPE* cur = usrc; int32_t* idxarr = alloca(SMALL_STR_SIZE * sizeof(int32_t)); unsigned char* rulearr = alloca(SMALL_STR_SIZE + 1); do { int32_t tmp = findidx(locale->table, locale->indirect, locale->extra, &cur, -1); rulearr[idxmax] = tmp >> 24; idxarr[idxmax] = tmp & 0xffffff; ++idxmax; } while (*cur != L('\0') && idxmax < SMALL_STR_SIZE); /* This element is only read, the value never used but to determine another value which then is ignored. */ rulearr[idxmax] = '\0'; /* Do the transformation. */ if (*cur == L('\0')) return do_xfrm_cached(dest, n, locale, idxmax, idxarr, rulearr); else return do_xfrm(usrc, dest, n, locale); }