tangara-fw/src/locale/strxfrm_l.c

/* Copyright (C) 1995-2018 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Written by Ulrich Drepper <drepper@gnu.org>, 1995.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

#include "strxfrm.h"

#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>

#include "esp_log.h"

#include "weight.h"

static const char kTag[] = "strxfrm";

#ifndef STRING_TYPE
#define STRING_TYPE char
#define USTRING_TYPE unsigned char
#define STRLEN strlen
#define STPNCPY stpncpy
#define L(arg) arg
#endif

#define CONCAT(a, b) CONCAT1(a, b)
#define CONCAT1(a, b) a##b

/* Maximum string size that is calculated with cached indices.  Right now this
   is an arbitrary value open to optimizations.  SMALL_STR_SIZE * 4 has to be
   lower than __MAX_ALLOCA_CUTOFF.  Keep localedata/xfrm-test.c in sync.  */
#define SMALL_STR_SIZE 4095

/* We know three kinds of collation sorting rules.  */
enum coll_sort_rule {
  illegal_0__,
  sort_forward,
  sort_backward,
  illegal_3__,
  sort_position,
  sort_forward_position,
  sort_backward_position,
  sort_mask
};

enum collate_element {
  COLLATE_NRULES = 0,
  COLLATE_RULESETS,
  COLLATE_TABLEMB,
  COLLATE_WEIGHTMB,
  COLLATE_EXTRAMB,
  COLLATE_INDIRECTMB,
  COLLATE_GAP1,
  COLLATE_GAP2,
  COLLATE_GAP3,
  COLLATE_TABLEWC,
  COLLATE_WEIGHTWC,
  COLLATE_EXTRAWC,
  COLLATE_INDIRECTWC,
  COLLATE_SYMB_HASH_SIZEMB,
  COLLATE_SYMB_TABLEMB,
  COLLATE_SYMB_EXTRAMB,
  COLLATE_COLLSEQMB,
  COLLATE_COLLSEQWC,
  COLLATE_CODESET,

  // Not a real element; used to know how many elements there are.
  COLLATE_LAST
};
typedef enum collate_element collate_element_t;

bool parse_locale_data(const void* raw_data, size_t size, locale_data_t* out) {
  const struct {
    unsigned int magic;
    unsigned int nstrings;
    unsigned int strindex[0];
  }* const header = raw_data;

  if (header->magic != 0x20051017) {
    ESP_LOGE(kTag, "file magic incorrect (was %x)", header->magic);
    return false;
  }

  if (sizeof(*header) + header->nstrings * sizeof(unsigned int) >= size) {
    ESP_LOGE(kTag, "file was too small to contain header");
    return false;
  }

  if (header->nstrings != COLLATE_LAST) {
    ESP_LOGE(kTag, "file has incorrect number of elements (was %u, wanted %u)",
             header->nstrings, COLLATE_LAST);
    return false;
  }

  // The LC_COLLATE partition appears to contain data in the correct shape.
  // Pull out pointers to the various attributes it contains.
  const void* offsets[COLLATE_LAST];
  for (size_t i = 0; i < header->nstrings; i++) {
    size_t offset = header->strindex[i];
    if (offset > size) {
      ESP_LOGE(kTag, "element offset (%u) exceeds file size", offset);
      return false;
    }
    offsets[i] = (raw_data + offset);
  }

  // Now parse those pointers into the output struct.
  out->nrules = *(const unsigned int*)offsets[COLLATE_NRULES];

  out->rulesets = (unsigned char*)offsets[COLLATE_RULESETS];
  out->table = (int32_t*)offsets[COLLATE_TABLEMB];
  out->weights = (unsigned char*)offsets[COLLATE_WEIGHTMB];
  out->extra = (unsigned char*)offsets[COLLATE_EXTRAMB];
  out->indirect = (int32_t*)offsets[COLLATE_INDIRECTMB];

  assert(((uintptr_t)out->table) % __alignof__(out->table[0]) == 0);
  assert(((uintptr_t)out->weights) % __alignof__(out->weights[0]) == 0);
  assert(((uintptr_t)out->extra) % __alignof__(out->extra[0]) == 0);
  assert(((uintptr_t)out->indirect) % __alignof__(out->indirect[0]) == 0);

  return true;
}

/* We need UTF-8 encoding of numbers.  */
static int utf8_encode(char* buf, int val) {
  int retval;

  if (val < 0x80) {
    *buf++ = (char)val;
    retval = 1;
  } else {
    int step;

    for (step = 2; step < 6; ++step)
      if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
        break;
    retval = step;

    *buf = (unsigned char)(~0xff >> step);
    --step;
    do {
      buf[step] = 0x80 | (val & 0x3f);
      val >>= 6;
    } while (--step > 0);
    *buf |= val;
  }

  return retval;
}

/* Find next weight and rule index.  Inlined since called for every char.  */
static __always_inline size_t find_idx(const USTRING_TYPE** us,
                                       int32_t* weight_idx,
                                       unsigned char* rule_idx,
                                       const locale_data_t* l_data,
                                       const int pass) {
  int32_t tmp = findidx(l_data->table, l_data->indirect, l_data->extra, us, -1);
  *rule_idx = tmp >> 24;
  int32_t idx = tmp & 0xffffff;
  size_t len = l_data->weights[idx++];

  /* Skip over indices of previous levels.  */
  for (int i = 0; i < pass; i++) {
    idx += len;
    len = l_data->weights[idx++];
  }

  *weight_idx = idx;
  return len;
}

static int find_position(const USTRING_TYPE* us,
                         const locale_data_t* l_data,
                         const int pass) {
  int32_t weight_idx;
  unsigned char rule_idx;
  const USTRING_TYPE* usrc = us;

  find_idx(&usrc, &weight_idx, &rule_idx, l_data, pass);
  return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
}

/* Do the transformation.  */
static size_t do_xfrm(const USTRING_TYPE* usrc,
                      STRING_TYPE* dest,
                      size_t n,
                      const locale_data_t* l_data) {
  int32_t weight_idx;
  unsigned char rule_idx;
  uint_fast32_t pass;
  size_t needed = 0;
  size_t last_needed;

  /* Now the passes over the weights.  */
  for (pass = 0; pass < l_data->nrules; ++pass) {
    size_t backw_len = 0;
    last_needed = needed;
    const USTRING_TYPE* cur = usrc;
    const USTRING_TYPE* backw_start = NULL;

    /* We assume that if a rule has defined `position' in one section
      this is true for all of them.  */
    int position = find_position(cur, l_data, pass);

    if (position == 0) {
      while (*cur != L('\0')) {
        const USTRING_TYPE* pos = cur;
        size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass);
        int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];

        if ((rule & sort_forward) != 0) {
          /* Handle the pushed backward sequence.  */
          if (backw_start != NULL) {
            for (size_t i = backw_len; i > 0;) {
              int32_t weight_idx;
              unsigned char rule_idx;
              size_t len =
                  find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass);
              if (needed + i < n)
                for (size_t j = len; j > 0; j--)
                  dest[needed + i - j] = l_data->weights[weight_idx++];

              i -= len;
            }

            needed += backw_len;
            backw_start = NULL;
            backw_len = 0;
          }

          /* Now handle the forward element.  */
          if (needed + len < n)
            while (len-- > 0)
              dest[needed++] = l_data->weights[weight_idx++];
          else
            /* No more characters fit into the buffer.  */
            needed += len;
        } else {
          /* Remember start of the backward sequence & track length.  */
          if (backw_start == NULL)
            backw_start = pos;
          backw_len += len;
        }
      }

      /* Handle the pushed backward sequence.  */
      if (backw_start != NULL) {
        for (size_t i = backw_len; i > 0;) {
          size_t len =
              find_idx(&backw_start, &weight_idx, &rule_idx, l_data, pass);
          if (needed + i < n)
            for (size_t j = len; j > 0; j--)
              dest[needed + i - j] = l_data->weights[weight_idx++];

          i -= len;
        }

        needed += backw_len;
      }
    } else {
      int val = 1;
      char buf[7];
      size_t buflen;
      size_t i;

      while (*cur != L('\0')) {
        const USTRING_TYPE* pos = cur;
        size_t len = find_idx(&cur, &weight_idx, &rule_idx, l_data, pass);
        int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];

        if ((rule & sort_forward) != 0) {
          /* Handle the pushed backward sequence.  */
          if (backw_start != NULL) {
            for (size_t p = backw_len; p > 0; p--) {
              size_t len;
              int32_t weight_idx;
              unsigned char rule_idx;
              const USTRING_TYPE* backw_cur = backw_start;

              /* To prevent a warning init the used vars.  */
              len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);

              for (i = 1; i < p; i++)
                len =
                    find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);

              if (len != 0) {
                buflen = utf8_encode(buf, val);
                if (needed + buflen + len < n) {
                  for (i = 0; i < buflen; ++i)
                    dest[needed + i] = buf[i];
                  for (i = 0; i < len; ++i)
                    dest[needed + buflen + i] = l_data->weights[weight_idx + i];
                }
                needed += buflen + len;
                val = 1;
              } else
                ++val;
            }

            backw_start = NULL;
            backw_len = 0;
          }

          /* Now handle the forward element.  */
          if (len != 0) {
            buflen = utf8_encode(buf, val);
            if (needed + buflen + len < n) {
              for (i = 0; i < buflen; ++i)
                dest[needed + i] = buf[i];
              for (i = 0; i < len; ++i)
                dest[needed + buflen + i] = l_data->weights[weight_idx + i];
            }
            needed += buflen + len;
            val = 1;
          } else
            ++val;
        } else {
          /* Remember start of the backward sequence & track length.  */
          if (backw_start == NULL)
            backw_start = pos;
          backw_len++;
        }
      }

      /* Handle the pushed backward sequence.  */
      if (backw_start != NULL) {
        for (size_t p = backw_len; p > 0; p--) {
          size_t len;
          int32_t weight_idx;
          unsigned char rule_idx;
          const USTRING_TYPE* backw_cur = backw_start;

          /* To prevent a warning init the used vars.  */
          len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);

          for (i = 1; i < p; i++)
            len = find_idx(&backw_cur, &weight_idx, &rule_idx, l_data, pass);

          if (len != 0) {
            buflen = utf8_encode(buf, val);
            if (needed + buflen + len < n) {
              for (i = 0; i < buflen; ++i)
                dest[needed + i] = buf[i];
              for (i = 0; i < len; ++i)
                dest[needed + buflen + i] = l_data->weights[weight_idx + i];
            }
            needed += buflen + len;
            val = 1;
          } else
            ++val;
        }
      }
    }

    /* Finally store the byte to separate the passes or terminate
       the string.  */
    if (needed < n)
      dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
    ++needed;
  }

  /* This is a little optimization: many collation specifications have
     a `position' rule at the end and if no non-ignored character
     is found the last \1 byte is immediately followed by a \0 byte
     signalling this.  We can avoid the \1 byte(s).  */
  if (needed > 2 && needed == last_needed + 1) {
    /* Remove the \1 byte.  */
    if (--needed <= n)
      dest[needed - 1] = L('\0');
  }

  /* Return the number of bytes/words we need, but don't count the NUL
     byte/word at the end.  */
  return needed - 1;
}

/* Do the transformation using weight-index and rule cache.  */
static size_t do_xfrm_cached(STRING_TYPE* dest,
                             size_t n,
                             const locale_data_t* l_data,
                             size_t idxmax,
                             int32_t* idxarr,
                             const unsigned char* rulearr) {
  uint_fast32_t nrules = l_data->nrules;
  unsigned char* rulesets = l_data->rulesets;
  USTRING_TYPE* weights = l_data->weights;
  uint_fast32_t pass;
  size_t needed = 0;
  size_t last_needed;
  size_t idxcnt;

  /* Now the passes over the weights.  */
  for (pass = 0; pass < nrules; ++pass) {
    size_t backw_stop = ~0ul;
    int rule = rulesets[rulearr[0] * nrules + pass];
    /* We assume that if a rule has defined `position' in one section
       this is true for all of them.  */
    int position = rule & sort_position;

    last_needed = needed;
    if (position == 0) {
      for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) {
        if ((rule & sort_forward) != 0) {
          size_t len;

          if (backw_stop != ~0ul) {
            /* Handle the pushed elements now.  */
            size_t backw;

            for (backw = idxcnt; backw > backw_stop;) {
              --backw;
              len = weights[idxarr[backw]++];

              if (needed + len < n)
                while (len-- > 0)
                  dest[needed++] = weights[idxarr[backw]++];
              else {
                /* No more characters fit into the buffer.  */
                needed += len;
                idxarr[backw] += len;
              }
            }

            backw_stop = ~0ul;
          }

          /* Now handle the forward element.  */
          len = weights[idxarr[idxcnt]++];
          if (needed + len < n)
            while (len-- > 0)
              dest[needed++] = weights[idxarr[idxcnt]++];
          else {
            /* No more characters fit into the buffer.  */
            needed += len;
            idxarr[idxcnt] += len;
          }
        } else {
          /* Remember where the backwards series started.  */
          if (backw_stop == ~0ul)
            backw_stop = idxcnt;
        }

        rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
      }

      if (backw_stop != ~0ul) {
        /* Handle the pushed elements now.  */
        size_t backw;

        backw = idxcnt;
        while (backw > backw_stop) {
          size_t len = weights[idxarr[--backw]++];

          if (needed + len < n)
            while (len-- > 0)
              dest[needed++] = weights[idxarr[backw]++];
          else {
            /* No more characters fit into the buffer.  */
            needed += len;
            idxarr[backw] += len;
          }
        }
      }
    } else {
      int val = 1;
      char buf[7];
      size_t buflen;
      size_t i;

      for (idxcnt = 0; idxcnt < idxmax; ++idxcnt) {
        if ((rule & sort_forward) != 0) {
          size_t len;

          if (backw_stop != ~0ul) {
            /* Handle the pushed elements now.  */
            size_t backw;

            for (backw = idxcnt; backw > backw_stop;) {
              --backw;
              len = weights[idxarr[backw]++];
              if (len != 0) {
                buflen = utf8_encode(buf, val);
                if (needed + buflen + len < n) {
                  for (i = 0; i < buflen; ++i)
                    dest[needed + i] = buf[i];
                  for (i = 0; i < len; ++i)
                    dest[needed + buflen + i] = weights[idxarr[backw] + i];
                }
                needed += buflen + len;
                idxarr[backw] += len;
                val = 1;
              } else
                ++val;
            }

            backw_stop = ~0ul;
          }

          /* Now handle the forward element.  */
          len = weights[idxarr[idxcnt]++];
          if (len != 0) {
            buflen = utf8_encode(buf, val);
            if (needed + buflen + len < n) {
              for (i = 0; i < buflen; ++i)
                dest[needed + i] = buf[i];
              for (i = 0; i < len; ++i)
                dest[needed + buflen + i] = weights[idxarr[idxcnt] + i];
            }
            needed += buflen + len;
            idxarr[idxcnt] += len;
            val = 1;
          } else
            /* Note that we don't have to increment `idxarr[idxcnt]'
               since the length is zero.  */
            ++val;
        } else {
          /* Remember where the backwards series started.  */
          if (backw_stop == ~0ul)
            backw_stop = idxcnt;
        }

        rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
      }

      if (backw_stop != ~0ul) {
        /* Handle the pushed elements now.  */
        size_t backw;

        backw = idxmax - 1;
        while (backw > backw_stop) {
          size_t len = weights[idxarr[--backw]++];
          if (len != 0) {
            buflen = utf8_encode(buf, val);
            if (needed + buflen + len < n) {
              for (i = 0; i < buflen; ++i)
                dest[needed + i] = buf[i];
              for (i = 0; i < len; ++i)
                dest[needed + buflen + i] = weights[idxarr[backw] + i];
            }
            needed += buflen + len;
            idxarr[backw] += len;
            val = 1;
          } else
            ++val;
        }
      }
    }

    /* Finally store the byte to separate the passes or terminate
       the string.  */
    if (needed < n)
      dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
    ++needed;
  }

  /* This is a little optimization: many collation specifications have
     a `position' rule at the end and if no non-ignored character
     is found the last \1 byte is immediately followed by a \0 byte
     signalling this.  We can avoid the \1 byte(s).  */
  if (needed > 2 && needed == last_needed + 1) {
    /* Remove the \1 byte.  */
    if (--needed <= n)
      dest[needed - 1] = L('\0');
  }

  /* Return the number of bytes/words we need, but don't count the NUL
     byte/word at the end.  */
  return needed - 1;
}

size_t glib_strxfrm(char* dest,
                    const char* src,
                    size_t n,
                    locale_data_t* locale) {
  /* Handle byte comparison case.  */
  if (locale->nrules == 0) {
    size_t srclen = strlen(src);

    if (n != 0) {
      memcpy(dest, src, MIN(srclen + 1, n));
    }

    return srclen;
  }

  /* Handle an empty string, code hereafter relies on strlen (src) > 0.  */
  if (*src == L('\0')) {
    if (n != 0)
      *dest = L('\0');
    return 0;
  }

  /* We need the elements of the string as unsigned values since they
     are used as indeces.  */
  const USTRING_TYPE* usrc = (const USTRING_TYPE*)src;

  /* Allocate cache for small strings on the stack and fill it with weight and
     rule indices.  If the cache size is not sufficient, continue with the
     uncached xfrm version.  */
  size_t idxmax = 0;
  const USTRING_TYPE* cur = usrc;
  int32_t* idxarr = alloca(SMALL_STR_SIZE * sizeof(int32_t));
  unsigned char* rulearr = alloca(SMALL_STR_SIZE + 1);

  do {
    int32_t tmp =
        findidx(locale->table, locale->indirect, locale->extra, &cur, -1);
    rulearr[idxmax] = tmp >> 24;
    idxarr[idxmax] = tmp & 0xffffff;

    ++idxmax;
  } while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);

  /* This element is only read, the value never used but to determine
     another value which then is ignored.  */
  rulearr[idxmax] = '\0';

  /* Do the transformation.  */
  if (*cur == L('\0'))
    return do_xfrm_cached(dest, n, locale, idxmax, idxarr, rulearr);
  else
    return do_xfrm(usrc, dest, n, locale);
}