You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
toaster-oven-bluepill/Lib/ufb/Src/utf8.c

130 lines
3.5 KiB

2 years ago
#include <stdint.h>
#include "ufb/utf8.h"
//
// Created by MightyPork on 2017/08/20.
//
// UTF-8 parser - collects bytes of a code point before writing them
// into a screen cell.
//
const struct Utf8Char EMPTY_CHAR = (struct Utf8Char) {.uint = 0};
// Code Points First Byte Second Byte Third Byte Fourth Byte
// U+0000 - U+007F 00 - 7F
// U+0080 - U+07FF C2 - DF 80 - BF
// U+0800 - U+0FFF E0 *A0 - BF 80 - BF
// U+1000 - U+CFFF E1 - EC 80 - BF 80 - BF
// U+D000 - U+D7FF ED 80 - *9F 80 - BF
// U+E000 - U+FFFF EE - EF 80 - BF 80 - BF
// U+10000 - U+3FFFF F0 *90 - BF 80 - BF 80 - BF
// U+40000 - U+FFFFF F1 - F3 80 - BF 80 - BF 80 - BF
// U+100000 - U+10FFFF F4 80 - *8F 80 - BF 80 - BF
size_t utf8_strlen(const char *text)
{
// TODO optimize
struct Utf8Iterator iter;
Utf8Iterator_Init(&iter, text);
size_t num = 0;
while ((Utf8Iterator_Next(&iter)).uint) {
num++;
}
return num;
}
/**
* Handle a received character
*/
struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c)
{
uint8_t *bytes = self->buffer.bytes;
uint8_t uc = (uint8_t) c;
// collecting unicode glyphs...
if (uc & 0x80) {
if (self->utf_len == 0) {
bytes[0] = uc;
self->utf_j = 1;
// start
if (uc == 0xC0 || uc == 0xC1 || uc > 0xF4) {
// forbidden start codes
goto fail;
}
if ((uc & 0xE0) == 0xC0) {
self->utf_len = 2;
}
else if ((uc & 0xF0) == 0xE0) {
self->utf_len = 3;
}
else if ((uc & 0xF8) == 0xF0) {
self->utf_len = 4;
}
else {
// chars over 127 that don't start unicode sequences
goto fail;
}
}
else {
if ((uc & 0xC0) != 0x80) {
bytes[self->utf_j++] = uc;
goto fail;
}
else {
bytes[self->utf_j++] = uc;
if (self->utf_j >= self->utf_len) {
// check for bad sequences - overlong or some other problem
if (bytes[0] == 0xF4 && bytes[1] > 0x8F) goto fail;
if (bytes[0] == 0xF0 && bytes[1] < 0x90) goto fail;
if (bytes[0] == 0xED && bytes[1] > 0x9F) goto fail;
if (bytes[0] == 0xE0 && bytes[1] < 0xA0) goto fail;
// trap for surrogates - those break javascript
if (bytes[0] == 0xED && bytes[1] >= 0xA0 && bytes[1] <= 0xBF) goto fail;
goto success;
}
}
}
}
else {
bytes[0] = uc;
goto success;
}
return EMPTY_CHAR;
success:;
struct Utf8Char result = self->buffer;
self->buffer.uint = 0; // erase the buffer
self->utf_len = 0;
return result;
fail:
self->buffer.uint = 0; // erase the buffer
self->utf_len = 0;
return EMPTY_CHAR;
}
struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self)
{
char c;
struct Utf8Char uchar;
while (1) {
if (self->is_progmem) {
c = pgm_read_byte(self->source++);
} else {
c = *self->source++;
}
if (!c) break;
uchar = Utf8Parser_Handle(&self->parser, c);
if (uchar.uint) {
return uchar;
}
}
return EMPTY_CHAR;
}