|
|
|
#include <stdint.h>
|
|
|
|
#include "utf8.h"
|
|
|
|
|
|
|
|
//
|
|
|
|
// Created by MightyPork on 2017/08/20.
|
|
|
|
//
|
|
|
|
// UTF-8 parser - collects bytes of a code point before writing them
|
|
|
|
// into a screen cell.
|
|
|
|
//
|
|
|
|
|
|
|
|
const struct Utf8Char EMPTY_CHAR = (struct Utf8Char) {.uint = 0};
|
|
|
|
|
|
|
|
// Code Points First Byte Second Byte Third Byte Fourth Byte
|
|
|
|
// U+0000 - U+007F 00 - 7F
|
|
|
|
// U+0080 - U+07FF C2 - DF 80 - BF
|
|
|
|
// U+0800 - U+0FFF E0 *A0 - BF 80 - BF
|
|
|
|
// U+1000 - U+CFFF E1 - EC 80 - BF 80 - BF
|
|
|
|
// U+D000 - U+D7FF ED 80 - *9F 80 - BF
|
|
|
|
// U+E000 - U+FFFF EE - EF 80 - BF 80 - BF
|
|
|
|
// U+10000 - U+3FFFF F0 *90 - BF 80 - BF 80 - BF
|
|
|
|
// U+40000 - U+FFFFF F1 - F3 80 - BF 80 - BF 80 - BF
|
|
|
|
// U+100000 - U+10FFFF F4 80 - *8F 80 - BF 80 - BF
|
|
|
|
|
|
|
|
size_t utf8_strlen(const char *text)
|
|
|
|
{
|
|
|
|
// TODO optimize
|
|
|
|
struct Utf8Iterator iter;
|
|
|
|
Utf8Iterator_Init(&iter, text);
|
|
|
|
size_t num = 0;
|
|
|
|
while ((Utf8Iterator_Next(&iter)).uint) {
|
|
|
|
num++;
|
|
|
|
}
|
|
|
|
return num;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Handle a received character
|
|
|
|
*/
|
|
|
|
struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c)
|
|
|
|
{
|
|
|
|
uint8_t *bytes = self->buffer.bytes;
|
|
|
|
|
|
|
|
uint8_t uc = (uint8_t) c;
|
|
|
|
// collecting unicode glyphs...
|
|
|
|
if (uc & 0x80) {
|
|
|
|
if (self->utf_len == 0) {
|
|
|
|
bytes[0] = uc;
|
|
|
|
self->utf_j = 1;
|
|
|
|
|
|
|
|
// start
|
|
|
|
if (uc == 0xC0 || uc == 0xC1 || uc > 0xF4) {
|
|
|
|
// forbidden start codes
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((uc & 0xE0) == 0xC0) {
|
|
|
|
self->utf_len = 2;
|
|
|
|
}
|
|
|
|
else if ((uc & 0xF0) == 0xE0) {
|
|
|
|
self->utf_len = 3;
|
|
|
|
}
|
|
|
|
else if ((uc & 0xF8) == 0xF0) {
|
|
|
|
self->utf_len = 4;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
// chars over 127 that don't start unicode sequences
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if ((uc & 0xC0) != 0x80) {
|
|
|
|
bytes[self->utf_j++] = uc;
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
bytes[self->utf_j++] = uc;
|
|
|
|
if (self->utf_j >= self->utf_len) {
|
|
|
|
// check for bad sequences - overlong or some other problem
|
|
|
|
if (bytes[0] == 0xF4 && bytes[1] > 0x8F) goto fail;
|
|
|
|
if (bytes[0] == 0xF0 && bytes[1] < 0x90) goto fail;
|
|
|
|
if (bytes[0] == 0xED && bytes[1] > 0x9F) goto fail;
|
|
|
|
if (bytes[0] == 0xE0 && bytes[1] < 0xA0) goto fail;
|
|
|
|
|
|
|
|
// trap for surrogates - those break javascript
|
|
|
|
if (bytes[0] == 0xED && bytes[1] >= 0xA0 && bytes[1] <= 0xBF) goto fail;
|
|
|
|
|
|
|
|
goto success;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
bytes[0] = uc;
|
|
|
|
goto success;
|
|
|
|
}
|
|
|
|
|
|
|
|
return EMPTY_CHAR;
|
|
|
|
|
|
|
|
success:;
|
|
|
|
struct Utf8Char result = self->buffer;
|
|
|
|
self->buffer.uint = 0; // erase the buffer
|
|
|
|
self->utf_len = 0;
|
|
|
|
return result;
|
|
|
|
|
|
|
|
fail:
|
|
|
|
self->buffer.uint = 0; // erase the buffer
|
|
|
|
self->utf_len = 0;
|
|
|
|
return EMPTY_CHAR;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self)
|
|
|
|
{
|
|
|
|
char c;
|
|
|
|
struct Utf8Char uchar;
|
|
|
|
while ((c = *self->source++) != 0) {
|
|
|
|
uchar = Utf8Parser_Handle(&self->parser, c);
|
|
|
|
if (uchar.uint) {
|
|
|
|
return uchar;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return EMPTY_CHAR;
|
|
|
|
}
|