#include #include "utf8.h" // // Created by MightyPork on 2017/08/20. // // UTF-8 parser - collects bytes of a code point before writing them // into a screen cell. // const struct Utf8Char EMPTY_CHAR = (struct Utf8Char) {.uint = 0}; // Code Points First Byte Second Byte Third Byte Fourth Byte // U+0000 - U+007F 00 - 7F // U+0080 - U+07FF C2 - DF 80 - BF // U+0800 - U+0FFF E0 *A0 - BF 80 - BF // U+1000 - U+CFFF E1 - EC 80 - BF 80 - BF // U+D000 - U+D7FF ED 80 - *9F 80 - BF // U+E000 - U+FFFF EE - EF 80 - BF 80 - BF // U+10000 - U+3FFFF F0 *90 - BF 80 - BF 80 - BF // U+40000 - U+FFFFF F1 - F3 80 - BF 80 - BF 80 - BF // U+100000 - U+10FFFF F4 80 - *8F 80 - BF 80 - BF /** * Handle a received character */ struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c) { uint8_t *bytes = self->buffer.bytes; uint8_t uc = (uint8_t) c; // collecting unicode glyphs... if (uc & 0x80) { if (self->utf_len == 0) { bytes[0] = uc; self->utf_j = 1; // start if (uc == 0xC0 || uc == 0xC1 || uc > 0xF4) { // forbidden start codes goto fail; } if ((uc & 0xE0) == 0xC0) { self->utf_len = 2; } else if ((uc & 0xF0) == 0xE0) { self->utf_len = 3; } else if ((uc & 0xF8) == 0xF0) { self->utf_len = 4; } else { // chars over 127 that don't start unicode sequences goto fail; } } else { if ((uc & 0xC0) != 0x80) { bytes[self->utf_j++] = uc; goto fail; } else { bytes[self->utf_j++] = uc; if (self->utf_j >= self->utf_len) { // check for bad sequences - overlong or some other problem if (bytes[0] == 0xF4 && bytes[1] > 0x8F) goto fail; if (bytes[0] == 0xF0 && bytes[1] < 0x90) goto fail; if (bytes[0] == 0xED && bytes[1] > 0x9F) goto fail; if (bytes[0] == 0xE0 && bytes[1] < 0xA0) goto fail; // trap for surrogates - those break javascript if (bytes[0] == 0xED && bytes[1] >= 0xA0 && bytes[1] <= 0xBF) goto fail; goto success; } } } } else { bytes[0] = uc; goto success; } return EMPTY_CHAR; success:; struct Utf8Char result = self->buffer; self->buffer.uint = 0; // erase the buffer self->utf_len = 0; return result; fail: self->buffer.uint = 0; // erase the buffer self->utf_len = 0; return EMPTY_CHAR; } struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self) { char c; struct Utf8Char uchar; while ((c = *self->source++) != 0) { uchar = Utf8Parser_Handle(&self->parser, c); if (uchar.uint) { return uchar; } } return EMPTY_CHAR; }