ufb/utf8.c

#include <stdint.h>
#include "utf8.h"

//
// Created by MightyPork on 2017/08/20.
//
// UTF-8 parser - collects bytes of a code point before writing them
// into a screen cell.
//

const struct Utf8Char EMPTY_CHAR = (struct Utf8Char) {.uint = 0};

//      Code Points      First Byte Second Byte Third Byte Fourth Byte
//  U+0000 -   U+007F     00 - 7F
//  U+0080 -   U+07FF     C2 - DF    80 - BF
//	U+0800 -   U+0FFF     E0         *A0 - BF     80 - BF
//	U+1000 -   U+CFFF     E1 - EC    80 - BF     80 - BF
//	U+D000 -   U+D7FF     ED         80 - *9F     80 - BF
//	U+E000 -   U+FFFF     EE - EF    80 - BF     80 - BF
//	U+10000 -  U+3FFFF    F0         *90 - BF     80 - BF    80 - BF
//	U+40000 -  U+FFFFF    F1 - F3    80 - BF     80 - BF    80 - BF
//	U+100000 - U+10FFFF   F4         80 - *8F     80 - BF    80 - BF

size_t utf8_strlen(const char *text)
{
    // TODO optimize
    struct Utf8Iterator iter;
    Utf8Iterator_Init(&iter, text);
    size_t num = 0;
    while ((Utf8Iterator_Next(&iter)).uint) {
        num++;
    }
    return num;
}

/**
 * Handle a received character
 */
struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c)
{
    uint8_t *bytes = self->buffer.bytes;

    uint8_t uc = (uint8_t) c;
    // collecting unicode glyphs...
    if (uc & 0x80) {
        if (self->utf_len == 0) {
            bytes[0] = uc;
            self->utf_j = 1;

            // start
            if (uc == 0xC0 || uc == 0xC1 || uc > 0xF4) {
                // forbidden start codes
                goto fail;
            }

            if ((uc & 0xE0) == 0xC0) {
                self->utf_len = 2;
            }
            else if ((uc & 0xF0) == 0xE0) {
                self->utf_len = 3;
            }
            else if ((uc & 0xF8) == 0xF0) {
                self->utf_len = 4;
            }
            else {
                // chars over 127 that don't start unicode sequences
                goto fail;
            }
        }
        else {
            if ((uc & 0xC0) != 0x80) {
                bytes[self->utf_j++] = uc;
                goto fail;
            }
            else {
                bytes[self->utf_j++] = uc;
                if (self->utf_j >= self->utf_len) {
                    // check for bad sequences - overlong or some other problem
                    if (bytes[0] == 0xF4 && bytes[1] > 0x8F) goto fail;
                    if (bytes[0] == 0xF0 && bytes[1] < 0x90) goto fail;
                    if (bytes[0] == 0xED && bytes[1] > 0x9F) goto fail;
                    if (bytes[0] == 0xE0 && bytes[1] < 0xA0) goto fail;

                    // trap for surrogates - those break javascript
                    if (bytes[0] == 0xED && bytes[1] >= 0xA0 && bytes[1] <= 0xBF) goto fail;

                    goto success;
                }
            }
        }
    }
    else {
        bytes[0] = uc;
        goto success;
    }

    return EMPTY_CHAR;

success:;
    struct Utf8Char result = self->buffer;
    self->buffer.uint = 0; // erase the buffer
    self->utf_len = 0;
    return result;

fail:
    self->buffer.uint = 0; // erase the buffer
    self->utf_len = 0;
    return EMPTY_CHAR;
}

struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self)
{
    char c;
    struct Utf8Char uchar;
    while (1) {
        if (self->is_progmem) {
            c = pgm_read_byte(self->source++);
        } else {
            c = *self->source++;
        }
        if (!c) break;

        uchar = Utf8Parser_Handle(&self->parser, c);
        if (uchar.uint) {
            return uchar;
        }
    }
    return EMPTY_CHAR;
}
tiny font ver 2 years ago			`#include <stdint.h>`
			`#include "utf8.h"`

			`//`
			`// Created by MightyPork on 2017/08/20.`
			`//`
			`// UTF-8 parser - collects bytes of a code point before writing them`
			`// into a screen cell.`
			`//`

			`const struct Utf8Char EMPTY_CHAR = (struct Utf8Char) {.uint = 0};`

			`// Code Points First Byte Second Byte Third Byte Fourth Byte`
			`// U+0000 - U+007F 00 - 7F`
			`// U+0080 - U+07FF C2 - DF 80 - BF`
			`// U+0800 - U+0FFF E0 *A0 - BF 80 - BF`
			`// U+1000 - U+CFFF E1 - EC 80 - BF 80 - BF`
			`// U+D000 - U+D7FF ED 80 - *9F 80 - BF`
			`// U+E000 - U+FFFF EE - EF 80 - BF 80 - BF`
			`// U+10000 - U+3FFFF F0 *90 - BF 80 - BF 80 - BF`
			`// U+40000 - U+FFFFF F1 - F3 80 - BF 80 - BF 80 - BF`
			`// U+100000 - U+10FFFF F4 80 - *8F 80 - BF 80 - BF`

			`size_t utf8_strlen(const char *text)`
			`{`
			`// TODO optimize`
			`struct Utf8Iterator iter;`
			`Utf8Iterator_Init(&iter, text);`
			`size_t num = 0;`
			`while ((Utf8Iterator_Next(&iter)).uint) {`
			`num++;`
			`}`
			`return num;`
			`}`

			`/**`
			`* Handle a received character`
			`*/`
			`struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c)`
			`{`
			`uint8_t *bytes = self->buffer.bytes;`

			`uint8_t uc = (uint8_t) c;`
			`// collecting unicode glyphs...`
			`if (uc & 0x80) {`
			`if (self->utf_len == 0) {`
			`bytes[0] = uc;`
			`self->utf_j = 1;`

			`// start`
			`if (uc == 0xC0 \|\| uc == 0xC1 \|\| uc > 0xF4) {`
			`// forbidden start codes`
			`goto fail;`
			`}`

			`if ((uc & 0xE0) == 0xC0) {`
			`self->utf_len = 2;`
			`}`
			`else if ((uc & 0xF0) == 0xE0) {`
			`self->utf_len = 3;`
			`}`
			`else if ((uc & 0xF8) == 0xF0) {`
			`self->utf_len = 4;`
			`}`
			`else {`
			`// chars over 127 that don't start unicode sequences`
			`goto fail;`
			`}`
			`}`
			`else {`
			`if ((uc & 0xC0) != 0x80) {`
			`bytes[self->utf_j++] = uc;`
			`goto fail;`
			`}`
			`else {`
			`bytes[self->utf_j++] = uc;`
			`if (self->utf_j >= self->utf_len) {`
			`// check for bad sequences - overlong or some other problem`
			`if (bytes[0] == 0xF4 && bytes[1] > 0x8F) goto fail;`
			`if (bytes[0] == 0xF0 && bytes[1] < 0x90) goto fail;`
			`if (bytes[0] == 0xED && bytes[1] > 0x9F) goto fail;`
			`if (bytes[0] == 0xE0 && bytes[1] < 0xA0) goto fail;`

			`// trap for surrogates - those break javascript`
			`if (bytes[0] == 0xED && bytes[1] >= 0xA0 && bytes[1] <= 0xBF) goto fail;`

			`goto success;`
			`}`
			`}`
			`}`
			`}`
			`else {`
			`bytes[0] = uc;`
			`goto success;`
			`}`

			`return EMPTY_CHAR;`

			`success:;`
			`struct Utf8Char result = self->buffer;`
			`self->buffer.uint = 0; // erase the buffer`
			`self->utf_len = 0;`
			`return result;`

			`fail:`
			`self->buffer.uint = 0; // erase the buffer`
			`self->utf_len = 0;`
			`return EMPTY_CHAR;`
			`}`

			`struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self)`
			`{`
			`char c;`
			`struct Utf8Char uchar;`
			`while (1) {`
			`if (self->is_progmem) {`
			`c = pgm_read_byte(self->source++);`
			`} else {`
			`c = *self->source++;`
			`}`
			`if (!c) break;`

			`uchar = Utf8Parser_Handle(&self->parser, c);`
			`if (uchar.uint) {`
			`return uchar;`
			`}`
			`}`
			`return EMPTY_CHAR;`
			`}`