ufb/utf8.c

#include <stdint.h>
#include "utf8.h"

//
// Created by MightyPork on 2017/08/20.
//
// UTF-8 parser - collects bytes of a code point before writing them
// into a screen cell.
//

const struct Utf8Char EMPTY_CHAR = (struct Utf8Char) {.uint = 0};

//      Code Points      First Byte Second Byte Third Byte Fourth Byte
//  U+0000 -   U+007F     00 - 7F
//  U+0080 -   U+07FF     C2 - DF    80 - BF
//	U+0800 -   U+0FFF     E0         *A0 - BF     80 - BF
//	U+1000 -   U+CFFF     E1 - EC    80 - BF     80 - BF
//	U+D000 -   U+D7FF     ED         80 - *9F     80 - BF
//	U+E000 -   U+FFFF     EE - EF    80 - BF     80 - BF
//	U+10000 -  U+3FFFF    F0         *90 - BF     80 - BF    80 - BF
//	U+40000 -  U+FFFFF    F1 - F3    80 - BF     80 - BF    80 - BF
//	U+100000 - U+10FFFF   F4         80 - *8F     80 - BF    80 - BF

size_t utf8_strlen(const char *text)
{
    // TODO optimize
    struct Utf8Iterator iter;
    Utf8Iterator_Init(&iter, text);
    size_t num = 0;
    while ((Utf8Iterator_Next(&iter)).uint) {
        num++;
    }
    return num;
}

/**
 * Handle a received character
 */
struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c)
{
    uint8_t *bytes = self->buffer.bytes;

    uint8_t uc = (uint8_t) c;
    // collecting unicode glyphs...
    if (uc & 0x80) {
        if (self->utf_len == 0) {
            bytes[0] = uc;
            self->utf_j = 1;

            // start
            if (uc == 0xC0 || uc == 0xC1 || uc > 0xF4) {
                // forbidden start codes
                goto fail;
            }

            if ((uc & 0xE0) == 0xC0) {
                self->utf_len = 2;
            }
            else if ((uc & 0xF0) == 0xE0) {
                self->utf_len = 3;
            }
            else if ((uc & 0xF8) == 0xF0) {
                self->utf_len = 4;
            }
            else {
                // chars over 127 that don't start unicode sequences
                goto fail;
            }
        }
        else {
            if ((uc & 0xC0) != 0x80) {
                bytes[self->utf_j++] = uc;
                goto fail;
            }
            else {
                bytes[self->utf_j++] = uc;
                if (self->utf_j >= self->utf_len) {
                    // check for bad sequences - overlong or some other problem
                    if (bytes[0] == 0xF4 && bytes[1] > 0x8F) goto fail;
                    if (bytes[0] == 0xF0 && bytes[1] < 0x90) goto fail;
                    if (bytes[0] == 0xED && bytes[1] > 0x9F) goto fail;
                    if (bytes[0] == 0xE0 && bytes[1] < 0xA0) goto fail;

                    // trap for surrogates - those break javascript
                    if (bytes[0] == 0xED && bytes[1] >= 0xA0 && bytes[1] <= 0xBF) goto fail;

                    goto success;
                }
            }
        }
    }
    else {
        bytes[0] = uc;
        goto success;
    }

    return EMPTY_CHAR;

success:;
    struct Utf8Char result = self->buffer;
    self->buffer.uint = 0; // erase the buffer
    self->utf_len = 0;
    return result;

fail:
    self->buffer.uint = 0; // erase the buffer
    self->utf_len = 0;
    return EMPTY_CHAR;
}

struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self)
{
    char c;
    struct Utf8Char uchar;
    while (1) {
        if (self->is_progmem) {
            c = pgm_read_byte(self->source++);
        } else {
            c = *self->source++;
        }
        if (!c) break;

        uchar = Utf8Parser_Handle(&self->parser, c);
        if (uchar.uint) {
            return uchar;
        }
    }
    return EMPTY_CHAR;
}