/** * UTF-8 string parsing and character iteration * * Created on 2020/01/04. */ #ifndef LIQUIDTYPE_UTF8_H #define LIQUIDTYPE_UTF8_H #include #include #include #include "progmem.h" /** * UTF-8 encoded character. */ struct Utf8Char { union { /** character bytes; padded by zero bytes if shorter than 4 */ uint8_t bytes[4]; /** u32 view of the bytes */ uint32_t uint; }; }; /** UTF8 string parser internal state */ struct Utf8Parser { /** UTF-8 bytes buffer */ struct Utf8Char buffer; /** Currently collected UTF-8 character length */ uint8_t utf_len; /** Position in the current character */ uint8_t utf_j; }; static inline void Utf8Parser_Clear(struct Utf8Parser *self) { self->buffer.uint = 0; self->utf_j = 0; self->utf_len = 0; } /** * Utf8 character iterator. * * Usage: * struct Utf8Iterator iter; * Utf8Iterator_Init(&iter, myString); * * union Utf8Char uchar; * while ((uchar = Utf8Iterator_Next(&iter)).uint) { * // do something with the char * } * * // Free myString if needed, it is not mutated. */ struct Utf8Iterator { /* Characters to parse. The pointer is advanced as the iterator progresses. */ const char *source; struct Utf8Parser parser; bool is_progmem; }; static inline void Utf8Iterator_Init(struct Utf8Iterator *self, const char *source) { Utf8Parser_Clear(&self->parser); self->source = source; self->is_progmem = false; } static inline void Utf8Iterator_Init_P(struct Utf8Iterator *self, const char *source) { Utf8Iterator_Init(self, source); self->is_progmem = true; } size_t utf8_strlen(const char *text); /** * Get the next character from the iterator; Returns empty character if there are no more characters to parse. * * Invalid characters are skipped. */ struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self); /** * Parse a character. * * The returned struct contains NIL (uint == 0) if no character is yet available. * * ASCII is passed through, utf-8 is collected and returned in one piece. */ struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c); #endif //LIQUIDTYPE_UTF8_H