|
|
|
/**
|
|
|
|
* UTF-8 string parsing and character iteration
|
|
|
|
*
|
|
|
|
* Created on 2020/01/04.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef LIQUIDTYPE_UTF8_H
|
|
|
|
#define LIQUIDTYPE_UTF8_H
|
|
|
|
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
|
|
|
|
/**
|
|
|
|
* UTF-8 encoded character.
|
|
|
|
*/
|
|
|
|
struct Utf8Char {
|
|
|
|
union {
|
|
|
|
/** character bytes; padded by zero bytes if shorter than 4 */
|
|
|
|
uint8_t bytes[4];
|
|
|
|
/** u32 view of the bytes */
|
|
|
|
uint32_t uint;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
/** UTF8 string parser internal state */
|
|
|
|
struct Utf8Parser {
|
|
|
|
/** UTF-8 bytes buffer */
|
|
|
|
struct Utf8Char buffer;
|
|
|
|
/** Currently collected UTF-8 character length */
|
|
|
|
uint8_t utf_len;
|
|
|
|
/** Position in the current character */
|
|
|
|
uint8_t utf_j;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline void Utf8Parser_Clear(struct Utf8Parser *self) {
|
|
|
|
self->buffer.uint = 0;
|
|
|
|
self->utf_j = 0;
|
|
|
|
self->utf_len = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Utf8 character iterator.
|
|
|
|
*
|
|
|
|
* Usage:
|
|
|
|
* struct Utf8Iterator iter;
|
|
|
|
* Utf8Iterator_Init(&iter, myString);
|
|
|
|
*
|
|
|
|
* union Utf8Char uchar;
|
|
|
|
* while ((uchar = Utf8Iterator_Next(&iter)).uint) {
|
|
|
|
* // do something with the char
|
|
|
|
* }
|
|
|
|
*
|
|
|
|
* // Free myString if needed, it is not mutated.
|
|
|
|
*/
|
|
|
|
struct Utf8Iterator {
|
|
|
|
/* Characters to parse. The pointer is advanced as the iterator progresses. */
|
|
|
|
const char *source;
|
|
|
|
struct Utf8Parser parser;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline void Utf8Iterator_Init(struct Utf8Iterator *self, const char *source) {
|
|
|
|
Utf8Parser_Clear(&self->parser);
|
|
|
|
self->source = source;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Get the next character from the iterator; Returns empty character if there are no more characters to parse.
|
|
|
|
*
|
|
|
|
* Invalid characters are skipped.
|
|
|
|
*/
|
|
|
|
struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse a character.
|
|
|
|
*
|
|
|
|
* The returned struct contains NIL (uint == 0) if no character is yet available.
|
|
|
|
*
|
|
|
|
* ASCII is passed through, utf-8 is collected and returned in one piece.
|
|
|
|
*/
|
|
|
|
struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c);
|
|
|
|
|
|
|
|
#endif //LIQUIDTYPE_UTF8_H
|