/**
 * UTF-8 string parsing and character iteration
 *
 * Created on 2020/01/04.
 */

#ifndef LIQUIDTYPE_UTF8_H
#define LIQUIDTYPE_UTF8_H

#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#include "progmem.h"

/**
 * UTF-8 encoded character.
 */
struct Utf8Char {
    union {
        /** character bytes; padded by zero bytes if shorter than 4 */
        uint8_t bytes[4];
        /** u32 view of the bytes */
        uint32_t uint;
    };
};

/** UTF8 string parser internal state */
struct Utf8Parser {
    /** UTF-8 bytes buffer */
    struct Utf8Char buffer;
    /** Currently collected UTF-8 character length */
    uint8_t utf_len;
    /** Position in the current character */
    uint8_t utf_j;
};

static inline void Utf8Parser_Clear(struct Utf8Parser *self) {
    self->buffer.uint = 0;
    self->utf_j = 0;
    self->utf_len = 0;
}

/**
 * Utf8 character iterator.
 *
 * Usage:
 * struct Utf8Iterator iter;
 * Utf8Iterator_Init(&iter, myString);
 *
 * union Utf8Char uchar;
 * while ((uchar = Utf8Iterator_Next(&iter)).uint) {
 *     // do something with the char
 * }
 *
 * // Free myString if needed, it is not mutated.
 */
struct Utf8Iterator {
    /* Characters to parse. The pointer is advanced as the iterator progresses. */
    const char *source;
    struct Utf8Parser parser;
    bool is_progmem;
};

static inline void Utf8Iterator_Init(struct Utf8Iterator *self, const char *source) {
    Utf8Parser_Clear(&self->parser);
    self->source = source;
    self->is_progmem = false;
}

static inline void Utf8Iterator_Init_P(struct Utf8Iterator *self, const char *source) {
    Utf8Iterator_Init(self, source);
    self->is_progmem = true;
}

size_t utf8_strlen(const char *text);

/**
 * Get the next character from the iterator; Returns empty character if there are no more characters to parse.
 *
 * Invalid characters are skipped.
 */
struct Utf8Char Utf8Iterator_Next(struct Utf8Iterator *self);

/**
 * Parse a character.
 *
 * The returned struct contains NIL (uint == 0) if no character is yet available.
 *
 * ASCII is passed through, utf-8 is collected and returned in one piece.
 */
struct Utf8Char Utf8Parser_Handle(struct Utf8Parser *self, char c);

#endif //LIQUIDTYPE_UTF8_H