You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
208 lines
5.1 KiB
208 lines
5.1 KiB
//
|
|
// Created by MightyPork on 2017/09/10.
|
|
//
|
|
|
|
#include <esp8266.h>
|
|
#include "utf8.h"
|
|
|
|
typedef struct __attribute__((packed)) {
|
|
char bytes[4];
|
|
uint16_t count;
|
|
} UnicodeCacheSlot;
|
|
|
|
static UnicodeCacheSlot cache[UNICODE_CACHE_SIZE];
|
|
|
|
#define REF_TO_ID(c) (u8)((c) >= 127 ? (c) - 95 : (c))
|
|
#define ID_TO_REF(c) (UnicodeCacheRef)((c) > 31 ? (c) + 95 : c)
|
|
|
|
/**
|
|
* Clear the entire cache
|
|
* @return
|
|
*/
|
|
void ICACHE_FLASH_ATTR
|
|
unicode_cache_clear(void)
|
|
{
|
|
utfc_dbg("utf8 cache clear!");
|
|
for (int slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
|
|
cache[slot].count=0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Add a code point to the cache. ASCII is passed through.
|
|
* If the code point is already stored, its use counter is incremented.
|
|
*
|
|
* @param bytes - utf8 bytes
|
|
* @return the obtained look-up reference
|
|
*/
|
|
UnicodeCacheRef ICACHE_FLASH_ATTR
|
|
unicode_cache_add(const u8 *bytes)
|
|
{
|
|
if (bytes[0] < 32) {
|
|
utfc_warn("utf8 cache bad char '%c'", bytes[0]);
|
|
return '?';
|
|
}
|
|
if (bytes[0] < 127) return bytes[0]; // ASCII, bypass
|
|
|
|
u8 slot;
|
|
for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
|
|
if (strneq(cache[slot].bytes, bytes, 4)) {
|
|
cache[slot].count++;
|
|
if (cache[slot].count == 1) {
|
|
utfc_dbg("utf8 cache new '%.4s' @ %d", bytes, slot);
|
|
} else {
|
|
utfc_dbg("utf8 cache inc '%.4s' @ %d, %d uses", bytes, slot, cache[slot].count);
|
|
}
|
|
goto suc;
|
|
}
|
|
}
|
|
for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
|
|
if (cache[slot].count==0) {
|
|
// empty slot, store it
|
|
strncpy(cache[slot].bytes, (const char *) bytes, 4); // this will zero out the remainder
|
|
cache[slot].count = 1;
|
|
utfc_dbg("utf8 cache new '%.4s' @ %d", bytes, slot);
|
|
goto suc;
|
|
}
|
|
}
|
|
error("utf8 cache full");
|
|
return '?'; // fallback to normal ASCII that will show to the user
|
|
suc:
|
|
return ID_TO_REF(slot);
|
|
}
|
|
|
|
/**
|
|
* Increment a reference
|
|
*
|
|
* @param ref - reference
|
|
* @return success
|
|
*/
|
|
bool ICACHE_FLASH_ATTR
|
|
unicode_cache_inc(UnicodeCacheRef ref)
|
|
{
|
|
if (!IS_UNICODE_CACHE_REF(ref)) return true; // ASCII
|
|
|
|
int slot = REF_TO_ID(ref);
|
|
if (cache[slot].count == 0) {
|
|
utfc_warn("utf8 cache inc-after-free ref @ %d", ref);
|
|
return false;
|
|
}
|
|
cache[slot].count++;
|
|
utfc_dbg("utf8 cache inc '%.4s' @ %d, %d uses", cache[slot].bytes, slot, cache[slot].count);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Look up a code point in the cache by reference. Do not change the use counter.
|
|
*
|
|
* @param ref - reference obtained earlier using unicode_cache_add()
|
|
* @param target - buffer of size 4 to hold the result.
|
|
* @return true if the look-up succeeded
|
|
*/
|
|
bool ICACHE_FLASH_ATTR
|
|
unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target)
|
|
{
|
|
if (!IS_UNICODE_CACHE_REF(ref)) {
|
|
// ASCII, bypass
|
|
target[0] = ref;
|
|
target[1] = 0;
|
|
return true;
|
|
}
|
|
|
|
u8 slot = REF_TO_ID(ref);
|
|
if (cache[slot].count == 0) {
|
|
// "use after free"
|
|
target[0] = '?';
|
|
target[1] = 0;
|
|
utfc_warn("utf8 cache use-after-free @ %d ('%.4s')", slot, cache[slot].bytes);
|
|
return false;
|
|
}
|
|
|
|
//utfc_dbg("utf8 cache hit '%.4s' @ %d", cache[slot].bytes, slot, cache[slot].count);
|
|
strncpy((char*)target, cache[slot].bytes, 4);
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Remove an occurence of a code point from the cache.
|
|
* If the code point is used more than once, the use counter is decremented.
|
|
*
|
|
* @param ref - reference to remove or reduce
|
|
* @return true if the code point was found in the cache
|
|
*/
|
|
bool ICACHE_FLASH_ATTR
|
|
unicode_cache_remove(UnicodeCacheRef ref)
|
|
{
|
|
if (!IS_UNICODE_CACHE_REF(ref)) return true; // ASCII, bypass
|
|
|
|
u8 slot = REF_TO_ID(ref);
|
|
|
|
if (cache[slot].count == 0) {
|
|
utfc_warn("utf8 cache double-free @ %d ('%.4s')", slot, cache[slot].bytes);
|
|
return false;
|
|
}
|
|
|
|
cache[slot].count--;
|
|
if (cache[slot].count) {
|
|
utfc_dbg("utf8 cache sub '%.4s' @ %d, %d uses remain", cache[slot].bytes, slot, cache[slot].count);
|
|
} else {
|
|
utfc_dbg("utf8 cache del '%.4s' @ %d", cache[slot].bytes, slot);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
/**
|
|
* Encode a code point using UTF-8
|
|
*
|
|
* @param out - output buffer (min 4 characters), will be 0-terminated if shorten than 4
|
|
* @param utf - code point 0-0x10FFFF
|
|
* @return number of bytes on success, 0 on failure (also produces U+FFFD, which uses 3 bytes)
|
|
*/
|
|
int ICACHE_FLASH_ATTR
|
|
utf8_encode(char *out, uint32_t utf, bool surrogateFix)
|
|
{
|
|
// Skip the surrogate block (wtf, unicode???)
|
|
if (surrogateFix && utf >= 0xD800) {
|
|
utf += 0x800;
|
|
}
|
|
|
|
if (utf <= 0x7F) {
|
|
// Plain ASCII
|
|
out[0] = (char) utf;
|
|
out[1] = 0;
|
|
return 1;
|
|
}
|
|
else if (utf <= 0x07FF) {
|
|
// 2-byte unicode
|
|
out[0] = (char) (((utf >> 6) & 0x1F) | 0xC0);
|
|
out[1] = (char) (((utf >> 0) & 0x3F) | 0x80);
|
|
out[2] = 0;
|
|
return 2;
|
|
}
|
|
else if (utf <= 0xFFFF) {
|
|
// 3-byte unicode
|
|
out[0] = (char) (((utf >> 12) & 0x0F) | 0xE0);
|
|
out[1] = (char) (((utf >> 6) & 0x3F) | 0x80);
|
|
out[2] = (char) (((utf >> 0) & 0x3F) | 0x80);
|
|
out[3] = 0;
|
|
return 3;
|
|
}
|
|
else if (utf <= 0x10FFFF) {
|
|
// 4-byte unicode
|
|
out[0] = (char) (((utf >> 18) & 0x07) | 0xF0);
|
|
out[1] = (char) (((utf >> 12) & 0x3F) | 0x80);
|
|
out[2] = (char) (((utf >> 6) & 0x3F) | 0x80);
|
|
out[3] = (char) (((utf >> 0) & 0x3F) | 0x80);
|
|
// out[4] = 0;
|
|
return 4;
|
|
}
|
|
else {
|
|
// error - use replacement character
|
|
out[0] = (char) 0xEF;
|
|
out[1] = (char) 0xBF;
|
|
out[2] = (char) 0xBD;
|
|
out[3] = 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|