From c4d399ccb0c2c58e2fdbf040826a7740a7c4f820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Hru=C5=A1ka?= Date: Mon, 11 Sep 2017 02:07:17 +0200 Subject: [PATCH] added utf8 cache implementation, not yet used --- esphttpdconfig.mk | 1 + user/utf8.c | 116 ++++++++++++++++++++++++++++++++++++++++++++-- user/utf8.h | 53 +++++++++++++++++++++ 3 files changed, 167 insertions(+), 3 deletions(-) diff --git a/esphttpdconfig.mk b/esphttpdconfig.mk index 56a017a..6672162 100644 --- a/esphttpdconfig.mk +++ b/esphttpdconfig.mk @@ -47,6 +47,7 @@ GLOBAL_CFLAGS = \ -DDEBUG_HTTP=0 \ -DDEBUG_ESPFS=0 \ -DDEBUG_PERSIST=0 \ + -DDEBUG_UTFCACHE=1 \ -DDEBUG_CGI=0 \ -DDEBUG_WIFI=0 \ -DDEBUG_WS=0 \ diff --git a/user/utf8.c b/user/utf8.c index 57a47df..71e0ed4 100644 --- a/user/utf8.c +++ b/user/utf8.c @@ -2,13 +2,123 @@ // Created by MightyPork on 2017/09/10. // +#include #include "utf8.h" +typedef struct { + char bytes[4]; + uint16_t count; +} UnicodeCacheSlot; + +static UnicodeCacheSlot cache[UNICODE_CACHE_SIZE]; + +#define REF_TO_ID(c) (u8)(c > 127 ? c & 0x7f + 32 : c) +#define ID_TO_REF(c) (UnicodeCacheRef)(c > 31 ? c + 95 : 95) +#define IS_UNICODE_CACHE_REF(c) (c < 32 || c >= 127) + /** - * Encode a code point using UTF-8 + * Add a code point to the cache. ASCII is passed through. + * If the code point is already stored, its use counter is incremented. + * + * @param bytes - utf8 bytes + * @return the obtained look-up reference + */ +UnicodeCacheRef ICACHE_FLASH_ATTR +unicode_cache_add(const u8 *bytes) { + if (bytes[0] < 32) { + utfc_warn("utf8 cache bad char '%c'", bytes[0]); + return '?'; + } + if (bytes[0] < 127) return bytes[0]; // ASCII, bypass + + u8 slot; + for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) { + if (strneq(cache[slot].bytes, bytes, 4)) { + cache[slot].count++; + if (cache[slot].count == 1) { + utfc_dbg("utf8 cache resurrect '%.4s' @ %d", bytes, slot); + } else { + utfc_dbg("utf8 cache inc '%.4s' @ %d, %d uses", bytes, slot, cache[slot].count); + } + goto suc; + } + } + for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) { + if (cache[slot].count==0) { + // empty slot, store it + strncpy(cache[slot].bytes, bytes, 4); // this will zero out the remainder + cache[slot].count = 1; + utfc_dbg("utf8 cache new '%.4s' @ %d", bytes, slot); + goto suc; + } + } + error("utf8 cache full"); + return '?'; // fallback to normal ASCII that will show to the user + suc: + return ID_TO_REF(slot); +} + +/** + * Look up a code point in the cache by reference. Do not change the use counter. + * + * @param ref - reference obtained earlier using unicode_cache_add() + * @param target - buffer of size 4 to hold the result. + * @return true if the look-up succeeded + */ +bool ICACHE_FLASH_ATTR +unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target) { + if (!IS_UNICODE_CACHE_REF(ref)) { + // ASCII, bypass + target[0] = ref; + target[1] = 0; + return true; + } + + u8 slot = REF_TO_ID(ref); + + if (cache[slot].count == 0) { + // "use after free" + target[0] = '?'; + target[1] = 0; + utfc_warn("utf8 cache use-after-free @ %d (freed)", slot); + return false; + } + + utfc_dbg("utf8 cache hit '%.4s' @ %d, uses %d", cache[slot].bytes, slot, cache[slot].count); + strncpy((char*)target, cache[slot].bytes, 4); + return true; +} + +/** + * Remove an occurence of a code point from the cache. + * If the code point is used more than once, the use counter is decremented. * - * @author Ondřej Hruška - * @license MIT + * @param ref - reference to remove or reduce + * @return true if the code point was found in the cache + */ +bool ICACHE_FLASH_ATTR +unicode_cache_remove(UnicodeCacheRef ref) { + if (!IS_UNICODE_CACHE_REF(ref)) return true; // ASCII, bypass + + u8 slot = REF_TO_ID(ref); + + if (cache[slot].count == 0) { + utfc_warn("utf8 cache double-free @ %d", slot, cache[slot].count); + return false; + } + + cache[slot].count--; + if (cache[slot].count) { + utfc_dbg("utf8 cache sub '%.4s' @ %d, %d uses remain", cache[slot].bytes, slot, cache[slot].count); + } else { + utfc_dbg("utf8 cache del '%.4s' @ %d", cache[slot].bytes, slot, cache[slot].count); + } + return true; +} + + +/** + * Encode a code point using UTF-8 * * @param out - output buffer (min 4 characters), will be 0-terminated if shorten than 4 * @param utf - code point 0-0x10FFFF diff --git a/user/utf8.h b/user/utf8.h index a0de3a8..866d2df 100644 --- a/user/utf8.h +++ b/user/utf8.h @@ -7,6 +7,59 @@ #include +// 160 is maximum possible +#define UNICODE_CACHE_SIZE 160 + +typedef u8 UnicodeCacheRef; + +/** + * Add a code point to the cache. ASCII is passed through. + * If the code point is already stored, its use counter is incremented. + * + * @param bytes - utf8 bytes + * @return the obtained look-up reference + */ +UnicodeCacheRef unicode_cache_add(const u8 *bytes); + +/** + * Look up a code point in the cache by reference. Do not change the use counter. + * + * @param ref - reference obtained earlier using unicode_cache_add() + * @param target - buffer of size 4 to hold the result. + * @return true if the look-up succeeded + */ +bool unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target); + +/** + * Remove an occurence of a code point from the cache. + * If the code point is used more than once, the use counter is decremented. + * + * @param ref - reference to remove or reduce + * @return true if the code point was found in the cache + */ +bool unicode_cache_remove(UnicodeCacheRef ref); + + +/** + * Encode a code point using UTF-8 + * + * @author Ondřej Hruška + * @license MIT + * + * @param out - output buffer (min 4 characters), will be 0-terminated if shorten than 4 + * @param utf - code point 0-0x10FFFF + * @return number of bytes on success, 0 on failure (also produces U+FFFD, which uses 3 bytes) + */ int utf8_encode(char *out, uint32_t utf); +#if DEBUG_UTFCACHE +#define utfc_warn warn +#define utfc_dbg dbg +#define utfc_info info +#else +#define utfc_warn(fmt, ...) +#define utfc_dbg(fmt, ...) +#define utfc_info(fmt, ...) +#endif + #endif //ESPTERM_UTF8_H