From c4d399ccb0c2c58e2fdbf040826a7740a7c4f820 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20Hru=C5=A1ka?= <ondra@ondrovo.com>
Date: Mon, 11 Sep 2017 02:07:17 +0200
Subject: [PATCH] added utf8 cache implementation, not yet used

---
 esphttpdconfig.mk |   1 +
 user/utf8.c       | 116 ++++++++++++++++++++++++++++++++++++++++++++--
 user/utf8.h       |  53 +++++++++++++++++++++
 3 files changed, 167 insertions(+), 3 deletions(-)

diff --git a/esphttpdconfig.mk b/esphttpdconfig.mk
index 56a017a..6672162 100644
--- a/esphttpdconfig.mk
+++ b/esphttpdconfig.mk
@@ -47,6 +47,7 @@ GLOBAL_CFLAGS = \
     -DDEBUG_HTTP=0 \
     -DDEBUG_ESPFS=0 \
     -DDEBUG_PERSIST=0 \
+    -DDEBUG_UTFCACHE=1 \
     -DDEBUG_CGI=0 \
     -DDEBUG_WIFI=0 \
     -DDEBUG_WS=0 \
diff --git a/user/utf8.c b/user/utf8.c
index 57a47df..71e0ed4 100644
--- a/user/utf8.c
+++ b/user/utf8.c
@@ -2,13 +2,123 @@
 // Created by MightyPork on 2017/09/10.
 //
 
+#include <esp8266.h>
 #include "utf8.h"
 
+typedef struct {
+	char bytes[4];
+	uint16_t count;
+} UnicodeCacheSlot;
+
+static UnicodeCacheSlot cache[UNICODE_CACHE_SIZE];
+
+#define REF_TO_ID(c) (u8)(c > 127 ? c & 0x7f + 32 : c)
+#define ID_TO_REF(c) (UnicodeCacheRef)(c > 31 ? c + 95 : 95)
+#define IS_UNICODE_CACHE_REF(c) (c < 32 || c >= 127)
+
 /**
- * Encode a code point using UTF-8
+ * Add a code point to the cache. ASCII is passed through.
+ * If the code point is already stored, its use counter is incremented.
+ *
+ * @param bytes - utf8 bytes
+ * @return the obtained look-up reference
+ */
+UnicodeCacheRef ICACHE_FLASH_ATTR
+unicode_cache_add(const u8 *bytes) {
+	if (bytes[0] < 32) {
+		utfc_warn("utf8 cache bad char '%c'", bytes[0]);
+		return '?';
+	}
+	if (bytes[0] < 127) return bytes[0]; // ASCII, bypass
+
+	u8 slot;
+	for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
+		if (strneq(cache[slot].bytes, bytes, 4)) {
+			cache[slot].count++;
+			if (cache[slot].count == 1) {
+				utfc_dbg("utf8 cache resurrect '%.4s' @ %d", bytes, slot);
+			} else {
+				utfc_dbg("utf8 cache inc '%.4s' @ %d, %d uses", bytes, slot, cache[slot].count);
+			}
+			goto suc;
+		}
+	}
+	for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
+		if (cache[slot].count==0) {
+			// empty slot, store it
+			strncpy(cache[slot].bytes, bytes, 4); // this will zero out the remainder
+			cache[slot].count = 1;
+			utfc_dbg("utf8 cache new '%.4s' @ %d", bytes, slot);
+			goto suc;
+		}
+	}
+	error("utf8 cache full");
+	return '?'; // fallback to normal ASCII that will show to the user
+	suc:
+	return ID_TO_REF(slot);
+}
+
+/**
+ * Look up a code point in the cache by reference. Do not change the use counter.
+ *
+ * @param ref - reference obtained earlier using unicode_cache_add()
+ * @param target - buffer of size 4 to hold the result.
+ * @return true if the look-up succeeded
+ */
+bool ICACHE_FLASH_ATTR
+unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target) {
+	if (!IS_UNICODE_CACHE_REF(ref)) {
+		// ASCII, bypass
+		target[0] = ref;
+		target[1] = 0;
+		return true;
+	}
+
+	u8 slot = REF_TO_ID(ref);
+
+	if (cache[slot].count == 0) {
+		// "use after free"
+		target[0] = '?';
+		target[1] = 0;
+		utfc_warn("utf8 cache use-after-free @ %d (freed)", slot);
+		return false;
+	}
+
+	utfc_dbg("utf8 cache hit '%.4s' @ %d, uses %d", cache[slot].bytes, slot, cache[slot].count);
+	strncpy((char*)target, cache[slot].bytes, 4);
+	return true;
+}
+
+/**
+ * Remove an occurence of a code point from the cache.
+ * If the code point is used more than once, the use counter is decremented.
  *
- * @author Ondřej Hruška <ondra@ondrovo.com>
- * @license MIT
+ * @param ref - reference to remove or reduce
+ * @return true if the code point was found in the cache
+ */
+bool ICACHE_FLASH_ATTR
+unicode_cache_remove(UnicodeCacheRef ref) {
+	if (!IS_UNICODE_CACHE_REF(ref)) return true; // ASCII, bypass
+
+	u8 slot = REF_TO_ID(ref);
+
+	if (cache[slot].count == 0) {
+		utfc_warn("utf8 cache double-free @ %d", slot, cache[slot].count);
+		return false;
+	}
+
+	cache[slot].count--;
+	if (cache[slot].count) {
+		utfc_dbg("utf8 cache sub '%.4s' @ %d, %d uses remain", cache[slot].bytes, slot, cache[slot].count);
+	} else {
+		utfc_dbg("utf8 cache del '%.4s' @ %d", cache[slot].bytes, slot, cache[slot].count);
+	}
+	return true;
+}
+
+
+/**
+ * Encode a code point using UTF-8
  *
  * @param out - output buffer (min 4 characters), will be 0-terminated if shorten than 4
  * @param utf - code point 0-0x10FFFF
diff --git a/user/utf8.h b/user/utf8.h
index a0de3a8..866d2df 100644
--- a/user/utf8.h
+++ b/user/utf8.h
@@ -7,6 +7,59 @@
 
 #include <c_types.h>
 
+// 160 is maximum possible
+#define UNICODE_CACHE_SIZE 160
+
+typedef u8 UnicodeCacheRef;
+
+/**
+ * Add a code point to the cache. ASCII is passed through.
+ * If the code point is already stored, its use counter is incremented.
+ *
+ * @param bytes - utf8 bytes
+ * @return the obtained look-up reference
+ */
+UnicodeCacheRef unicode_cache_add(const u8 *bytes);
+
+/**
+ * Look up a code point in the cache by reference. Do not change the use counter.
+ *
+ * @param ref - reference obtained earlier using unicode_cache_add()
+ * @param target - buffer of size 4 to hold the result.
+ * @return true if the look-up succeeded
+ */
+bool unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target);
+
+/**
+ * Remove an occurence of a code point from the cache.
+ * If the code point is used more than once, the use counter is decremented.
+ *
+ * @param ref - reference to remove or reduce
+ * @return true if the code point was found in the cache
+ */
+bool unicode_cache_remove(UnicodeCacheRef ref);
+
+
+/**
+ * Encode a code point using UTF-8
+ *
+ * @author Ondřej Hruška <ondra@ondrovo.com>
+ * @license MIT
+ *
+ * @param out - output buffer (min 4 characters), will be 0-terminated if shorten than 4
+ * @param utf - code point 0-0x10FFFF
+ * @return number of bytes on success, 0 on failure (also produces U+FFFD, which uses 3 bytes)
+ */
 int utf8_encode(char *out, uint32_t utf);
 
+#if DEBUG_UTFCACHE
+#define utfc_warn warn
+#define utfc_dbg dbg
+#define utfc_info info
+#else
+#define utfc_warn(fmt, ...)
+#define utfc_dbg(fmt, ...)
+#define utfc_info(fmt, ...)
+#endif
+
 #endif //ESPTERM_UTF8_H