added utf8 cache implementation, not yet used

http-comm
Ondřej Hruška 7 years ago
parent 521bc63c22
commit c4d399ccb0
  1. 1
      esphttpdconfig.mk
  2. 116
      user/utf8.c
  3. 53
      user/utf8.h

@ -47,6 +47,7 @@ GLOBAL_CFLAGS = \
-DDEBUG_HTTP=0 \
-DDEBUG_ESPFS=0 \
-DDEBUG_PERSIST=0 \
-DDEBUG_UTFCACHE=1 \
-DDEBUG_CGI=0 \
-DDEBUG_WIFI=0 \
-DDEBUG_WS=0 \

@ -2,13 +2,123 @@
// Created by MightyPork on 2017/09/10.
//
#include <esp8266.h>
#include "utf8.h"
typedef struct {
char bytes[4];
uint16_t count;
} UnicodeCacheSlot;
static UnicodeCacheSlot cache[UNICODE_CACHE_SIZE];
#define REF_TO_ID(c) (u8)(c > 127 ? c & 0x7f + 32 : c)
#define ID_TO_REF(c) (UnicodeCacheRef)(c > 31 ? c + 95 : 95)
#define IS_UNICODE_CACHE_REF(c) (c < 32 || c >= 127)
/**
* Encode a code point using UTF-8
* Add a code point to the cache. ASCII is passed through.
* If the code point is already stored, its use counter is incremented.
*
* @param bytes - utf8 bytes
* @return the obtained look-up reference
*/
UnicodeCacheRef ICACHE_FLASH_ATTR
unicode_cache_add(const u8 *bytes) {
if (bytes[0] < 32) {
utfc_warn("utf8 cache bad char '%c'", bytes[0]);
return '?';
}
if (bytes[0] < 127) return bytes[0]; // ASCII, bypass
u8 slot;
for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
if (strneq(cache[slot].bytes, bytes, 4)) {
cache[slot].count++;
if (cache[slot].count == 1) {
utfc_dbg("utf8 cache resurrect '%.4s' @ %d", bytes, slot);
} else {
utfc_dbg("utf8 cache inc '%.4s' @ %d, %d uses", bytes, slot, cache[slot].count);
}
goto suc;
}
}
for (slot = 0; slot < UNICODE_CACHE_SIZE; slot++) {
if (cache[slot].count==0) {
// empty slot, store it
strncpy(cache[slot].bytes, bytes, 4); // this will zero out the remainder
cache[slot].count = 1;
utfc_dbg("utf8 cache new '%.4s' @ %d", bytes, slot);
goto suc;
}
}
error("utf8 cache full");
return '?'; // fallback to normal ASCII that will show to the user
suc:
return ID_TO_REF(slot);
}
/**
* Look up a code point in the cache by reference. Do not change the use counter.
*
* @param ref - reference obtained earlier using unicode_cache_add()
* @param target - buffer of size 4 to hold the result.
* @return true if the look-up succeeded
*/
bool ICACHE_FLASH_ATTR
unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target) {
if (!IS_UNICODE_CACHE_REF(ref)) {
// ASCII, bypass
target[0] = ref;
target[1] = 0;
return true;
}
u8 slot = REF_TO_ID(ref);
if (cache[slot].count == 0) {
// "use after free"
target[0] = '?';
target[1] = 0;
utfc_warn("utf8 cache use-after-free @ %d (freed)", slot);
return false;
}
utfc_dbg("utf8 cache hit '%.4s' @ %d, uses %d", cache[slot].bytes, slot, cache[slot].count);
strncpy((char*)target, cache[slot].bytes, 4);
return true;
}
/**
* Remove an occurence of a code point from the cache.
* If the code point is used more than once, the use counter is decremented.
*
* @author Ondřej Hruška <ondra@ondrovo.com>
* @license MIT
* @param ref - reference to remove or reduce
* @return true if the code point was found in the cache
*/
bool ICACHE_FLASH_ATTR
unicode_cache_remove(UnicodeCacheRef ref) {
if (!IS_UNICODE_CACHE_REF(ref)) return true; // ASCII, bypass
u8 slot = REF_TO_ID(ref);
if (cache[slot].count == 0) {
utfc_warn("utf8 cache double-free @ %d", slot, cache[slot].count);
return false;
}
cache[slot].count--;
if (cache[slot].count) {
utfc_dbg("utf8 cache sub '%.4s' @ %d, %d uses remain", cache[slot].bytes, slot, cache[slot].count);
} else {
utfc_dbg("utf8 cache del '%.4s' @ %d", cache[slot].bytes, slot, cache[slot].count);
}
return true;
}
/**
* Encode a code point using UTF-8
*
* @param out - output buffer (min 4 characters), will be 0-terminated if shorten than 4
* @param utf - code point 0-0x10FFFF

@ -7,6 +7,59 @@
#include <c_types.h>
// 160 is maximum possible
#define UNICODE_CACHE_SIZE 160
typedef u8 UnicodeCacheRef;
/**
* Add a code point to the cache. ASCII is passed through.
* If the code point is already stored, its use counter is incremented.
*
* @param bytes - utf8 bytes
* @return the obtained look-up reference
*/
UnicodeCacheRef unicode_cache_add(const u8 *bytes);
/**
* Look up a code point in the cache by reference. Do not change the use counter.
*
* @param ref - reference obtained earlier using unicode_cache_add()
* @param target - buffer of size 4 to hold the result.
* @return true if the look-up succeeded
*/
bool unicode_cache_retrieve(UnicodeCacheRef ref, u8 *target);
/**
* Remove an occurence of a code point from the cache.
* If the code point is used more than once, the use counter is decremented.
*
* @param ref - reference to remove or reduce
* @return true if the code point was found in the cache
*/
bool unicode_cache_remove(UnicodeCacheRef ref);
/**
* Encode a code point using UTF-8
*
* @author Ondřej Hruška <ondra@ondrovo.com>
* @license MIT
*
* @param out - output buffer (min 4 characters), will be 0-terminated if shorten than 4
* @param utf - code point 0-0x10FFFF
* @return number of bytes on success, 0 on failure (also produces U+FFFD, which uses 3 bytes)
*/
int utf8_encode(char *out, uint32_t utf);
#if DEBUG_UTFCACHE
#define utfc_warn warn
#define utfc_dbg dbg
#define utfc_info info
#else
#define utfc_warn(fmt, ...)
#define utfc_dbg(fmt, ...)
#define utfc_info(fmt, ...)
#endif
#endif //ESPTERM_UTF8_H

Loading…
Cancel
Save