nostrdb

an unfairly fast embedded nostr database backed by lmdb
git clone git://jb55.com/nostrdb
Log | Files | Refs | Submodules | README | LICENSE

binmoji.c (7195B)


      1 #include <assert.h>
      2 #include <stddef.h>
      3 #include <stdint.h>
      4 #include <stdio.h>
      5 #include <stdlib.h>
      6 #include <string.h>
      7 
      8 #include "binmoji.h"
      9 
     10 #define PRIMARY_CP_SHIFT 42
     11 #define HASH_SHIFT 10
     12 #define TONE1_SHIFT 7
     13 #define TONE2_SHIFT 4
     14 #define FLAGS_SHIFT 0
     15 
     16 #define PRIMARY_CP_MASK 0x3FFFFF
     17 #define HASH_MASK 0xFFFFFFFF
     18 #define TONE_MASK 0x7
     19 #define FLAGS_MASK 0xF
     20 
     21 typedef struct {
     22 	uint32_t hash;
     23 	size_t count;
     24 	uint32_t components[16];
     25 } EmojiHashEntry;
     26 
     27 #include "binmoji_table.h"
     28 
     29 const size_t num_hash_entries =
     30     sizeof(binmoji_table) / sizeof(binmoji_table[0]);
     31 
     32 static uint32_t crc32(const uint32_t *data, size_t length)
     33 {
     34 	uint32_t item, bit, crc = 0xFFFFFFFF;
     35 	size_t i;
     36 	int j;
     37 
     38 	if (data == NULL || length == 0)
     39 		return 0;
     40 	for (i = 0; i < length; ++i) {
     41 		item = data[i];
     42 		for (j = 0; j < 32; ++j) {
     43 			bit = (item >> (31 - j)) & 1;
     44 			if ((crc >> 31) ^ bit) {
     45 				crc = (crc << 1) ^ 0x04C11DB7;
     46 			} else {
     47 				crc = (crc << 1);
     48 			}
     49 		}
     50 	}
     51 	return crc;
     52 }
     53 
     54 static int is_base_emoji(uint32_t codepoint)
     55 {
     56 	if (codepoint >= 0x1F3FB && codepoint <= 0x1F3FF) /* Skin Tones */
     57 		return 0;
     58 	if (codepoint == 0x200D) /* Zero Width Joiner */
     59 		return 0;
     60 	return 1;
     61 }
     62 
     63 void binmoji_parse(const char *emoji_str, struct binmoji *binmoji)
     64 {
     65 	const unsigned char *s;
     66 	memset(binmoji, 0, sizeof(struct binmoji));
     67 	s = (const unsigned char *)emoji_str;
     68 
     69 	while (*s) {
     70 		uint32_t codepoint = 0;
     71 		int len = 0;
     72 		if (*s < 0x80) {
     73 			len = 1;
     74 			codepoint = s[0];
     75 		} else if ((*s & 0xE0) == 0xC0) {
     76 			len = 2;
     77 			codepoint = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
     78 		} else if ((*s & 0xF0) == 0xE0) {
     79 			len = 3;
     80 			codepoint = ((s[0] & 0x0F) << 12) |
     81 				    ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
     82 		} else if ((*s & 0xF8) == 0xF0) {
     83 			len = 4;
     84 			codepoint = ((s[0] & 0x07) << 18) |
     85 				    ((s[1] & 0x3F) << 12) |
     86 				    ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
     87 		} else {
     88 			s++;
     89 			continue;
     90 		}
     91 		s += len;
     92 
     93 		if (codepoint >= 0x1F3FB && codepoint <= 0x1F3FF) {
     94 			uint8_t tone_val = (codepoint - 0x1F3FB) + 1;
     95 			if (binmoji->skin_tone1 == 0)
     96 				binmoji->skin_tone1 = tone_val;
     97 			else if (binmoji->skin_tone2 == 0)
     98 				binmoji->skin_tone2 = tone_val;
     99 		} else if (is_base_emoji(codepoint)) {
    100 			if (binmoji->primary_codepoint == 0) {
    101 				binmoji->primary_codepoint = codepoint;
    102 			} else if (binmoji->component_count < 16) {
    103 				binmoji->component_list
    104 				    [binmoji->component_count++] = codepoint;
    105 			}
    106 		}
    107 	}
    108 	binmoji->component_hash =
    109 	    crc32(binmoji->component_list, binmoji->component_count);
    110 }
    111 
    112 uint64_t binmoji_encode(const struct binmoji *binmoji)
    113 {
    114 	uint64_t id = 0;
    115 	id |= ((uint64_t)(binmoji->primary_codepoint & PRIMARY_CP_MASK)
    116 	       << PRIMARY_CP_SHIFT);
    117 	id |= ((uint64_t)(binmoji->component_hash & HASH_MASK) << HASH_SHIFT);
    118 	id |= ((uint64_t)(binmoji->skin_tone1 & TONE_MASK) << TONE1_SHIFT);
    119 	id |= ((uint64_t)(binmoji->skin_tone2 & TONE_MASK) << TONE2_SHIFT);
    120 	id |= ((uint64_t)(binmoji->flags & FLAGS_MASK) << FLAGS_SHIFT);
    121 	return id;
    122 }
    123 
    124 /**
    125  * @brief Comparison function for bsearch.
    126  *
    127  * Compares a target hash key against an EmojiHashEntry's hash.
    128  * @param key Pointer to the target uint32_t hash.
    129  * @param element Pointer to the EmojiHashEntry from the array.
    130  * @return <0 if key is less than element's hash, 0 if equal, >0 if greater.
    131  */
    132 static int compare_emoji_hash(const void *key, const void *element)
    133 {
    134 	const uint32_t hash_key = *(const uint32_t *)key;
    135 	const EmojiHashEntry *entry = (const EmojiHashEntry *)element;
    136 
    137 	if (hash_key < entry->hash) {
    138 		return -1;
    139 	} else if (hash_key > entry->hash) {
    140 		return 1;
    141 	} else {
    142 		return 0;
    143 	}
    144 }
    145 
    146 /**
    147  * @brief Optimized lookup using binary search.
    148  */
    149 static int lookup_binmoji_by_hash(uint32_t hash, uint32_t *out_binmoji,
    150 				  size_t *out_count)
    151 {
    152 	const EmojiHashEntry *result =
    153 	    bsearch(&hash, binmoji_table, num_hash_entries,
    154 		    sizeof(EmojiHashEntry), compare_emoji_hash);
    155 
    156 	if (result != NULL) {
    157 		*out_count = result->count;
    158 		memcpy(out_binmoji, result->components,
    159 		       (*out_count) * sizeof(uint32_t));
    160 		return 1; /* Found */
    161 	}
    162 
    163 	*out_count = 0;
    164 	return 0; /* Not found */
    165 }
    166 
    167 void binmoji_decode(uint64_t id, struct binmoji *binmoji)
    168 {
    169 	memset(binmoji, 0, sizeof(struct binmoji));
    170 	binmoji->primary_codepoint = (id >> PRIMARY_CP_SHIFT) & PRIMARY_CP_MASK;
    171 	binmoji->component_hash = (id >> HASH_SHIFT) & HASH_MASK;
    172 	binmoji->skin_tone1 = (id >> TONE1_SHIFT) & TONE_MASK;
    173 	binmoji->skin_tone2 = (id >> TONE2_SHIFT) & TONE_MASK;
    174 	binmoji->flags = (id >> FLAGS_SHIFT) & FLAGS_MASK;
    175 	if (binmoji->component_hash != 0) {
    176 		lookup_binmoji_by_hash(binmoji->component_hash,
    177 				       binmoji->component_list,
    178 				       &binmoji->component_count);
    179 	}
    180 }
    181 
    182 static int append_utf8(char *buf, size_t buf_size, size_t *offset,
    183 		       uint32_t codepoint)
    184 {
    185 	char *p;
    186 	int bytes_to_write = 0;
    187 
    188 	if (!buf)
    189 		return 0;
    190 	if (codepoint < 0x80)
    191 		bytes_to_write = 1;
    192 	else if (codepoint < 0x800)
    193 		bytes_to_write = 2;
    194 	else if (codepoint < 0x10000)
    195 		bytes_to_write = 3;
    196 	else if (codepoint < 0x110000)
    197 		bytes_to_write = 4;
    198 	else
    199 		return 0;
    200 	if (*offset + bytes_to_write >= buf_size)
    201 		return 0;
    202 
    203 	p = buf + *offset;
    204 	if (bytes_to_write == 1) {
    205 		*p = (char)codepoint;
    206 	} else if (bytes_to_write == 2) {
    207 		p[0] = 0xC0 | (codepoint >> 6);
    208 		p[1] = 0x80 | (codepoint & 0x3F);
    209 	} else if (bytes_to_write == 3) {
    210 		p[0] = 0xE0 | (codepoint >> 12);
    211 		p[1] = 0x80 | ((codepoint >> 6) & 0x3F);
    212 		p[2] = 0x80 | (codepoint & 0x3F);
    213 	} else {
    214 		p[0] = 0xF0 | (codepoint >> 18);
    215 		p[1] = 0x80 | ((codepoint >> 12) & 0x3F);
    216 		p[2] = 0x80 | ((codepoint >> 6) & 0x3F);
    217 		p[3] = 0x80 | (codepoint & 0x3F);
    218 	}
    219 	*offset += bytes_to_write;
    220 	return bytes_to_write;
    221 }
    222 
    223 void binmoji_to_string(const struct binmoji *binmoji, char *out_str,
    224 		       size_t out_str_size)
    225 {
    226 	size_t i, offset;
    227 	uint32_t comp;
    228 	int needs_zwj, is_country_flag, is_subdivision_flag, no_zwj_sequence;
    229 
    230 	if (!binmoji || !out_str || out_str_size == 0)
    231 		return;
    232 
    233 	offset = 0;
    234 	out_str[0] = '\0';
    235 
    236 	is_country_flag = (binmoji->primary_codepoint >= 0x1F1E6 &&
    237 			   binmoji->primary_codepoint <= 0x1F1FF);
    238 
    239 	is_subdivision_flag = (binmoji->primary_codepoint == 0x1F3F4 &&
    240 			       binmoji->component_count > 0 &&
    241 			       binmoji->component_list[0] >= 0xE0020 &&
    242 			       binmoji->component_list[0] <= 0xE007F);
    243 
    244 	no_zwj_sequence = is_country_flag || is_subdivision_flag;
    245 
    246 	if (binmoji->primary_codepoint > 0) {
    247 		append_utf8(out_str, out_str_size, &offset,
    248 			    binmoji->primary_codepoint);
    249 	}
    250 
    251 	if (binmoji->skin_tone1 > 0) {
    252 		append_utf8(out_str, out_str_size, &offset,
    253 			    0x1F3FB + binmoji->skin_tone1 - 1);
    254 	}
    255 
    256 	for (i = 0; i < binmoji->component_count; i++) {
    257 		comp = binmoji->component_list[i];
    258 		needs_zwj =
    259 		    (comp != 0xFE0F && comp != 0x20E3 && !no_zwj_sequence);
    260 
    261 		if (needs_zwj) {
    262 			append_utf8(out_str, out_str_size, &offset,
    263 				    0x200D); /* ZWJ */
    264 		}
    265 		append_utf8(out_str, out_str_size, &offset, comp);
    266 
    267 		if (i == binmoji->component_count - 1 &&
    268 		    binmoji->skin_tone2 > 0) {
    269 			append_utf8(out_str, out_str_size, &offset,
    270 				    0x1F3FB + binmoji->skin_tone2 - 1);
    271 		}
    272 	}
    273 
    274 	if (offset < out_str_size)
    275 		out_str[offset] = '\0';
    276 	else if (out_str_size > 0)
    277 		out_str[out_str_size - 1] = '\0';
    278 }