binmoji.c (7195B)
1 #include <assert.h> 2 #include <stddef.h> 3 #include <stdint.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 8 #include "binmoji.h" 9 10 #define PRIMARY_CP_SHIFT 42 11 #define HASH_SHIFT 10 12 #define TONE1_SHIFT 7 13 #define TONE2_SHIFT 4 14 #define FLAGS_SHIFT 0 15 16 #define PRIMARY_CP_MASK 0x3FFFFF 17 #define HASH_MASK 0xFFFFFFFF 18 #define TONE_MASK 0x7 19 #define FLAGS_MASK 0xF 20 21 typedef struct { 22 uint32_t hash; 23 size_t count; 24 uint32_t components[16]; 25 } EmojiHashEntry; 26 27 #include "binmoji_table.h" 28 29 const size_t num_hash_entries = 30 sizeof(binmoji_table) / sizeof(binmoji_table[0]); 31 32 static uint32_t crc32(const uint32_t *data, size_t length) 33 { 34 uint32_t item, bit, crc = 0xFFFFFFFF; 35 size_t i; 36 int j; 37 38 if (data == NULL || length == 0) 39 return 0; 40 for (i = 0; i < length; ++i) { 41 item = data[i]; 42 for (j = 0; j < 32; ++j) { 43 bit = (item >> (31 - j)) & 1; 44 if ((crc >> 31) ^ bit) { 45 crc = (crc << 1) ^ 0x04C11DB7; 46 } else { 47 crc = (crc << 1); 48 } 49 } 50 } 51 return crc; 52 } 53 54 static int is_base_emoji(uint32_t codepoint) 55 { 56 if (codepoint >= 0x1F3FB && codepoint <= 0x1F3FF) /* Skin Tones */ 57 return 0; 58 if (codepoint == 0x200D) /* Zero Width Joiner */ 59 return 0; 60 return 1; 61 } 62 63 void binmoji_parse(const char *emoji_str, struct binmoji *binmoji) 64 { 65 const unsigned char *s; 66 memset(binmoji, 0, sizeof(struct binmoji)); 67 s = (const unsigned char *)emoji_str; 68 69 while (*s) { 70 uint32_t codepoint = 0; 71 int len = 0; 72 if (*s < 0x80) { 73 len = 1; 74 codepoint = s[0]; 75 } else if ((*s & 0xE0) == 0xC0) { 76 len = 2; 77 codepoint = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F); 78 } else if ((*s & 0xF0) == 0xE0) { 79 len = 3; 80 codepoint = ((s[0] & 0x0F) << 12) | 81 ((s[1] & 0x3F) << 6) | (s[2] & 0x3F); 82 } else if ((*s & 0xF8) == 0xF0) { 83 len = 4; 84 codepoint = ((s[0] & 0x07) << 18) | 85 ((s[1] & 0x3F) << 12) | 86 ((s[2] & 0x3F) << 6) | (s[3] & 0x3F); 87 } else { 88 s++; 89 continue; 90 } 91 s += len; 92 93 if (codepoint >= 0x1F3FB && codepoint <= 0x1F3FF) { 94 uint8_t tone_val = (codepoint - 0x1F3FB) + 1; 95 if (binmoji->skin_tone1 == 0) 96 binmoji->skin_tone1 = tone_val; 97 else if (binmoji->skin_tone2 == 0) 98 binmoji->skin_tone2 = tone_val; 99 } else if (is_base_emoji(codepoint)) { 100 if (binmoji->primary_codepoint == 0) { 101 binmoji->primary_codepoint = codepoint; 102 } else if (binmoji->component_count < 16) { 103 binmoji->component_list 104 [binmoji->component_count++] = codepoint; 105 } 106 } 107 } 108 binmoji->component_hash = 109 crc32(binmoji->component_list, binmoji->component_count); 110 } 111 112 uint64_t binmoji_encode(const struct binmoji *binmoji) 113 { 114 uint64_t id = 0; 115 id |= ((uint64_t)(binmoji->primary_codepoint & PRIMARY_CP_MASK) 116 << PRIMARY_CP_SHIFT); 117 id |= ((uint64_t)(binmoji->component_hash & HASH_MASK) << HASH_SHIFT); 118 id |= ((uint64_t)(binmoji->skin_tone1 & TONE_MASK) << TONE1_SHIFT); 119 id |= ((uint64_t)(binmoji->skin_tone2 & TONE_MASK) << TONE2_SHIFT); 120 id |= ((uint64_t)(binmoji->flags & FLAGS_MASK) << FLAGS_SHIFT); 121 return id; 122 } 123 124 /** 125 * @brief Comparison function for bsearch. 126 * 127 * Compares a target hash key against an EmojiHashEntry's hash. 128 * @param key Pointer to the target uint32_t hash. 129 * @param element Pointer to the EmojiHashEntry from the array. 130 * @return <0 if key is less than element's hash, 0 if equal, >0 if greater. 131 */ 132 static int compare_emoji_hash(const void *key, const void *element) 133 { 134 const uint32_t hash_key = *(const uint32_t *)key; 135 const EmojiHashEntry *entry = (const EmojiHashEntry *)element; 136 137 if (hash_key < entry->hash) { 138 return -1; 139 } else if (hash_key > entry->hash) { 140 return 1; 141 } else { 142 return 0; 143 } 144 } 145 146 /** 147 * @brief Optimized lookup using binary search. 148 */ 149 static int lookup_binmoji_by_hash(uint32_t hash, uint32_t *out_binmoji, 150 size_t *out_count) 151 { 152 const EmojiHashEntry *result = 153 bsearch(&hash, binmoji_table, num_hash_entries, 154 sizeof(EmojiHashEntry), compare_emoji_hash); 155 156 if (result != NULL) { 157 *out_count = result->count; 158 memcpy(out_binmoji, result->components, 159 (*out_count) * sizeof(uint32_t)); 160 return 1; /* Found */ 161 } 162 163 *out_count = 0; 164 return 0; /* Not found */ 165 } 166 167 void binmoji_decode(uint64_t id, struct binmoji *binmoji) 168 { 169 memset(binmoji, 0, sizeof(struct binmoji)); 170 binmoji->primary_codepoint = (id >> PRIMARY_CP_SHIFT) & PRIMARY_CP_MASK; 171 binmoji->component_hash = (id >> HASH_SHIFT) & HASH_MASK; 172 binmoji->skin_tone1 = (id >> TONE1_SHIFT) & TONE_MASK; 173 binmoji->skin_tone2 = (id >> TONE2_SHIFT) & TONE_MASK; 174 binmoji->flags = (id >> FLAGS_SHIFT) & FLAGS_MASK; 175 if (binmoji->component_hash != 0) { 176 lookup_binmoji_by_hash(binmoji->component_hash, 177 binmoji->component_list, 178 &binmoji->component_count); 179 } 180 } 181 182 static int append_utf8(char *buf, size_t buf_size, size_t *offset, 183 uint32_t codepoint) 184 { 185 char *p; 186 int bytes_to_write = 0; 187 188 if (!buf) 189 return 0; 190 if (codepoint < 0x80) 191 bytes_to_write = 1; 192 else if (codepoint < 0x800) 193 bytes_to_write = 2; 194 else if (codepoint < 0x10000) 195 bytes_to_write = 3; 196 else if (codepoint < 0x110000) 197 bytes_to_write = 4; 198 else 199 return 0; 200 if (*offset + bytes_to_write >= buf_size) 201 return 0; 202 203 p = buf + *offset; 204 if (bytes_to_write == 1) { 205 *p = (char)codepoint; 206 } else if (bytes_to_write == 2) { 207 p[0] = 0xC0 | (codepoint >> 6); 208 p[1] = 0x80 | (codepoint & 0x3F); 209 } else if (bytes_to_write == 3) { 210 p[0] = 0xE0 | (codepoint >> 12); 211 p[1] = 0x80 | ((codepoint >> 6) & 0x3F); 212 p[2] = 0x80 | (codepoint & 0x3F); 213 } else { 214 p[0] = 0xF0 | (codepoint >> 18); 215 p[1] = 0x80 | ((codepoint >> 12) & 0x3F); 216 p[2] = 0x80 | ((codepoint >> 6) & 0x3F); 217 p[3] = 0x80 | (codepoint & 0x3F); 218 } 219 *offset += bytes_to_write; 220 return bytes_to_write; 221 } 222 223 void binmoji_to_string(const struct binmoji *binmoji, char *out_str, 224 size_t out_str_size) 225 { 226 size_t i, offset; 227 uint32_t comp; 228 int needs_zwj, is_country_flag, is_subdivision_flag, no_zwj_sequence; 229 230 if (!binmoji || !out_str || out_str_size == 0) 231 return; 232 233 offset = 0; 234 out_str[0] = '\0'; 235 236 is_country_flag = (binmoji->primary_codepoint >= 0x1F1E6 && 237 binmoji->primary_codepoint <= 0x1F1FF); 238 239 is_subdivision_flag = (binmoji->primary_codepoint == 0x1F3F4 && 240 binmoji->component_count > 0 && 241 binmoji->component_list[0] >= 0xE0020 && 242 binmoji->component_list[0] <= 0xE007F); 243 244 no_zwj_sequence = is_country_flag || is_subdivision_flag; 245 246 if (binmoji->primary_codepoint > 0) { 247 append_utf8(out_str, out_str_size, &offset, 248 binmoji->primary_codepoint); 249 } 250 251 if (binmoji->skin_tone1 > 0) { 252 append_utf8(out_str, out_str_size, &offset, 253 0x1F3FB + binmoji->skin_tone1 - 1); 254 } 255 256 for (i = 0; i < binmoji->component_count; i++) { 257 comp = binmoji->component_list[i]; 258 needs_zwj = 259 (comp != 0xFE0F && comp != 0x20E3 && !no_zwj_sequence); 260 261 if (needs_zwj) { 262 append_utf8(out_str, out_str_size, &offset, 263 0x200D); /* ZWJ */ 264 } 265 append_utf8(out_str, out_str_size, &offset, comp); 266 267 if (i == binmoji->component_count - 1 && 268 binmoji->skin_tone2 > 0) { 269 append_utf8(out_str, out_str_size, &offset, 270 0x1F3FB + binmoji->skin_tone2 - 1); 271 } 272 } 273 274 if (offset < out_str_size) 275 out_str[offset] = '\0'; 276 else if (out_str_size > 0) 277 out_str[out_str_size - 1] = '\0'; 278 }