utf8.h (1667B)
1 /* MIT (BSD) license - see LICENSE file for details */ 2 #ifndef CCAN_UTF8_H 3 #define CCAN_UTF8_H 4 #include <inttypes.h> 5 #include <stdbool.h> 6 #include <string.h> 7 8 /* Unicode is limited to 21 bits. */ 9 #define UTF8_MAX_LEN 4 10 11 struct utf8_state { 12 /* How many characters we are expecting as part of this Unicode point */ 13 uint16_t total_len; 14 /* How many characters we've already seen. */ 15 uint16_t used_len; 16 /* Compound character, aka Unicode point. */ 17 uint32_t c; 18 }; 19 20 #define UTF8_STATE_INIT { 0, 0, 0 } 21 22 static inline void utf8_state_init(struct utf8_state *utf8_state) 23 { 24 memset(utf8_state, 0, sizeof(*utf8_state)); 25 } 26 27 /** 28 * utf8_decode - continue UTF8 decoding with this character. 29 * @utf8_state - initialized UTF8 state. 30 * @c - the character. 31 * 32 * Returns false if it needs another character to give results. 33 * Otherwise returns true, @utf8_state can be reused without initializeation, 34 * and sets errno: 35 * 0: success 36 * EINVAL: bad encoding (including a NUL character). 37 * EFBIG: not a minimal encoding. 38 * ERANGE: encoding of invalid character. 39 * 40 * You can extract the character from @utf8_state->c; @utf8_state->used_len 41 * indicates how many characters have been consumed. 42 */ 43 bool utf8_decode(struct utf8_state *utf8_state, char c); 44 45 /** 46 * utf8_encode - encode a point into UTF8. 47 * @point - Unicode point to include. 48 * @dest - buffer to fill. 49 * 50 * Returns 0 if point was invalid, otherwise bytes of dest used. 51 * Sets errno to ERANGE if point was invalid. 52 */ 53 size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]); 54 55 /* Check for valid UTF-8 */ 56 bool utf8_check(const void *vbuf, size_t buflen); 57 #endif /* CCAN_UTF8_H */