utf8.c (6471B)
1 /* MIT (BSD) license - see LICENSE file for details - taken from ccan. thanks rusty! */ 2 3 #include "utf8.h" 4 #include <errno.h> 5 #include <stdlib.h> 6 7 /* I loved this table, so I stole it: */ 8 /* 9 * Copyright (c) 2017 Christian Hansen <chansen@cpan.org> 10 * <https://github.com/chansen/c-utf8-valid> 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions are met: 15 * 16 * 1. Redistributions of source code must retain the above copyright notice, this 17 * list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright notice, 19 * this list of conditions and the following disclaimer in the documentation 20 * and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 24 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 25 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 26 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 27 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 29 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 /* 34 * UTF-8 Encoding Form 35 * 36 * U+0000..U+007F 0xxxxxxx <= 7 bits 37 * U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits 38 * U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits 39 * U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits 40 * 41 * 42 * U+0000..U+007F 00..7F 43 * N C0..C1 80..BF 1100000x 10xxxxxx 44 * U+0080..U+07FF C2..DF 80..BF 45 * N E0 80..9F 80..BF 11100000 100xxxxx 46 * U+0800..U+0FFF E0 A0..BF 80..BF 47 * U+1000..U+CFFF E1..EC 80..BF 80..BF 48 * U+D000..U+D7FF ED 80..9F 80..BF 49 * S ED A0..BF 80..BF 11101101 101xxxxx 50 * U+E000..U+FFFF EE..EF 80..BF 80..BF 51 * N F0 80..8F 80..BF 80..BF 11110000 1000xxxx 52 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF 53 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 54 * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx 55 * 56 * Legend: 57 * N = Non-shortest form 58 * S = Surrogates 59 */ 60 bool utf8_decode(struct utf8_state *utf8_state, char c) 61 { 62 if (utf8_state->used_len == utf8_state->total_len) { 63 utf8_state->used_len = 1; 64 /* First character in sequence. */ 65 if (((unsigned char)c & 0x80) == 0) { 66 /* ASCII, easy. */ 67 if (c == 0) 68 goto bad_encoding; 69 utf8_state->total_len = 1; 70 utf8_state->c = c; 71 goto finished_decoding; 72 } else if (((unsigned char)c & 0xE0) == 0xC0) { 73 utf8_state->total_len = 2; 74 utf8_state->c = ((unsigned char)c & 0x1F); 75 return false; 76 } else if (((unsigned char)c & 0xF0) == 0xE0) { 77 utf8_state->total_len = 3; 78 utf8_state->c = ((unsigned char)c & 0x0F); 79 return false; 80 } else if (((unsigned char)c & 0xF8) == 0xF0) { 81 utf8_state->total_len = 4; 82 utf8_state->c = ((unsigned char)c & 0x07); 83 return false; 84 } 85 goto bad_encoding; 86 } 87 88 if (((unsigned char)c & 0xC0) != 0x80) 89 goto bad_encoding; 90 91 utf8_state->c <<= 6; 92 utf8_state->c |= ((unsigned char)c & 0x3F); 93 94 utf8_state->used_len++; 95 if (utf8_state->used_len == utf8_state->total_len) 96 goto finished_decoding; 97 return false; 98 99 finished_decoding: 100 if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF) 101 errno = ERANGE; 102 /* The UTF-16 "surrogate range": illegal in UTF-8 */ 103 else if (utf8_state->total_len == 3 104 && (utf8_state->c & 0xFFFFF800) == 0x0000D800) 105 errno = ERANGE; 106 else { 107 int min_bits; 108 switch (utf8_state->total_len) { 109 case 1: 110 min_bits = 0; 111 break; 112 case 2: 113 min_bits = 7; 114 break; 115 case 3: 116 min_bits = 11; 117 break; 118 case 4: 119 min_bits = 16; 120 break; 121 default: 122 abort(); 123 } 124 if ((utf8_state->c >> min_bits) == 0) 125 errno = EFBIG; 126 else 127 errno = 0; 128 } 129 return true; 130 131 bad_encoding: 132 utf8_state->total_len = utf8_state->used_len; 133 errno = EINVAL; 134 return true; 135 } 136 137 size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]) 138 { 139 if ((point >> 7) == 0) { 140 if (point == 0) { 141 errno = ERANGE; 142 return 0; 143 } 144 /* 0xxxxxxx */ 145 dest[0] = point; 146 return 1; 147 } 148 149 if ((point >> 11) == 0) { 150 /* 110xxxxx 10xxxxxx */ 151 dest[1] = 0x80 | (point & 0x3F); 152 dest[0] = 0xC0 | (point >> 6); 153 return 2; 154 } 155 156 if ((point >> 16) == 0) { 157 if (point >= 0xD800 && point <= 0xDFFF) { 158 errno = ERANGE; 159 return 0; 160 } 161 /* 1110xxxx 10xxxxxx 10xxxxxx */ 162 dest[2] = 0x80 | (point & 0x3F); 163 dest[1] = 0x80 | ((point >> 6) & 0x3F); 164 dest[0] = 0xE0 | (point >> 12); 165 return 3; 166 } 167 168 if (point > 0x10FFFF) { 169 errno = ERANGE; 170 return 0; 171 } 172 173 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 174 dest[3] = 0x80 | (point & 0x3F); 175 dest[2] = 0x80 | ((point >> 6) & 0x3F); 176 dest[1] = 0x80 | ((point >> 12) & 0x3F); 177 dest[0] = 0xF0 | (point >> 18); 178 return 4; 179 } 180 181 /* Check for valid UTF-8 */ 182 bool utf8_check(const void *vbuf, size_t buflen) 183 { 184 const unsigned char *buf = vbuf; 185 struct utf8_state utf8_state = UTF8_STATE_INIT; 186 bool need_more = false; 187 188 for (size_t i = 0; i < buflen; i++) { 189 if (!utf8_decode(&utf8_state, buf[i])) { 190 need_more = true; 191 continue; 192 } 193 need_more = false; 194 if (errno != 0) 195 return false; 196 } 197 return !need_more; 198 } 199