damus

nostr ios client
git clone git://jb55.com/damus
Log | Files | Refs | README | LICENSE

utf8.c (6471B)


      1 /* MIT (BSD) license - see LICENSE file for details - taken from ccan. thanks rusty! */
      2 
      3 #include "utf8.h"
      4 #include <errno.h>
      5 #include <stdlib.h>
      6 
      7 /* I loved this table, so I stole it: */
      8 /*
      9  * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
     10  * <https://github.com/chansen/c-utf8-valid>
     11  * All rights reserved.
     12  *
     13  * Redistribution and use in source and binary forms, with or without
     14  * modification, are permitted provided that the following conditions are met:
     15  *
     16  * 1. Redistributions of source code must retain the above copyright notice, this
     17  *    list of conditions and the following disclaimer.
     18  * 2. Redistributions in binary form must reproduce the above copyright notice,
     19  *    this list of conditions and the following disclaimer in the documentation
     20  *    and/or other materials provided with the distribution.
     21  *
     22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     24  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     25  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     26  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     27  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     29  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     31  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     32  */
     33 /*
     34  *    UTF-8 Encoding Form
     35  *
     36  *    U+0000..U+007F       0xxxxxxx                <= 7 bits
     37  *    U+0080..U+07FF       110xxxxx 10xxxxxx            <= 11 bits
     38  *    U+0800..U+FFFF       1110xxxx 10xxxxxx 10xxxxxx        <= 16 bits
     39  *   U+10000..U+10FFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx    <= 21 bits
     40  *
     41  *
     42  *    U+0000..U+007F       00..7F
     43  *                      N  C0..C1  80..BF                   1100000x 10xxxxxx
     44  *    U+0080..U+07FF       C2..DF  80..BF
     45  *                      N  E0      80..9F  80..BF           11100000 100xxxxx
     46  *    U+0800..U+0FFF       E0      A0..BF  80..BF
     47  *    U+1000..U+CFFF       E1..EC  80..BF  80..BF
     48  *    U+D000..U+D7FF       ED      80..9F  80..BF
     49  *                      S  ED      A0..BF  80..BF           11101101 101xxxxx
     50  *    U+E000..U+FFFF       EE..EF  80..BF  80..BF
     51  *                      N  F0      80..8F  80..BF  80..BF   11110000 1000xxxx
     52  *   U+10000..U+3FFFF      F0      90..BF  80..BF  80..BF
     53  *   U+40000..U+FFFFF      F1..F3  80..BF  80..BF  80..BF
     54  *  U+100000..U+10FFFF     F4      80..8F  80..BF  80..BF   11110100 1000xxxx
     55  *
     56  *  Legend:
     57  *    N = Non-shortest form
     58  *    S = Surrogates
     59  */
     60 bool utf8_decode(struct utf8_state *utf8_state, char c)
     61 {
     62     if (utf8_state->used_len == utf8_state->total_len) {
     63         utf8_state->used_len = 1;
     64         /* First character in sequence. */
     65         if (((unsigned char)c & 0x80) == 0) {
     66             /* ASCII, easy. */
     67             if (c == 0)
     68                 goto bad_encoding;
     69             utf8_state->total_len = 1;
     70             utf8_state->c = c;
     71             goto finished_decoding;
     72         } else if (((unsigned char)c & 0xE0) == 0xC0) {
     73             utf8_state->total_len = 2;
     74             utf8_state->c = ((unsigned char)c & 0x1F);
     75             return false;
     76         } else if (((unsigned char)c & 0xF0) == 0xE0) {
     77             utf8_state->total_len = 3;
     78             utf8_state->c = ((unsigned char)c & 0x0F);
     79             return false;
     80         } else if (((unsigned char)c & 0xF8) == 0xF0) {
     81             utf8_state->total_len = 4;
     82             utf8_state->c = ((unsigned char)c & 0x07);
     83             return false;
     84         }
     85         goto bad_encoding;
     86     }
     87 
     88     if (((unsigned char)c & 0xC0) != 0x80)
     89         goto bad_encoding;
     90 
     91     utf8_state->c <<= 6;
     92     utf8_state->c |= ((unsigned char)c & 0x3F);
     93     
     94     utf8_state->used_len++;
     95     if (utf8_state->used_len == utf8_state->total_len)
     96         goto finished_decoding;
     97     return false;
     98 
     99 finished_decoding:
    100     if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
    101         errno = ERANGE;
    102     /* The UTF-16 "surrogate range": illegal in UTF-8 */
    103     else if (utf8_state->total_len == 3
    104          && (utf8_state->c & 0xFFFFF800) == 0x0000D800)
    105         errno = ERANGE;
    106     else {
    107         int min_bits;
    108         switch (utf8_state->total_len) {
    109         case 1:
    110             min_bits = 0;
    111             break;
    112         case 2:
    113             min_bits = 7;
    114             break;
    115         case 3:
    116             min_bits = 11;
    117             break;
    118         case 4:
    119             min_bits = 16;
    120             break;
    121         default:
    122             abort();
    123         }
    124         if ((utf8_state->c >> min_bits) == 0)
    125             errno = EFBIG;
    126         else
    127             errno = 0;
    128     }
    129     return true;
    130 
    131 bad_encoding:
    132     utf8_state->total_len = utf8_state->used_len;
    133     errno = EINVAL;
    134     return true;
    135 }
    136 
    137 size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
    138 {
    139     if ((point >> 7) == 0) {
    140         if (point == 0) {
    141             errno = ERANGE;
    142             return 0;
    143         }
    144         /* 0xxxxxxx */
    145         dest[0] = point;
    146         return 1;
    147     }
    148 
    149     if ((point >> 11) == 0) {
    150         /* 110xxxxx 10xxxxxx */
    151         dest[1] = 0x80 | (point & 0x3F);
    152         dest[0] = 0xC0 | (point >> 6);
    153         return 2;
    154     }
    155 
    156     if ((point >> 16) == 0) {
    157         if (point >= 0xD800 && point <= 0xDFFF) {
    158             errno = ERANGE;
    159             return 0;
    160         }
    161         /* 1110xxxx 10xxxxxx 10xxxxxx */
    162         dest[2] = 0x80 | (point & 0x3F);
    163         dest[1] = 0x80 | ((point >> 6) & 0x3F);
    164         dest[0] = 0xE0 | (point >> 12);
    165         return 3;
    166     }
    167 
    168     if (point > 0x10FFFF) {
    169         errno = ERANGE;
    170         return 0;
    171     }
    172 
    173     /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    174     dest[3] = 0x80 | (point & 0x3F);
    175     dest[2] = 0x80 | ((point >> 6) & 0x3F);
    176     dest[1] = 0x80 | ((point >> 12) & 0x3F);
    177     dest[0] = 0xF0 | (point >> 18);
    178     return 4;
    179 }
    180 
    181 /* Check for valid UTF-8 */
    182 bool utf8_check(const void *vbuf, size_t buflen)
    183 {
    184     const unsigned char *buf = vbuf;
    185     struct utf8_state utf8_state = UTF8_STATE_INIT;
    186     bool need_more = false;
    187 
    188     for (size_t i = 0; i < buflen; i++) {
    189         if (!utf8_decode(&utf8_state, buf[i])) {
    190             need_more = true;
    191             continue;
    192         }
    193         need_more = false;
    194         if (errno != 0)
    195             return false;
    196     }
    197     return !need_more;
    198 }
    199