grisu3_parse.h (18815B)
1 /* 2 * Copyright (c) 2016 Mikkel F. Jørgensen, dvide.com 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. http://www.apache.org/licenses/LICENSE-2.0 15 */ 16 17 /* 18 * Port of parts of Google Double Conversion strtod functionality 19 * but with fallback to strtod instead of a bignum implementation. 20 * 21 * Based on grisu3 math from MathGeoLib. 22 * 23 * See also grisu3_math.h comments. 24 */ 25 26 #ifndef GRISU3_PARSE_H 27 #define GRISU3_PARSE_H 28 29 #ifdef __cplusplus 30 extern "C" { 31 #endif 32 33 #ifndef UINT8_MAX 34 #include <stdint.h> 35 #endif 36 37 #include <stdlib.h> 38 #include <limits.h> 39 40 #include "grisu3_math.h" 41 42 43 /* 44 * The maximum number characters a valid number may contain. The parse 45 * fails if the input length is longer but the character after max len 46 * was part of the number. 47 * 48 * The length should not be set too high because it protects against 49 * overflow in the exponent part derived from the input length. 50 */ 51 #define GRISU3_NUM_MAX_LEN 1000 52 53 /* 54 * The lightweight "portable" C library recognizes grisu3 support if 55 * included first. 56 */ 57 #define grisu3_parse_double_is_defined 1 58 59 /* 60 * Disable to compare performance and to test diy_fp algorithm in 61 * broader range. 62 */ 63 #define GRISU3_PARSE_FAST_CASE 64 65 /* May result in a one off error, otherwise when uncertain, fall back to strtod. */ 66 //#define GRISU3_PARSE_ALLOW_ERROR 67 68 69 /* 70 * The dec output exponent jumps in 8, so the result is offset at most 71 * by 7 when the input is within range. 72 */ 73 static int grisu3_diy_fp_cached_dec_pow(int d_exp, grisu3_diy_fp_t *p) 74 { 75 const int cached_offset = -GRISU3_MIN_CACHED_EXP; 76 const int d_exp_dist = GRISU3_CACHED_EXP_STEP; 77 int i, a_exp; 78 79 GRISU3_ASSERT(GRISU3_MIN_CACHED_EXP <= d_exp); 80 GRISU3_ASSERT(d_exp < GRISU3_MAX_CACHED_EXP + d_exp_dist); 81 82 i = (d_exp + cached_offset) / d_exp_dist; 83 a_exp = grisu3_diy_fp_pow_cache[i].d_exp; 84 p->f = grisu3_diy_fp_pow_cache[i].fract; 85 p->e = grisu3_diy_fp_pow_cache[i].b_exp; 86 87 GRISU3_ASSERT(a_exp <= d_exp); 88 GRISU3_ASSERT(d_exp < a_exp + d_exp_dist); 89 90 return a_exp; 91 } 92 93 /* 94 * Ported from google double conversion strtod using 95 * MathGeoLibs diy_fp functions for grisu3 in C. 96 * 97 * ulp_half_error is set if needed to trunacted non-zero trialing 98 * characters. 99 * 100 * The actual value we need to encode is: 101 * 102 * (sign ? -1 : 1) * fraction * 2 ^ (exponent - fraction_exp) 103 * where exponent is the base 10 exponent assuming the decimal point is 104 * after the first digit. fraction_exp is the base 10 magnitude of the 105 * fraction or number of significant digits - 1. 106 * 107 * If the exponent is between 0 and 22 and the fraction is encoded in 108 * the lower 53 bits (the largest bit is implicit in a double, but not 109 * in this fraction), then the value can be trivially converted to 110 * double without loss of precision. If the fraction was in fact 111 * multiplied by trailing zeroes that we didn't convert to exponent, 112 * we there are larger values the 53 bits that can also be encoded 113 * trivially - but then it is better to handle this during parsing 114 * if it is worthwhile. We do not optimize for this here, because it 115 * can be done in a simple check before calling, and because it might 116 * not be worthwile to do at all since it cery likely will fail for 117 * numbers printed to be convertible back to double without loss. 118 * 119 * Returns 0 if conversion was not exact. In that case the vale is 120 * either one smaller than the correct one, or the correct one. 121 * 122 * Exponents must be range protected before calling otherwise cached 123 * powers will blow up. 124 * 125 * Google Double Conversion seems to prefer the following notion: 126 * 127 * x >= 10^309 => +Inf 128 * x <= 10^-324 => 0, 129 * 130 * max double: HUGE_VAL = 1.7976931348623157 * 10^308 131 * min double: 4.9406564584124654 * 10^-324 132 * 133 * Values just below or above min/max representable number 134 * may round towards large/small non-Inf/non-neg values. 135 * 136 * but `strtod` seems to return +/-HUGE_VAL on overflow? 137 */ 138 static int grisu3_diy_fp_encode_double(uint64_t fraction, int exponent, int fraction_exp, int ulp_half_error, double *result) 139 { 140 /* 141 * Error is measures in fractions of integers, so we scale up to get 142 * some resolution to represent error expressions. 143 */ 144 const int log2_error_one = 3; 145 const int error_one = 1 << log2_error_one; 146 const int denorm_exp = GRISU3_D64_DENORM_EXP; 147 const uint64_t hidden_bit = GRISU3_D64_IMPLICIT_ONE; 148 const int diy_size = GRISU3_DIY_FP_FRACT_SIZE; 149 const int max_digits = 19; 150 151 int error = ulp_half_error ? error_one / 2 : 0; 152 int d_exp = (exponent - fraction_exp); 153 int a_exp; 154 int o_exp; 155 grisu3_diy_fp_t v = { fraction, 0 }; 156 grisu3_diy_fp_t cp; 157 grisu3_diy_fp_t rounded; 158 int mag; 159 int prec; 160 int prec_bits; 161 int half_way; 162 163 /* When fractions in a double aren't stored with implicit msb fraction bit. */ 164 165 /* Shift fraction to msb. */ 166 v = grisu3_diy_fp_normalize(v); 167 /* The half point error moves up while the exponent moves down. */ 168 error <<= -v.e; 169 170 a_exp = grisu3_diy_fp_cached_dec_pow(d_exp, &cp); 171 172 /* Interpolate between cached powers at distance 8. */ 173 if (a_exp != d_exp) { 174 int adj_exp = d_exp - a_exp - 1; 175 static grisu3_diy_fp_t cp_10_lut[] = { 176 { 0xa000000000000000ULL, -60 }, 177 { 0xc800000000000000ULL, -57 }, 178 { 0xfa00000000000000ULL, -54 }, 179 { 0x9c40000000000000ULL, -50 }, 180 { 0xc350000000000000ULL, -47 }, 181 { 0xf424000000000000ULL, -44 }, 182 { 0x9896800000000000ULL, -40 }, 183 }; 184 GRISU3_ASSERT(adj_exp >= 0 && adj_exp < 7); 185 v = grisu3_diy_fp_multiply(v, cp_10_lut[adj_exp]); 186 187 /* 20 decimal digits won't always fit in 64 bit. 188 * (`fraction_exp` is one less than significant decimal 189 * digits in fraction, e.g. 1 * 10e0). 190 * If we cannot fit, introduce 1/2 ulp error 191 * (says double conversion reference impl.) */ 192 if (1 + fraction_exp + adj_exp > max_digits) { 193 error += error_one / 2; 194 } 195 } 196 197 v = grisu3_diy_fp_multiply(v, cp); 198 /* 199 * Google double conversion claims that: 200 * 201 * The error introduced by a multiplication of a*b equals 202 * error_a + error_b + error_a*error_b/2^64 + 0.5 203 * Substituting a with 'input' and b with 'cached_power' we have 204 * error_b = 0.5 (all cached powers have an error of less than 0.5 ulp), 205 * error_ab = 0 or 1 / error_oner > error_a*error_b/ 2^64 206 * 207 * which in our encoding becomes: 208 * error_a = error_one/2 209 * error_ab = 1 / error_one (rounds up to 1 if error != 0, or 0 * otherwise) 210 * fixed_error = error_one/2 211 * 212 * error += error_a + fixed_error + (error ? 1 : 0) 213 * 214 * (this isn't entirely clear, but that is as close as we get). 215 */ 216 error += error_one + (error ? 1 : 0); 217 218 o_exp = v.e; 219 v = grisu3_diy_fp_normalize(v); 220 /* Again, if we shift the significant bits, the error moves along. */ 221 error <<= o_exp - v.e; 222 223 /* 224 * The value `v` is bounded by 2^mag which is 64 + v.e. because we 225 * just normalized it by shifting towards msb. 226 */ 227 mag = diy_size + v.e; 228 229 /* The effective magnitude of the IEEE double representation. */ 230 mag = mag >= diy_size + denorm_exp ? diy_size : mag <= denorm_exp ? 0 : mag - denorm_exp; 231 prec = diy_size - mag; 232 if (prec + log2_error_one >= diy_size) { 233 int e_scale = prec + log2_error_one - diy_size - 1; 234 v.f >>= e_scale; 235 v.e += e_scale; 236 error = (error >> e_scale) + 1 + error_one; 237 prec -= e_scale; 238 } 239 rounded.f = v.f >> prec; 240 rounded.e = v.e + prec; 241 prec_bits = (int)(v.f & ((uint64_t)1 << (prec - 1))) * error_one; 242 half_way = (int)((uint64_t)1 << (prec - 1)) * error_one; 243 if (prec >= half_way + error) { 244 rounded.f++; 245 /* Prevent overflow. */ 246 if (rounded.f & (hidden_bit << 1)) { 247 rounded.f >>= 1; 248 rounded.e += 1; 249 } 250 } 251 *result = grisu3_cast_double_from_diy_fp(rounded); 252 return half_way - error >= prec_bits || prec_bits >= half_way + error; 253 } 254 255 /* 256 * `end` is unchanged if number is handled natively, or it is the result 257 * of strtod parsing in case of fallback. 258 */ 259 static const char *grisu3_encode_double(const char *buf, const char *end, int sign, uint64_t fraction, int exponent, int fraction_exp, int ulp_half_error, double *result) 260 { 261 const int max_d_exp = GRISU3_D64_MAX_DEC_EXP; 262 const int min_d_exp = GRISU3_D64_MIN_DEC_EXP; 263 264 char *v_end; 265 266 /* Both for user experience, and to protect internal power table lookups. */ 267 if (fraction == 0 || exponent < min_d_exp) { 268 *result = 0.0; 269 goto done; 270 } 271 if (exponent - 1 > max_d_exp) { 272 *result = grisu3_double_infinity; 273 goto done; 274 } 275 276 /* 277 * `exponent` is the normalized value, fraction_exp is the size of 278 * the representation in the `fraction value`, or one less than 279 * number of significant digits. 280 * 281 * If the final value can be kept in 53 bits and we can avoid 282 * division, then we can convert to double quite fast. 283 * 284 * ulf_half_error only happens when fraction is maxed out, so 285 * fraction_exp > 22 by definition. 286 * 287 * fraction_exp >= 0 always. 288 * 289 * http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/ 290 */ 291 292 293 #ifdef GRISU3_PARSE_FAST_CASE 294 if (fraction < (1ULL << 53) && exponent >= 0 && exponent <= 22) { 295 double v = (double)fraction; 296 /* Multiplying by 1e-k instead of dividing by 1ek results in rounding error. */ 297 switch (exponent - fraction_exp) { 298 case -22: v /= 1e22; break; 299 case -21: v /= 1e21; break; 300 case -20: v /= 1e20; break; 301 case -19: v /= 1e19; break; 302 case -18: v /= 1e18; break; 303 case -17: v /= 1e17; break; 304 case -16: v /= 1e16; break; 305 case -15: v /= 1e15; break; 306 case -14: v /= 1e14; break; 307 case -13: v /= 1e13; break; 308 case -12: v /= 1e12; break; 309 case -11: v /= 1e11; break; 310 case -10: v /= 1e10; break; 311 case -9: v /= 1e9; break; 312 case -8: v /= 1e8; break; 313 case -7: v /= 1e7; break; 314 case -6: v /= 1e6; break; 315 case -5: v /= 1e5; break; 316 case -4: v /= 1e4; break; 317 case -3: v /= 1e3; break; 318 case -2: v /= 1e2; break; 319 case -1: v /= 1e1; break; 320 case 0: break; 321 case 1: v *= 1e1; break; 322 case 2: v *= 1e2; break; 323 case 3: v *= 1e3; break; 324 case 4: v *= 1e4; break; 325 case 5: v *= 1e5; break; 326 case 6: v *= 1e6; break; 327 case 7: v *= 1e7; break; 328 case 8: v *= 1e8; break; 329 case 9: v *= 1e9; break; 330 case 10: v *= 1e10; break; 331 case 11: v *= 1e11; break; 332 case 12: v *= 1e12; break; 333 case 13: v *= 1e13; break; 334 case 14: v *= 1e14; break; 335 case 15: v *= 1e15; break; 336 case 16: v *= 1e16; break; 337 case 17: v *= 1e17; break; 338 case 18: v *= 1e18; break; 339 case 19: v *= 1e19; break; 340 case 20: v *= 1e20; break; 341 case 21: v *= 1e21; break; 342 case 22: v *= 1e22; break; 343 } 344 *result = v; 345 goto done; 346 } 347 #endif 348 349 if (grisu3_diy_fp_encode_double(fraction, exponent, fraction_exp, ulp_half_error, result)) { 350 goto done; 351 } 352 #ifdef GRISU3_PARSE_ALLOW_ERROR 353 goto done; 354 #endif 355 *result = strtod(buf, &v_end); 356 if (v_end < end) { 357 return v_end; 358 } 359 return end; 360 done: 361 if (sign) { 362 *result = -*result; 363 } 364 return end; 365 } 366 367 /* 368 * Returns buf if number wasn't matched, or null if number starts ok 369 * but contains invalid content. 370 */ 371 static const char *grisu3_parse_hex_fp(const char *buf, const char *end, int sign, double *result) 372 { 373 (void)buf; 374 (void)end; 375 (void)sign; 376 *result = 0.0; 377 /* Not currently supported. */ 378 return buf; 379 } 380 381 /* 382 * Returns end pointer on success, or null, or buf if start is not a number. 383 * Sets result to 0.0 on error. 384 * Reads up to len + 1 bytes from buffer where len + 1 must not be a 385 * valid part of a number, but all of buf, buf + len need not be a 386 * number. Leading whitespace is NOT valid. 387 * Very small numbers are truncated to +/-0.0 and numerically very large 388 * numbers are returns as +/-infinity. 389 * 390 * A value must not end or begin with '.' (like JSON), but can have 391 * leading zeroes (unlike JSON). A single leading zero followed by 392 * an encoding symbol may or may not be interpreted as a non-decimal 393 * encoding prefix, e.g. 0x, but a leading zero followed by a digit is 394 * NOT interpreted as octal. 395 * A single leading negative sign may appear before digits, but positive 396 * sign is not allowed and space after the sign is not allowed. 397 * At most the first 1000 characters of the input is considered. 398 */ 399 static const char *grisu3_parse_double(const char *buf, size_t len, double *result) 400 { 401 const char *mark, *k, *end; 402 int sign = 0, esign = 0; 403 uint64_t fraction = 0; 404 int exponent = 0; 405 int ee = 0; 406 int fraction_exp = 0; 407 int ulp_half_error = 0; 408 409 *result = 0.0; 410 411 end = buf + len + 1; 412 413 /* Failsafe for exponent overflow. */ 414 if (len > GRISU3_NUM_MAX_LEN) { 415 end = buf + GRISU3_NUM_MAX_LEN + 1; 416 } 417 418 if (buf == end) { 419 return buf; 420 } 421 mark = buf; 422 if (*buf == '-') { 423 ++buf; 424 sign = 1; 425 if (buf == end) { 426 return 0; 427 } 428 } 429 if (*buf == '0') { 430 ++buf; 431 /* | 0x20 is lower case ASCII. */ 432 if (buf != end && (*buf | 0x20) == 'x') { 433 k = grisu3_parse_hex_fp(buf, end, sign, result); 434 if (k == buf) { 435 return mark; 436 } 437 return k; 438 } 439 /* Not worthwhile, except for getting the scale of integer part. */ 440 while (buf != end && *buf == '0') { 441 ++buf; 442 } 443 } else { 444 if (*buf < '1' || *buf > '9') { 445 /* 446 * If we didn't see a sign, just don't recognize it as 447 * number, otherwise make it an error. 448 */ 449 return sign ? 0 : mark; 450 } 451 fraction = (uint64_t)(*buf++ - '0'); 452 } 453 k = buf; 454 /* 455 * We do not catch trailing zeroes when there is no decimal point. 456 * This misses an opportunity for moving the exponent down into the 457 * fast case. But it is unlikely to be worthwhile as it complicates 458 * parsing. 459 */ 460 while (buf != end && *buf >= '0' && *buf <= '9') { 461 if (fraction >= UINT64_MAX / 10) { 462 fraction += *buf >= '5'; 463 ulp_half_error = 1; 464 break; 465 } 466 fraction = fraction * 10 + (uint64_t)(*buf++ - '0'); 467 } 468 fraction_exp = (int)(buf - k); 469 /* Skip surplus digits. Trailing zero does not introduce error. */ 470 while (buf != end && *buf == '0') { 471 ++exponent; 472 ++buf; 473 } 474 if (buf != end && *buf >= '1' && *buf <= '9') { 475 ulp_half_error = 1; 476 ++exponent; 477 ++buf; 478 while (buf != end && *buf >= '0' && *buf <= '9') { 479 ++exponent; 480 ++buf; 481 } 482 } 483 if (buf != end && *buf == '.') { 484 ++buf; 485 k = buf; 486 if (*buf < '0' || *buf > '9') { 487 /* We don't accept numbers without leading or trailing digit. */ 488 return 0; 489 } 490 while (buf != end && *buf >= '0' && *buf <= '9') { 491 if (fraction >= UINT64_MAX / 10) { 492 if (!ulp_half_error) { 493 fraction += *buf >= '5'; 494 ulp_half_error = 1; 495 } 496 break; 497 } 498 fraction = fraction * 10 + (uint64_t)(*buf++ - '0'); 499 --exponent; 500 } 501 fraction_exp += (int)(buf - k); 502 while (buf != end && *buf == '0') { 503 ++exponent; 504 ++buf; 505 } 506 if (buf != end && *buf >= '1' && *buf <= '9') { 507 ulp_half_error = 1; 508 ++buf; 509 while (buf != end && *buf >= '0' && *buf <= '9') { 510 ++buf; 511 } 512 } 513 } 514 /* 515 * Normalized exponent e.g: 1.23434e3 with fraction = 123434, 516 * fraction_exp = 5, exponent = 3. 517 * So value = fraction * 10^(exponent - fraction_exp) 518 */ 519 exponent += fraction_exp; 520 if (buf != end && (*buf | 0x20) == 'e') { 521 if (end - buf < 2) { 522 return 0; 523 } 524 ++buf; 525 if (*buf == '+') { 526 ++buf; 527 if (buf == end) { 528 return 0; 529 } 530 } else if (*buf == '-') { 531 esign = 1; 532 ++buf; 533 if (buf == end) { 534 return 0; 535 } 536 } 537 if (*buf < '0' || *buf > '9') { 538 return 0; 539 } 540 ee = *buf++ - '0'; 541 while (buf != end && *buf >= '0' && *buf <= '9') { 542 /* 543 * This test impacts performance and we do not need an 544 * exact value just one large enough to dominate the fraction_exp. 545 * Subsequent handling maps large absolute ee to 0 or infinity. 546 */ 547 if (ee <= 0x7fff) { 548 ee = ee * 10 + *buf - '0'; 549 } 550 ++buf; 551 } 552 } 553 exponent = exponent + (esign ? -ee : ee); 554 555 /* 556 * Exponent is now a base 10 normalized exponent so the absolute value 557 * is less the 10^(exponent + 1) for positive exponents. For 558 * denormalized doubles (using 11 bit exponent 0 with a fraction 559 * shiftet down, extra small numbers can be achieved. 560 * 561 * https://en.wikipedia.org/wiki/Double-precision_floating-point_format 562 * 563 * 10^-324 holds the smallest normalized exponent (but not value) and 564 * 10^308 holds the largest exponent. Internally our lookup table is 565 * only safe to use within a range slightly larger than this. 566 * Externally, a slightly larger/smaller value represents NaNs which 567 * are technically also possible to store as a number. 568 * 569 */ 570 571 /* This also protects strod fallback parsing. */ 572 if (buf == end) { 573 return 0; 574 } 575 return grisu3_encode_double(mark, buf, sign, fraction, exponent, fraction_exp, ulp_half_error, result); 576 } 577 578 #ifdef __cplusplus 579 } 580 #endif 581 582 #endif /* GRISU3_PARSE_H */