1 /*************************************************************************************************** 2 * 3 * Text parsing functionality. 4 * 5 * Authors: 6 * $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 7 * 8 * Copyright: 9 * © 2017-2023 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise), $(LINK2 mailto:etienne@cimons.com, Etienne Cimon) 10 * 11 * License: 12 * $(LINK2 https://mit-license.org/, The MIT License (MIT)) 13 * 14 **************************************************************************************************/ 15 module fast.parsing; 16 17 import std.traits; 18 import fast.internal.sysdef; 19 20 //import std.array : staticArray; 21 22 static enum isAMD64 = false; 23 24 /+ 25 ╔══════════════════════════════════════════════════════════════════════════════ 26 ║ ⚑ Hexadecimal 27 ╚══════════════════════════════════════════════════════════════════════════════ 28 +/ 29 30 /******************************************************************************* 31 * 32 * Decodes a single hexadecimal character. 33 * 34 * Params: 35 * c = The hexadecimal digit. 36 * 37 * Returns: 38 * `c` converted to an integer. 39 * 40 **************************************/ 41 @safe @nogc pure nothrow 42 uint hexDecode(char c) 43 { 44 return c + 9 * (c >> 6) & 15; 45 } 46 47 @nogc pure nothrow 48 uint hexDecode4(ref const(char)* hex) 49 { 50 uint x = *cast(uint*)&hex; 51 hex += 4; 52 x = (x & 0x0F0F0F0F) + 9 * (x >> 6 & 0x01010101); 53 version (LittleEndian) 54 { 55 return x >> 24 | x >> 12 & 0xF0 | x & 0xF00 | x << 12 & 0xF000; 56 } 57 else 58 { 59 x = (x | x >> 4) & 0x00FF00FF; 60 return (x | x >> 8) & 0x0000FFFF; 61 } 62 } 63 64 @nogc pure nothrow 65 inout(char)* hexDecode4(ref inout(char)* hex, out uint result) 66 { 67 foreach (i; 0 .. 4) 68 { 69 result *= 16; 70 char ch = cast(char)(hex[i] - '0'); 71 if (ch <= 9) 72 { 73 result += ch; 74 } 75 else 76 { 77 ch = cast(char)((ch | 0x20) - 0x31); 78 if (ch <= 5) 79 result += ch + 10; 80 else 81 return hex + i; 82 } 83 } 84 hex += 4; 85 return null; 86 } 87 88 unittest 89 { 90 string x = "aF09"; 91 const(char)* p = x.ptr; 92 uint result; 93 hexDecode4(p, result); 94 assert(result == 0xAF09); 95 } 96 97 /+ 98 ╔══════════════════════════════════════════════════════════════════════════════ 99 ║ ⚑ Numbers 100 ╚══════════════════════════════════════════════════════════════════════════════ 101 +/ 102 103 /// Options for `parseNumber`. 104 struct NumberOptions 105 { 106 /// Allows the minus sign as the first character and thus negative numbers. 107 bool minus; 108 } 109 110 /******************************************************************************* 111 * 112 * Parse a number from a character read pointer. 113 * 114 * On success, the read pointer is set behind the number. 115 * 116 * Params: 117 * opt = Selects features for the implementation. Less features make the 118 * parser faster. 119 * str = The read pointer. 120 * n = A reference to a number to be overwritten with the result. 121 * 122 * Returns: 123 * An indication of success. Typically the function fails when a number cannot 124 * be stored in an integer of the given size or invalid characters are 125 * encountered. 126 * 127 **************************************/ 128 @nogc nothrow 129 bool parseNumber(NumberOptions opt, N)(ref const(char)* str, ref N n) 130 if (isNumeric!N) 131 { 132 import fast.internal.helpers; 133 import std.range; 134 135 // Integer types larger than the mantissa of N. 136 static if (N.sizeof <= size_t.sizeof) 137 { 138 alias U = size_t; 139 alias I = ptrdiff_t; 140 } 141 else 142 { 143 alias U = ulong; 144 alias I = long; 145 } 146 147 // Largest value of type U that can be multiplied by 10 and have a digit added without overflow. 148 enum canHoldOneMoreDigit = (U.max - 9) / 10; 149 static if (isFloatingPoint!N) 150 { 151 enum significandRightShift = 8 * U.sizeof - N.mant_dig + 1; 152 //enum lastSignificandBit = U(2) << 8 * U.sizeof - N.mant_dig; 153 enum firstFractionBit = U(1) << 8 * U.sizeof - N.mant_dig; 154 //enum remainderBits = U.max - N.mant_dig + 1; 155 enum expShift = N.mant_dig - 1; 156 enum expBias = N.max_exp - 1; 157 } 158 159 static if (isFloatingPoint!N) 160 { 161 162 // Largest power of 10 that fits into a float of type N. The factor 5 here is correct, as the 2s 163 // go in as an increment in the exponent, that is neglectable here. 164 enum pow10MaxF = { 165 U v = 1; 166 uint exp; 167 while (v <= ((U(1) << N.mant_dig) - 1) / 5) 168 { 169 v *= 5; 170 exp++; 171 } 172 return exp; 173 }(); 174 __gshared static bool pow10Fb = false; 175 __gshared static N[pow10MaxF] pow10F; 176 if (!pow10Fb) 177 { 178 int i = 0; 179 foreach (v; N(10).recurrence!((a, n) => 10 * a[n - 1]).take(pow10MaxF)) 180 { 181 pow10F[i++] = v; 182 } 183 pow10Fb = true; 184 } 185 enum pow5Max = { 186 U v = 1; 187 uint exp; 188 while (v <= (U.max / 5)) 189 { 190 v *= 5; 191 exp++; 192 } 193 return exp; 194 }(); 195 __gshared static bool pow5b = false; 196 __gshared static U[pow5Max] pow5; 197 198 if (!pow5b) 199 { 200 int i = 0; 201 foreach (v; U(5).recurrence!((a, n) => 5 * a[n - 1]).take(pow5Max)) 202 { 203 pow5[i++] = v; 204 } 205 pow5b = true; 206 } 207 } 208 else 209 { 210 enum pow10Max = { 211 U v = 1; 212 uint exp; 213 while (v <= (U.max / 10)) 214 { 215 v *= 10; 216 exp++; 217 } 218 return exp; 219 }(); 220 __gshared static bool pow10b = false; 221 222 __gshared static U[pow10Max] pow10; 223 if (!pow10b) 224 { 225 int i = 0; 226 foreach (v; U(10).recurrence!((a, n) => 10 * a[n - 1]).take(pow10Max)) 227 { 228 pow10[i++] = v; 229 } 230 pow10b = true; 231 } 232 } 233 234 const(char)* p = str; 235 const(char)* point = null; 236 U significand = 0; 237 size_t exponent = 0; 238 size_t expAdjust = void; 239 bool expSign = void; 240 static if (isFloatingPoint!N) 241 { 242 U exp2 = void; 243 bool roundUp = false; 244 } 245 246 /////////////////// SIGN BIT HANDLING /////////////////// 247 248 // Check for the sign. 249 static if (opt.minus) 250 { 251 bool sign = (*p == '-'); 252 if (sign) 253 p++; 254 } 255 256 /////////////////// INTEGRAL PART OF SIGNIFICAND /////////////////// 257 258 uint digit = *p - '0'; 259 if (digit == 0) 260 { 261 // We have a single zero. 262 p++; 263 } 264 else if (digit <= 9) 265 { 266 // Regular case of one or more digits. 267 do 268 { 269 if (significand > canHoldOneMoreDigit) 270 goto BigMantissa; 271 BigMantissaNotSoMuch: 272 significand = 10 * significand + digit; 273 digit = *++p - '0'; 274 } 275 while (digit <= 9); 276 } 277 else 278 return false; 279 280 /////////////////// FRACTIONAL PART OF SIGNIFICAND /////////////////// 281 282 if (*p == '.') 283 { 284 point = ++p; 285 digit = *p - '0'; 286 if (digit > 9) 287 digit = 0; 288 else 289 do 290 { 291 if (significand > canHoldOneMoreDigit) 292 goto BigMantissa; 293 significand = 10 * significand + digit; 294 digit = *++p - '0'; 295 } 296 while (digit <= 9); 297 } 298 299 /////////////////// EXPONENT HANDLING /////////////////// 300 301 expAdjust = (point is null) ? 0 : p - point; 302 if ((*p | 0x20) == 'e') 303 { 304 p++; 305 expSign = (*p == '-'); 306 if (expSign || *p == '+') 307 p++; 308 digit = *p - '0'; 309 if (digit > 9) 310 return false; 311 do 312 { 313 if (exponent > canHoldOneMoreDigit) 314 goto BigExponent; 315 exponent = 10 * exponent + digit; 316 digit = *++p - '0'; 317 } 318 while (digit <= 9); 319 } 320 321 if (expAdjust) 322 { 323 if (expSign) 324 { 325 if (exponent > size_t.max - expAdjust) 326 goto BigExponentAdjustForDecimalPoint; 327 exponent += expAdjust; 328 } 329 else if (exponent >= expAdjust) 330 { 331 exponent -= expAdjust; 332 } 333 else 334 { 335 // Amount of fraction digits turns exponent from positive to negative. 336 expAdjust -= exponent; 337 exponent = expAdjust; 338 expSign = true; 339 } 340 } 341 342 /////////////////// RESULT ASSEMBLY /////////////////// 343 344 static if (isFloatingPoint!N) 345 { 346 if (significand == 0 || exponent == 0) 347 { 348 // The significand is the unsigned result. 349 static if (opt.minus) 350 if (sign) 351 n = -N(significand); 352 n = +N(significand); 353 str = p; 354 return true; 355 } 356 357 // Try the floating-point fast path: The significand's bits, as well as the 10^x exponent can be expressed 358 // accurately as a float of type N. We just need to divide or multiply them based on the signedness of the 359 // exponent. 360 exp2 = bsr(significand); 361 if (exp2 - bsf(significand) < N.mant_dig && exponent <= pow10MaxF) 362 { 363 N b = pow10F[exponent - 1]; 364 static if (opt.minus) 365 if (sign) 366 b = -b; 367 n = expSign ? significand / b : significand * b; 368 str = p; 369 return true; 370 } 371 else if (exponent <= pow5Max) 372 { 373 // Special case, mostly to handle the little bit of extra precision that comes from 374 // converting a double to its string representation. The last base-10 digit doesn't quite 375 // fit back into a double, but we don't need to resort to arbitrary precision math just yet. 376 if (expSign) 377 { 378 U divisor = pow5[exponent - 1]; 379 static if (isAMD64 && (isLDC || isGDC)) 380 { 381 // AMD64 can divide 128-bit numbers by 64-bit numbers directly. 382 size_t expDivisor = clz(divisor); 383 divisor <<= expDivisor; 384 exp2 = expDivisor - exponent - bigDiv(significand, divisor); 385 significand <<= 1; 386 } 387 else 388 { 389 // We perform an iterative division. 390 U dividend = significand << 8 * U.sizeof - 1 - exp2; 391 U quotient = dividend / divisor; 392 dividend %= divisor; 393 394 size_t lzs = clz(quotient); 395 exp2 -= exponent + lzs; 396 significand = quotient << ++lzs; 397 size_t accuracy = 8 * U.sizeof - lzs; 398 while (accuracy < N.mant_dig) 399 { 400 lzs = clz(dividend); 401 dividend <<= lzs; 402 quotient = dividend / divisor; 403 dividend %= divisor; 404 significand |= quotient << (8 * U.sizeof - lzs) >> accuracy; 405 accuracy += lzs; 406 } 407 } 408 409 // Assemble floating point value from bits. 410 roundUp = (significand & firstFractionBit) != 0; 411 significand >>= significandRightShift; 412 if (roundUp) 413 { 414 significand++; 415 significand &= ~(U(1) << N.mant_dig - 1); 416 if (significand == 0) 417 ++exp2; 418 } 419 420 U* result = cast(U*)&n; 421 *result = exp2 + expBias << expShift | significand; 422 static if (opt.minus) 423 *result |= U(sign) << U.sizeof * 8 - 1; 424 str = p; 425 return true; 426 } 427 //else assert(0, "Not implemented"); 428 } 429 //else assert(0, "Not implemented"); 430 } 431 else 432 { 433 import fast.intmath; 434 435 if (exponent && significand) 436 { 437 // We need to account for the exponent. 438 U pow = pow10[exponent - 1]; 439 if (expSign) 440 { 441 // Negative exponent, if we get a fractional result, abort. 442 if (significand % pow) 443 return false; 444 significand /= pow; 445 } 446 else static if (U.sizeof < ulong.sizeof) 447 { 448 // Multiply using a bigger result type 449 ulong prod = ulong(significand) * pow; 450 if (prod > U.max) 451 return false; 452 significand = cast(U) prod; 453 } 454 else 455 { 456 // If the multiply will overflow, abort. 457 bool overflowed; 458 significand = mulu(significand, pow, overflowed); 459 if (overflowed) 460 return false; 461 } 462 } 463 464 n = cast(N) significand; 465 static if (isSigned!N && opt.minus) 466 { 467 if (significand > U(N.max) + sign) 468 return false; 469 if (sign) 470 n = -n; 471 } 472 else if (significand > N.max) 473 return false; 474 str = p; 475 return true; 476 } 477 478 BigMantissa: 479 if (significand <= (significand.max - digit) / 10) 480 goto BigMantissaNotSoMuch; 481 // assert(0, "Not implemented"); 482 483 BigExponent: 484 // assert(0, "Not implemented"); 485 486 BigExponentAdjustForDecimalPoint: 487 // assert(0, "Not implemented"); 488 return false; 489 } 490 491 private template PowData(U, U base) 492 { 493 import std.range; 494 495 // Largest power of `base` that fits into an integer of type U. 496 enum powMax = { 497 U v = 1; 498 uint exp; 499 while (v <= U.max / base) 500 { 501 v *= base; 502 exp++; 503 } 504 return exp; 505 }(); 506 507 // Table of powers of `base`. (We skip base^0) 508 static immutable U[powMax] pows = base.recurrence!((a, n) => base * a[n - 1]).take(powMax); 509 } 510 511 static if (isAMD64 && (isLDC || isGDC)) 512 { 513 @nogc pure nothrow 514 private size_t bigDiv(ref size_t a, size_t b) 515 in 516 { 517 assert(b > size_t.max / 2, "High bit of divisor must be set."); 518 } 519 body 520 { 521 // Make sure that the division will yield exactly 32 or 64 significant bits. 522 import fast.internal.helpers; 523 524 size_t lza = clz(a); 525 version (LDC) 526 { 527 import ldc.llvmasm; 528 529 a <<= lza; 530 if (a >= b) 531 { 532 a >>= 1; 533 lza--; 534 } 535 a = __asm!ulong(" 536 xor %rax, %rax 537 divq $2 538 ", "={rax},{rdx},rm", a, b); 539 } 540 else version (GNU) 541 { 542 size_t dividend = a << lza; 543 if (dividend >= b) 544 { 545 dividend >>= 1; 546 lza--; 547 } 548 asm 549 { 550 " 551 xor %%rax, %%rax 552 divq %3 553 " : "=&a" a, "=d" dividend : "d" dividend, "rm" b; 554 } 555 } 556 return ++lza; 557 } 558 559 unittest 560 { 561 size_t a = size_t.max / 11; 562 size_t b = size_t.max / 5; 563 version (X86_64) 564 { 565 import fast.internal.helpers; 566 567 long exp = clz(b); // Positive base-2 exponent 568 b <<= exp; 569 exp -= bigDiv(a, b); 570 assert(a == 0xE8BA2E8BA2E8BA2AUL); 571 assert(exp == -2); 572 } 573 } 574 } 575 576 /+ 577 ╔══════════════════════════════════════════════════════════════════════════════ 578 ║ ⚑ String Scanning and Comparison 579 ╚══════════════════════════════════════════════════════════════════════════════ 580 +/ 581 582 /******************************************************************************* 583 * 584 * Compares a string of unknown length against a statically known key. 585 * 586 * This function also handles escapes and requires one or more terminator chars. 587 * 588 * Params: 589 * C = Character with. 590 * key = The static key string. 591 * terminators = A list of code units that terminate the string. 592 * special = A list of code units that are handled by the user callback. Use 593 * this for escape string handling. Default is `null`. 594 * p_str = Pointer to the string for the comparison. After the function call 595 * it will be behind the last matching character. 596 * callback = User callback to handle special escape characters if `special` 597 * is non-empty. 598 * 599 * Returns: 600 * A code with following meanings: -1 = not equal, terminator character hit, 601 * 0 = not equal, but string not exhausted, 1 = string equals key. 602 * 603 **************************************/ 604 // int fixedTermStrCmp(C, immutable C[] key, immutable C[] terminators, immutable C[] special = null) 605 // (ref const(C)* p_str, scope bool delegate(ref immutable(char)*, ref const(char)*) callback = null) 606 // in 607 // { 608 // assert(special.length == 0 || callback !is null); 609 // } 610 // body 611 // { 612 // import std.algorithm, std.range; 613 614 // static immutable byte[256] classify = 615 // iota(256).map!(c => terminators.canFind(c) ? byte(-1) : special.canFind(c) ? 1 : 0).staticArray; 616 617 // immutable(C)* p_key = key.ptr; 618 // immutable C* e_key = p_key + key.length; 619 620 // while (p_key !is e_key) 621 // { 622 // int clazz = *p_str <= 0xFF ? classify[*p_str] : 0; 623 624 // if (clazz < 0) 625 // { 626 // return clazz; 627 // } 628 // else if (clazz == 0) 629 // { 630 // if (*p_str != *p_key) 631 // return clazz; 632 633 // p_str++; 634 // p_key++; 635 // } 636 // else if (clazz > 0) 637 // { 638 // if (!callback(p_key, p_str)) 639 // return 0; 640 // } 641 // } 642 643 // return classify[*p_str & 0xFF] < 0; 644 // } 645 646 /* 647 @nogc nothrow 648 void fixedStringCompareSSE4() 649 { 650 enum words = key.length / 16; 651 enum remainder = key.length % 16; 652 enum contains0 = key.canFind('\0'); // For SSE4.2 string search. 653 static assert(!contains0, "Not implemented"); 654 655 size_t remaining = e - b; 656 auto p = b; 657 658 foreach (i; staticIota!(0, words)) 659 { 660 auto backup = p; 661 p.vpcmpistri!(char, key[16 * i .. 16 * i + 16], Operation.equalElem, Polarity.negateValid); 662 p = backup; 663 p.vpcmpistri!(char, key[16 * i .. 16 * i + 16], Operation.equalElem, Polarity.negateValid); 664 } 665 } 666 */ 667 668 @forceinline @nogc nothrow pure 669 void seekToAnyOf(string cs)(ref const(char)* p) 670 { 671 bool found = false; 672 while (*p) 673 { 674 foreach (c; cs) 675 { 676 if (c == *p) 677 { 678 found = true; 679 break; 680 } 681 } 682 if (found) 683 break; 684 else 685 p++; 686 } 687 //p.vpcmpistri!(char, sanitizeChars(cs), Operation.equalAnyElem); 688 } 689 690 @forceinline nothrow 691 void seekToRanges(string cs)(ref const(char)* p) 692 { 693 bool found = false; 694 while (*p) 695 { 696 for (int i = 0; i < cs.length; i += 2) 697 { 698 if (cs[i] <= *p && cs[i + 1] >= *p) 699 { 700 found = true; 701 break; 702 } 703 } 704 if (found) 705 break; 706 else 707 p++; 708 } 709 //p.vpcmpistri!(char, sanitizeRanges(cs), Operation.inRanges); 710 } 711 712 /******************************************************************************* 713 * 714 * Searches for a specific character known to appear in the stream and skips the 715 * read pointer over it. 716 * 717 * Params: 718 * c = the character 719 * p = the read pointer 720 * 721 **************************************/ 722 @forceinline @nogc nothrow pure 723 void seekPast(char c)(ref const(char)* p) 724 { 725 while (*p) 726 { 727 if (c == *p) 728 { 729 p++; 730 break; 731 } 732 else 733 p++; 734 } 735 //p.vpcmpistri!(char, c.repeat(16).to!string, Operation.equalElem); 736 737 } 738 739 /******************************************************************************* 740 * 741 * Skips the read pointer over characters that fall into any of up to 8 ranges 742 * of characters. The first character in `cs` is the start of the first range, 743 * the second character is the end. This is repeated for any other character 744 * pair. A character falls into a range from `a` to `b` if `a <= *p <= b`. 745 * 746 * Params: 747 * cs = the character ranges 748 * p = the read pointer 749 * 750 **************************************/ 751 @forceinline @nogc nothrow pure 752 void skipCharRanges(string cs)(ref const(char)* p) 753 { 754 import std.range : chunks; 755 756 while (*p) 757 { 758 bool found = false; 759 for (int i = 0; i < cs.length; i += 2) 760 { 761 if (cs[i] <= *p && cs[i + 1] >= *p) 762 { 763 found = true; 764 break; 765 } 766 } 767 if (found) 768 p++; 769 else 770 break; 771 } 772 //p.vpcmpistri!(char, cs, Operation.inRanges, Polarity.negate); 773 } 774 775 /******************************************************************************* 776 * 777 * Skips the read pointer over all and any of the given characters. 778 * 779 * Params: 780 * cs = the characters to skip over 781 * p = the read pointer 782 * 783 **************************************/ 784 @forceinline @nogc nothrow pure 785 void skipAllOf(string cs)(ref const(char)* p) 786 { 787 while (*p) 788 { 789 bool found = false; 790 foreach (c; cs) 791 { 792 if (c == *p) 793 { 794 found = true; 795 break; 796 } 797 } 798 if (found) 799 p++; 800 else 801 break; 802 } 803 804 //p.vpcmpistri!(char, cs, Operation.equalAnyElem, Polarity.negate); 805 } 806 807 /******************************************************************************* 808 * 809 * Skips the read pointer over ASCII white-space comprising '\t', '\r', '\n' and 810 * ' '. 811 * 812 * Params: 813 * p = the read pointer 814 * 815 **************************************/ 816 @forceinline @nogc nothrow pure 817 void skipAsciiWhitespace(ref const(char)* p) 818 { 819 if (*p == ' ') 820 p++; 821 if (*p > ' ') 822 return; 823 p.skipAllOf!" \t\r\n"; 824 } 825 826 /******************************************************************************* 827 * 828 * Sets the read pointer to the start of the next line. 829 * 830 * Params: 831 * p = the read pointer 832 * 833 **************************************/ 834 @forceinline @nogc nothrow pure 835 void skipToNextLine(ref const(char)* p) 836 { 837 // Stop at next \r, \n or \0. 838 enum cmp_to = "\x09\x0B\x0C\x0E"; 839 while (*p && (*p != cmp_to[0] && *p != cmp_to[1] && *p != cmp_to[2] && *p != cmp_to[3])) 840 p++; 841 842 //p.vpcmpistri!(char, "\x01\x09\x0B\x0C\x0E\xFF", Operation.inRanges, Polarity.negate); 843 if (p[0] == '\r') 844 p++; 845 if (p[0] == '\n') 846 p++; 847 } 848 849 private enum sanitizeChars(string cs) 850 { 851 bool has0 = false; 852 foreach (c; cs) 853 if (!c) 854 { 855 has0 = true; 856 break; 857 } 858 assert(has0, "Parsers are required to also check for \0 when looking for specific chars."); 859 860 return cs; 861 } 862 863 private enum sanitizeRanges(string cs) 864 { 865 bool has0 = false; 866 foreach (i; 0 .. cs.length / 2) 867 if (!cs[2 * i]) 868 { 869 has0 = true; 870 break; 871 } 872 assert(has0, "Parsers are required to also check for \0 when looking for specific chars."); 873 return cs; 874 }