1 /******************************************** 2 * Encode and decode UTF-8, UTF-16 and UTF-32 strings. 3 * 4 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D 5 * wchar type. 6 * For Posix systems, the C wchar_t type is UTF-32 and corresponds to 7 * the D utf.dchar type. 8 * 9 * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). 10 * 11 * See_Also: 12 * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 13 * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 14 * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 15 * 16 * Copyright: Copyright Digital Mars 2003 - 2016. 17 * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 18 * Authors: Walter Bright, Sean Kelly 19 * Source: $(DRUNTIMESRC core/internal/_utf.d) 20 */ 21 22 module core.internal.utf; 23 // version (CRuntime_LIBWASM) This was changed to be mostly nothrow 24 25 extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure; 26 27 /******************************* 28 * Test if c is a valid UTF-32 character. 29 * 30 * \uFFFE and \uFFFF are considered valid by this function, 31 * as they are permitted for internal use by an application, 32 * but they are not allowed for interchange by the Unicode standard. 33 * 34 * Returns: true if it is, false if not. 35 */ 36 37 @safe @nogc pure nothrow 38 bool isValidDchar(dchar c) 39 { 40 /* Note: FFFE and FFFF are specifically permitted by the 41 * Unicode standard for application internal use, but are not 42 * allowed for interchange. 43 * (thanks to Arcane Jill) 44 */ 45 46 return c < 0xD800 || 47 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); 48 } 49 50 unittest 51 { 52 debug(utf) printf("utf.isValidDchar.unittest\n"); 53 assert(isValidDchar(cast(dchar)'a') == true); 54 assert(isValidDchar(cast(dchar)0x1FFFFF) == false); 55 } 56 57 58 59 static immutable UTF8stride = 60 [ 61 cast(ubyte) 62 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 63 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 66 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 67 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 68 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 69 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 70 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 71 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 72 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 73 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 74 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 75 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 76 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 77 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, 78 ]; 79 80 /** 81 * stride() returns the length of a UTF-8 sequence starting at index i 82 * in string s. 83 * Returns: 84 * The number of bytes in the UTF-8 sequence or 85 * 0xFF meaning s[i] is not the start of of UTF-8 sequence. 86 */ 87 @safe @nogc pure nothrow 88 uint stride(const scope char[] s, size_t i) 89 { 90 return UTF8stride[s[i]]; 91 } 92 93 /** 94 * stride() returns the length of a UTF-16 sequence starting at index i 95 * in string s. 96 */ 97 @safe @nogc pure nothrow 98 uint stride(const scope wchar[] s, size_t i) 99 { uint u = s[i]; 100 return 1 + (u >= 0xD800 && u <= 0xDBFF); 101 } 102 103 /** 104 * stride() returns the length of a UTF-32 sequence starting at index i 105 * in string s. 106 * Returns: The return value will always be 1. 107 */ 108 @safe @nogc pure nothrow 109 uint stride(const scope dchar[] s, size_t i) 110 { 111 return 1; 112 } 113 114 /******************************************* 115 * Given an index i into an array of characters s[], 116 * and assuming that index i is at the start of a UTF character, 117 * determine the number of UCS characters up to that index i. 118 */ 119 @safe pure 120 size_t toUCSindex(const scope char[] s, size_t i) 121 { 122 size_t n; 123 size_t j; 124 125 for (j = 0; j < i; ) 126 { 127 j += stride(s, j); 128 n++; 129 } 130 if (j > i) 131 { 132 onUnicodeError("invalid UTF-8 sequence", j); 133 } 134 return n; 135 } 136 137 /** ditto */ 138 @safe pure 139 size_t toUCSindex(const scope wchar[] s, size_t i) 140 { 141 size_t n; 142 size_t j; 143 144 for (j = 0; j < i; ) 145 { 146 j += stride(s, j); 147 n++; 148 } 149 if (j > i) 150 { 151 onUnicodeError("invalid UTF-16 sequence", j); 152 } 153 return n; 154 } 155 156 /** ditto */ 157 @safe @nogc pure nothrow 158 size_t toUCSindex(const scope dchar[] s, size_t i) 159 { 160 return i; 161 } 162 163 /****************************************** 164 * Given a UCS index n into an array of characters s[], return the UTF index. 165 */ 166 @safe pure 167 size_t toUTFindex(const scope char[] s, size_t n) 168 { 169 size_t i; 170 171 while (n--) 172 { 173 uint j = UTF8stride[s[i]]; 174 if (j == 0xFF) 175 onUnicodeError("invalid UTF-8 sequence", i); 176 i += j; 177 } 178 return i; 179 } 180 181 /** ditto */ 182 @safe @nogc pure nothrow 183 size_t toUTFindex(const scope wchar[] s, size_t n) 184 { 185 size_t i; 186 187 while (n--) 188 { wchar u = s[i]; 189 190 i += 1 + (u >= 0xD800 && u <= 0xDBFF); 191 } 192 return i; 193 } 194 195 /** ditto */ 196 @safe @nogc pure nothrow 197 size_t toUTFindex(const scope dchar[] s, size_t n) 198 { 199 return n; 200 } 201 202 /* =================== Decode ======================= */ 203 204 /*************** 205 * Decodes and returns character starting at s[idx]. idx is advanced past the 206 * decoded character. If the character is not well formed, a UtfException is 207 * thrown and idx remains unchanged. 208 */ 209 @safe pure nothrow 210 dchar decode(const scope char[] s, ref size_t idx) 211 in 212 { 213 assert(idx >= 0 && idx < s.length); 214 } 215 out (result) 216 { 217 assert(isValidDchar(result)); 218 } 219 do 220 { 221 size_t len = s.length; 222 dchar V; 223 size_t i = idx; 224 char u = s[i]; 225 226 if (u & 0x80) 227 { uint n; 228 char u2; 229 230 /* The following encodings are valid, except for the 5 and 6 byte 231 * combinations: 232 * 0xxxxxxx 233 * 110xxxxx 10xxxxxx 234 * 1110xxxx 10xxxxxx 10xxxxxx 235 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 236 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 237 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 238 */ 239 for (n = 1; ; n++) 240 { 241 if (n > 4) 242 goto Lerr; // only do the first 4 of 6 encodings 243 if (((u << n) & 0x80) == 0) 244 { 245 if (n == 1) 246 goto Lerr; 247 break; 248 } 249 } 250 251 // Pick off (7 - n) significant bits of B from first byte of octet 252 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); 253 254 if (i + (n - 1) >= len) 255 goto Lerr; // off end of string 256 257 /* The following combinations are overlong, and illegal: 258 * 1100000x (10xxxxxx) 259 * 11100000 100xxxxx (10xxxxxx) 260 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 261 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 262 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 263 */ 264 u2 = s[i + 1]; 265 if ((u & 0xFE) == 0xC0 || 266 (u == 0xE0 && (u2 & 0xE0) == 0x80) || 267 (u == 0xF0 && (u2 & 0xF0) == 0x80) || 268 (u == 0xF8 && (u2 & 0xF8) == 0x80) || 269 (u == 0xFC && (u2 & 0xFC) == 0x80)) 270 goto Lerr; // overlong combination 271 272 for (uint j = 1; j != n; j++) 273 { 274 u = s[i + j]; 275 if ((u & 0xC0) != 0x80) 276 goto Lerr; // trailing bytes are 10xxxxxx 277 V = (V << 6) | (u & 0x3F); 278 } 279 if (!isValidDchar(V)) 280 goto Lerr; 281 i += n; 282 } 283 else 284 { 285 V = cast(dchar) u; 286 i++; 287 } 288 289 idx = i; 290 return V; 291 292 Lerr: 293 return '?'; 294 return V; // dummy return 295 } 296 297 unittest 298 { size_t i; 299 dchar c; 300 301 debug(utf) printf("utf.decode.unittest\n"); 302 303 static s1 = "abcd"c; 304 i = 0; 305 c = decode(s1, i); 306 assert(c == cast(dchar)'a'); 307 assert(i == 1); 308 c = decode(s1, i); 309 assert(c == cast(dchar)'b'); 310 assert(i == 2); 311 312 static s2 = "\xC2\xA9"c; 313 i = 0; 314 c = decode(s2, i); 315 assert(c == cast(dchar)'\u00A9'); 316 assert(i == 2); 317 318 static s3 = "\xE2\x89\xA0"c; 319 i = 0; 320 c = decode(s3, i); 321 assert(c == cast(dchar)'\u2260'); 322 assert(i == 3); 323 324 static s4 = 325 [ "\xE2\x89"c[], // too short 326 "\xC0\x8A", 327 "\xE0\x80\x8A", 328 "\xF0\x80\x80\x8A", 329 "\xF8\x80\x80\x80\x8A", 330 "\xFC\x80\x80\x80\x80\x8A", 331 ]; 332 333 for (int j = 0; j < s4.length; j++) 334 { 335 try 336 { 337 i = 0; 338 c = decode(s4[j], i); 339 assert(0); 340 } 341 catch (Throwable o) 342 { 343 i = 23; 344 } 345 assert(i == 23); 346 } 347 } 348 349 /** ditto */ 350 @safe pure nothrow 351 dchar decode(const scope wchar[] s, ref size_t idx) 352 in 353 { 354 assert(idx >= 0 && idx < s.length); 355 } 356 out (result) 357 { 358 assert(isValidDchar(result)); 359 } 360 do 361 { 362 string msg; 363 dchar V; 364 size_t i = idx; 365 uint u = s[i]; 366 367 if (u & ~0x7F) 368 { if (u >= 0xD800 && u <= 0xDBFF) 369 { uint u2; 370 371 if (i + 1 == s.length) 372 { msg = "surrogate UTF-16 high value past end of string"; 373 goto Lerr; 374 } 375 u2 = s[i + 1]; 376 if (u2 < 0xDC00 || u2 > 0xDFFF) 377 { msg = "surrogate UTF-16 low value out of range"; 378 goto Lerr; 379 } 380 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 381 i += 2; 382 } 383 else if (u >= 0xDC00 && u <= 0xDFFF) 384 { msg = "unpaired surrogate UTF-16 value"; 385 goto Lerr; 386 } 387 else if (u == 0xFFFE || u == 0xFFFF) 388 { msg = "illegal UTF-16 value"; 389 goto Lerr; 390 } 391 else 392 i++; 393 } 394 else 395 { 396 i++; 397 } 398 399 idx = i; 400 return cast(dchar)u; 401 402 Lerr: 403 return '?'; 404 return cast(dchar)u; // dummy return 405 } 406 407 /** ditto */ 408 @safe pure nothrow 409 dchar decode(const scope dchar[] s, ref size_t idx) 410 in 411 { 412 assert(idx >= 0 && idx < s.length); 413 } 414 do 415 { 416 size_t i = idx; 417 dchar c = s[i]; 418 419 if (!isValidDchar(c)) 420 goto Lerr; 421 idx = i + 1; 422 return c; 423 424 Lerr: 425 return '?'; 426 return c; // dummy return 427 } 428 429 430 /* =================== Encode ======================= */ 431 432 /******************************* 433 * Encodes character c and appends it to array s[]. 434 */ 435 @safe pure nothrow 436 void encode(ref char[] s, dchar c) 437 in 438 { 439 assert(isValidDchar(c)); 440 } 441 do 442 { 443 char[] r = s; 444 445 if (c <= 0x7F) 446 { 447 r ~= cast(char) c; 448 } 449 else 450 { 451 char[4] buf = void; 452 uint L; 453 454 if (c <= 0x7FF) 455 { 456 buf[0] = cast(char)(0xC0 | (c >> 6)); 457 buf[1] = cast(char)(0x80 | (c & 0x3F)); 458 L = 2; 459 } 460 else if (c <= 0xFFFF) 461 { 462 buf[0] = cast(char)(0xE0 | (c >> 12)); 463 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 464 buf[2] = cast(char)(0x80 | (c & 0x3F)); 465 L = 3; 466 } 467 else if (c <= 0x10FFFF) 468 { 469 buf[0] = cast(char)(0xF0 | (c >> 18)); 470 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 471 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 472 buf[3] = cast(char)(0x80 | (c & 0x3F)); 473 L = 4; 474 } 475 else 476 { 477 assert(0); 478 } 479 r ~= buf[0 .. L]; 480 } 481 s = r; 482 } 483 484 unittest 485 { 486 debug(utf) printf("utf.encode.unittest\n"); 487 488 char[] s = "abcd".dup; 489 encode(s, cast(dchar)'a'); 490 assert(s.length == 5); 491 assert(s == "abcda"); 492 493 encode(s, cast(dchar)'\u00A9'); 494 assert(s.length == 7); 495 assert(s == "abcda\xC2\xA9"); 496 //assert(s == "abcda\u00A9"); // BUG: fix compiler 497 498 encode(s, cast(dchar)'\u2260'); 499 assert(s.length == 10); 500 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 501 } 502 503 /** ditto */ 504 @safe pure nothrow 505 void encode(ref wchar[] s, dchar c) 506 in 507 { 508 assert(isValidDchar(c)); 509 } 510 do 511 { 512 wchar[] r = s; 513 514 if (c <= 0xFFFF) 515 { 516 r ~= cast(wchar) c; 517 } 518 else 519 { 520 wchar[2] buf = void; 521 522 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 523 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 524 r ~= buf; 525 } 526 s = r; 527 } 528 529 /** ditto */ 530 @safe pure nothrow 531 void encode(ref dchar[] s, dchar c) 532 in 533 { 534 assert(isValidDchar(c)); 535 } 536 do 537 { 538 s ~= c; 539 } 540 541 /** 542 Returns the code length of $(D c) in the encoding using $(D C) as a 543 code point. The code is returned in character count, not in bytes. 544 */ 545 @safe pure nothrow @nogc 546 ubyte codeLength(C)(dchar c) 547 { 548 static if (C.sizeof == 1) 549 { 550 if (c <= 0x7F) return 1; 551 if (c <= 0x7FF) return 2; 552 if (c <= 0xFFFF) return 3; 553 if (c <= 0x10FFFF) return 4; 554 assert(false); 555 } 556 else static if (C.sizeof == 2) 557 { 558 return c <= 0xFFFF ? 1 : 2; 559 } 560 else 561 { 562 static assert(C.sizeof == 4); 563 return 1; 564 } 565 } 566 567 /* =================== Validation ======================= */ 568 569 /*********************************** 570 Checks to see if string is well formed or not. $(D S) can be an array 571 of $(D char), $(D wchar), or $(D dchar). Returns $(D false) if it is not. 572 Use to check all untrusted input for correctness. 573 */ 574 @safe pure 575 bool isValidString(S)(const scope S s) nothrow 576 { 577 auto len = s.length; 578 for (size_t i = 0; i < len; ) 579 { 580 // try 581 decode(s, i); 582 // catch (Exception e) 583 // return false; 584 } 585 586 return true; 587 } 588 589 /* =================== Conversion to UTF8 ======================= */ 590 591 @safe pure nothrow @nogc 592 char[] toUTF8(return scope char[] buf, dchar c) 593 in 594 { 595 assert(isValidDchar(c)); 596 } 597 do 598 { 599 if (c <= 0x7F) 600 { 601 buf[0] = cast(char) c; 602 return buf[0 .. 1]; 603 } 604 else if (c <= 0x7FF) 605 { 606 buf[0] = cast(char)(0xC0 | (c >> 6)); 607 buf[1] = cast(char)(0x80 | (c & 0x3F)); 608 return buf[0 .. 2]; 609 } 610 else if (c <= 0xFFFF) 611 { 612 buf[0] = cast(char)(0xE0 | (c >> 12)); 613 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 614 buf[2] = cast(char)(0x80 | (c & 0x3F)); 615 return buf[0 .. 3]; 616 } 617 else if (c <= 0x10FFFF) 618 { 619 buf[0] = cast(char)(0xF0 | (c >> 18)); 620 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 621 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 622 buf[3] = cast(char)(0x80 | (c & 0x3F)); 623 return buf[0 .. 4]; 624 } 625 assert(0); 626 } 627 628 /******************* 629 * Encodes string s into UTF-8 and returns the encoded string. 630 */ 631 @safe pure nothrow 632 string toUTF8(return scope string s) 633 in 634 { 635 assert(isValidString(s)); 636 } 637 do 638 { 639 return s; 640 } 641 642 /** ditto */ 643 @trusted pure 644 string toUTF8(const scope wchar[] s) 645 { 646 char[] r; 647 size_t i; 648 size_t slen = s.length; 649 650 r.length = slen; 651 652 for (i = 0; i < slen; i++) 653 { wchar c = s[i]; 654 655 if (c <= 0x7F) 656 r[i] = cast(char)c; // fast path for ascii 657 else 658 { 659 r.length = i; 660 foreach (dchar ch; s[i .. slen]) 661 { 662 encode(r, ch); 663 } 664 break; 665 } 666 } 667 return cast(string)r; 668 } 669 670 /** ditto */ 671 @trusted pure 672 string toUTF8(const scope dchar[] s) 673 { 674 char[] r; 675 size_t i; 676 size_t slen = s.length; 677 678 r.length = slen; 679 680 for (i = 0; i < slen; i++) 681 { dchar c = s[i]; 682 683 if (c <= 0x7F) 684 r[i] = cast(char)c; // fast path for ascii 685 else 686 { 687 r.length = i; 688 foreach (dchar d; s[i .. slen]) 689 { 690 encode(r, d); 691 } 692 break; 693 } 694 } 695 return cast(string)r; 696 } 697 698 /* =================== Conversion to UTF16 ======================= */ 699 700 @safe pure nothrow @nogc 701 wchar[] toUTF16(return scope wchar[] buf, dchar c) 702 in 703 { 704 assert(isValidDchar(c)); 705 } 706 do 707 { 708 if (c <= 0xFFFF) 709 { 710 buf[0] = cast(wchar) c; 711 return buf[0 .. 1]; 712 } 713 else 714 { 715 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 716 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 717 return buf[0 .. 2]; 718 } 719 } 720 721 /**************** 722 * Encodes string s into UTF-16 and returns the encoded string. 723 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take 724 * an LPWSTR or LPCWSTR argument. 725 */ 726 @trusted pure 727 wstring toUTF16(const scope char[] s) 728 { 729 wchar[] r; 730 size_t slen = s.length; 731 732 if (!__ctfe) 733 { 734 // Reserve still does a lot if slen is zero. 735 // Return early for that case. 736 if (0 == slen) 737 return ""w; 738 r.reserve(slen); 739 } 740 for (size_t i = 0; i < slen; ) 741 { 742 dchar c = s[i]; 743 if (c <= 0x7F) 744 { 745 i++; 746 r ~= cast(wchar)c; 747 } 748 else 749 { 750 c = decode(s, i); 751 encode(r, c); 752 } 753 } 754 return cast(wstring)r; 755 } 756 757 alias const(wchar)* wptr; 758 /** ditto */ 759 @safe pure 760 wptr toUTF16z(const scope char[] s) 761 { 762 wchar[] r; 763 size_t slen = s.length; 764 765 if (!__ctfe) 766 { 767 // Reserve still does a lot if slen is zero. 768 // Return early for that case. 769 if (0 == slen) 770 return &"\0"w[0]; 771 r.reserve(slen + 1); 772 } 773 for (size_t i = 0; i < slen; ) 774 { 775 dchar c = s[i]; 776 if (c <= 0x7F) 777 { 778 i++; 779 r ~= cast(wchar)c; 780 } 781 else 782 { 783 c = decode(s, i); 784 encode(r, c); 785 } 786 } 787 r ~= '\000'; 788 return &r[0]; 789 } 790 791 /** ditto */ 792 @safe pure nothrow 793 wstring toUTF16(return scope wstring s) 794 in 795 { 796 assert(isValidString(s)); 797 } 798 do 799 { 800 return s; 801 } 802 803 /** ditto */ 804 @trusted pure nothrow 805 wstring toUTF16(const scope dchar[] s) 806 { 807 wchar[] r; 808 size_t slen = s.length; 809 810 if (!__ctfe) 811 { 812 // Reserve still does a lot if slen is zero. 813 // Return early for that case. 814 if (0 == slen) 815 return ""w; 816 r.reserve(slen); 817 } 818 for (size_t i = 0; i < slen; i++) 819 { 820 encode(r, s[i]); 821 } 822 return cast(wstring)r; 823 } 824 825 /* =================== Conversion to UTF32 ======================= */ 826 827 /***** 828 * Encodes string s into UTF-32 and returns the encoded string. 829 */ 830 @trusted pure 831 dstring toUTF32(const scope char[] s) 832 { 833 dchar[] r; 834 size_t slen = s.length; 835 size_t j = 0; 836 837 r.length = slen; // r[] will never be longer than s[] 838 for (size_t i = 0; i < slen; ) 839 { 840 dchar c = s[i]; 841 if (c >= 0x80) 842 c = decode(s, i); 843 else 844 i++; // c is ascii, no need for decode 845 r[j++] = c; 846 } 847 return cast(dstring)r[0 .. j]; 848 } 849 850 /** ditto */ 851 @trusted pure 852 dstring toUTF32(const scope wchar[] s) 853 { 854 dchar[] r; 855 size_t slen = s.length; 856 size_t j = 0; 857 858 r.length = slen; // r[] will never be longer than s[] 859 for (size_t i = 0; i < slen; ) 860 { 861 dchar c = s[i]; 862 if (c >= 0x80) 863 c = decode(s, i); 864 else 865 i++; // c is ascii, no need for decode 866 r[j++] = c; 867 } 868 return cast(dstring)r[0 .. j]; 869 } 870 871 /** ditto */ 872 @safe pure nothrow 873 dstring toUTF32(return scope dstring s) 874 in 875 { 876 assert(isValidString(s)); 877 } 878 do 879 { 880 return s; 881 } 882 883 /* ================================ tests ================================== */ 884 885 unittest 886 { 887 debug(utf) printf("utf.toUTF.unittest\n"); 888 889 auto c = "hello"c[]; 890 auto w = toUTF16(c); 891 assert(w == "hello"); 892 auto d = toUTF32(c); 893 assert(d == "hello"); 894 895 c = toUTF8(w); 896 assert(c == "hello"); 897 d = toUTF32(w); 898 assert(d == "hello"); 899 900 c = toUTF8(d); 901 assert(c == "hello"); 902 w = toUTF16(d); 903 assert(w == "hello"); 904 905 906 c = "hel\u1234o"; 907 w = toUTF16(c); 908 assert(w == "hel\u1234o"); 909 d = toUTF32(c); 910 assert(d == "hel\u1234o"); 911 912 c = toUTF8(w); 913 assert(c == "hel\u1234o"); 914 d = toUTF32(w); 915 assert(d == "hel\u1234o"); 916 917 c = toUTF8(d); 918 assert(c == "hel\u1234o"); 919 w = toUTF16(d); 920 assert(w == "hel\u1234o"); 921 922 923 c = "he\U000BAAAAllo"; 924 w = toUTF16(c); 925 //foreach (wchar c; w) printf("c = x%x\n", c); 926 //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c); 927 assert(w == "he\U000BAAAAllo"); 928 d = toUTF32(c); 929 assert(d == "he\U000BAAAAllo"); 930 931 c = toUTF8(w); 932 assert(c == "he\U000BAAAAllo"); 933 d = toUTF32(w); 934 assert(d == "he\U000BAAAAllo"); 935 936 c = toUTF8(d); 937 assert(c == "he\U000BAAAAllo"); 938 w = toUTF16(d); 939 assert(w == "he\U000BAAAAllo"); 940 941 wchar[2] buf; 942 auto ret = toUTF16(buf, '\U000BAAAA'); 943 assert(ret == "\U000BAAAA"); 944 }