core.internal.utf source code

1 /********************************************
2  * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
3  *
4  * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
5  * wchar type.
6  * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
7  * the D utf.dchar type.
8  *
9  * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
10  *
11  * See_Also:
12  *      $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
13  *      $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
14  *      $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
15  *
16  * Copyright: Copyright Digital Mars 2003 - 2016.
17  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
18  * Authors:   Walter Bright, Sean Kelly
19  * Source:    $(DRUNTIMESRC core/internal/_utf.d)
20  */
21 
22 module core.internal.utf;
23 // version (CRuntime_LIBWASM) This was changed to be mostly nothrow
24 
25 extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;
26 
27 /*******************************
28  * Test if c is a valid UTF-32 character.
29  *
30  * \uFFFE and \uFFFF are considered valid by this function,
31  * as they are permitted for internal use by an application,
32  * but they are not allowed for interchange by the Unicode standard.
33  *
34  * Returns: true if it is, false if not.
35  */
36 
37 @safe @nogc pure nothrow
38 bool isValidDchar(dchar c)
39 {
40     /* Note: FFFE and FFFF are specifically permitted by the
41      * Unicode standard for application internal use, but are not
42      * allowed for interchange.
43      * (thanks to Arcane Jill)
44      */
45 
46     return c < 0xD800 ||
47         (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
48 }
49 
50 unittest
51 {
52     debug(utf) printf("utf.isValidDchar.unittest\n");
53     assert(isValidDchar(cast(dchar)'a') == true);
54     assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
55 }
56 
57 
58 
59 static immutable UTF8stride =
60 [
61     cast(ubyte)
62     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
63     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
64     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
71     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
72     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
73     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
74     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
75     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
76     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
77     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
78 ];
79 
80 /**
81  * stride() returns the length of a UTF-8 sequence starting at index i
82  * in string s.
83  * Returns:
84  *      The number of bytes in the UTF-8 sequence or
85  *      0xFF meaning s[i] is not the start of of UTF-8 sequence.
86  */
87 @safe @nogc pure nothrow
88 uint stride(const scope char[] s, size_t i)
89 {
90     return UTF8stride[s[i]];
91 }
92 
93 /**
94  * stride() returns the length of a UTF-16 sequence starting at index i
95  * in string s.
96  */
97 @safe @nogc pure nothrow
98 uint stride(const scope wchar[] s, size_t i)
99 {   uint u = s[i];
100     return 1 + (u >= 0xD800 && u <= 0xDBFF);
101 }
102 
103 /**
104  * stride() returns the length of a UTF-32 sequence starting at index i
105  * in string s.
106  * Returns: The return value will always be 1.
107  */
108 @safe @nogc pure nothrow
109 uint stride(const scope dchar[] s, size_t i)
110 {
111     return 1;
112 }
113 
114 /*******************************************
115  * Given an index i into an array of characters s[],
116  * and assuming that index i is at the start of a UTF character,
117  * determine the number of UCS characters up to that index i.
118  */
119 @safe pure
120 size_t toUCSindex(const scope char[] s, size_t i)
121 {
122     size_t n;
123     size_t j;
124 
125     for (j = 0; j < i; )
126     {
127         j += stride(s, j);
128         n++;
129     }
130     if (j > i)
131     {
132         onUnicodeError("invalid UTF-8 sequence", j);
133     }
134     return n;
135 }
136 
137 /** ditto */
138 @safe pure
139 size_t toUCSindex(const scope wchar[] s, size_t i)
140 {
141     size_t n;
142     size_t j;
143 
144     for (j = 0; j < i; )
145     {
146         j += stride(s, j);
147         n++;
148     }
149     if (j > i)
150     {
151         onUnicodeError("invalid UTF-16 sequence", j);
152     }
153     return n;
154 }
155 
156 /** ditto */
157 @safe @nogc pure nothrow
158 size_t toUCSindex(const scope dchar[] s, size_t i)
159 {
160     return i;
161 }
162 
163 /******************************************
164  * Given a UCS index n into an array of characters s[], return the UTF index.
165  */
166 @safe pure
167 size_t toUTFindex(const scope char[] s, size_t n)
168 {
169     size_t i;
170 
171     while (n--)
172     {
173         uint j = UTF8stride[s[i]];
174         if (j == 0xFF)
175             onUnicodeError("invalid UTF-8 sequence", i);
176         i += j;
177     }
178     return i;
179 }
180 
181 /** ditto */
182 @safe @nogc pure nothrow
183 size_t toUTFindex(const scope wchar[] s, size_t n)
184 {
185     size_t i;
186 
187     while (n--)
188     {   wchar u = s[i];
189 
190         i += 1 + (u >= 0xD800 && u <= 0xDBFF);
191     }
192     return i;
193 }
194 
195 /** ditto */
196 @safe @nogc pure nothrow
197 size_t toUTFindex(const scope dchar[] s, size_t n)
198 {
199     return n;
200 }
201 
202 /* =================== Decode ======================= */
203 
204 /***************
205  * Decodes and returns character starting at s[idx]. idx is advanced past the
206  * decoded character. If the character is not well formed, a UtfException is
207  * thrown and idx remains unchanged.
208  */
209 @safe pure nothrow
210 dchar decode(const scope char[] s, ref size_t idx) 
211     in
212     {
213         assert(idx >= 0 && idx < s.length);
214     }
215     out (result)
216     {
217         assert(isValidDchar(result));
218     }
219     do
220     {
221         size_t len = s.length;
222         dchar V;
223         size_t i = idx;
224         char u = s[i];
225 
226         if (u & 0x80)
227         {   uint n;
228             char u2;
229 
230             /* The following encodings are valid, except for the 5 and 6 byte
231              * combinations:
232              *  0xxxxxxx
233              *  110xxxxx 10xxxxxx
234              *  1110xxxx 10xxxxxx 10xxxxxx
235              *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
236              *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
237              *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
238              */
239             for (n = 1; ; n++)
240             {
241                 if (n > 4)
242                     goto Lerr;          // only do the first 4 of 6 encodings
243                 if (((u << n) & 0x80) == 0)
244                 {
245                     if (n == 1)
246                         goto Lerr;
247                     break;
248                 }
249             }
250 
251             // Pick off (7 - n) significant bits of B from first byte of octet
252             V = cast(dchar)(u & ((1 << (7 - n)) - 1));
253 
254             if (i + (n - 1) >= len)
255                 goto Lerr;                      // off end of string
256 
257             /* The following combinations are overlong, and illegal:
258              *  1100000x (10xxxxxx)
259              *  11100000 100xxxxx (10xxxxxx)
260              *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
261              *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
262              *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
263              */
264             u2 = s[i + 1];
265             if ((u & 0xFE) == 0xC0 ||
266                 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
267                 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
268                 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
269                 (u == 0xFC && (u2 & 0xFC) == 0x80))
270                 goto Lerr;                      // overlong combination
271 
272             for (uint j = 1; j != n; j++)
273             {
274                 u = s[i + j];
275                 if ((u & 0xC0) != 0x80)
276                     goto Lerr;                  // trailing bytes are 10xxxxxx
277                 V = (V << 6) | (u & 0x3F);
278             }
279             if (!isValidDchar(V))
280                 goto Lerr;
281             i += n;
282         }
283         else
284         {
285             V = cast(dchar) u;
286             i++;
287         }
288 
289         idx = i;
290         return V;
291 
292       Lerr:
293       return '?';
294     return V; // dummy return
295     }
296 
297 unittest
298 {   size_t i;
299     dchar c;
300 
301     debug(utf) printf("utf.decode.unittest\n");
302 
303     static s1 = "abcd"c;
304     i = 0;
305     c = decode(s1, i);
306     assert(c == cast(dchar)'a');
307     assert(i == 1);
308     c = decode(s1, i);
309     assert(c == cast(dchar)'b');
310     assert(i == 2);
311 
312     static s2 = "\xC2\xA9"c;
313     i = 0;
314     c = decode(s2, i);
315     assert(c == cast(dchar)'\u00A9');
316     assert(i == 2);
317 
318     static s3 = "\xE2\x89\xA0"c;
319     i = 0;
320     c = decode(s3, i);
321     assert(c == cast(dchar)'\u2260');
322     assert(i == 3);
323 
324     static s4 =
325     [   "\xE2\x89"c[],          // too short
326         "\xC0\x8A",
327         "\xE0\x80\x8A",
328         "\xF0\x80\x80\x8A",
329         "\xF8\x80\x80\x80\x8A",
330         "\xFC\x80\x80\x80\x80\x8A",
331     ];
332 
333     for (int j = 0; j < s4.length; j++)
334     {
335         try
336         {
337             i = 0;
338             c = decode(s4[j], i);
339             assert(0);
340         }
341         catch (Throwable o)
342         {
343             i = 23;
344         }
345         assert(i == 23);
346     }
347 }
348 
349 /** ditto */
350 @safe pure nothrow
351 dchar decode(const scope wchar[] s, ref size_t idx)
352     in
353     {
354         assert(idx >= 0 && idx < s.length);
355     }
356     out (result)
357     {
358         assert(isValidDchar(result));
359     }
360     do
361     {
362         string msg;
363         dchar V;
364         size_t i = idx;
365         uint u = s[i];
366 
367         if (u & ~0x7F)
368         {   if (u >= 0xD800 && u <= 0xDBFF)
369             {   uint u2;
370 
371                 if (i + 1 == s.length)
372                 {   msg = "surrogate UTF-16 high value past end of string";
373                     goto Lerr;
374                 }
375                 u2 = s[i + 1];
376                 if (u2 < 0xDC00 || u2 > 0xDFFF)
377                 {   msg = "surrogate UTF-16 low value out of range";
378                     goto Lerr;
379                 }
380                 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
381                 i += 2;
382             }
383             else if (u >= 0xDC00 && u <= 0xDFFF)
384             {   msg = "unpaired surrogate UTF-16 value";
385                 goto Lerr;
386             }
387             else if (u == 0xFFFE || u == 0xFFFF)
388             {   msg = "illegal UTF-16 value";
389                 goto Lerr;
390             }
391             else
392                 i++;
393         }
394         else
395         {
396             i++;
397         }
398 
399         idx = i;
400         return cast(dchar)u;
401 
402       Lerr:
403           return '?';
404         return cast(dchar)u; // dummy return
405     }
406 
407 /** ditto */
408 @safe pure nothrow
409 dchar decode(const scope dchar[] s, ref size_t idx)
410     in
411     {
412         assert(idx >= 0 && idx < s.length);
413     }
414     do
415     {
416         size_t i = idx;
417         dchar c = s[i];
418 
419         if (!isValidDchar(c))
420             goto Lerr;
421         idx = i + 1;
422         return c;
423 
424       Lerr:
425           return '?';
426         return c; // dummy return
427     }
428 
429 
430 /* =================== Encode ======================= */
431 
432 /*******************************
433  * Encodes character c and appends it to array s[].
434  */
435 @safe pure nothrow
436 void encode(ref char[] s, dchar c)
437     in
438     {
439         assert(isValidDchar(c));
440     }
441     do
442     {
443         char[] r = s;
444 
445         if (c <= 0x7F)
446         {
447             r ~= cast(char) c;
448         }
449         else
450         {
451             char[4] buf = void;
452             uint L;
453 
454             if (c <= 0x7FF)
455             {
456                 buf[0] = cast(char)(0xC0 | (c >> 6));
457                 buf[1] = cast(char)(0x80 | (c & 0x3F));
458                 L = 2;
459             }
460             else if (c <= 0xFFFF)
461             {
462                 buf[0] = cast(char)(0xE0 | (c >> 12));
463                 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
464                 buf[2] = cast(char)(0x80 | (c & 0x3F));
465                 L = 3;
466             }
467             else if (c <= 0x10FFFF)
468             {
469                 buf[0] = cast(char)(0xF0 | (c >> 18));
470                 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
471                 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
472                 buf[3] = cast(char)(0x80 | (c & 0x3F));
473                 L = 4;
474             }
475             else
476             {
477                 assert(0);
478             }
479             r ~= buf[0 .. L];
480         }
481         s = r;
482     }
483 
484 unittest
485 {
486     debug(utf) printf("utf.encode.unittest\n");
487 
488     char[] s = "abcd".dup;
489     encode(s, cast(dchar)'a');
490     assert(s.length == 5);
491     assert(s == "abcda");
492 
493     encode(s, cast(dchar)'\u00A9');
494     assert(s.length == 7);
495     assert(s == "abcda\xC2\xA9");
496     //assert(s == "abcda\u00A9");       // BUG: fix compiler
497 
498     encode(s, cast(dchar)'\u2260');
499     assert(s.length == 10);
500     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
501 }
502 
503 /** ditto */
504 @safe pure nothrow
505 void encode(ref wchar[] s, dchar c)
506     in
507     {
508         assert(isValidDchar(c));
509     }
510     do
511     {
512         wchar[] r = s;
513 
514         if (c <= 0xFFFF)
515         {
516             r ~= cast(wchar) c;
517         }
518         else
519         {
520             wchar[2] buf = void;
521 
522             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
523             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
524             r ~= buf;
525         }
526         s = r;
527     }
528 
529 /** ditto */
530 @safe pure nothrow
531 void encode(ref dchar[] s, dchar c)
532     in
533     {
534         assert(isValidDchar(c));
535     }
536     do
537     {
538         s ~= c;
539     }
540 
541 /**
542 Returns the code length of $(D c) in the encoding using $(D C) as a
543 code point. The code is returned in character count, not in bytes.
544  */
545 @safe pure nothrow @nogc
546 ubyte codeLength(C)(dchar c)
547 {
548     static if (C.sizeof == 1)
549     {
550         if (c <= 0x7F) return 1;
551         if (c <= 0x7FF) return 2;
552         if (c <= 0xFFFF) return 3;
553         if (c <= 0x10FFFF) return 4;
554         assert(false);
555     }
556     else static if (C.sizeof == 2)
557     {
558         return c <= 0xFFFF ? 1 : 2;
559     }
560     else
561     {
562         static assert(C.sizeof == 4);
563         return 1;
564     }
565 }
566 
567 /* =================== Validation ======================= */
568 
569 /***********************************
570 Checks to see if string is well formed or not. $(D S) can be an array
571  of $(D char), $(D wchar), or $(D dchar). Returns $(D false) if it is not.
572  Use to check all untrusted input for correctness.
573  */
574 @safe pure 
575 bool isValidString(S)(const scope S s) nothrow
576 {
577     auto len = s.length;
578     for (size_t i = 0; i < len; )
579     {
580         // try
581             decode(s, i);
582         // catch (Exception e)
583         //     return false;
584     }
585 
586     return true;
587 }
588 
589 /* =================== Conversion to UTF8 ======================= */
590 
591 @safe pure nothrow @nogc
592 char[] toUTF8(return scope char[] buf, dchar c)
593     in
594     {
595         assert(isValidDchar(c));
596     }
597     do
598     {
599         if (c <= 0x7F)
600         {
601             buf[0] = cast(char) c;
602             return buf[0 .. 1];
603         }
604         else if (c <= 0x7FF)
605         {
606             buf[0] = cast(char)(0xC0 | (c >> 6));
607             buf[1] = cast(char)(0x80 | (c & 0x3F));
608             return buf[0 .. 2];
609         }
610         else if (c <= 0xFFFF)
611         {
612             buf[0] = cast(char)(0xE0 | (c >> 12));
613             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
614             buf[2] = cast(char)(0x80 | (c & 0x3F));
615             return buf[0 .. 3];
616         }
617         else if (c <= 0x10FFFF)
618         {
619             buf[0] = cast(char)(0xF0 | (c >> 18));
620             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
621             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
622             buf[3] = cast(char)(0x80 | (c & 0x3F));
623             return buf[0 .. 4];
624         }
625         assert(0);
626     }
627 
628 /*******************
629  * Encodes string s into UTF-8 and returns the encoded string.
630  */
631 @safe pure nothrow
632 string toUTF8(return scope string s)
633     in
634     {
635         assert(isValidString(s));
636     }
637     do
638     {
639         return s;
640     }
641 
642 /** ditto */
643 @trusted pure
644 string toUTF8(const scope wchar[] s)
645 {
646     char[] r;
647     size_t i;
648     size_t slen = s.length;
649 
650     r.length = slen;
651 
652     for (i = 0; i < slen; i++)
653     {   wchar c = s[i];
654 
655         if (c <= 0x7F)
656             r[i] = cast(char)c;         // fast path for ascii
657         else
658         {
659             r.length = i;
660             foreach (dchar ch; s[i .. slen])
661             {
662                 encode(r, ch);
663             }
664             break;
665         }
666     }
667     return cast(string)r;
668 }
669 
670 /** ditto */
671 @trusted pure
672 string toUTF8(const scope dchar[] s)
673 {
674     char[] r;
675     size_t i;
676     size_t slen = s.length;
677 
678     r.length = slen;
679 
680     for (i = 0; i < slen; i++)
681     {   dchar c = s[i];
682 
683         if (c <= 0x7F)
684             r[i] = cast(char)c;         // fast path for ascii
685         else
686         {
687             r.length = i;
688             foreach (dchar d; s[i .. slen])
689             {
690                 encode(r, d);
691             }
692             break;
693         }
694     }
695     return cast(string)r;
696 }
697 
698 /* =================== Conversion to UTF16 ======================= */
699 
700 @safe pure nothrow @nogc
701 wchar[] toUTF16(return scope wchar[] buf, dchar c)
702     in
703     {
704         assert(isValidDchar(c));
705     }
706     do
707     {
708         if (c <= 0xFFFF)
709         {
710             buf[0] = cast(wchar) c;
711             return buf[0 .. 1];
712         }
713         else
714         {
715             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
716             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
717             return buf[0 .. 2];
718         }
719     }
720 
721 /****************
722  * Encodes string s into UTF-16 and returns the encoded string.
723  * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
724  * an LPWSTR or LPCWSTR argument.
725  */
726 @trusted pure
727 wstring toUTF16(const scope char[] s)
728 {
729     wchar[] r;
730     size_t slen = s.length;
731 
732     if (!__ctfe)
733     {
734         // Reserve still does a lot if slen is zero.
735         // Return early for that case.
736         if (0 == slen)
737             return ""w;
738         r.reserve(slen);
739     }
740     for (size_t i = 0; i < slen; )
741     {
742         dchar c = s[i];
743         if (c <= 0x7F)
744         {
745             i++;
746             r ~= cast(wchar)c;
747         }
748         else
749         {
750             c = decode(s, i);
751             encode(r, c);
752         }
753     }
754     return cast(wstring)r;
755 }
756 
757 alias const(wchar)* wptr;
758 /** ditto */
759 @safe pure
760 wptr toUTF16z(const scope char[] s)
761 {
762     wchar[] r;
763     size_t slen = s.length;
764 
765     if (!__ctfe)
766     {
767         // Reserve still does a lot if slen is zero.
768         // Return early for that case.
769         if (0 == slen)
770             return &"\0"w[0];
771         r.reserve(slen + 1);
772     }
773     for (size_t i = 0; i < slen; )
774     {
775         dchar c = s[i];
776         if (c <= 0x7F)
777         {
778             i++;
779             r ~= cast(wchar)c;
780         }
781         else
782         {
783             c = decode(s, i);
784             encode(r, c);
785         }
786     }
787     r ~= '\000';
788     return &r[0];
789 }
790 
791 /** ditto */
792 @safe pure nothrow
793 wstring toUTF16(return scope wstring s)
794     in
795     {
796         assert(isValidString(s));
797     }
798     do
799     {
800         return s;
801     }
802 
803 /** ditto */
804 @trusted pure nothrow
805 wstring toUTF16(const scope dchar[] s)
806 {
807     wchar[] r;
808     size_t slen = s.length;
809 
810     if (!__ctfe)
811     {
812         // Reserve still does a lot if slen is zero.
813         // Return early for that case.
814         if (0 == slen)
815             return ""w;
816         r.reserve(slen);
817     }
818     for (size_t i = 0; i < slen; i++)
819     {
820         encode(r, s[i]);
821     }
822     return cast(wstring)r;
823 }
824 
825 /* =================== Conversion to UTF32 ======================= */
826 
827 /*****
828  * Encodes string s into UTF-32 and returns the encoded string.
829  */
830 @trusted pure
831 dstring toUTF32(const scope char[] s)
832 {
833     dchar[] r;
834     size_t slen = s.length;
835     size_t j = 0;
836 
837     r.length = slen;            // r[] will never be longer than s[]
838     for (size_t i = 0; i < slen; )
839     {
840         dchar c = s[i];
841         if (c >= 0x80)
842             c = decode(s, i);
843         else
844             i++;                // c is ascii, no need for decode
845         r[j++] = c;
846     }
847     return cast(dstring)r[0 .. j];
848 }
849 
850 /** ditto */
851 @trusted pure
852 dstring toUTF32(const scope wchar[] s)
853 {
854     dchar[] r;
855     size_t slen = s.length;
856     size_t j = 0;
857 
858     r.length = slen;            // r[] will never be longer than s[]
859     for (size_t i = 0; i < slen; )
860     {
861         dchar c = s[i];
862         if (c >= 0x80)
863             c = decode(s, i);
864         else
865             i++;                // c is ascii, no need for decode
866         r[j++] = c;
867     }
868     return cast(dstring)r[0 .. j];
869 }
870 
871 /** ditto */
872 @safe pure nothrow
873 dstring toUTF32(return scope dstring s)
874     in
875     {
876         assert(isValidString(s));
877     }
878     do
879     {
880         return s;
881     }
882 
883 /* ================================ tests ================================== */
884 
885 unittest
886 {
887     debug(utf) printf("utf.toUTF.unittest\n");
888 
889     auto c = "hello"c[];
890     auto w = toUTF16(c);
891     assert(w == "hello");
892     auto d = toUTF32(c);
893     assert(d == "hello");
894 
895     c = toUTF8(w);
896     assert(c == "hello");
897     d = toUTF32(w);
898     assert(d == "hello");
899 
900     c = toUTF8(d);
901     assert(c == "hello");
902     w = toUTF16(d);
903     assert(w == "hello");
904 
905 
906     c = "hel\u1234o";
907     w = toUTF16(c);
908     assert(w == "hel\u1234o");
909     d = toUTF32(c);
910     assert(d == "hel\u1234o");
911 
912     c = toUTF8(w);
913     assert(c == "hel\u1234o");
914     d = toUTF32(w);
915     assert(d == "hel\u1234o");
916 
917     c = toUTF8(d);
918     assert(c == "hel\u1234o");
919     w = toUTF16(d);
920     assert(w == "hel\u1234o");
921 
922 
923     c = "he\U000BAAAAllo";
924     w = toUTF16(c);
925     //foreach (wchar c; w) printf("c = x%x\n", c);
926     //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
927     assert(w == "he\U000BAAAAllo");
928     d = toUTF32(c);
929     assert(d == "he\U000BAAAAllo");
930 
931     c = toUTF8(w);
932     assert(c == "he\U000BAAAAllo");
933     d = toUTF32(w);
934     assert(d == "he\U000BAAAAllo");
935 
936     c = toUTF8(d);
937     assert(c == "he\U000BAAAAllo");
938     w = toUTF16(d);
939     assert(w == "he\U000BAAAAllo");
940 
941     wchar[2] buf;
942     auto ret = toUTF16(buf, '\U000BAAAA');
943     assert(ret == "\U000BAAAA");
944 }