1 /*******************************************************************************
2  * 
3  * Converts between UTF-8 and UTF-16.
4  * 
5  * Authors:
6  *   $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
7  * 
8  * Copyright:
9  *   © 2013-2023 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise), $(LINK2 mailto:etienne@cimons.com, Etienne Cimon)
10  * 
11  * License:
12  *   $(LINK2 https://mit-license.org/, The MIT License (MIT))
13  * 
14  **************************************/
15 module fast.cstring;
16 @nogc nothrow:
17 
18 //import core.stdc.stdlib;
19 //import core.stdc.string;
20 //import std.traits;
21 import fast.buffer;
22 
23 /**
24  * Converts a string to a wstring using a buffer provided by the user.
25  * To get the buffer requirements call $(D wstringSize) on your source buffer.
26  *
27  * Params:
28  *   src = The UTF-8 string to convert.
29  *   dst = The destination buffer for the conversion.
30  *
31  * Returns:
32  *   The part of the destination buffer used for the conversion as a $(D wchar[]).
33  *   A terminating zero is appended, so the result.ptr can be passed into Windows APIs.
34  */
35 pure
36 wchar[] string2wstring(in char[] src, wchar* dst)
37 {
38 	const char* srcEnd = src.ptr + src.length;
39 	const(char)* srcIt = src.ptr;
40 	wchar* dstIt = dst;
41 
42 	while (srcIt !is srcEnd)
43 	{
44 		// how long is the byte sequence
45 		int len = 0;
46 		uint mask = 0b1000_0000;
47 		while (*srcIt & mask)
48 		{
49 			mask >>= 1;
50 			len++;
51 		}
52 
53 		// get payload of first byte
54 		dchar ch = *srcIt++ & (mask - 1);
55 
56 		while (--len > 0)
57 		{
58 			// make space for 6 more bits
59 			ch <<= 6;
60 			ch |= *srcIt++ & 0b0011_1111;
61 		}
62 
63 		// do we need to store a surrogate pair ?
64 		static if (is(wchar == dchar))
65 		{
66 			*dstIt++ = ch;
67 		}
68 		else if (ch > wchar.max)
69 		{
70 			*dstIt++ = (ch >> 10) | 0xD800;
71 			*dstIt++ = (ch & 0b11_1111_1111) | 0xDC00;
72 		}
73 		else
74 		{
75 			*dstIt++ = cast(wchar) ch;
76 		}
77 	}
78 	*dstIt = 0;
79 
80 	return dst[0 .. dstIt - dst];
81 }
82 
83 /**
84  * Calculates the required buffer size in bytes for a string to wchar[] conversion.
85  * Room for a terminating '\0' is included.
86  *
87  * Params:
88  *   src = The source string.
89  *
90  * Returns:
91  *   The maximum byte count the source string could require, including the terminating '\0'.
92  *
93  * See_Also:
94  *   string2wstring
95  *   
96  */
97 @safe pure
98 size_t string2wstringSize(in char[] src)
99 {
100 	enum limit = size_t.max / wchar.sizeof - 1;
101 	return src.length <= limit ? wchar.sizeof * (src.length + 1) : size_t.max;
102 }
103 
104 /**
105  * Converts a wstring to a string using a buffer provided by the user.
106  * To get the buffer requirements call $(D stringSize) on your source buffer.
107  *
108  * Params:
109  *   src = The UTF-8 string to convert.
110  *   dst = The destination buffer for the conversion.
111  *
112  * Returns:
113  *   The part of the destination buffer used for the conversion as a $(D wchar[]).
114  *   A terminating zero is appended, so the result.ptr can be passed into Windows APIs.
115  */
116 pure
117 char[] wstring2string(in wchar[] src, char* dst)
118 {
119 	const wchar* srcEnd = src.ptr + src.length;
120 	const(wchar)* srcIt = src.ptr;
121 	char* dstIt = dst;
122 
123 	while (srcIt !is srcEnd)
124 	{
125 		if (*srcIt < 0x80)
126 		{
127 			*dstIt++ = cast(char)*srcIt++;
128 		}
129 		else if (*srcIt < 0x800)
130 		{
131 			*dstIt++ = cast(char)(0b_11000000 | *srcIt >> 6);
132 			*dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt++;
133 		}
134 		if (*srcIt < 0xD800 || *srcIt > 0xDBFF)
135 		{
136 			// anything else within the BMP (<= 0xFFFF), but not a high surrogate
137 			*dstIt++ = 0b_11100000 | *srcIt >> 12;
138 			*dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt >> 6;
139 			*dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt++;
140 		}
141 		else
142 		{
143 			// high surrogate, assume correct encoding and that the next wchar is the low surrogate
144 			dchar decoded;
145 			decoded = (*srcIt++ & 0b11_1111_1111) << 10;
146 			decoded |= (*srcIt++ & 0b11_1111_1111);
147 			*dstIt++ = 0b_11110000 | decoded >> 18;
148 			*dstIt++ = 0b_10000000 | 0b_00111111 & decoded >> 12;
149 			*dstIt++ = 0b_10000000 | 0b_00111111 & decoded >> 6;
150 			*dstIt++ = 0b_10000000 | 0b_00111111 & decoded;
151 		}
152 	}
153 	*dstIt = 0;
154 
155 	return dst[0 .. dstIt - dst];
156 }
157 
158 /**
159  * Calculates the required buffer size in bytes for a wstring to char[] conversion.
160  * Room for a terminating '\0' is included.
161  *
162  * Params:
163  *   src = The source string.
164  *
165  * Returns:
166  *   The maximum byte count the source string could require, including the terminating '\0'.
167  *
168  * See_Also:
169  *   wstring2string
170  *   
171  */
172 @safe pure
173 size_t wstring2stringSize(in wchar[] src)
174 {
175 	enum limit = (size_t.max / char.sizeof - 1) / 3;
176 	return src.length <= limit ? char.sizeof * (3 * src.length + 1) : size_t.max;
177 }
178 
179 /**
180  * Replaces $(D std.utf.toUTFz) with a version that uses the stack as long as the required bytes for the output are
181  * <= 1k. Longer strings use $(D malloc) to create a buffer for the conversion. It is freed at least at the end of the
182  * scope.
183  * 
184  * Params:
185  *   str = The source string to convert.
186  *
187  * See_Also:
188  *   toWstring
189  * 
190  * Example:
191  * ---
192  * string text = "Hello, world!";
193  * WinApiW(wcharPtr!text);
194  * ---
195  */
196 auto wcharPtr(alias str)(void* buffer = string2wstringSize(str) <= allocaLimit ? alloca(
197 		string2wstringSize(str)) : null)
198 {
199 	// In any case we have to return a proper InstantBuffer, so that free() is called in the dtor at some point.
200 	return TempBuffer!wchar(
201 		string2wstring(str, cast(wchar*)(buffer ? buffer
202 			: malloc(string2wstringSize(str)))),
203 		buffer is null);
204 }
205 
206 /// ditto
207 immutable(wchar)* wcharPtr(alias wstr)()
208 		if (is(typeof(wstr) == wstring) && __traits(compiles, {
209 				enum wstring e = wstr;
210 			}))
211 {
212 	// D string literals (known at compile time) are always \0-terminated.
213 	return wstr.ptr;
214 }
215 
216 /**
217  * $(D char*) version of $(D wcharPtr). Basically it appends a \0 to the input.
218  * The function uses $(D malloc) for strings of lengths 1024 and above.
219  * 
220  * Params:
221  *   str = The source string to convert to a C UTF-8 string
222  * 
223  * Note:
224  *   Do not use this to call Windows ANSI functions! Always use wide-char
225  *   functions on this operating system unless you want to deal with codepages.
226  *
227  * Example:
228  * ---
229  * string text = "Hello, world!";
230  * linuxApi(charPtr!text);
231  * ---
232  */
233 auto charPtr(alias str)(void* buffer = alloca(str.length + 1))
234 		if (is(typeof(str) : const(char)[]) || is(typeof(str) : const(ubyte)[]))
235 {
236 	char* dst = cast(char*) memcpy(buffer ? buffer : malloc(str.length + 1), str.ptr, str.length);
237 	dst[str.length] = '\0';
238 	return TempBuffer!char(dst[0 .. str.length], buffer is null);
239 }
240 
241 /// ditto
242 immutable(char)* charPtr(alias str)() if (__traits(compiles, { enum string e = str; }))
243 		{
244 			// D string literals (known at compile time) are always \0-terminated.
245 			return str.ptr;
246 		}
247 
248 	/**
249  * This overload allocates the required memory from an existing stack buffer.
250  *
251  * Params:
252  *   str = The source string to convert to a C UTF-8 string
253  *   sb = The stack buffer to allocate from
254  * 
255  * Note:
256  *   Always assign the result to an auto variable first for RAII to work correctly.
257  */
258 	StackBufferEntry!char charPtr(SB)(const(char)[] str, ref SB sb)
259 		if (is(SB == StackBuffer!bytes, bytes...))
260 {
261 	import llvm.intrinsics;
262 
263 	auto buffer = sb.alloc!char(str.length + 1);
264 	llvm_memcpy(buffer.ptr, str.ptr, str.length);
265 	buffer[str.length] = '\0';
266 	return buffer;
267 }
268 
269 bool isPrintable(T)(T c) @safe pure nothrow @nogc
270 {
271 	return c >= ' ' && c <= '~';
272 }
273 
274 size_t strlen(inout(char*) str) pure
275 {
276 	size_t len_;
277 	size_t* len = &len_;
278 	for (*len = 0; str[*len]; (*len)++)
279 		continue;
280 	return len_;
281 }
282 
283 int memcmp(const(char)* buf1, immutable(char)* buf2, size_t count) pure
284 {
285 	if (!count)
286 		return (0);
287 
288 	while (--count && *buf1 == *buf2)
289 	{
290 		buf1++;
291 		buf2++;
292 	}
293 
294 	return *buf1 - *buf2;
295 }
296 /**
297  * Returns the given $(D ptr) up to but not including the \0 as a $(D char[]).
298  */
299 inout(char)[] asString(inout(char*) ptr) @trusted pure
300 {
301 	if (ptr is null)
302 		return null;
303 	return ptr[0 .. strlen(ptr)];
304 }