1 /******************************************************************************* 2 * 3 * Converts between UTF-8 and UTF-16. 4 * 5 * Authors: 6 * $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 7 * 8 * Copyright: 9 * © 2013-2023 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise), $(LINK2 mailto:etienne@cimons.com, Etienne Cimon) 10 * 11 * License: 12 * $(LINK2 https://mit-license.org/, The MIT License (MIT)) 13 * 14 **************************************/ 15 module fast.cstring; 16 @nogc nothrow: 17 18 //import core.stdc.stdlib; 19 //import core.stdc.string; 20 //import std.traits; 21 import fast.buffer; 22 23 /** 24 * Converts a string to a wstring using a buffer provided by the user. 25 * To get the buffer requirements call $(D wstringSize) on your source buffer. 26 * 27 * Params: 28 * src = The UTF-8 string to convert. 29 * dst = The destination buffer for the conversion. 30 * 31 * Returns: 32 * The part of the destination buffer used for the conversion as a $(D wchar[]). 33 * A terminating zero is appended, so the result.ptr can be passed into Windows APIs. 34 */ 35 pure 36 wchar[] string2wstring(in char[] src, wchar* dst) 37 { 38 const char* srcEnd = src.ptr + src.length; 39 const(char)* srcIt = src.ptr; 40 wchar* dstIt = dst; 41 42 while (srcIt !is srcEnd) 43 { 44 // how long is the byte sequence 45 int len = 0; 46 uint mask = 0b1000_0000; 47 while (*srcIt & mask) 48 { 49 mask >>= 1; 50 len++; 51 } 52 53 // get payload of first byte 54 dchar ch = *srcIt++ & (mask - 1); 55 56 while (--len > 0) 57 { 58 // make space for 6 more bits 59 ch <<= 6; 60 ch |= *srcIt++ & 0b0011_1111; 61 } 62 63 // do we need to store a surrogate pair ? 64 static if (is(wchar == dchar)) 65 { 66 *dstIt++ = ch; 67 } 68 else if (ch > wchar.max) 69 { 70 *dstIt++ = (ch >> 10) | 0xD800; 71 *dstIt++ = (ch & 0b11_1111_1111) | 0xDC00; 72 } 73 else 74 { 75 *dstIt++ = cast(wchar) ch; 76 } 77 } 78 *dstIt = 0; 79 80 return dst[0 .. dstIt - dst]; 81 } 82 83 /** 84 * Calculates the required buffer size in bytes for a string to wchar[] conversion. 85 * Room for a terminating '\0' is included. 86 * 87 * Params: 88 * src = The source string. 89 * 90 * Returns: 91 * The maximum byte count the source string could require, including the terminating '\0'. 92 * 93 * See_Also: 94 * string2wstring 95 * 96 */ 97 @safe pure 98 size_t string2wstringSize(in char[] src) 99 { 100 enum limit = size_t.max / wchar.sizeof - 1; 101 return src.length <= limit ? wchar.sizeof * (src.length + 1) : size_t.max; 102 } 103 104 /** 105 * Converts a wstring to a string using a buffer provided by the user. 106 * To get the buffer requirements call $(D stringSize) on your source buffer. 107 * 108 * Params: 109 * src = The UTF-8 string to convert. 110 * dst = The destination buffer for the conversion. 111 * 112 * Returns: 113 * The part of the destination buffer used for the conversion as a $(D wchar[]). 114 * A terminating zero is appended, so the result.ptr can be passed into Windows APIs. 115 */ 116 pure 117 char[] wstring2string(in wchar[] src, char* dst) 118 { 119 const wchar* srcEnd = src.ptr + src.length; 120 const(wchar)* srcIt = src.ptr; 121 char* dstIt = dst; 122 123 while (srcIt !is srcEnd) 124 { 125 if (*srcIt < 0x80) 126 { 127 *dstIt++ = cast(char)*srcIt++; 128 } 129 else if (*srcIt < 0x800) 130 { 131 *dstIt++ = cast(char)(0b_11000000 | *srcIt >> 6); 132 *dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt++; 133 } 134 if (*srcIt < 0xD800 || *srcIt > 0xDBFF) 135 { 136 // anything else within the BMP (<= 0xFFFF), but not a high surrogate 137 *dstIt++ = 0b_11100000 | *srcIt >> 12; 138 *dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt >> 6; 139 *dstIt++ = 0b_10000000 | 0b_00111111 & *srcIt++; 140 } 141 else 142 { 143 // high surrogate, assume correct encoding and that the next wchar is the low surrogate 144 dchar decoded; 145 decoded = (*srcIt++ & 0b11_1111_1111) << 10; 146 decoded |= (*srcIt++ & 0b11_1111_1111); 147 *dstIt++ = 0b_11110000 | decoded >> 18; 148 *dstIt++ = 0b_10000000 | 0b_00111111 & decoded >> 12; 149 *dstIt++ = 0b_10000000 | 0b_00111111 & decoded >> 6; 150 *dstIt++ = 0b_10000000 | 0b_00111111 & decoded; 151 } 152 } 153 *dstIt = 0; 154 155 return dst[0 .. dstIt - dst]; 156 } 157 158 /** 159 * Calculates the required buffer size in bytes for a wstring to char[] conversion. 160 * Room for a terminating '\0' is included. 161 * 162 * Params: 163 * src = The source string. 164 * 165 * Returns: 166 * The maximum byte count the source string could require, including the terminating '\0'. 167 * 168 * See_Also: 169 * wstring2string 170 * 171 */ 172 @safe pure 173 size_t wstring2stringSize(in wchar[] src) 174 { 175 enum limit = (size_t.max / char.sizeof - 1) / 3; 176 return src.length <= limit ? char.sizeof * (3 * src.length + 1) : size_t.max; 177 } 178 179 /** 180 * Replaces $(D std.utf.toUTFz) with a version that uses the stack as long as the required bytes for the output are 181 * <= 1k. Longer strings use $(D malloc) to create a buffer for the conversion. It is freed at least at the end of the 182 * scope. 183 * 184 * Params: 185 * str = The source string to convert. 186 * 187 * See_Also: 188 * toWstring 189 * 190 * Example: 191 * --- 192 * string text = "Hello, world!"; 193 * WinApiW(wcharPtr!text); 194 * --- 195 */ 196 auto wcharPtr(alias str)(void* buffer = string2wstringSize(str) <= allocaLimit ? alloca( 197 string2wstringSize(str)) : null) 198 { 199 // In any case we have to return a proper InstantBuffer, so that free() is called in the dtor at some point. 200 return TempBuffer!wchar( 201 string2wstring(str, cast(wchar*)(buffer ? buffer 202 : malloc(string2wstringSize(str)))), 203 buffer is null); 204 } 205 206 /// ditto 207 immutable(wchar)* wcharPtr(alias wstr)() 208 if (is(typeof(wstr) == wstring) && __traits(compiles, { 209 enum wstring e = wstr; 210 })) 211 { 212 // D string literals (known at compile time) are always \0-terminated. 213 return wstr.ptr; 214 } 215 216 /** 217 * $(D char*) version of $(D wcharPtr). Basically it appends a \0 to the input. 218 * The function uses $(D malloc) for strings of lengths 1024 and above. 219 * 220 * Params: 221 * str = The source string to convert to a C UTF-8 string 222 * 223 * Note: 224 * Do not use this to call Windows ANSI functions! Always use wide-char 225 * functions on this operating system unless you want to deal with codepages. 226 * 227 * Example: 228 * --- 229 * string text = "Hello, world!"; 230 * linuxApi(charPtr!text); 231 * --- 232 */ 233 auto charPtr(alias str)(void* buffer = alloca(str.length + 1)) 234 if (is(typeof(str) : const(char)[]) || is(typeof(str) : const(ubyte)[])) 235 { 236 char* dst = cast(char*) memcpy(buffer ? buffer : malloc(str.length + 1), str.ptr, str.length); 237 dst[str.length] = '\0'; 238 return TempBuffer!char(dst[0 .. str.length], buffer is null); 239 } 240 241 /// ditto 242 immutable(char)* charPtr(alias str)() if (__traits(compiles, { enum string e = str; })) 243 { 244 // D string literals (known at compile time) are always \0-terminated. 245 return str.ptr; 246 } 247 248 /** 249 * This overload allocates the required memory from an existing stack buffer. 250 * 251 * Params: 252 * str = The source string to convert to a C UTF-8 string 253 * sb = The stack buffer to allocate from 254 * 255 * Note: 256 * Always assign the result to an auto variable first for RAII to work correctly. 257 */ 258 StackBufferEntry!char charPtr(SB)(const(char)[] str, ref SB sb) 259 if (is(SB == StackBuffer!bytes, bytes...)) 260 { 261 import llvm.intrinsics; 262 263 auto buffer = sb.alloc!char(str.length + 1); 264 llvm_memcpy(buffer.ptr, str.ptr, str.length); 265 buffer[str.length] = '\0'; 266 return buffer; 267 } 268 269 bool isPrintable(T)(T c) @safe pure nothrow @nogc 270 { 271 return c >= ' ' && c <= '~'; 272 } 273 274 size_t strlen(inout(char*) str) pure 275 { 276 size_t len_; 277 size_t* len = &len_; 278 for (*len = 0; str[*len]; (*len)++) 279 continue; 280 return len_; 281 } 282 283 int memcmp(const(char)* buf1, immutable(char)* buf2, size_t count) pure 284 { 285 if (!count) 286 return (0); 287 288 while (--count && *buf1 == *buf2) 289 { 290 buf1++; 291 buf2++; 292 } 293 294 return *buf1 - *buf2; 295 } 296 /** 297 * Returns the given $(D ptr) up to but not including the \0 as a $(D char[]). 298 */ 299 inout(char)[] asString(inout(char*) ptr) @trusted pure 300 { 301 if (ptr is null) 302 return null; 303 return ptr[0 .. strlen(ptr)]; 304 }