fast.unicode source code

1 /***************************************************************************************************
2  * 
3  * Functions to work with the Unicode Transformation Format.
4  * 
5  * Grapheme clusters:
6  *   A grapheme cluster is roughly speaking what the user would perceive as the smallest unit in a
7  *   writing system. Their count can be thought of as a caret position in a text editor. In
8  *   particular at grapheme cluster level, different normalization forms (NFC, NFD) become
9  *   transparent. The default definition used here is independent of the user's locale.
10  * 
11  * Authors:
12  *   $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise)
13  * 
14  * Copyright:
15  *   © 2017-2023 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise), $(LINK2 mailto:etienne@cimons.com, Etienne Cimon)
16  * 
17  * License:
18  *   $(LINK2 https://mit-license.org/, The MIT License (MIT))
19  * 
20  **************************************************************************************************/
21 module fast.unicode;
22 
23 import fast.internal.unicode_tables;
24 import fast.internal.sysdef;
25 
26 //import std.simd;
27 
28 /*******************************************************************************
29  * 
30  * Enumeration for the Unicode "General Category" used to roughly classify
31  * codepoints into letters, punctuation etc.
32  *
33  **************************************/
34 alias GeneralCategory = DerivedGeneralCategory.Enum;
35 
36 /*******************************************************************************
37  * 
38  * A customizable structure providing information on a code point. It consists
39  * of a Unicode `property` in the form of an `enum` (e.g. `GeneralCategory`) and
40  * a `length` in bytes of the code point in UTF-8.
41  *
42  **************************************/
43 struct CodePointInfo(Enum)
44 {
45 	alias property this;
46 	size_t length;
47 	Enum property;
48 }
49 
50 /*******************************************************************************
51  * 
52  * Counts the number of grapheme clusters (character count) in a UTF string.
53  * 
54  * This function uses "extended grapheme clusters" as defined in Unicode:
55  * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
56  * 
57  * When invalid byte sequences are encountered, each byte that does not make up
58  * a code point will be counted as one grapheme as visual representations of
59  * such broken strings will often show a square with the hexadecimal byte value
60  * in them.
61  *
62  * Params:
63  *   str = the UTF-8 string
64  *
65  * Returns:
66  *   the number of grapheme clusters
67  *
68  **************************************/
69 @nogc @trusted pure nothrow size_t countGraphemes(scope const(char)[] str)
70 {
71 	enum numValues = GraphemeBreakProperty.Enum.max + 1;
72 	static immutable graphemeBreakRules =
73 	{
74 		// GB999
75 		byte[numValues][numValues] graphemeBreaks = true;
76 		with (GraphemeBreakProperty.Enum)
77 		{
78 			// GB12 + GB13 (special handling)
79 			foreach (i; 0 .. numValues)
80 				graphemeBreaks[i][Regional_Indicator] = -1;
81 			// GB11
82 			graphemeBreaks[ZWJ][Glue_After_Zwj] = false;
83 			graphemeBreaks[ZWJ][E_Base_GAZ] = false;
84 			// GB10 (special handling)
85 			graphemeBreaks[E_Base][E_Modifier] = false;
86 			graphemeBreaks[E_Base_GAZ][E_Modifier] = false;
87 			graphemeBreaks[Extend][E_Modifier] = -1;
88 			// GB9b
89 			foreach (i; 0 .. numValues)
90 				graphemeBreaks[Prepend][i] = false;
91 			// GB9a
92 			foreach (i; 0 .. numValues)
93 				graphemeBreaks[i][SpacingMark] = false;
94 			// GB9
95 			foreach (i; 0 .. numValues)
96 			{
97 				graphemeBreaks[i][Extend] = false;
98 				graphemeBreaks[i][ZWJ] = false;
99 			}
100 			graphemeBreaks[E_Base][Extend] = -1;
101 			graphemeBreaks[E_Base_GAZ][Extend] = -1;
102 			// GB8
103 			graphemeBreaks[LVT][T] = false;
104 			graphemeBreaks[T][T] = false;
105 			// GB7
106 			graphemeBreaks[LV][V] = false;
107 			graphemeBreaks[LV][T] = false;
108 			graphemeBreaks[V][V] = false;
109 			graphemeBreaks[V][T] = false;
110 			// GB6
111 			graphemeBreaks[L][L] = false;
112 			graphemeBreaks[L][V] = false;
113 			graphemeBreaks[L][LV] = false;
114 			graphemeBreaks[L][LVT] = false;
115 			// GB5
116 			foreach (i; 0 .. numValues)
117 			{
118 				graphemeBreaks[i][Control] = true;
119 				graphemeBreaks[i][CR] = true;
120 				graphemeBreaks[i][LF] = true;
121 			}
122 			// GB4
123 			foreach (i; 0 .. numValues)
124 			{
125 				graphemeBreaks[Control][i] = true;
126 				graphemeBreaks[CR][i] = true;
127 				graphemeBreaks[LF][i] = true;
128 			}
129 			// GB3
130 			graphemeBreaks[CR][LF] = false;
131 			// Additional homebrew top level rule to break before and after invalid characters
132 			foreach (i; 0 .. numValues)
133 			{
134 				graphemeBreaks[i][__] = true;
135 				graphemeBreaks[__][i] = true;
136 			}
137 		}
138 		return graphemeBreaks;
139 	}();
140 
141 	size_t graphemeCount = 0;
142 	auto p = str.ptr;
143 	auto graphemeStart = p;
144 	GraphemeBreakProperty.Enum last, next;
145 	bool riEven, inEmojiBaseExtension;
146 
147 	@noinline @safe @nogc pure nothrow bool complexRules()
148 	{
149 		pragma(inline, false);
150 		with (GraphemeBreakProperty.Enum)
151 		{
152 			if (next == Regional_Indicator)
153 			{
154 				// For GB12 + GB13 we need break only after a complete country code (2 indicators).
155 				if (last == Regional_Indicator)
156 					return riEven = !riEven;
157 				riEven = true;
158 				return false;
159 			}
160 			else if (next == Extend)
161 			{
162 				inEmojiBaseExtension = true;
163 				return false;
164 			}
165 			else if (inEmojiBaseExtension)
166 			{
167 				return inEmojiBaseExtension = false;
168 			}
169 			return true;
170 		}
171 	}
172 
173 	@forceinline void graphemeCountImpl(S)(ref S str)
174 	{
175 		version (LDC) pragma(inline, true);
176 		auto cpi = getProperty!GraphemeBreakProperty(str);
177 		auto next = cpi.property;
178 		byte isBoundary = graphemeBreakRules[last][next];
179 		if (isBoundary < 0 ? complexRules() : isBoundary)
180 		{
181 			graphemeCount++;
182 			static if (is(S == const(char)*))
183 				graphemeStart = str;
184 			else
185 				graphemeStart = str.ptr;
186 			inEmojiBaseExtension = false;
187 		}
188 		static if (is(S == const(char)*))
189 			str += cpi.length;
190 		else
191 			str = str[cpi.length .. $];
192 		last = next;
193 	}
194 
195 	if (str.length >= 4)
196 	{
197 		const e = str.ptr + str.length - 4;
198 		do
199 			graphemeCountImpl(p);
200 		while (p <= e);
201 		str = str[p - str.ptr .. $];
202 	}
203 	while (str.length)
204 		graphemeCountImpl(str);
205 	return graphemeCount;
206 }
207 
208 /*******************************************************************************
209  * 
210  * Retrieves the "General Category" of the first code point in some UTF-8
211  * string. For broken UTF-8, the property is set to `GeneralCategory.__` (`0`).
212  *
213  * Params:
214  *   str = the UTF-8 encoded text, which must not be empty
215  *
216  * Returns:
217  *   a code point information struct consisting of a the fields `property`,
218  *   containing the `GeneralCategory` enumeration and the `length` of the code
219  *   point in bytes.
220  * 
221  **************************************/
222 @property @safe @nogc pure nothrow CodePointInfo!GeneralCategory generalCategory(
223 	scope const(char)[] str)
224 {
225 	return getProperty!DerivedGeneralCategory(str);
226 }
227 
228 unittest
229 {
230 	assert("क".generalCategory == GeneralCategory.Other_Letter);
231 	assert("̸".generalCategory == GeneralCategory.Nonspacing_Mark);
232 	assert("\xFF".generalCategory == GeneralCategory.__);
233 }
234 
235 private:
236 
237 @forceinline pure @nogc nothrow auto
238 getProperty(Property, S)(scope S str)
239 		if (is(S == const(char)*) || is(S == const(char)[]))
240 in
241 {
242 	static if (is(S == const(char)[]))
243 		assert(str.length != 0, "No code units passed in.");
244 }
245 out
246 {
247 	assert(__result <= Property.Enum.max);
248 }
249 body
250 {
251 	version (LDC) pragma(inline, true);
252 	import fast.internal.helpers;
253 
254 	alias Enum = Property.Enum;
255 	alias CPI = CodePointInfo!Enum;
256 	// Fast path for ASCII.
257 	size_t idx = Property.level0[0][str[0]];
258 	if (byte(str[0]) >= 0)
259 		return CPI(1, cast(Enum) idx);
260 	// On multi-byte sequences, set the length to 1 for invalid sequences (idx == 0).
261 	size_t length = clz(str[0] ^ 0xFFu) - 24;
262 	// Safely return invalid code point of 1 byte length if string exhausted.
263 	static if (is(S == const(char)[]))
264 		if (length > str.length)
265 			return CPI(1, cast(Enum) 0);
266 	// Otherwise use lookup table hierarchy to determine if code units form a valid code point
267 	if (idx > Enum.max)
268 	{
269 		idx = Property.level1[idx - Enum.max - 1][str[1]];
270 		if (idx > Enum.max)
271 		{
272 			idx = Property.level2[idx - Enum.max - 1][str[2]];
273 			if (idx > Enum.max)
274 				idx = Property.level3[idx - Enum.max - 1][str[3]];
275 		}
276 	}
277 	if (idx)
278 		return CPI(length, cast(Enum) idx);
279 	else
280 		return CPI(1, cast(Enum) 0);
281 }