1 /*************************************************************************************************** 2 * 3 * Functions to work with the Unicode Transformation Format. 4 * 5 * Grapheme clusters: 6 * A grapheme cluster is roughly speaking what the user would perceive as the smallest unit in a 7 * writing system. Their count can be thought of as a caret position in a text editor. In 8 * particular at grapheme cluster level, different normalization forms (NFC, NFD) become 9 * transparent. The default definition used here is independent of the user's locale. 10 * 11 * Authors: 12 * $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise) 13 * 14 * Copyright: 15 * © 2017-2023 $(LINK2 mailto:Marco.Leise@gmx.de, Marco Leise), $(LINK2 mailto:etienne@cimons.com, Etienne Cimon) 16 * 17 * License: 18 * $(LINK2 https://mit-license.org/, The MIT License (MIT)) 19 * 20 **************************************************************************************************/ 21 module fast.unicode; 22 23 import fast.internal.unicode_tables; 24 import fast.internal.sysdef; 25 26 //import std.simd; 27 28 /******************************************************************************* 29 * 30 * Enumeration for the Unicode "General Category" used to roughly classify 31 * codepoints into letters, punctuation etc. 32 * 33 **************************************/ 34 alias GeneralCategory = DerivedGeneralCategory.Enum; 35 36 /******************************************************************************* 37 * 38 * A customizable structure providing information on a code point. It consists 39 * of a Unicode `property` in the form of an `enum` (e.g. `GeneralCategory`) and 40 * a `length` in bytes of the code point in UTF-8. 41 * 42 **************************************/ 43 struct CodePointInfo(Enum) 44 { 45 alias property this; 46 size_t length; 47 Enum property; 48 } 49 50 /******************************************************************************* 51 * 52 * Counts the number of grapheme clusters (character count) in a UTF string. 53 * 54 * This function uses "extended grapheme clusters" as defined in Unicode: 55 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries 56 * 57 * When invalid byte sequences are encountered, each byte that does not make up 58 * a code point will be counted as one grapheme as visual representations of 59 * such broken strings will often show a square with the hexadecimal byte value 60 * in them. 61 * 62 * Params: 63 * str = the UTF-8 string 64 * 65 * Returns: 66 * the number of grapheme clusters 67 * 68 **************************************/ 69 @nogc @trusted pure nothrow size_t countGraphemes(scope const(char)[] str) 70 { 71 enum numValues = GraphemeBreakProperty.Enum.max + 1; 72 static immutable graphemeBreakRules = 73 { 74 // GB999 75 byte[numValues][numValues] graphemeBreaks = true; 76 with (GraphemeBreakProperty.Enum) 77 { 78 // GB12 + GB13 (special handling) 79 foreach (i; 0 .. numValues) 80 graphemeBreaks[i][Regional_Indicator] = -1; 81 // GB11 82 graphemeBreaks[ZWJ][Glue_After_Zwj] = false; 83 graphemeBreaks[ZWJ][E_Base_GAZ] = false; 84 // GB10 (special handling) 85 graphemeBreaks[E_Base][E_Modifier] = false; 86 graphemeBreaks[E_Base_GAZ][E_Modifier] = false; 87 graphemeBreaks[Extend][E_Modifier] = -1; 88 // GB9b 89 foreach (i; 0 .. numValues) 90 graphemeBreaks[Prepend][i] = false; 91 // GB9a 92 foreach (i; 0 .. numValues) 93 graphemeBreaks[i][SpacingMark] = false; 94 // GB9 95 foreach (i; 0 .. numValues) 96 { 97 graphemeBreaks[i][Extend] = false; 98 graphemeBreaks[i][ZWJ] = false; 99 } 100 graphemeBreaks[E_Base][Extend] = -1; 101 graphemeBreaks[E_Base_GAZ][Extend] = -1; 102 // GB8 103 graphemeBreaks[LVT][T] = false; 104 graphemeBreaks[T][T] = false; 105 // GB7 106 graphemeBreaks[LV][V] = false; 107 graphemeBreaks[LV][T] = false; 108 graphemeBreaks[V][V] = false; 109 graphemeBreaks[V][T] = false; 110 // GB6 111 graphemeBreaks[L][L] = false; 112 graphemeBreaks[L][V] = false; 113 graphemeBreaks[L][LV] = false; 114 graphemeBreaks[L][LVT] = false; 115 // GB5 116 foreach (i; 0 .. numValues) 117 { 118 graphemeBreaks[i][Control] = true; 119 graphemeBreaks[i][CR] = true; 120 graphemeBreaks[i][LF] = true; 121 } 122 // GB4 123 foreach (i; 0 .. numValues) 124 { 125 graphemeBreaks[Control][i] = true; 126 graphemeBreaks[CR][i] = true; 127 graphemeBreaks[LF][i] = true; 128 } 129 // GB3 130 graphemeBreaks[CR][LF] = false; 131 // Additional homebrew top level rule to break before and after invalid characters 132 foreach (i; 0 .. numValues) 133 { 134 graphemeBreaks[i][__] = true; 135 graphemeBreaks[__][i] = true; 136 } 137 } 138 return graphemeBreaks; 139 }(); 140 141 size_t graphemeCount = 0; 142 auto p = str.ptr; 143 auto graphemeStart = p; 144 GraphemeBreakProperty.Enum last, next; 145 bool riEven, inEmojiBaseExtension; 146 147 @noinline @safe @nogc pure nothrow bool complexRules() 148 { 149 pragma(inline, false); 150 with (GraphemeBreakProperty.Enum) 151 { 152 if (next == Regional_Indicator) 153 { 154 // For GB12 + GB13 we need break only after a complete country code (2 indicators). 155 if (last == Regional_Indicator) 156 return riEven = !riEven; 157 riEven = true; 158 return false; 159 } 160 else if (next == Extend) 161 { 162 inEmojiBaseExtension = true; 163 return false; 164 } 165 else if (inEmojiBaseExtension) 166 { 167 return inEmojiBaseExtension = false; 168 } 169 return true; 170 } 171 } 172 173 @forceinline void graphemeCountImpl(S)(ref S str) 174 { 175 version (LDC) pragma(inline, true); 176 auto cpi = getProperty!GraphemeBreakProperty(str); 177 auto next = cpi.property; 178 byte isBoundary = graphemeBreakRules[last][next]; 179 if (isBoundary < 0 ? complexRules() : isBoundary) 180 { 181 graphemeCount++; 182 static if (is(S == const(char)*)) 183 graphemeStart = str; 184 else 185 graphemeStart = str.ptr; 186 inEmojiBaseExtension = false; 187 } 188 static if (is(S == const(char)*)) 189 str += cpi.length; 190 else 191 str = str[cpi.length .. $]; 192 last = next; 193 } 194 195 if (str.length >= 4) 196 { 197 const e = str.ptr + str.length - 4; 198 do 199 graphemeCountImpl(p); 200 while (p <= e); 201 str = str[p - str.ptr .. $]; 202 } 203 while (str.length) 204 graphemeCountImpl(str); 205 return graphemeCount; 206 } 207 208 /******************************************************************************* 209 * 210 * Retrieves the "General Category" of the first code point in some UTF-8 211 * string. For broken UTF-8, the property is set to `GeneralCategory.__` (`0`). 212 * 213 * Params: 214 * str = the UTF-8 encoded text, which must not be empty 215 * 216 * Returns: 217 * a code point information struct consisting of a the fields `property`, 218 * containing the `GeneralCategory` enumeration and the `length` of the code 219 * point in bytes. 220 * 221 **************************************/ 222 @property @safe @nogc pure nothrow CodePointInfo!GeneralCategory generalCategory( 223 scope const(char)[] str) 224 { 225 return getProperty!DerivedGeneralCategory(str); 226 } 227 228 unittest 229 { 230 assert("क".generalCategory == GeneralCategory.Other_Letter); 231 assert("̸".generalCategory == GeneralCategory.Nonspacing_Mark); 232 assert("\xFF".generalCategory == GeneralCategory.__); 233 } 234 235 private: 236 237 @forceinline pure @nogc nothrow auto 238 getProperty(Property, S)(scope S str) 239 if (is(S == const(char)*) || is(S == const(char)[])) 240 in 241 { 242 static if (is(S == const(char)[])) 243 assert(str.length != 0, "No code units passed in."); 244 } 245 out 246 { 247 assert(__result <= Property.Enum.max); 248 } 249 body 250 { 251 version (LDC) pragma(inline, true); 252 import fast.internal.helpers; 253 254 alias Enum = Property.Enum; 255 alias CPI = CodePointInfo!Enum; 256 // Fast path for ASCII. 257 size_t idx = Property.level0[0][str[0]]; 258 if (byte(str[0]) >= 0) 259 return CPI(1, cast(Enum) idx); 260 // On multi-byte sequences, set the length to 1 for invalid sequences (idx == 0). 261 size_t length = clz(str[0] ^ 0xFFu) - 24; 262 // Safely return invalid code point of 1 byte length if string exhausted. 263 static if (is(S == const(char)[])) 264 if (length > str.length) 265 return CPI(1, cast(Enum) 0); 266 // Otherwise use lookup table hierarchy to determine if code units form a valid code point 267 if (idx > Enum.max) 268 { 269 idx = Property.level1[idx - Enum.max - 1][str[1]]; 270 if (idx > Enum.max) 271 { 272 idx = Property.level2[idx - Enum.max - 1][str[2]]; 273 if (idx > Enum.max) 274 idx = Property.level3[idx - Enum.max - 1][str[3]]; 275 } 276 } 277 if (idx) 278 return CPI(length, cast(Enum) idx); 279 else 280 return CPI(1, cast(Enum) 0); 281 }