Ada 2.7.8
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
unicode.cpp
Go to the documentation of this file.
1#include "ada.h"
3#include "ada/common_defs.h"
4#include "ada/unicode.h"
5
7#include "ada_idna.cpp"
9
10#include <algorithm>
11#if ADA_NEON
12#include <arm_neon.h>
13#elif ADA_SSE2
14#include <emmintrin.h>
15#endif
16
17namespace ada::unicode {
18
19constexpr uint64_t broadcast(uint8_t v) noexcept {
20 return 0x101010101010101ull * v;
21}
22
23constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
25 uint64_t broadcast_Ap = broadcast(128 - 'A');
26 uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1);
28 size_t i = 0;
29
30 for (; i + 7 < length; i += 8) {
31 uint64_t word{};
32 memcpy(&word, input + i, sizeof(word));
34 word ^=
35 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
36 memcpy(input + i, &word, sizeof(word));
37 }
38 if (i < length) {
39 uint64_t word{};
40 memcpy(&word, input + i, length - i);
42 word ^=
43 (((word + broadcast_Ap) ^ (word + broadcast_Zp)) & broadcast_80) >> 2;
44 memcpy(input + i, &word, length - i);
45 }
46 return non_ascii == 0;
47}
48#if ADA_NEON
49ada_really_inline bool has_tabs_or_newline(
50 std::string_view user_input) noexcept {
51 // first check for short strings in which case we do it naively.
52 if (user_input.size() < 16) { // slow path
53 for (size_t i = 0; i < user_input.size(); i++) {
54 if (user_input[i] == '\r' || user_input[i] == '\n' ||
55 user_input[i] == '\t') {
56 return true;
57 }
58 }
59 return false;
60 }
61 // fast path for long strings (expected to be common)
62 size_t i = 0;
75 static uint8_t rnt_array[16] = {1, 0, 0, 0, 0, 0, 0, 0,
76 0, 9, 10, 0, 0, 13, 0, 0};
78 // m['0xd', '0xa', '0x9']
80 for (; i + 15 < user_input.size(); i += 16) {
81 uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
82
84 }
85 if (i < user_input.size()) {
87 vld1q_u8((const uint8_t*)user_input.data() + user_input.length() - 16);
89 }
91}
92#elif ADA_SSE2
93ada_really_inline bool has_tabs_or_newline(
94 std::string_view user_input) noexcept {
95 // first check for short strings in which case we do it naively.
96 if (user_input.size() < 16) { // slow path
97 for (size_t i = 0; i < user_input.size(); i++) {
98 if (user_input[i] == '\r' || user_input[i] == '\n' ||
99 user_input[i] == '\t') {
100 return true;
101 }
102 }
103 return false;
104 }
105 // fast path for long strings (expected to be common)
106 size_t i = 0;
107 const __m128i mask1 = _mm_set1_epi8('\r');
108 const __m128i mask2 = _mm_set1_epi8('\n');
109 const __m128i mask3 = _mm_set1_epi8('\t');
110 // If we supported SSSE3, we could use the algorithm that we use for NEON.
111 __m128i running{0};
112 for (; i + 15 < user_input.size(); i += 16) {
113 __m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
118 }
119 if (i < user_input.size()) {
121 (const __m128i*)(user_input.data() + user_input.length() - 16));
126 }
127 return _mm_movemask_epi8(running) != 0;
128}
129#else
130ada_really_inline bool has_tabs_or_newline(
131 std::string_view user_input) noexcept {
132 auto has_zero_byte = [](uint64_t v) {
133 return ((v - 0x0101010101010101) & ~(v) & 0x8080808080808080);
134 };
135 size_t i = 0;
136 uint64_t mask1 = broadcast('\r');
137 uint64_t mask2 = broadcast('\n');
138 uint64_t mask3 = broadcast('\t');
139 uint64_t running{0};
140 for (; i + 7 < user_input.size(); i += 8) {
141 uint64_t word{};
142 memcpy(&word, user_input.data() + i, sizeof(word));
147 }
148 if (i < user_input.size()) {
149 uint64_t word{};
150 memcpy(&word, user_input.data() + i, user_input.size() - i);
155 }
156 return running;
157}
158#endif
159
160// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR,
161// U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>),
162// U+003F (?), U+0040 (@), U+005B ([), U+005C (\‍), U+005D (]), U+005E (^), or
163// U+007C (|).
164constexpr static std::array<uint8_t, 256> is_forbidden_host_code_point_table =
165 []() constexpr {
166 std::array<uint8_t, 256> result{};
167 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
168 '>', '?', '@', '[', '\\', ']', '^', '|'}) {
169 result[c] = true;
170 }
171 return result;
172 }();
173
174ada_really_inline constexpr bool is_forbidden_host_code_point(
175 const char c) noexcept {
177}
178
179constexpr static std::array<uint8_t, 256> is_forbidden_domain_code_point_table =
180 []() constexpr {
181 std::array<uint8_t, 256> result{};
182 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
183 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
184 result[c] = true;
185 }
186 for (uint8_t c = 0; c <= 32; c++) {
187 result[c] = true;
188 }
189 for (size_t c = 127; c < 255; c++) {
190 result[c] = true;
191 }
192 return result;
193 }();
194
195static_assert(sizeof(is_forbidden_domain_code_point_table) == 256);
196
197ada_really_inline constexpr bool is_forbidden_domain_code_point(
198 const char c) noexcept {
200}
201
202ada_really_inline constexpr bool contains_forbidden_domain_code_point(
203 const char* input, size_t length) noexcept {
204 size_t i = 0;
206 for (; i + 4 <= length; i += 4) {
211 }
212 for (; i < length; i++) {
214 }
215 return accumulator;
216}
217
218constexpr static std::array<uint8_t, 256>
220 std::array<uint8_t, 256> result{};
221 for (uint8_t c : {'\0', '\x09', '\x0a', '\x0d', ' ', '#', '/', ':', '<',
222 '>', '?', '@', '[', '\\', ']', '^', '|', '%'}) {
223 result[c] = 1;
224 }
225 for (uint8_t c = 'A'; c <= 'Z'; c++) {
226 result[c] = 2;
227 }
228 for (uint8_t c = 0; c <= 32; c++) {
229 result[c] = 1;
230 }
231 for (size_t c = 127; c < 255; c++) {
232 result[c] = 1;
233 }
234 return result;
235 }();
236
238contains_forbidden_domain_code_point_or_upper(const char* input,
239 size_t length) noexcept {
240 size_t i = 0;
242 for (; i + 4 <= length; i += 4) {
243 accumulator |=
245 accumulator |=
247 accumulator |=
249 accumulator |=
251 }
252 for (; i < length; i++) {
253 accumulator |=
255 }
256 return accumulator;
257}
258
259// std::isalnum(c) || c == '+' || c == '-' || c == '.') is true for
260constexpr static std::array<bool, 256> is_alnum_plus_table = []() constexpr {
261 std::array<bool, 256> result{};
262 for (size_t c = 0; c < 256; c++) {
263 if (c >= '0' && c <= '9') {
264 result[c] = true;
265 } else if (c >= 'a' && c <= 'z') {
266 result[c] = true;
267 } else if (c >= 'A' && c <= 'Z') {
268 result[c] = true;
269 } else if (c == '+' || c == '-' || c == '.') {
270 result[c] = true;
271 }
272 }
273 return result;
274}();
275
276ada_really_inline constexpr bool is_alnum_plus(const char c) noexcept {
278 // A table is almost surely much faster than the
279 // following under most compilers: return
280 // return (std::isalnum(c) || c == '+' || c == '-' || c == '.');
281}
282
283ada_really_inline constexpr bool is_ascii_hex_digit(const char c) noexcept {
284 return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') ||
285 (c >= 'a' && c <= 'f');
286}
287
288ada_really_inline constexpr bool is_c0_control_or_space(const char c) noexcept {
289 return (unsigned char)c <= ' ';
290}
291
292ada_really_inline constexpr bool is_ascii_tab_or_newline(
293 const char c) noexcept {
294 return c == '\t' || c == '\n' || c == '\r';
295}
296
297constexpr std::string_view table_is_double_dot_path_segment[] = {
298 "..", "%2e.", ".%2e", "%2e%2e"};
299
300ada_really_inline ada_constexpr bool is_double_dot_path_segment(
301 std::string_view input) noexcept {
302 // This will catch most cases:
303 // The length must be 2,4 or 6.
304 // We divide by two and require
305 // that the result be between 1 and 3 inclusively.
306 uint64_t half_length = uint64_t(input.size()) / 2;
307 if (half_length - 1 > 2) {
308 return false;
309 }
310 // We have a string of length 2, 4 or 6.
311 // We now check the first character:
312 if ((input[0] != '.') && (input[0] != '%')) {
313 return false;
314 }
315 // We are unlikely the get beyond this point.
316 int hash_value = (input.size() + (unsigned)(input[0])) & 3;
317 const std::string_view target = table_is_double_dot_path_segment[hash_value];
318 if (target.size() != input.size()) {
319 return false;
320 }
321 // We almost never get here.
322 // Optimizing the rest is relatively unimportant.
323 auto prefix_equal_unsafe = [](std::string_view a, std::string_view b) {
324 uint16_t A, B;
325 memcpy(&A, a.data(), sizeof(A));
326 memcpy(&B, b.data(), sizeof(B));
327 return A == B;
328 };
330 return false;
331 }
332 for (size_t i = 2; i < input.size(); i++) {
333 char c = input[i];
334 if ((uint8_t((c | 0x20) - 0x61) <= 25 ? (c | 0x20) : c) != target[i]) {
335 return false;
336 }
337 }
338 return true;
339 // The above code might be a bit better than the code below. Compilers
340 // are not stupid and may use the fact that these strings have length 2,4 and
341 // 6 and other tricks.
342 // return input == ".." ||
343 // input == ".%2e" || input == ".%2E" ||
344 // input == "%2e." || input == "%2E." ||
345 // input == "%2e%2e" || input == "%2E%2E" || input == "%2E%2e" || input ==
346 // "%2e%2E";
347}
348
349ada_really_inline constexpr bool is_single_dot_path_segment(
350 std::string_view input) noexcept {
351 return input == "." || input == "%2e" || input == "%2E";
352}
353
354ada_really_inline constexpr bool is_lowercase_hex(const char c) noexcept {
355 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
356}
357
358constexpr static char hex_to_binary_table[] = {
359 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 10, 11,
360 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15};
362unsigned constexpr convert_hex_to_binary(const char c) noexcept {
363 return hex_to_binary_table[c - '0'];
364}
365
366std::string percent_decode(const std::string_view input, size_t first_percent) {
367 // next line is for safety only, we expect users to avoid calling
368 // percent_decode when first_percent is outside the range.
369 if (first_percent == std::string_view::npos) {
370 return std::string(input);
371 }
372 std::string dest;
373 dest.reserve(input.length());
374 dest.append(input.substr(0, first_percent));
375 const char* pointer = input.data() + first_percent;
376 const char* end = input.data() + input.size();
377 // Optimization opportunity: if the following code gets
378 // called often, it can be optimized quite a bit.
379 while (pointer < end) {
380 const char ch = pointer[0];
381 size_t remaining = end - pointer - 1;
382 if (ch != '%' || remaining < 2 ||
383 ( // ch == '%' && // It is unnecessary to check that ch == '%'.
384 (!is_ascii_hex_digit(pointer[1]) ||
385 !is_ascii_hex_digit(pointer[2])))) {
386 dest += ch;
387 pointer++;
388 continue;
389 } else {
390 unsigned a = convert_hex_to_binary(pointer[1]);
391 unsigned b = convert_hex_to_binary(pointer[2]);
392 char c = static_cast<char>(a * 16 + b);
393 dest += c;
394 pointer += 3;
395 }
396 }
397 return dest;
398}
399
400std::string percent_encode(const std::string_view input,
401 const uint8_t character_set[]) {
402 auto pointer =
403 std::find_if(input.begin(), input.end(), [character_set](const char c) {
404 return character_sets::bit_at(character_set, c);
405 });
406 // Optimization: Don't iterate if percent encode is not required
407 if (pointer == input.end()) {
408 return std::string(input);
409 }
410
411 std::string result;
412 result.reserve(input.length()); // in the worst case, percent encoding might
413 // produce 3 characters.
414 result.append(input.substr(0, std::distance(input.begin(), pointer)));
415
416 for (; pointer != input.end(); pointer++) {
418 result.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
419 } else {
420 result += *pointer;
421 }
422 }
423
424 return result;
425}
426
427template <bool append>
428bool percent_encode(const std::string_view input, const uint8_t character_set[],
429 std::string& out) {
430 ada_log("percent_encode ", input, " to output string while ",
431 append ? "appending" : "overwriting");
432 auto pointer =
433 std::find_if(input.begin(), input.end(), [character_set](const char c) {
434 return character_sets::bit_at(character_set, c);
435 });
436 ada_log("percent_encode done checking, moved to ",
437 std::distance(input.begin(), pointer));
438
439 // Optimization: Don't iterate if percent encode is not required
440 if (pointer == input.end()) {
441 ada_log("percent_encode encoding not needed.");
442 return false;
443 }
444 if (!append) {
445 out.clear();
446 }
447 ada_log("percent_encode appending ", std::distance(input.begin(), pointer),
448 " bytes");
449 out.append(input.data(), std::distance(input.begin(), pointer));
450 ada_log("percent_encode processing ", std::distance(pointer, input.end()),
451 " bytes");
452 for (; pointer != input.end(); pointer++) {
454 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
455 } else {
456 out += *pointer;
457 }
458 }
459 return true;
460}
461
462bool to_ascii(std::optional<std::string>& out, const std::string_view plain,
463 size_t first_percent) {
464 std::string percent_decoded_buffer;
465 std::string_view input = plain;
466 if (first_percent != std::string_view::npos) {
467 percent_decoded_buffer = unicode::percent_decode(plain, first_percent);
469 }
470 // input is a non-empty UTF-8 string, must be percent decoded
471 std::string idna_ascii = ada::idna::to_ascii(input);
473 idna_ascii.data(), idna_ascii.size())) {
474 return false;
475 }
476 out = std::move(idna_ascii);
477 return true;
478}
479
480std::string percent_encode(const std::string_view input,
481 const uint8_t character_set[], size_t index) {
482 std::string out;
483 out.append(input.data(), index);
484 auto pointer = input.begin() + index;
485 for (; pointer != input.end(); pointer++) {
487 out.append(character_sets::hex + uint8_t(*pointer) * 4, 3);
488 } else {
489 out += *pointer;
490 }
491 }
492 return out;
493}
494
495std::string to_unicode(std::string_view input) {
497}
498
499} // namespace ada::unicode
Includes all definitions for Ada.
Definitions of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ada_constexpr
#define ADA_PUSH_DISABLE_ALL_WARNINGS
#define ADA_POP_DISABLE_WARNINGS
#define ada_really_inline
Definition common_defs.h:84
ada_really_inline bool bit_at(const uint8_t a[], const uint8_t i)
constexpr char hex[1024]
std::string to_ascii(std::string_view ut8_string)
std::string to_unicode(std::string_view input)
bool contains_forbidden_domain_code_point(std::string_view ascii_string)
Includes the declarations for unicode operations.
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table
Definition unicode.cpp:179
static constexpr std::array< uint8_t, 256 > is_forbidden_domain_code_point_table_or_upper
Definition unicode.cpp:219
static constexpr char hex_to_binary_table[]
Definition unicode.cpp:358
constexpr uint64_t broadcast(uint8_t v) noexcept
Definition unicode.cpp:19
constexpr std::string_view table_is_double_dot_path_segment[]
Definition unicode.cpp:297
static constexpr std::array< uint8_t, 256 > is_forbidden_host_code_point_table
Definition unicode.cpp:164
static constexpr std::array< bool, 256 > is_alnum_plus_table
Definition unicode.cpp:260
tl::expected< result_type, ada::errors > result
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
Definitions for all unicode specific functions.