Ada 2.9.2
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
helpers.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/checkers-inl.h"
3#include "ada/common_defs.h"
4#include "ada/scheme.h"
5
6#include <cstring>
7#include <sstream>
8
9namespace ada::helpers {
10
11template <typename out_iter>
12void encode_json(std::string_view view, out_iter out) {
13 // trivial implementation. could be faster.
14 const char* hexvalues =
15 "000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f";
16 for (uint8_t c : view) {
17 if (c == '\\') {
18 *out++ = '\\';
19 *out++ = '\\';
20 } else if (c == '"') {
21 *out++ = '\\';
22 *out++ = '"';
23 } else if (c <= 0x1f) {
24 *out++ = '\\';
25 *out++ = 'u';
26 *out++ = '0';
27 *out++ = '0';
28 *out++ = hexvalues[2 * c];
29 *out++ = hexvalues[2 * c + 1];
30 } else {
31 *out++ = c;
32 }
33 }
34}
35
37 switch (s) {
39 return "Authority";
41 return "Scheme Start";
43 return "Scheme";
45 return "Host";
47 return "No Scheme";
49 return "Fragment";
51 return "Relative Scheme";
53 return "Relative Slash";
55 return "File";
57 return "File Host";
59 return "File Slash";
61 return "Path or Authority";
63 return "Special Authority Ignore Slashes";
65 return "Special Authority Slashes";
67 return "Special Relative or Authority";
69 return "Query";
71 return "Path";
73 return "Path Start";
75 return "Opaque Path";
77 return "Port";
78 default:
79 return "unknown state";
80 }
81}
82
83ada_really_inline std::optional<std::string_view> prune_hash(
84 std::string_view& input) noexcept {
85 // compiles down to 20--30 instructions including a class to memchr (C
86 // function). this function should be quite fast.
87 size_t location_of_first = input.find('#');
88 if (location_of_first == std::string_view::npos) {
89 return std::nullopt;
90 }
91 std::string_view hash = input;
92 hash.remove_prefix(location_of_first + 1);
93 input.remove_suffix(input.size() - location_of_first);
94 return hash;
95}
96
97ada_really_inline bool shorten_path(std::string& path,
98 ada::scheme::type type) noexcept {
99 // Let path be url's path.
100 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
101 // Windows drive letter, then return.
102 if (type == ada::scheme::type::FILE &&
103 path.find('/', 1) == std::string_view::npos && !path.empty()) {
105 helpers::substring(path, 1))) {
106 return false;
107 }
108 }
109
110 // Remove path's last item, if any.
111 size_t last_delimiter = path.rfind('/');
112 if (last_delimiter != std::string::npos) {
113 path.erase(last_delimiter);
114 return true;
115 }
116
117 return false;
118}
119
120ada_really_inline bool shorten_path(std::string_view& path,
121 ada::scheme::type type) noexcept {
122 // Let path be url's path.
123 // If url's scheme is "file", path's size is 1, and path[0] is a normalized
124 // Windows drive letter, then return.
125 if (type == ada::scheme::type::FILE &&
126 path.find('/', 1) == std::string_view::npos && !path.empty()) {
128 helpers::substring(path, 1))) {
129 return false;
130 }
131 }
132
133 // Remove path's last item, if any.
134 if (!path.empty()) {
135 size_t slash_loc = path.rfind('/');
136 if (slash_loc != std::string_view::npos) {
137 path.remove_suffix(path.size() - slash_loc);
138 return true;
139 }
140 }
141
142 return false;
143}
144
145ada_really_inline void remove_ascii_tab_or_newline(
146 std::string& input) noexcept {
147 // if this ever becomes a performance issue, we could use an approach similar
148 // to has_tabs_or_newline
149 std::erase_if(input, ada::unicode::is_ascii_tab_or_newline);
150}
151
152ada_really_inline constexpr std::string_view substring(std::string_view input,
153 size_t pos) noexcept {
154 ADA_ASSERT_TRUE(pos <= input.size());
155 // The following is safer but unneeded if we have the above line:
156 // return pos > input.size() ? std::string_view() : input.substr(pos);
157 return input.substr(pos);
158}
159
160ada_really_inline void resize(std::string_view& input, size_t pos) noexcept {
161 ADA_ASSERT_TRUE(pos <= input.size());
162 input.remove_suffix(input.size() - pos);
163}
164
165// computes the number of trailing zeroes
166// this is a private inline function only defined in this source file.
167ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept {
168#ifdef ADA_REGULAR_VISUAL_STUDIO
169 unsigned long ret;
170 // Search the mask data from least significant bit (LSB)
171 // to the most significant bit (MSB) for a set bit (1).
172 _BitScanForward(&ret, input_num);
173 return (int)ret;
174#else // ADA_REGULAR_VISUAL_STUDIO
175 return __builtin_ctzl(input_num);
176#endif // ADA_REGULAR_VISUAL_STUDIO
177}
178
179// starting at index location, this finds the next location of a character
180// :, /, \\, ? or [. If none is found, view.size() is returned.
181// For use within get_host_delimiter_location.
182#if ADA_NEON
183// The ada_make_uint8x16_t macro is necessary because Visual Studio does not
184// support direct initialization of uint8x16_t. See
185// https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon
186#ifndef ada_make_uint8x16_t
187#define ada_make_uint8x16_t(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, \
188 x13, x14, x15, x16) \
189 ([=]() { \
190 static uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8, \
191 x9, x10, x11, x12, x13, x14, x15, x16}; \
192 return vld1q_u8(array); \
193 }())
194#endif
195
197 std::string_view view, size_t location) noexcept {
198 // first check for short strings in which case we do it naively.
199 if (view.size() - location < 16) { // slow path
200 for (size_t i = location; i < view.size(); i++) {
201 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
202 view[i] == '?' || view[i] == '[') {
203 return i;
204 }
205 }
206 return size_t(view.size());
207 }
208 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
209 uint8x16_t bit_mask =
210 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
211 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
212 uint8x16_t minput = vandq_u8(input, bit_mask);
213 uint8x16_t tmp = vpaddq_u8(minput, minput);
214 tmp = vpaddq_u8(tmp, tmp);
215 tmp = vpaddq_u8(tmp, tmp);
216 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
217 };
218
219 // fast path for long strings (expected to be common)
220 size_t i = location;
221 uint8x16_t low_mask =
222 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
223 0x00, 0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
224 uint8x16_t high_mask =
225 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
226 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
227 uint8x16_t fmask = vmovq_n_u8(0xf);
228 uint8x16_t zero{0};
229 for (; i + 15 < view.size(); i += 16) {
230 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
231 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
232 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
233 uint8x16_t classify = vandq_u8(lowpart, highpart);
234 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
235 uint8x16_t is_zero = vceqq_u8(classify, zero);
236 uint16_t is_non_zero = ~to_bitmask(is_zero);
237 return i + trailing_zeroes(is_non_zero);
238 }
239 }
240
241 if (i < view.size()) {
242 uint8x16_t word =
243 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
244 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
245 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
246 uint8x16_t classify = vandq_u8(lowpart, highpart);
247 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
248 uint8x16_t is_zero = vceqq_u8(classify, zero);
249 uint16_t is_non_zero = ~to_bitmask(is_zero);
250 return view.length() - 16 + trailing_zeroes(is_non_zero);
251 }
252 }
253 return size_t(view.size());
254}
255#elif ADA_SSE2
257 std::string_view view, size_t location) noexcept {
258 // first check for short strings in which case we do it naively.
259 if (view.size() - location < 16) { // slow path
260 for (size_t i = location; i < view.size(); i++) {
261 if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
262 view[i] == '?' || view[i] == '[') {
263 return i;
264 }
265 }
266 return size_t(view.size());
267 }
268 // fast path for long strings (expected to be common)
269 size_t i = location;
270 const __m128i mask1 = _mm_set1_epi8(':');
271 const __m128i mask2 = _mm_set1_epi8('/');
272 const __m128i mask3 = _mm_set1_epi8('\\');
273 const __m128i mask4 = _mm_set1_epi8('?');
274 const __m128i mask5 = _mm_set1_epi8('[');
275
276 for (; i + 15 < view.size(); i += 16) {
277 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
278 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
279 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
280 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
281 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
282 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
283 __m128i m = _mm_or_si128(
284 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
285 int mask = _mm_movemask_epi8(m);
286 if (mask != 0) {
287 return i + trailing_zeroes(mask);
288 }
289 }
290 if (i < view.size()) {
291 __m128i word =
292 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
293 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
294 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
295 __m128i m3 = _mm_cmpeq_epi8(word, mask3);
296 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
297 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
298 __m128i m = _mm_or_si128(
299 _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m3, m4)), m5);
300 int mask = _mm_movemask_epi8(m);
301 if (mask != 0) {
302 return view.length() - 16 + trailing_zeroes(mask);
303 }
304 }
305 return size_t(view.length());
306}
307#else
308// : / [ \\ ?
309static constexpr std::array<uint8_t, 256> special_host_delimiters =
310 []() consteval {
311 std::array<uint8_t, 256> result{};
312 for (int i : {':', '/', '[', '\\', '?'}) {
313 result[i] = 1;
314 }
315 return result;
316 }();
317// credit: @the-moisrex recommended a table-based approach
319 std::string_view view, size_t location) noexcept {
320 auto const str = view.substr(location);
321 for (auto pos = str.begin(); pos != str.end(); ++pos) {
322 if (special_host_delimiters[(uint8_t)*pos]) {
323 return pos - str.begin() + location;
324 }
325 }
326 return size_t(view.size());
327}
328#endif
329
330// starting at index location, this finds the next location of a character
331// :, /, ? or [. If none is found, view.size() is returned.
332// For use within get_host_delimiter_location.
333#if ADA_NEON
334ada_really_inline size_t find_next_host_delimiter(std::string_view view,
335 size_t location) noexcept {
336 // first check for short strings in which case we do it naively.
337 if (view.size() - location < 16) { // slow path
338 for (size_t i = location; i < view.size(); i++) {
339 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
340 view[i] == '[') {
341 return i;
342 }
343 }
344 return size_t(view.size());
345 }
346 auto to_bitmask = [](uint8x16_t input) -> uint16_t {
347 uint8x16_t bit_mask =
348 ada_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x01,
349 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
350 uint8x16_t minput = vandq_u8(input, bit_mask);
351 uint8x16_t tmp = vpaddq_u8(minput, minput);
352 tmp = vpaddq_u8(tmp, tmp);
353 tmp = vpaddq_u8(tmp, tmp);
354 return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
355 };
356
357 // fast path for long strings (expected to be common)
358 size_t i = location;
359 uint8x16_t low_mask =
360 ada_make_uint8x16_t(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
361 0x00, 0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
362 uint8x16_t high_mask =
363 ada_make_uint8x16_t(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00,
364 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
365 uint8x16_t fmask = vmovq_n_u8(0xf);
366 uint8x16_t zero{0};
367 for (; i + 15 < view.size(); i += 16) {
368 uint8x16_t word = vld1q_u8((const uint8_t*)view.data() + i);
369 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
370 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
371 uint8x16_t classify = vandq_u8(lowpart, highpart);
372 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
373 uint8x16_t is_zero = vceqq_u8(classify, zero);
374 uint16_t is_non_zero = ~to_bitmask(is_zero);
375 return i + trailing_zeroes(is_non_zero);
376 }
377 }
378
379 if (i < view.size()) {
380 uint8x16_t word =
381 vld1q_u8((const uint8_t*)view.data() + view.length() - 16);
382 uint8x16_t lowpart = vqtbl1q_u8(low_mask, vandq_u8(word, fmask));
383 uint8x16_t highpart = vqtbl1q_u8(high_mask, vshrq_n_u8(word, 4));
384 uint8x16_t classify = vandq_u8(lowpart, highpart);
385 if (vmaxvq_u32(vreinterpretq_u32_u8(classify)) != 0) {
386 uint8x16_t is_zero = vceqq_u8(classify, zero);
387 uint16_t is_non_zero = ~to_bitmask(is_zero);
388 return view.length() - 16 + trailing_zeroes(is_non_zero);
389 }
390 }
391 return size_t(view.size());
392}
393#elif ADA_SSE2
394ada_really_inline size_t find_next_host_delimiter(std::string_view view,
395 size_t location) noexcept {
396 // first check for short strings in which case we do it naively.
397 if (view.size() - location < 16) { // slow path
398 for (size_t i = location; i < view.size(); i++) {
399 if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
400 view[i] == '[') {
401 return i;
402 }
403 }
404 return size_t(view.size());
405 }
406 // fast path for long strings (expected to be common)
407 size_t i = location;
408 const __m128i mask1 = _mm_set1_epi8(':');
409 const __m128i mask2 = _mm_set1_epi8('/');
410 const __m128i mask4 = _mm_set1_epi8('?');
411 const __m128i mask5 = _mm_set1_epi8('[');
412
413 for (; i + 15 < view.size(); i += 16) {
414 __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
415 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
416 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
417 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
418 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
419 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
420 int mask = _mm_movemask_epi8(m);
421 if (mask != 0) {
422 return i + trailing_zeroes(mask);
423 }
424 }
425 if (i < view.size()) {
426 __m128i word =
427 _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
428 __m128i m1 = _mm_cmpeq_epi8(word, mask1);
429 __m128i m2 = _mm_cmpeq_epi8(word, mask2);
430 __m128i m4 = _mm_cmpeq_epi8(word, mask4);
431 __m128i m5 = _mm_cmpeq_epi8(word, mask5);
432 __m128i m = _mm_or_si128(_mm_or_si128(m1, m2), _mm_or_si128(m4, m5));
433 int mask = _mm_movemask_epi8(m);
434 if (mask != 0) {
435 return view.length() - 16 + trailing_zeroes(mask);
436 }
437 }
438 return size_t(view.length());
439}
440#else
441// : / [ ?
442static constexpr std::array<uint8_t, 256> host_delimiters = []() consteval {
443 std::array<uint8_t, 256> result{};
444 for (int i : {':', '/', '?', '['}) {
445 result[i] = 1;
446 }
447 return result;
448}();
449// credit: @the-moisrex recommended a table-based approach
450ada_really_inline size_t find_next_host_delimiter(std::string_view view,
451 size_t location) noexcept {
452 auto const str = view.substr(location);
453 for (auto pos = str.begin(); pos != str.end(); ++pos) {
454 if (host_delimiters[(uint8_t)*pos]) {
455 return pos - str.begin() + location;
456 }
457 }
458 return size_t(view.size());
459}
460#endif
461
462ada_really_inline std::pair<size_t, bool> get_host_delimiter_location(
463 const bool is_special, std::string_view& view) noexcept {
472 const size_t view_size = view.size();
473 size_t location = 0;
474 bool found_colon = false;
494 if (is_special) {
495 // We move to the next delimiter.
496 location = find_next_host_delimiter_special(view, location);
497 // Unless we find '[' then we are going only going to have to call
498 // find_next_host_delimiter_special once.
499 for (; location < view_size;
500 location = find_next_host_delimiter_special(view, location)) {
501 if (view[location] == '[') {
502 location = view.find(']', location);
503 if (location == std::string_view::npos) {
504 // performance: view.find might get translated to a memchr, which
505 // has no notion of std::string_view::npos, so the code does not
506 // reflect the assembly.
507 location = view_size;
508 break;
509 }
510 } else {
511 found_colon = view[location] == ':';
512 break;
513 }
514 }
515 } else {
516 // We move to the next delimiter.
517 location = find_next_host_delimiter(view, location);
518 // Unless we find '[' then we are going only going to have to call
519 // find_next_host_delimiter_special once.
520 for (; location < view_size;
521 location = find_next_host_delimiter(view, location)) {
522 if (view[location] == '[') {
523 location = view.find(']', location);
524 if (location == std::string_view::npos) {
525 // performance: view.find might get translated to a memchr, which
526 // has no notion of std::string_view::npos, so the code does not
527 // reflect the assembly.
528 location = view_size;
529 break;
530 }
531 } else {
532 found_colon = view[location] == ':';
533 break;
534 }
535 }
536 }
537 // performance: remove_suffix may translate into a single instruction.
538 view.remove_suffix(view_size - location);
539 return {location, found_colon};
540}
541
542ada_really_inline void trim_c0_whitespace(std::string_view& input) noexcept {
543 while (!input.empty() &&
544 ada::unicode::is_c0_control_or_space(input.front())) {
545 input.remove_prefix(1);
546 }
547 while (!input.empty() && ada::unicode::is_c0_control_or_space(input.back())) {
548 input.remove_suffix(1);
549 }
550}
551
552ada_really_inline void parse_prepared_path(std::string_view input,
554 std::string& path) {
555 ada_log("parse_prepared_path ", input);
556 uint8_t accumulator = checkers::path_signature(input);
557 // Let us first detect a trivial case.
558 // If it is special, we check that we have no dot, no %, no \ and no
559 // character needing percent encoding. Otherwise, we check that we have no %,
560 // no dot, and no character needing percent encoding.
561 constexpr uint8_t need_encoding = 1;
562 constexpr uint8_t backslash_char = 2;
563 constexpr uint8_t dot_char = 4;
564 constexpr uint8_t percent_char = 8;
565 bool special = type != ada::scheme::NOT_SPECIAL;
566 bool may_need_slow_file_handling = (type == ada::scheme::type::FILE &&
568 bool trivial_path =
569 (special ? (accumulator == 0)
570 : ((accumulator & (need_encoding | dot_char | percent_char)) ==
571 0)) &&
572 (!may_need_slow_file_handling);
573 if (accumulator == dot_char && !may_need_slow_file_handling) {
574 // '4' means that we have at least one dot, but nothing that requires
575 // percent encoding or decoding. The only part that is not trivial is
576 // that we may have single dots and double dots path segments.
577 // If we have such segments, then we either have a path that begins
578 // with '.' (easy to check), or we have the sequence './'.
579 // Note: input cannot be empty, it must at least contain one character ('.')
580 // Note: we know that '\' is not present.
581 if (input[0] != '.') {
582 size_t slashdot = input.find("/.");
583 if (slashdot == std::string_view::npos) { // common case
584 trivial_path = true;
585 } else { // uncommon
586 // only three cases matter: /./, /.. or a final /
587 trivial_path =
588 !(slashdot + 2 == input.size() || input[slashdot + 2] == '.' ||
589 input[slashdot + 2] == '/');
590 }
591 }
592 }
593 if (trivial_path) {
594 ada_log("parse_path trivial");
595 path += '/';
596 path += input;
597 return;
598 }
599 // We are going to need to look a bit at the path, but let us see if we can
600 // ignore percent encoding *and* backslashes *and* percent characters.
601 // Except for the trivial case, this is likely to capture 99% of paths out
602 // there.
603 bool fast_path =
604 (special &&
605 (accumulator & (need_encoding | backslash_char | percent_char)) == 0) &&
606 (type != ada::scheme::type::FILE);
607 if (fast_path) {
608 ada_log("parse_prepared_path fast");
609 // Here we don't need to worry about \ or percent encoding.
610 // We also do not have a file protocol. We might have dots, however,
611 // but dots must as appear as '.', and they cannot be encoded because
612 // the symbol '%' is not present.
613 size_t previous_location = 0; // We start at 0.
614 do {
615 size_t new_location = input.find('/', previous_location);
616 // std::string_view path_view = input;
617 // We process the last segment separately:
618 if (new_location == std::string_view::npos) {
619 std::string_view path_view = input.substr(previous_location);
620 if (path_view == "..") { // The path ends with ..
621 // e.g., if you receive ".." with an empty path, you go to "/".
622 if (path.empty()) {
623 path = '/';
624 return;
625 }
626 // Fast case where we have nothing to do:
627 if (path.back() == '/') {
628 return;
629 }
630 // If you have the path "/joe/myfriend",
631 // then you delete 'myfriend'.
632 path.resize(path.rfind('/') + 1);
633 return;
634 }
635 path += '/';
636 if (path_view != ".") {
637 path.append(path_view);
638 }
639 return;
640 } else {
641 // This is a non-final segment.
642 std::string_view path_view =
643 input.substr(previous_location, new_location - previous_location);
644 previous_location = new_location + 1;
645 if (path_view == "..") {
646 size_t last_delimiter = path.rfind('/');
647 if (last_delimiter != std::string::npos) {
648 path.erase(last_delimiter);
649 }
650 } else if (path_view != ".") {
651 path += '/';
652 path.append(path_view);
653 }
654 }
655 } while (true);
656 } else {
657 ada_log("parse_path slow");
658 // we have reached the general case
659 bool needs_percent_encoding = (accumulator & 1);
660 std::string path_buffer_tmp;
661 do {
662 size_t location = (special && (accumulator & 2))
663 ? input.find_first_of("/\\")
664 : input.find('/');
665 std::string_view path_view = input;
666 if (location != std::string_view::npos) {
667 path_view.remove_suffix(path_view.size() - location);
668 input.remove_prefix(location + 1);
669 }
670 // path_buffer is either path_view or it might point at a percent encoded
671 // temporary file.
672 std::string_view path_buffer =
673 (needs_percent_encoding &&
674 ada::unicode::percent_encode<false>(
675 path_view, character_sets::PATH_PERCENT_ENCODE, path_buffer_tmp))
676 ? path_buffer_tmp
677 : path_view;
678 if (unicode::is_double_dot_path_segment(path_buffer)) {
679 if ((helpers::shorten_path(path, type) || special) &&
680 location == std::string_view::npos) {
681 path += '/';
682 }
683 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
684 (location == std::string_view::npos)) {
685 path += '/';
686 }
687 // Otherwise, if path_buffer is not a single-dot path segment, then:
688 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
689 // If url's scheme is "file", url's path is empty, and path_buffer is a
690 // Windows drive letter, then replace the second code point in
691 // path_buffer with U+003A (:).
692 if (type == ada::scheme::type::FILE && path.empty() &&
694 path += '/';
695 path += path_buffer[0];
696 path += ':';
697 path_buffer.remove_prefix(2);
698 path.append(path_buffer);
699 } else {
700 // Append path_buffer to url's path.
701 path += '/';
702 path.append(path_buffer);
703 }
704 }
705 if (location == std::string_view::npos) {
706 return;
707 }
708 } while (true);
709 }
710}
711
712bool overlaps(std::string_view input1, const std::string& input2) noexcept {
713 ada_log("helpers::overlaps check if string_view '", input1, "' [",
714 input1.size(), " bytes] is part of string '", input2, "' [",
715 input2.size(), " bytes]");
716 return !input1.empty() && !input2.empty() && input1.data() >= input2.data() &&
717 input1.data() < input2.data() + input2.size();
718}
719
720template <class url_type>
721ada_really_inline void strip_trailing_spaces_from_opaque_path(
722 url_type& url) noexcept {
723 ada_log("helpers::strip_trailing_spaces_from_opaque_path");
724 if (!url.has_opaque_path) return;
725 if (url.has_hash()) return;
726 if (url.has_search()) return;
727
728 auto path = std::string(url.get_pathname());
729 while (!path.empty() && path.back() == ' ') {
730 path.resize(path.size() - 1);
731 }
732 url.update_base_pathname(path);
733}
734
735// @ / \\ ?
736static constexpr std::array<uint8_t, 256> authority_delimiter_special =
737 []() consteval {
738 std::array<uint8_t, 256> result{};
739 for (uint8_t i : {'@', '/', '\\', '?'}) {
740 result[i] = 1;
741 }
742 return result;
743 }();
744// credit: @the-moisrex recommended a table-based approach
746find_authority_delimiter_special(std::string_view view) noexcept {
747 // performance note: we might be able to gain further performance
748 // with SIMD instrinsics.
749 for (auto pos = view.begin(); pos != view.end(); ++pos) {
750 if (authority_delimiter_special[(uint8_t)*pos]) {
751 return pos - view.begin();
752 }
753 }
754 return size_t(view.size());
755}
756
757// @ / ?
758static constexpr std::array<uint8_t, 256> authority_delimiter = []() consteval {
759 std::array<uint8_t, 256> result{};
760 for (uint8_t i : {'@', '/', '?'}) {
761 result[i] = 1;
762 }
763 return result;
764}();
765// credit: @the-moisrex recommended a table-based approach
767find_authority_delimiter(std::string_view view) noexcept {
768 // performance note: we might be able to gain further performance
769 // with SIMD instrinsics.
770 for (auto pos = view.begin(); pos != view.end(); ++pos) {
771 if (authority_delimiter[(uint8_t)*pos]) {
772 return pos - view.begin();
773 }
774 }
775 return size_t(view.size());
776}
777
778} // namespace ada::helpers
779
780namespace ada {
784#undef ada_make_uint8x16_t
785} // namespace ada
Includes all definitions for Ada.
Definitions for URL specific checkers used within Ada.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
#define ada_unused
Definition common_defs.h:80
#define ada_warn_unused
Definition common_defs.h:81
#define ada_really_inline
Definition common_defs.h:77
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
Includes the definitions for helper functions.
ada_really_inline size_t find_next_host_delimiter(std::string_view view, size_t location) noexcept
Definition helpers.cpp:450
static constexpr std::array< uint8_t, 256 > authority_delimiter_special
Definition helpers.cpp:736
static constexpr std::array< uint8_t, 256 > host_delimiters
Definition helpers.cpp:442
ada_really_inline size_t find_next_host_delimiter_special(std::string_view view, size_t location) noexcept
Definition helpers.cpp:318
ada_unused std::string get_state(ada::state s)
Definition helpers.cpp:36
static constexpr std::array< uint8_t, 256 > authority_delimiter
Definition helpers.cpp:758
static constexpr std::array< uint8_t, 256 > special_host_delimiters
Definition helpers.cpp:309
ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept
Definition helpers.cpp:167
@ NOT_SPECIAL
Definition scheme.h:31
Definition ada_idna.h:13
ada_warn_unused std::string to_string(encoding_type type)
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
@ SPECIAL_AUTHORITY_SLASHES
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
tl::expected< result_type, ada::errors > result
Declarations for the URL scheme.