Ada 2.7.8
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_aggregator.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/checkers-inl.h"
3#include "ada/checkers.h"
4#include "ada/helpers.h"
6#include "ada/scheme.h"
7#include "ada/unicode-inl.h"
11#include "ada/parser.h"
12
13#include <string>
14#include <string_view>
15
16namespace ada {
17template <bool has_state_override>
18[[nodiscard]] ada_really_inline bool url_aggregator::parse_scheme_with_colon(
19 const std::string_view input_with_colon) {
20 ada_log("url_aggregator::parse_scheme_with_colon ", input_with_colon);
22 ADA_ASSERT_TRUE(!helpers::overlaps(input_with_colon, buffer));
23 std::string_view input{input_with_colon};
24 input.remove_suffix(1);
31 if (is_input_special) { // fast path!!!
33 // If url's scheme is not a special scheme and buffer is a special scheme,
34 // then return.
36 return true;
37 }
38
39 // If url includes credentials or has a non-null port, and buffer is
40 // "file", then return.
41 if ((has_credentials() || components.port != url_components::omitted) &&
43 return true;
44 }
45
46 // If url's scheme is "file" and its host is an empty host, then return.
47 // An empty host is the empty string.
48 if (type == ada::scheme::type::FILE &&
49 components.host_start == components.host_end) {
50 return true;
51 }
52 }
53
55 set_scheme_from_view_with_colon(input_with_colon);
56
58 // This is uncommon.
60
61 // If url's port is url's scheme's default port, then set url's port to
62 // null.
63 if (components.port == urls_scheme_port) {
64 clear_port();
65 }
66 }
67 } else { // slow path
68 std::string _buffer(input);
69 // Next function is only valid if the input is ASCII and returns false
70 // otherwise, but it seems that we always have ascii content so we do not
71 // need to check the return value.
72 unicode::to_lower_ascii(_buffer.data(), _buffer.size());
73
75 // If url's scheme is a special scheme and buffer is not a special scheme,
76 // then return. If url's scheme is not a special scheme and buffer is a
77 // special scheme, then return.
78 if (is_special() != ada::scheme::is_special(_buffer)) {
79 return true;
80 }
81
82 // If url includes credentials or has a non-null port, and buffer is
83 // "file", then return.
84 if ((has_credentials() || components.port != url_components::omitted) &&
85 _buffer == "file") {
86 return true;
87 }
88
89 // If url's scheme is "file" and its host is an empty host, then return.
90 // An empty host is the empty string.
91 if (type == ada::scheme::type::FILE &&
92 components.host_start == components.host_end) {
93 return true;
94 }
95 }
96
97 set_scheme(_buffer);
98
100 // This is uncommon.
102
103 // If url's port is url's scheme's default port, then set url's port to
104 // null.
105 if (components.port == urls_scheme_port) {
106 clear_port();
107 }
108 }
109 }
111 return true;
112}
113
114inline void url_aggregator::copy_scheme(const url_aggregator& u) noexcept {
115 ada_log("url_aggregator::copy_scheme ", u.buffer);
116 ADA_ASSERT_TRUE(validate());
117 // next line could overflow but unsigned arithmetic has well-defined
118 // overflows.
119 uint32_t new_difference = u.components.protocol_end - components.protocol_end;
120 type = u.type;
121 buffer.erase(0, components.protocol_end);
122 buffer.insert(0, u.get_protocol());
123 components.protocol_end = u.components.protocol_end;
124
125 // No need to update the components
126 if (new_difference == 0) {
127 return;
128 }
129
130 // Update the rest of the components.
131 components.username_end += new_difference;
132 components.host_start += new_difference;
133 components.host_end += new_difference;
134 components.pathname_start += new_difference;
135 if (components.search_start != url_components::omitted) {
136 components.search_start += new_difference;
137 }
138 if (components.hash_start != url_components::omitted) {
139 components.hash_start += new_difference;
140 }
141 ADA_ASSERT_TRUE(validate());
142}
143
144inline void url_aggregator::set_scheme_from_view_with_colon(
145 std::string_view new_scheme_with_colon) noexcept {
146 ada_log("url_aggregator::set_scheme_from_view_with_colon ",
148 ADA_ASSERT_TRUE(validate());
150 new_scheme_with_colon.back() == ':');
151 // next line could overflow but unsigned arithmetic has well-defined
152 // overflows.
154 uint32_t(new_scheme_with_colon.size()) - components.protocol_end;
155
156 if (buffer.empty()) {
157 buffer.append(new_scheme_with_colon);
158 } else {
159 buffer.erase(0, components.protocol_end);
160 buffer.insert(0, new_scheme_with_colon);
161 }
162 components.protocol_end += new_difference;
163
164 // Update the rest of the components.
165 components.username_end += new_difference;
166 components.host_start += new_difference;
167 components.host_end += new_difference;
168 components.pathname_start += new_difference;
169 if (components.search_start != url_components::omitted) {
170 components.search_start += new_difference;
171 }
172 if (components.hash_start != url_components::omitted) {
173 components.hash_start += new_difference;
174 }
175 ADA_ASSERT_TRUE(validate());
176}
177
178inline void url_aggregator::set_scheme(std::string_view new_scheme) noexcept {
179 ada_log("url_aggregator::set_scheme ", new_scheme);
180 ADA_ASSERT_TRUE(validate());
181 ADA_ASSERT_TRUE(new_scheme.empty() || new_scheme.back() != ':');
182 // next line could overflow but unsigned arithmetic has well-defined
183 // overflows.
185 uint32_t(new_scheme.size()) - components.protocol_end + 1;
186
188 if (buffer.empty()) {
189 buffer.append(helpers::concat(new_scheme, ":"));
190 } else {
191 buffer.erase(0, components.protocol_end);
192 buffer.insert(0, helpers::concat(new_scheme, ":"));
193 }
194 components.protocol_end = uint32_t(new_scheme.size() + 1);
195
196 // Update the rest of the components.
197 components.username_end += new_difference;
198 components.host_start += new_difference;
199 components.host_end += new_difference;
200 components.pathname_start += new_difference;
201 if (components.search_start != url_components::omitted) {
202 components.search_start += new_difference;
203 }
204 if (components.hash_start != url_components::omitted) {
205 components.hash_start += new_difference;
206 }
207 ADA_ASSERT_TRUE(validate());
208}
209
210bool url_aggregator::set_protocol(const std::string_view input) {
211 ada_log("url_aggregator::set_protocol ", input);
213 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
214 std::string view(input);
215 helpers::remove_ascii_tab_or_newline(view);
216 if (view.empty()) {
217 return true;
218 }
219
220 // Schemes should start with alpha values.
221 if (!checkers::is_alpha(view[0])) {
222 return false;
223 }
224
225 view.append(":");
226
227 std::string::iterator pointer =
228 std::find_if_not(view.begin(), view.end(), unicode::is_alnum_plus);
229
230 if (pointer != view.end() && *pointer == ':') {
232 std::string_view(view.data(), pointer - view.begin() + 1));
233 }
234 return false;
235}
236
237bool url_aggregator::set_username(const std::string_view input) {
238 ada_log("url_aggregator::set_username '", input, "' ");
240 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
241 if (cannot_have_credentials_or_port()) {
242 return false;
243 }
246 if (idx == input.size()) {
247 update_base_username(input);
248 } else {
249 // We only create a temporary string if we have to!
250 update_base_username(ada::unicode::percent_encode(
252 }
254 return true;
255}
256
257bool url_aggregator::set_password(const std::string_view input) {
258 ada_log("url_aggregator::set_password '", input, "'");
260 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
261 if (cannot_have_credentials_or_port()) {
262 return false;
263 }
266 if (idx == input.size()) {
267 update_base_password(input);
268 } else {
269 // We only create a temporary string if we have to!
270 update_base_password(ada::unicode::percent_encode(
272 }
274 return true;
275}
276
277bool url_aggregator::set_port(const std::string_view input) {
278 ada_log("url_aggregator::set_port ", input);
280 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
281 if (cannot_have_credentials_or_port()) {
282 return false;
283 }
284 std::string trimmed(input);
285 helpers::remove_ascii_tab_or_newline(trimmed);
286 if (trimmed.empty()) {
287 clear_port();
288 return true;
289 }
290 // Input should not start with control characters.
291 if (ada::unicode::is_c0_control_or_space(trimmed.front())) {
292 return false;
293 }
294 // Input should contain at least one ascii digit.
295 if (input.find_first_of("0123456789") == std::string_view::npos) {
296 return false;
297 }
298
299 // Revert changes if parse_port fails.
300 uint32_t previous_port = components.port;
301 parse_port(trimmed);
302 if (is_valid) {
303 return true;
304 }
305 update_base_port(previous_port);
306 is_valid = true;
308 return false;
309}
310
311bool url_aggregator::set_pathname(const std::string_view input) {
312 ada_log("url_aggregator::set_pathname ", input);
314 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
315 if (has_opaque_path) {
316 return false;
317 }
318 clear_pathname();
319 parse_path(input);
320 if (checkers::begins_with(input, "//") && !has_authority() &&
321 !has_dash_dot()) {
322 buffer.insert(components.pathname_start, "/.");
323 components.pathname_start += 2;
324 }
326 return true;
327}
328
329ada_really_inline void url_aggregator::parse_path(std::string_view input) {
330 ada_log("url_aggregator::parse_path ", input);
332 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
333 std::string tmp_buffer;
334 std::string_view internal_input;
335 if (unicode::has_tabs_or_newline(input)) {
337 // Optimization opportunity: Instead of copying and then pruning, we could
338 // just directly build the string from user_input.
339 helpers::remove_ascii_tab_or_newline(tmp_buffer);
341 } else {
343 }
344
345 // If url is special, then:
346 if (is_special()) {
347 if (internal_input.empty()) {
348 update_base_pathname("/");
349 } else if ((internal_input[0] == '/') || (internal_input[0] == '\\')) {
350 consume_prepared_path(internal_input.substr(1));
351 } else {
352 consume_prepared_path(internal_input);
353 }
354 } else if (!internal_input.empty()) {
355 if (internal_input[0] == '/') {
356 consume_prepared_path(internal_input.substr(1));
357 } else {
358 consume_prepared_path(internal_input);
359 }
360 } else {
361 // Non-special URLs with an empty host can have their paths erased
362 // Path-only URLs cannot have their paths erased
363 if (components.host_start == components.host_end && !has_authority()) {
364 update_base_pathname("/");
365 }
366 }
368}
369
370void url_aggregator::set_search(const std::string_view input) {
371 ada_log("url_aggregator::set_search ", input);
373 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
374 if (input.empty()) {
375 clear_search();
376 helpers::strip_trailing_spaces_from_opaque_path(*this);
377 return;
378 }
379
380 std::string new_value;
381 new_value = input[0] == '?' ? input.substr(1) : input;
382 helpers::remove_ascii_tab_or_newline(new_value);
383
387
388 update_base_search(new_value, query_percent_encode_set);
390}
391
392void url_aggregator::set_hash(const std::string_view input) {
393 ada_log("url_aggregator::set_hash ", input);
395 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
396 if (input.empty()) {
397 if (components.hash_start != url_components::omitted) {
398 buffer.resize(components.hash_start);
400 }
401 helpers::strip_trailing_spaces_from_opaque_path(*this);
402 return;
403 }
404
405 std::string new_value;
406 new_value = input[0] == '#' ? input.substr(1) : input;
407 helpers::remove_ascii_tab_or_newline(new_value);
408 update_unencoded_base_hash(new_value);
410}
411
412bool url_aggregator::set_href(const std::string_view input) {
413 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
414 ada_log("url_aggregator::set_href ", input, " [", input.size(), " bytes]");
416 ada_log("url_aggregator::set_href, success :", out.has_value());
417
418 if (out) {
419 ada_log("url_aggregator::set_href, parsed ", out->to_string());
420 // TODO: Figure out why the following line puts test to never finish.
421 *this = *out;
422 }
423
424 return out.has_value();
425}
426
427ada_really_inline bool url_aggregator::parse_host(std::string_view input) {
428 ada_log("url_aggregator:parse_host \"", input, "\" [", input.size(),
429 " bytes]");
431 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
432 if (input.empty()) {
433 return is_valid = false;
434 } // technically unnecessary.
435 // If input starts with U+005B ([), then:
436 if (input[0] == '[') {
437 // If input does not end with U+005D (]), validation error, return failure.
438 if (input.back() != ']') {
439 return is_valid = false;
440 }
441 ada_log("parse_host ipv6");
442
443 // Return the result of IPv6 parsing input with its leading U+005B ([) and
444 // trailing U+005D (]) removed.
445 input.remove_prefix(1);
446 input.remove_suffix(1);
447 return parse_ipv6(input);
448 }
449
450 // If isNotSpecial is true, then return the result of opaque-host parsing
451 // input.
452 if (!is_special()) {
453 return parse_opaque_host(input);
454 }
455 // Let domain be the result of running UTF-8 decode without BOM on the
456 // percent-decoding of input. Let asciiDomain be the result of running domain
457 // to ASCII with domain and false. The most common case is an ASCII input, in
458 // which case we do not need to call the expensive 'to_ascii' if a few
459 // conditions are met: no '%' and no 'xn-' subsequence.
460
461 // Often, the input does not contain any forbidden code points, and no upper
462 // case ASCII letter, then we can just copy it to the buffer. We want to
463 // optimize for such a common case.
465 unicode::contains_forbidden_domain_code_point_or_upper(input.data(),
466 input.size());
467 // Minor optimization opportunity:
468 // contains_forbidden_domain_code_point_or_upper could be extend to check for
469 // the presence of characters that cannot appear in the ipv4 address and we
470 // could also check whether x and n and - are present, and so we could skip
471 // some of the checks below. However, the gains are likely to be small, and
472 // the code would be more complex.
473 if (is_forbidden_or_upper == 0 &&
474 input.find("xn-") == std::string_view::npos) {
475 // fast path
476 update_base_hostname(input);
477 if (checkers::is_ipv4(get_hostname())) {
478 ada_log("parse_host fast path ipv4");
479 return parse_ipv4(get_hostname(), true);
480 }
481 ada_log("parse_host fast path ", get_hostname());
482 return true;
483 }
484 // We have encountered at least one forbidden code point or the input contains
485 // 'xn-' (case insensitive), so we need to call 'to_ascii' to perform the full
486 // conversion.
487
488 ada_log("parse_host calling to_ascii");
489 std::optional<std::string> host = std::string(get_hostname());
490 is_valid = ada::unicode::to_ascii(host, input, input.find('%'));
491 if (!is_valid) {
492 ada_log("parse_host to_ascii returns false");
493 return is_valid = false;
494 }
495 ada_log("parse_host to_ascii succeeded ", *host, " [", host->size(),
496 " bytes]");
497
498 if (std::any_of(host.value().begin(), host.value().end(),
499 ada::unicode::is_forbidden_domain_code_point)) {
500 return is_valid = false;
501 }
502
503 // If asciiDomain ends in a number, then return the result of IPv4 parsing
504 // asciiDomain.
505 if (checkers::is_ipv4(host.value())) {
506 ada_log("parse_host got ipv4 ", *host);
507 return parse_ipv4(host.value(), false);
508 }
509
510 update_base_hostname(host.value());
512 return true;
513}
514
515template <bool override_hostname>
516bool url_aggregator::set_host_or_hostname(const std::string_view input) {
517 ada_log("url_aggregator::set_host_or_hostname ", input);
519 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
520 if (has_opaque_path) {
521 return false;
522 }
523
524 std::string previous_host(get_hostname());
525 uint32_t previous_port = components.port;
526
527 size_t host_end_pos = input.find('#');
528 std::string _host(input.data(), host_end_pos != std::string_view::npos
530 : input.size());
531 helpers::remove_ascii_tab_or_newline(_host);
532 std::string_view new_host(_host);
533
534 // If url's scheme is "file", then set state to file host state, instead of
535 // host state.
536 if (type != ada::scheme::type::FILE) {
537 std::string_view host_view(_host.data(), _host.length());
538 auto [location, found_colon] =
539 helpers::get_host_delimiter_location(is_special(), host_view);
540
541 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
542 // Note: the 'found_colon' value is true if and only if a colon was
543 // encountered while not inside brackets.
544 if (found_colon) {
545 if (override_hostname) {
546 return false;
547 }
548 std::string_view sub_buffer = new_host.substr(location + 1);
549 if (!sub_buffer.empty()) {
551 }
552 }
553 // If url is special and host_view is the empty string, validation error,
554 // return failure. Otherwise, if state override is given, host_view is the
555 // empty string, and either url includes credentials or url's port is
556 // non-null, return.
557 else if (host_view.empty() &&
558 (is_special() || has_credentials() || has_port())) {
559 return false;
560 }
561
562 // Let host be the result of host parsing host_view with url is not special.
563 if (host_view.empty() && !is_special()) {
564 if (has_hostname()) {
565 clear_hostname(); // easy!
566 } else if (has_dash_dot()) {
567 add_authority_slashes_if_needed();
568 delete_dash_dot();
569 }
570 return true;
571 }
572
573 bool succeeded = parse_host(host_view);
574 if (!succeeded) {
575 update_base_hostname(previous_host);
576 update_base_port(previous_port);
577 } else if (has_dash_dot()) {
578 // Should remove dash_dot from pathname
579 delete_dash_dot();
580 }
581 return succeeded;
582 }
583
584 size_t location = new_host.find_first_of("/\\?");
585 if (location != std::string_view::npos) {
586 new_host.remove_suffix(new_host.length() - location);
587 }
588
589 if (new_host.empty()) {
590 // Set url's host to the empty string.
591 clear_hostname();
592 } else {
593 // Let host be the result of host parsing buffer with url is not special.
594 if (!parse_host(new_host)) {
595 update_base_hostname(previous_host);
596 update_base_port(previous_port);
597 return false;
598 }
599
600 // If host is "localhost", then set host to the empty string.
601 if (helpers::substring(buffer, components.host_start,
602 components.host_end) == "localhost") {
603 clear_hostname();
604 }
605 }
607 return true;
608}
609
610bool url_aggregator::set_host(const std::string_view input) {
611 ada_log("url_aggregator::set_host '", input, "'");
613 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
615}
616
617bool url_aggregator::set_hostname(const std::string_view input) {
618 ada_log("url_aggregator::set_hostname '", input, "'");
620 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
622}
623
625 ada_log("url_aggregator::get_origin");
626 if (is_special()) {
627 // Return a new opaque origin.
628 if (type == scheme::FILE) {
629 return "null";
630 }
631
632 return helpers::concat(get_protocol(), "//", get_host());
633 }
634
635 if (get_protocol() == "blob:") {
636 std::string_view path = get_pathname();
637 if (!path.empty()) {
639 if (out && (out->type == scheme::HTTP || out->type == scheme::HTTPS)) {
640 // If pathURL's scheme is not "http" and not "https", then return a
641 // new opaque origin.
642 return helpers::concat(out->get_protocol(), "//", out->get_host());
643 }
644 }
645 }
646
647 // Return a new opaque origin.
648 return "null";
649}
650
652 ada_log("url_aggregator::get_username");
654 return helpers::substring(buffer, components.protocol_end + 2,
655 components.username_end);
656 }
657 return "";
658}
659
661 ada_log("url_aggregator::get_password");
663 return helpers::substring(buffer, components.username_end + 1,
664 components.host_start);
665 }
666 return "";
667}
668
670 ada_log("url_aggregator::get_port");
671 if (components.port == url_components::omitted) {
672 return "";
673 }
674 return helpers::substring(buffer, components.host_end + 1,
675 components.pathname_start);
676}
677
679 ada_log("url_aggregator::get_hash");
680 // If this's URL's fragment is either null or the empty string, then return
681 // the empty string. Return U+0023 (#), followed by this's URL's fragment.
682 if (components.hash_start == url_components::omitted) {
683 return "";
684 }
685 if (buffer.size() - components.hash_start <= 1) {
686 return "";
687 }
688 return helpers::substring(buffer, components.hash_start);
689}
690
692 ada_log("url_aggregator::get_host");
693 // Technically, we should check if there is a hostname, but
694 // the code below works even if there isn't.
695 // if(!has_hostname()) { return ""; }
696 size_t start = components.host_start;
697 if (components.host_end > components.host_start &&
698 buffer[components.host_start] == '@') {
699 start++;
700 }
701 // if we have an empty host, then the space between components.host_end and
702 // components.pathname_start may be occupied by /.
703 if (start == components.host_end) {
704 return std::string_view();
705 }
706 return helpers::substring(buffer, start, components.pathname_start);
707}
708
710 ada_log("url_aggregator::get_hostname");
711 // Technically, we should check if there is a hostname, but
712 // the code below works even if there isn't.
713 // if(!has_hostname()) { return ""; }
714 size_t start = components.host_start;
715 // So host_start is not where the host begins.
716 if (components.host_end > components.host_start &&
717 buffer[components.host_start] == '@') {
718 start++;
719 }
720 return helpers::substring(buffer, start, components.host_end);
721}
722
724 ada_log("url_aggregator::get_pathname pathname_start = ",
725 components.pathname_start, " buffer.size() = ", buffer.size(),
726 " components.search_start = ", components.search_start,
727 " components.hash_start = ", components.hash_start);
728 uint32_t ending_index = uint32_t(buffer.size());
729 if (components.search_start != url_components::omitted) {
730 ending_index = components.search_start;
731 } else if (components.hash_start != url_components::omitted) {
732 ending_index = components.hash_start;
733 }
734 return helpers::substring(buffer, components.pathname_start, ending_index);
735}
736
738 ada_log("url_aggregator::get_search");
739 // If this's URL's query is either null or the empty string, then return the
740 // empty string. Return U+003F (?), followed by this's URL's query.
741 if (components.search_start == url_components::omitted) {
742 return "";
743 }
744 uint32_t ending_index = uint32_t(buffer.size());
745 if (components.hash_start != url_components::omitted) {
746 ending_index = components.hash_start;
747 }
748 if (ending_index - components.search_start <= 1) {
749 return "";
750 }
751 return helpers::substring(buffer, components.search_start, ending_index);
752}
753
755 ada_log("url_aggregator::get_protocol");
756 return helpers::substring(buffer, 0, components.protocol_end);
757}
758
760 ada_log("url_aggregator::to_string buffer:", buffer, " [", buffer.size(),
761 " bytes]");
762 if (!is_valid) {
763 return "null";
764 }
765
766 std::string answer;
767 auto back = std::back_insert_iterator(answer);
768 answer.append("{\n");
769
770 answer.append("\t\"buffer\":\"");
771 helpers::encode_json(buffer, back);
772 answer.append("\",\n");
773
774 answer.append("\t\"protocol\":\"");
775 helpers::encode_json(get_protocol(), back);
776 answer.append("\",\n");
777
778 if (has_credentials()) {
779 answer.append("\t\"username\":\"");
780 helpers::encode_json(get_username(), back);
781 answer.append("\",\n");
782 answer.append("\t\"password\":\"");
783 helpers::encode_json(get_password(), back);
784 answer.append("\",\n");
785 }
786
787 answer.append("\t\"host\":\"");
788 helpers::encode_json(get_host(), back);
789 answer.append("\",\n");
790
791 answer.append("\t\"path\":\"");
792 helpers::encode_json(get_pathname(), back);
793 answer.append("\",\n");
794 answer.append("\t\"opaque path\":");
795 answer.append((has_opaque_path ? "true" : "false"));
796 answer.append(",\n");
797
798 if (components.search_start != url_components::omitted) {
799 answer.append("\t\"query\":\"");
800 helpers::encode_json(get_search(), back);
801 answer.append("\",\n");
802 }
803 if (components.hash_start != url_components::omitted) {
804 answer.append("\t\"fragment\":\"");
805 helpers::encode_json(get_hash(), back);
806 answer.append("\",\n");
807 }
808
809 auto convert_offset_to_string = [](uint32_t offset) -> std::string {
811 return "null";
812 } else {
813 return std::to_string(offset);
814 }
815 };
816
817 answer.append("\t\"protocol_end\":");
818 answer.append(convert_offset_to_string(components.protocol_end));
819 answer.append(",\n");
820
821 answer.append("\t\"username_end\":");
822 answer.append(convert_offset_to_string(components.username_end));
823 answer.append(",\n");
824
825 answer.append("\t\"host_start\":");
826 answer.append(convert_offset_to_string(components.host_start));
827 answer.append(",\n");
828
829 answer.append("\t\"host_end\":");
830 answer.append(convert_offset_to_string(components.host_end));
831 answer.append(",\n");
832
833 answer.append("\t\"port\":");
834 answer.append(convert_offset_to_string(components.port));
835 answer.append(",\n");
836
837 answer.append("\t\"pathname_start\":");
838 answer.append(convert_offset_to_string(components.pathname_start));
839 answer.append(",\n");
840
841 answer.append("\t\"search_start\":");
842 answer.append(convert_offset_to_string(components.search_start));
843 answer.append(",\n");
844
845 answer.append("\t\"hash_start\":");
846 answer.append(convert_offset_to_string(components.hash_start));
847 answer.append("\n}");
848
849 return answer;
850}
851
853 if (components.host_start == components.host_end) {
854 return false;
855 }
856 return checkers::verify_dns_length(get_hostname());
857}
858
859bool url_aggregator::parse_ipv4(std::string_view input, bool in_place) {
860 ada_log("parse_ipv4 ", input, " [", input.size(),
861 " bytes], overlaps with buffer: ",
862 helpers::overlaps(input, buffer) ? "yes" : "no");
864 const bool trailing_dot = (input.back() == '.');
865 if (trailing_dot) {
866 input.remove_suffix(1);
867 }
868 size_t digit_count{0};
869 int pure_decimal_count = 0; // entries that are decimal
870 uint64_t ipv4{0};
871 // we could unroll for better performance?
872 for (; (digit_count < 4) && !(input.empty()); digit_count++) {
874 segment_result{}; // If any number exceeds 32 bits, we have an error.
876 if (is_hex && ((input.length() == 2) ||
877 ((input.length() > 2) && (input[2] == '.')))) {
878 // special case
879 segment_result = 0;
880 input.remove_prefix(2);
881 } else {
882 std::from_chars_result r;
883 if (is_hex) {
884 ada_log("parse_ipv4 trying to parse hex number");
885 r = std::from_chars(input.data() + 2, input.data() + input.size(),
886 segment_result, 16);
887 } else if ((input.length() >= 2) && input[0] == '0' &&
889 ada_log("parse_ipv4 trying to parse octal number");
890 r = std::from_chars(input.data() + 1, input.data() + input.size(),
891 segment_result, 8);
892 } else {
893 ada_log("parse_ipv4 trying to parse decimal number");
895 r = std::from_chars(input.data(), input.data() + input.size(),
896 segment_result, 10);
897 }
898 if (r.ec != std::errc()) {
899 ada_log("parse_ipv4 parsing failed");
900 return is_valid = false;
901 }
902 ada_log("parse_ipv4 parsed ", segment_result);
903 input.remove_prefix(r.ptr - input.data());
904 }
905 if (input.empty()) {
906 // We have the last value.
907 // At this stage, ipv4 contains digit_count*8 bits.
908 // So we have 32-digit_count*8 bits left.
909 if (segment_result >= (uint64_t(1) << (32 - digit_count * 8))) {
910 return is_valid = false;
911 }
912 ipv4 <<= (32 - digit_count * 8);
914 goto final;
915 } else {
916 // There is more, so that the value must no be larger than 255
917 // and we must have a '.'.
918 if ((segment_result > 255) || (input[0] != '.')) {
919 return is_valid = false;
920 }
921 ipv4 <<= 8;
923 input.remove_prefix(1); // remove '.'
924 }
925 }
926 if ((digit_count != 4) || (!input.empty())) {
927 ada_log("parse_ipv4 found invalid (more than 4 numbers or empty) ");
928 return is_valid = false;
929 }
930final:
931 ada_log("url_aggregator::parse_ipv4 completed ", get_href(),
932 " host: ", get_host());
933
934 // We could also check r.ptr to see where the parsing ended.
935 if (in_place && pure_decimal_count == 4 && !trailing_dot) {
936 ada_log(
937 "url_aggregator::parse_ipv4 completed and was already correct in the "
938 "buffer");
939 // The original input was already all decimal and we validated it. So we
940 // don't need to do anything.
941 } else {
942 ada_log("url_aggregator::parse_ipv4 completed and we need to update it");
943 // Optimization opportunity: Get rid of unnecessary string return in ipv4
944 // serializer.
945 // TODO: This is likely a bug because it goes back update_base_hostname, not
946 // what we want to do.
947 update_base_hostname(
948 ada::serializers::ipv4(ipv4)); // We have to reserialize the address.
949 }
950 host_type = IPV4;
952 return true;
953}
954
955bool url_aggregator::parse_ipv6(std::string_view input) {
956 // TODO: Implement in_place optimization: we know that input points
957 // in the buffer, so we can just check whether the buffer is already
958 // well formatted.
959 // TODO: Find a way to merge parse_ipv6 with url.cpp implementation.
960 ada_log("parse_ipv6 ", input, " [", input.size(), " bytes]");
962 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
963 if (input.empty()) {
964 return is_valid = false;
965 }
966 // Let address be a new IPv6 address whose IPv6 pieces are all 0.
967 std::array<uint16_t, 8> address{};
968
969 // Let pieceIndex be 0.
970 int piece_index = 0;
971
972 // Let compress be null.
973 std::optional<int> compress{};
974
975 // Let pointer be a pointer for input.
976 std::string_view::iterator pointer = input.begin();
977
978 // If c is U+003A (:), then:
979 if (input[0] == ':') {
980 // If remaining does not start with U+003A (:), validation error, return
981 // failure.
982 if (input.size() == 1 || input[1] != ':') {
983 ada_log("parse_ipv6 starts with : but the rest does not start with :");
984 return is_valid = false;
985 }
986
987 // Increase pointer by 2.
988 pointer += 2;
989
990 // Increase pieceIndex by 1 and then set compress to pieceIndex.
992 }
993
994 // While c is not the EOF code point:
995 while (pointer != input.end()) {
996 // If pieceIndex is 8, validation error, return failure.
997 if (piece_index == 8) {
998 ada_log("parse_ipv6 piece_index == 8");
999 return is_valid = false;
1000 }
1001
1002 // If c is U+003A (:), then:
1003 if (*pointer == ':') {
1004 // If compress is non-null, validation error, return failure.
1005 if (compress.has_value()) {
1006 ada_log("parse_ipv6 compress is non-null");
1007 return is_valid = false;
1008 }
1009
1010 // Increase pointer and pieceIndex by 1, set compress to pieceIndex, and
1011 // then continue.
1012 pointer++;
1014 continue;
1015 }
1016
1017 // Let value and length be 0.
1018 uint16_t value = 0, length = 0;
1019
1020 // While length is less than 4 and c is an ASCII hex digit,
1021 // set value to value times 0x10 + c interpreted as hexadecimal number, and
1022 // increase pointer and length by 1.
1023 while (length < 4 && pointer != input.end() &&
1024 unicode::is_ascii_hex_digit(*pointer)) {
1025 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1026 value = uint16_t(value * 0x10 + unicode::convert_hex_to_binary(*pointer));
1027 pointer++;
1028 length++;
1029 }
1030
1031 // If c is U+002E (.), then:
1032 if (pointer != input.end() && *pointer == '.') {
1033 // If length is 0, validation error, return failure.
1034 if (length == 0) {
1035 ada_log("parse_ipv6 length is 0");
1036 return is_valid = false;
1037 }
1038
1039 // Decrease pointer by length.
1040 pointer -= length;
1041
1042 // If pieceIndex is greater than 6, validation error, return failure.
1043 if (piece_index > 6) {
1044 ada_log("parse_ipv6 piece_index > 6");
1045 return is_valid = false;
1046 }
1047
1048 // Let numbersSeen be 0.
1049 int numbers_seen = 0;
1050
1051 // While c is not the EOF code point:
1052 while (pointer != input.end()) {
1053 // Let ipv4Piece be null.
1054 std::optional<uint16_t> ipv4_piece{};
1055
1056 // If numbersSeen is greater than 0, then:
1057 if (numbers_seen > 0) {
1058 // If c is a U+002E (.) and numbersSeen is less than 4, then increase
1059 // pointer by 1.
1060 if (*pointer == '.' && numbers_seen < 4) {
1061 pointer++;
1062 } else {
1063 // Otherwise, validation error, return failure.
1064 ada_log("parse_ipv6 Otherwise, validation error, return failure");
1065 return is_valid = false;
1066 }
1067 }
1068
1069 // If c is not an ASCII digit, validation error, return failure.
1070 if (pointer == input.end() || !checkers::is_digit(*pointer)) {
1071 ada_log(
1072 "parse_ipv6 If c is not an ASCII digit, validation error, return "
1073 "failure");
1074 return is_valid = false;
1075 }
1076
1077 // While c is an ASCII digit:
1078 while (pointer != input.end() && checkers::is_digit(*pointer)) {
1079 // Let number be c interpreted as decimal number.
1080 int number = *pointer - '0';
1081
1082 // If ipv4Piece is null, then set ipv4Piece to number.
1083 if (!ipv4_piece.has_value()) {
1085 }
1086 // Otherwise, if ipv4Piece is 0, validation error, return failure.
1087 else if (ipv4_piece == 0) {
1088 ada_log("parse_ipv6 if ipv4Piece is 0, validation error");
1089 return is_valid = false;
1090 }
1091 // Otherwise, set ipv4Piece to ipv4Piece times 10 + number.
1092 else {
1093 ipv4_piece = *ipv4_piece * 10 + number;
1094 }
1095
1096 // If ipv4Piece is greater than 255, validation error, return failure.
1097 if (ipv4_piece > 255) {
1098 ada_log("parse_ipv6 ipv4_piece > 255");
1099 return is_valid = false;
1100 }
1101
1102 // Increase pointer by 1.
1103 pointer++;
1104 }
1105
1106 // Set address[pieceIndex] to address[pieceIndex] times 0x100 +
1107 // ipv4Piece.
1108 // https://stackoverflow.com/questions/39060852/why-does-the-addition-of-two-shorts-return-an-int
1111
1112 // Increase numbersSeen by 1.
1113 numbers_seen++;
1114
1115 // If numbersSeen is 2 or 4, then increase pieceIndex by 1.
1116 if (numbers_seen == 2 || numbers_seen == 4) {
1117 piece_index++;
1118 }
1119 }
1120
1121 // If numbersSeen is not 4, validation error, return failure.
1122 if (numbers_seen != 4) {
1123 return is_valid = false;
1124 }
1125
1126 // Break.
1127 break;
1128 }
1129 // Otherwise, if c is U+003A (:):
1130 else if ((pointer != input.end()) && (*pointer == ':')) {
1131 // Increase pointer by 1.
1132 pointer++;
1133
1134 // If c is the EOF code point, validation error, return failure.
1135 if (pointer == input.end()) {
1136 ada_log(
1137 "parse_ipv6 If c is the EOF code point, validation error, return "
1138 "failure");
1139 return is_valid = false;
1140 }
1141 }
1142 // Otherwise, if c is not the EOF code point, validation error, return
1143 // failure.
1144 else if (pointer != input.end()) {
1145 ada_log(
1146 "parse_ipv6 Otherwise, if c is not the EOF code point, validation "
1147 "error, return failure");
1148 return is_valid = false;
1149 }
1150
1151 // Set address[pieceIndex] to value.
1152 address[piece_index] = value;
1153
1154 // Increase pieceIndex by 1.
1155 piece_index++;
1156 }
1157
1158 // If compress is non-null, then:
1159 if (compress.has_value()) {
1160 // Let swaps be pieceIndex - compress.
1161 int swaps = piece_index - *compress;
1162
1163 // Set pieceIndex to 7.
1164 piece_index = 7;
1165
1166 // While pieceIndex is not 0 and swaps is greater than 0,
1167 // swap address[pieceIndex] with address[compress + swaps - 1], and then
1168 // decrease both pieceIndex and swaps by 1.
1169 while (piece_index != 0 && swaps > 0) {
1170 std::swap(address[piece_index], address[*compress + swaps - 1]);
1171 piece_index--;
1172 swaps--;
1173 }
1174 }
1175 // Otherwise, if compress is null and pieceIndex is not 8, validation error,
1176 // return failure.
1177 else if (piece_index != 8) {
1178 ada_log(
1179 "parse_ipv6 if compress is null and pieceIndex is not 8, validation "
1180 "error, return failure");
1181 return is_valid = false;
1182 }
1183 // TODO: Optimization opportunity: Get rid of unnecessary string creation.
1184 // TODO: This is likely a bug because it goes back update_base_hostname, not
1185 // what we want to do.
1186 update_base_hostname(ada::serializers::ipv6(address));
1187 ada_log("parse_ipv6 ", get_hostname());
1189 host_type = IPV6;
1190 return true;
1191}
1192
1193bool url_aggregator::parse_opaque_host(std::string_view input) {
1194 ada_log("parse_opaque_host ", input, " [", input.size(), " bytes]");
1196 ADA_ASSERT_TRUE(!helpers::overlaps(input, buffer));
1197 if (std::any_of(input.begin(), input.end(),
1198 ada::unicode::is_forbidden_host_code_point)) {
1199 return is_valid = false;
1200 }
1201
1202 // Return the result of running UTF-8 percent-encode on input using the C0
1203 // control percent-encode set.
1206 if (idx == input.size()) {
1207 update_base_hostname(input);
1208 } else {
1209 // We only create a temporary string if we need to.
1210 update_base_hostname(ada::unicode::percent_encode(
1212 }
1214 return true;
1215}
1216
1217[[nodiscard]] std::string url_aggregator::to_diagram() const {
1218 if (!is_valid) {
1219 return "invalid";
1220 }
1221 std::string answer;
1222 answer.append(buffer);
1223 answer.append(" [");
1224 answer.append(std::to_string(buffer.size()));
1225 answer.append(" bytes]");
1226 answer.append("\n");
1227 // first line
1228 std::string line1;
1229 line1.resize(buffer.size(), ' ');
1230 if (components.hash_start != url_components::omitted) {
1231 line1[components.hash_start] = '|';
1232 }
1233 if (components.search_start != url_components::omitted) {
1234 line1[components.search_start] = '|';
1235 }
1236 if (components.pathname_start != buffer.size()) {
1237 line1[components.pathname_start] = '|';
1238 }
1239 if (components.host_end != buffer.size()) {
1240 line1[components.host_end] = '|';
1241 }
1242 if (components.host_start != buffer.size()) {
1243 line1[components.host_start] = '|';
1244 }
1245 if (components.username_end != buffer.size()) {
1246 line1[components.username_end] = '|';
1247 }
1248 if (components.protocol_end != buffer.size()) {
1249 line1[components.protocol_end] = '|';
1250 }
1251 answer.append(line1);
1252 answer.append("\n");
1253
1254 std::string line2 = line1;
1255 if (components.hash_start != url_components::omitted) {
1256 line2[components.hash_start] = '`';
1257 line1[components.hash_start] = ' ';
1258
1259 for (size_t i = components.hash_start + 1; i < line2.size(); i++) {
1260 line2[i] = '-';
1261 }
1262 line2.append(" hash_start");
1263 answer.append(line2);
1264 answer.append("\n");
1265 }
1266
1267 std::string line3 = line1;
1268 if (components.search_start != url_components::omitted) {
1269 line3[components.search_start] = '`';
1270 line1[components.search_start] = ' ';
1271
1272 for (size_t i = components.search_start + 1; i < line3.size(); i++) {
1273 line3[i] = '-';
1274 }
1275 line3.append(" search_start ");
1276 line3.append(std::to_string(components.search_start));
1277 answer.append(line3);
1278 answer.append("\n");
1279 }
1280
1281 std::string line4 = line1;
1282 if (components.pathname_start != buffer.size()) {
1283 line4[components.pathname_start] = '`';
1284 line1[components.pathname_start] = ' ';
1285 for (size_t i = components.pathname_start + 1; i < line4.size(); i++) {
1286 line4[i] = '-';
1287 }
1288 line4.append(" pathname_start ");
1289 line4.append(std::to_string(components.pathname_start));
1290 answer.append(line4);
1291 answer.append("\n");
1292 }
1293
1294 std::string line5 = line1;
1295 if (components.host_end != buffer.size()) {
1296 line5[components.host_end] = '`';
1297 line1[components.host_end] = ' ';
1298
1299 for (size_t i = components.host_end + 1; i < line5.size(); i++) {
1300 line5[i] = '-';
1301 }
1302 line5.append(" host_end ");
1303 line5.append(std::to_string(components.host_end));
1304 answer.append(line5);
1305 answer.append("\n");
1306 }
1307
1308 std::string line6 = line1;
1309 if (components.host_start != buffer.size()) {
1310 line6[components.host_start] = '`';
1311 line1[components.host_start] = ' ';
1312
1313 for (size_t i = components.host_start + 1; i < line6.size(); i++) {
1314 line6[i] = '-';
1315 }
1316 line6.append(" host_start ");
1317 line6.append(std::to_string(components.host_start));
1318 answer.append(line6);
1319 answer.append("\n");
1320 }
1321
1322 std::string line7 = line1;
1323 if (components.username_end != buffer.size()) {
1324 line7[components.username_end] = '`';
1325 line1[components.username_end] = ' ';
1326
1327 for (size_t i = components.username_end + 1; i < line7.size(); i++) {
1328 line7[i] = '-';
1329 }
1330 line7.append(" username_end ");
1331 line7.append(std::to_string(components.username_end));
1332 answer.append(line7);
1333 answer.append("\n");
1334 }
1335
1336 std::string line8 = line1;
1337 if (components.protocol_end != buffer.size()) {
1338 line8[components.protocol_end] = '`';
1339 line1[components.protocol_end] = ' ';
1340
1341 for (size_t i = components.protocol_end + 1; i < line8.size(); i++) {
1342 line8[i] = '-';
1343 }
1344 line8.append(" protocol_end ");
1345 line8.append(std::to_string(components.protocol_end));
1346 answer.append(line8);
1347 answer.append("\n");
1348 }
1349
1350 if (components.hash_start == url_components::omitted) {
1351 answer.append("note: hash omitted\n");
1352 }
1353 if (components.search_start == url_components::omitted) {
1354 answer.append("note: search omitted\n");
1355 }
1356 if (components.protocol_end > buffer.size()) {
1357 answer.append("warning: protocol_end overflows\n");
1358 }
1359 if (components.username_end > buffer.size()) {
1360 answer.append("warning: username_end overflows\n");
1361 }
1362 if (components.host_start > buffer.size()) {
1363 answer.append("warning: host_start overflows\n");
1364 }
1365 if (components.host_end > buffer.size()) {
1366 answer.append("warning: host_end overflows\n");
1367 }
1368 if (components.pathname_start > buffer.size()) {
1369 answer.append("warning: pathname_start overflows\n");
1370 }
1371 return answer;
1372}
1373
1375 if (!is_valid) {
1376 return true;
1377 }
1378 if (!components.check_offset_consistency()) {
1379 ada_log("url_aggregator::validate inconsistent components \n",
1380 to_diagram());
1381 return false;
1382 }
1383 // We have a credible components struct, but let us investivate more
1384 // carefully:
1397 if (components.protocol_end == url_components::omitted) {
1398 ada_log("url_aggregator::validate omitted protocol_end \n", to_diagram());
1399 return false;
1400 }
1401 if (components.username_end == url_components::omitted) {
1402 ada_log("url_aggregator::validate omitted username_end \n", to_diagram());
1403 return false;
1404 }
1405 if (components.host_start == url_components::omitted) {
1406 ada_log("url_aggregator::validate omitted host_start \n", to_diagram());
1407 return false;
1408 }
1409 if (components.host_end == url_components::omitted) {
1410 ada_log("url_aggregator::validate omitted host_end \n", to_diagram());
1411 return false;
1412 }
1413 if (components.pathname_start == url_components::omitted) {
1414 ada_log("url_aggregator::validate omitted pathname_start \n", to_diagram());
1415 return false;
1416 }
1417
1418 if (components.protocol_end > buffer.size()) {
1419 ada_log("url_aggregator::validate protocol_end overflow \n", to_diagram());
1420 return false;
1421 }
1422 if (components.username_end > buffer.size()) {
1423 ada_log("url_aggregator::validate username_end overflow \n", to_diagram());
1424 return false;
1425 }
1426 if (components.host_start > buffer.size()) {
1427 ada_log("url_aggregator::validate host_start overflow \n", to_diagram());
1428 return false;
1429 }
1430 if (components.host_end > buffer.size()) {
1431 ada_log("url_aggregator::validate host_end overflow \n", to_diagram());
1432 return false;
1433 }
1434 if (components.pathname_start > buffer.size()) {
1435 ada_log("url_aggregator::validate pathname_start overflow \n",
1436 to_diagram());
1437 return false;
1438 }
1439
1440 if (components.protocol_end > 0) {
1441 if (buffer[components.protocol_end - 1] != ':') {
1442 ada_log(
1443 "url_aggregator::validate missing : at the end of the protocol \n",
1444 to_diagram());
1445 return false;
1446 }
1447 }
1448
1449 if (components.username_end != buffer.size() &&
1450 components.username_end > components.protocol_end + 2) {
1451 if (buffer[components.username_end] != ':' &&
1452 buffer[components.username_end] != '@') {
1453 ada_log(
1454 "url_aggregator::validate missing : or @ at the end of the username "
1455 "\n",
1456 to_diagram());
1457 return false;
1458 }
1459 }
1460
1461 if (components.host_start != buffer.size()) {
1462 if (components.host_start > components.username_end) {
1463 if (buffer[components.host_start] != '@') {
1464 ada_log(
1465 "url_aggregator::validate missing @ at the end of the password \n",
1466 to_diagram());
1467 return false;
1468 }
1469 } else if (components.host_start == components.username_end &&
1470 components.host_end > components.host_start) {
1471 if (components.host_start == components.protocol_end + 2) {
1472 if (buffer[components.protocol_end] != '/' ||
1473 buffer[components.protocol_end + 1] != '/') {
1474 ada_log(
1475 "url_aggregator::validate missing // between protocol and host "
1476 "\n",
1477 to_diagram());
1478 return false;
1479 }
1480 } else {
1481 if (components.host_start > components.protocol_end &&
1482 buffer[components.host_start] != '@') {
1483 ada_log(
1484 "url_aggregator::validate missing @ at the end of the username "
1485 "\n",
1486 to_diagram());
1487 return false;
1488 }
1489 }
1490 } else {
1491 if (components.host_end != components.host_start) {
1492 ada_log("url_aggregator::validate expected omitted host \n",
1493 to_diagram());
1494 return false;
1495 }
1496 }
1497 }
1498 if (components.host_end != buffer.size() &&
1499 components.pathname_start > components.host_end) {
1500 if (components.pathname_start == components.host_end + 2 &&
1501 buffer[components.host_end] == '/' &&
1502 buffer[components.host_end + 1] == '.') {
1503 if (components.pathname_start + 1 >= buffer.size() ||
1504 buffer[components.pathname_start] != '/' ||
1505 buffer[components.pathname_start + 1] != '/') {
1506 ada_log(
1507 "url_aggregator::validate expected the path to begin with // \n",
1508 to_diagram());
1509 return false;
1510 }
1511 } else if (buffer[components.host_end] != ':') {
1512 ada_log("url_aggregator::validate missing : at the port \n",
1513 to_diagram());
1514 return false;
1515 }
1516 }
1517 if (components.pathname_start != buffer.size() &&
1518 components.pathname_start < components.search_start &&
1519 components.pathname_start < components.hash_start && !has_opaque_path) {
1520 if (buffer[components.pathname_start] != '/') {
1521 ada_log("url_aggregator::validate missing / at the path \n",
1522 to_diagram());
1523 return false;
1524 }
1525 }
1526 if (components.search_start != url_components::omitted) {
1527 if (buffer[components.search_start] != '?') {
1528 ada_log("url_aggregator::validate missing ? at the search \n",
1529 to_diagram());
1530 return false;
1531 }
1532 }
1533 if (components.hash_start != url_components::omitted) {
1534 if (buffer[components.hash_start] != '#') {
1535 ada_log("url_aggregator::validate missing # at the hash \n",
1536 to_diagram());
1537 return false;
1538 }
1539 }
1540
1541 return true;
1542}
1543
1544void url_aggregator::delete_dash_dot() {
1545 ada_log("url_aggregator::delete_dash_dot");
1547 ADA_ASSERT_TRUE(has_dash_dot());
1548 buffer.erase(components.host_end, 2);
1549 components.pathname_start -= 2;
1550 if (components.search_start != url_components::omitted) {
1551 components.search_start -= 2;
1552 }
1553 if (components.hash_start != url_components::omitted) {
1554 components.hash_start -= 2;
1555 }
1557 ADA_ASSERT_TRUE(!has_dash_dot());
1558}
1559
1560inline void url_aggregator::consume_prepared_path(std::string_view input) {
1561 ada_log("url_aggregator::consume_prepared_path ", input);
1562
1571 uint8_t accumulator = checkers::path_signature(input);
1572 // Let us first detect a trivial case.
1573 // If it is special, we check that we have no dot, no %, no \ and no
1574 // character needing percent encoding. Otherwise, we check that we have no %,
1575 // no dot, and no character needing percent encoding.
1576 constexpr uint8_t need_encoding = 1;
1577 constexpr uint8_t backslash_char = 2;
1578 constexpr uint8_t dot_char = 4;
1579 constexpr uint8_t percent_char = 8;
1583 bool trivial_path =
1584 (special ? (accumulator == 0)
1586 0)) &&
1589 // '4' means that we have at least one dot, but nothing that requires
1590 // percent encoding or decoding. The only part that is not trivial is
1591 // that we may have single dots and double dots path segments.
1592 // If we have such segments, then we either have a path that begins
1593 // with '.' (easy to check), or we have the sequence './'.
1594 // Note: input cannot be empty, it must at least contain one character ('.')
1595 // Note: we know that '\' is not present.
1596 if (input[0] != '.') {
1597 size_t slashdot = input.find("/.");
1598 if (slashdot == std::string_view::npos) { // common case
1599 trivial_path = true;
1600 } else { // uncommon
1601 // only three cases matter: /./, /.. or a final /
1602 trivial_path =
1603 !(slashdot + 2 == input.size() || input[slashdot + 2] == '.' ||
1604 input[slashdot + 2] == '/');
1605 }
1606 }
1607 }
1608 if (trivial_path && is_at_path()) {
1609 ada_log("parse_path trivial");
1610 buffer += '/';
1611 buffer += input;
1612 return;
1613 }
1614 std::string path = std::string(get_pathname());
1615 // We are going to need to look a bit at the path, but let us see if we can
1616 // ignore percent encoding *and* backslashes *and* percent characters.
1617 // Except for the trivial case, this is likely to capture 99% of paths out
1618 // there.
1619 bool fast_path =
1620 (special &&
1622 (type != ada::scheme::type::FILE);
1623 if (fast_path) {
1624 ada_log("parse_prepared_path fast");
1625 // Here we don't need to worry about \ or percent encoding.
1626 // We also do not have a file protocol. We might have dots, however,
1627 // but dots must as appear as '.', and they cannot be encoded because
1628 // the symbol '%' is not present.
1629 size_t previous_location = 0; // We start at 0.
1630 do {
1631 size_t new_location = input.find('/', previous_location);
1632 // std::string_view path_view = input;
1633 // We process the last segment separately:
1634 if (new_location == std::string_view::npos) {
1635 std::string_view path_view = input.substr(previous_location);
1636 if (path_view == "..") { // The path ends with ..
1637 // e.g., if you receive ".." with an empty path, you go to "/".
1638 if (path.empty()) {
1639 path = '/';
1640 update_base_pathname(path);
1641 return;
1642 }
1643 // Fast case where we have nothing to do:
1644 if (path.back() == '/') {
1645 update_base_pathname(path);
1646 return;
1647 }
1648 // If you have the path "/joe/myfriend",
1649 // then you delete 'myfriend'.
1650 path.resize(path.rfind('/') + 1);
1651 update_base_pathname(path);
1652 return;
1653 }
1654 path += '/';
1655 if (path_view != ".") {
1656 path.append(path_view);
1657 }
1658 update_base_pathname(path);
1659 return;
1660 } else {
1661 // This is a non-final segment.
1662 std::string_view path_view =
1665 if (path_view == "..") {
1666 size_t last_delimiter = path.rfind('/');
1667 if (last_delimiter != std::string::npos) {
1668 path.erase(last_delimiter);
1669 }
1670 } else if (path_view != ".") {
1671 path += '/';
1672 path.append(path_view);
1673 }
1674 }
1675 } while (true);
1676 } else {
1677 ada_log("parse_path slow");
1678 // we have reached the general case
1680 std::string path_buffer_tmp;
1681 do {
1682 size_t location = (special && (accumulator & 2))
1683 ? input.find_first_of("/\\")
1684 : input.find('/');
1685 std::string_view path_view = input;
1686 if (location != std::string_view::npos) {
1687 path_view.remove_suffix(path_view.size() - location);
1688 input.remove_prefix(location + 1);
1689 }
1690 // path_buffer is either path_view or it might point at a percent encoded
1691 // temporary string.
1692 std::string_view path_buffer =
1694 ada::unicode::percent_encode<false>(
1697 : path_view;
1698 if (unicode::is_double_dot_path_segment(path_buffer)) {
1699 if ((helpers::shorten_path(path, type) || special) &&
1700 location == std::string_view::npos) {
1701 path += '/';
1702 }
1703 } else if (unicode::is_single_dot_path_segment(path_buffer) &&
1704 (location == std::string_view::npos)) {
1705 path += '/';
1706 }
1707 // Otherwise, if path_buffer is not a single-dot path segment, then:
1708 else if (!unicode::is_single_dot_path_segment(path_buffer)) {
1709 // If url's scheme is "file", url's path is empty, and path_buffer is a
1710 // Windows drive letter, then replace the second code point in
1711 // path_buffer with U+003A (:).
1712 if (type == ada::scheme::type::FILE && path.empty() &&
1714 path += '/';
1715 path += path_buffer[0];
1716 path += ':';
1717 path_buffer.remove_prefix(2);
1718 path.append(path_buffer);
1719 } else {
1720 // Append path_buffer to url's path.
1721 path += '/';
1722 path.append(path_buffer);
1723 }
1724 }
1725 if (location == std::string_view::npos) {
1726 update_base_pathname(path);
1727 return;
1728 }
1729 } while (true);
1730 }
1731}
1732} // namespace ada
Includes all definitions for Ada.
Definitions for URL specific checkers used within Ada.
Declarations for URL specific checkers used within Ada.
#define ADA_ASSERT_TRUE(COND)
#define ada_really_inline
Definition common_defs.h:84
Definitions for helper functions used within Ada.
Definitions for user facing functions for parsing URL and it's components.
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t PATH_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
bool has_hex_prefix(std::string_view input)
constexpr bool is_alpha(char x) noexcept
constexpr bool is_digit(char x) noexcept
ada_really_inline bool begins_with(std::string_view view, std::string_view prefix)
constexpr ada::scheme::type get_scheme_type(std::string_view scheme) noexcept
Definition scheme-inl.h:72
@ NOT_SPECIAL
Definition scheme.h:32
constexpr uint16_t get_special_port(std::string_view scheme) noexcept
Definition scheme-inl.h:57
std::string ipv6(const std::array< uint16_t, 8 > &address) noexcept
std::string ipv4(uint64_t address) noexcept
ada_really_inline size_t percent_encode_index(const std::string_view input, const uint8_t character_set[])
Definition unicode-inl.h:19
Definition ada_idna.h:13
@ IPV6
Definition url_base.h:32
@ IPV4
Definition url_base.h:27
template ada::result< url_aggregator > parse< url_aggregator >(std::string_view input, const url_aggregator *base_url)
tl::expected< result_type, ada::errors > result
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
Definitions for the parser.
Declarations for the URL scheme.
bool has_non_empty_username() const noexcept
std::string_view get_pathname() const noexcept
void set_hash(std::string_view input)
void clear_search() override
std::string_view get_host() const noexcept
bool has_hostname() const noexcept
bool has_non_empty_password() const noexcept
ada_really_inline bool has_credentials() const noexcept
std::string_view get_search() const noexcept
std::string_view get_username() const noexcept
std::string to_string() const override
std::string to_diagram() const
bool set_protocol(std::string_view input)
std::string get_origin() const noexcept override
bool validate() const noexcept
std::string_view get_protocol() const noexcept
std::string_view get_port() const noexcept
std::string_view get_hostname() const noexcept
bool has_valid_domain() const noexcept override
bool set_hostname(std::string_view input)
std::string_view get_href() const noexcept
bool set_password(std::string_view input)
bool set_pathname(std::string_view input)
std::string_view get_password() const noexcept
bool set_href(std::string_view input)
void set_search(std::string_view input)
bool has_port() const noexcept
std::string_view get_hash() const noexcept
bool set_host(std::string_view input)
bool set_port(std::string_view input)
bool set_username(std::string_view input)
ada_really_inline bool is_special() const noexcept
url_host_type host_type
Definition url_base.h:60
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
bool check_offset_consistency() const noexcept
static constexpr uint32_t omitted
Definitions for unicode operations.
Inline functions for url aggregator.
Declaration for the basic URL definitions.
Declaration for the URL Components.