Ada 2.7.8
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
parser.cpp
Go to the documentation of this file.
1#include "ada.h"
2#include "ada/common_defs.h"
4#include "ada/unicode.h"
5#include "ada/url-inl.h"
6#include "ada/log.h"
7#include "ada/parser.h"
8
9#include <numeric>
10#include <limits>
11
12namespace ada::parser {
13
14template <class result_type>
16 const result_type* base_url) {
17 // We can specialize the implementation per type.
18 // Important: result_type_is_ada_url is evaluated at *compile time*. This
19 // means that doing if constexpr(result_type_is_ada_url) { something } else {
20 // something else } is free (at runtime). This means that ada::url_aggregator
21 // and ada::url **do not have to support the exact same API**.
22 constexpr bool result_type_is_ada_url =
23 std::is_same<ada::url, result_type>::value;
25 std::is_same<ada::url_aggregator, result_type>::value;
26 static_assert(result_type_is_ada_url ||
27 result_type_is_ada_url_aggregator); // We don't support
28 // anything else for now.
29
30 ada_log("ada::parser::parse_url('", user_input, "' [", user_input.size(),
31 " bytes],", (base_url != nullptr ? base_url->to_string() : "null"),
32 ")");
33
36
37 // We refuse to parse URL strings that exceed 4GB. Such strings are almost
38 // surely the result of a bug or are otherwise a security concern.
39 if (user_input.size() > std::numeric_limits<uint32_t>::max()) {
40 url.is_valid = false;
41 }
42 // Going forward, user_input.size() is in [0,
43 // std::numeric_limits<uint32_t>::max). If we are provided with an invalid
44 // base, or the optional_url was invalid, we must return.
45 if (base_url != nullptr) {
46 url.is_valid &= base_url->is_valid;
47 }
48 if (!url.is_valid) {
49 return url;
50 }
52 // Most of the time, we just need user_input.size().
53 // In some instances, we may need a bit more.
55 // This is *very* important. This line should *not* be removed
56 // hastily. There are principled reasons why reserve is important
57 // for performance. If you have a benchmark with small inputs,
58 // it may not matter, but in other instances, it could.
60 // This rounds up to the next power of two.
61 // We know that user_input.size() is in [0,
62 // std::numeric_limits<uint32_t>::max).
64 (0xFFFFFFFF >>
65 helpers::leading_zeroes(uint32_t(1 | user_input.size()))) +
66 1;
67 url.reserve(reserve_capacity);
68 //
69 //
70 //
71 }
72 std::string tmp_buffer;
73 std::string_view internal_input;
74 if (unicode::has_tabs_or_newline(user_input)) {
76 // Optimization opportunity: Instead of copying and then pruning, we could
77 // just directly build the string from user_input.
78 helpers::remove_ascii_tab_or_newline(tmp_buffer);
80 } else {
82 }
83
84 // Leading and trailing control characters are uncommon and easy to deal with
85 // (no performance concern).
86 std::string_view url_data = internal_input;
87 helpers::trim_c0_whitespace(url_data);
88
89 // Optimization opportunity. Most websites do not have fragment.
90 std::optional<std::string_view> fragment = helpers::prune_hash(url_data);
91 // We add it last so that an implementation like ada::url_aggregator
92 // can append it last to its internal buffer, thus improving performance.
93
94 // Here url_data no longer has its fragment.
95 // We are going to access the data from url_data (it is immutable).
96 // At any given time, we are pointing at byte 'input_position' in url_data.
97 // The input_position variable should range from 0 to input_size.
98 // It is illegal to access url_data at input_size.
99 size_t input_position = 0;
100 const size_t input_size = url_data.size();
101 // Keep running the following state machine by switching on state.
102 // If after a run pointer points to the EOF code point, go to the next step.
103 // Otherwise, increase pointer by 1 and continue with the state machine.
104 // We never decrement input_position.
105 while (input_position <= input_size) {
106 ada_log("In parsing at ", input_position, " out of ", input_size,
107 " in state ", ada::to_string(state));
108 switch (state) {
110 ada_log("SCHEME_START ", helpers::substring(url_data, input_position));
111 // If c is an ASCII alpha, append c, lowercased, to buffer, and set
112 // state to scheme state.
113 if ((input_position != input_size) &&
117 } else {
118 // Otherwise, if state override is not given, set state to no scheme
119 // state and decrease pointer by 1.
121 }
122 break;
123 }
124 case ada::state::SCHEME: {
125 ada_log("SCHEME ", helpers::substring(url_data, input_position));
126 // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.),
127 // append c, lowercased, to buffer.
128 while ((input_position != input_size) &&
129 (ada::unicode::is_alnum_plus(url_data[input_position]))) {
131 }
132 // Otherwise, if c is U+003A (:), then:
133 if ((input_position != input_size) &&
134 (url_data[input_position] == ':')) {
135 ada_log("SCHEME the scheme should be ",
136 url_data.substr(0, input_position));
137 if constexpr (result_type_is_ada_url) {
138 if (!url.parse_scheme(url_data.substr(0, input_position))) {
139 return url;
140 }
141 } else {
142 // we pass the colon along instead of painfully adding it back.
143 if (!url.parse_scheme_with_colon(
144 url_data.substr(0, input_position + 1))) {
145 return url;
146 }
147 }
148 ada_log("SCHEME the scheme is ", url.get_protocol());
149
150 // If url's scheme is "file", then:
151 if (url.type == ada::scheme::type::FILE) {
152 // Set state to file state.
154 }
155 // Otherwise, if url is special, base is non-null, and base's scheme
156 // is url's scheme: Note: Doing base_url->scheme is unsafe if base_url
157 // != nullptr is false.
158 else if (url.is_special() && base_url != nullptr &&
159 base_url->type == url.type) {
160 // Set state to special relative or authority state.
162 }
163 // Otherwise, if url is special, set state to special authority
164 // slashes state.
165 else if (url.is_special()) {
167 }
168 // Otherwise, if remaining starts with an U+002F (/), set state to
169 // path or authority state and increase pointer by 1.
170 else if (input_position + 1 < input_size &&
171 url_data[input_position + 1] == '/') {
174 }
175 // Otherwise, set url's path to the empty string and set state to
176 // opaque path state.
177 else {
179 }
180 }
181 // Otherwise, if state override is not given, set buffer to the empty
182 // string, state to no scheme state, and start over (from the first code
183 // point in input).
184 else {
186 input_position = 0;
187 break;
188 }
190 break;
191 }
193 ada_log("NO_SCHEME ", helpers::substring(url_data, input_position));
194 // If base is null, or base has an opaque path and c is not U+0023 (#),
195 // validation error, return failure.
196 if (base_url == nullptr ||
197 (base_url->has_opaque_path && !fragment.has_value())) {
198 ada_log("NO_SCHEME validation error");
199 url.is_valid = false;
200 return url;
201 }
202 // Otherwise, if base has an opaque path and c is U+0023 (#),
203 // set url's scheme to base's scheme, url's path to base's path, url's
204 // query to base's query, and set state to fragment state.
205 else if (base_url->has_opaque_path && fragment.has_value() &&
207 ada_log("NO_SCHEME opaque base with fragment");
208 url.copy_scheme(*base_url);
209 url.has_opaque_path = base_url->has_opaque_path;
210
211 if constexpr (result_type_is_ada_url) {
212 url.path = base_url->path;
213 url.query = base_url->query;
214 } else {
215 url.update_base_pathname(base_url->get_pathname());
216 url.update_base_search(base_url->get_search());
217 }
218 url.update_unencoded_base_hash(*fragment);
219 return url;
220 }
221 // Otherwise, if base's scheme is not "file", set state to relative
222 // state and decrease pointer by 1.
223 else if (base_url->type != ada::scheme::type::FILE) {
224 ada_log("NO_SCHEME non-file relative path");
226 }
227 // Otherwise, set state to file state and decrease pointer by 1.
228 else {
229 ada_log("NO_SCHEME file base type");
231 }
232 break;
233 }
235 ada_log("AUTHORITY ", helpers::substring(url_data, input_position));
236 // most URLs have no @. Having no @ tells us that we don't have to worry
237 // about AUTHORITY. Of course, we could have @ and still not have to
238 // worry about AUTHORITY.
239 // TODO: Instead of just collecting a bool, collect the location of the
240 // '@' and do something useful with it.
241 // TODO: We could do various processing early on, using a single pass
242 // over the string to collect information about it, e.g., telling us
243 // whether there is a @ and if so, where (or how many).
244 const bool contains_ampersand =
245 (url_data.find('@', input_position) != std::string_view::npos);
246
247 if (!contains_ampersand) {
249 break;
250 }
251 bool at_sign_seen{false};
252 bool password_token_seen{false};
258 do {
259 std::string_view view = helpers::substring(url_data, input_position);
260 // The delimiters are @, /, ? \\.
261 size_t location =
262 url.is_special() ? helpers::find_authority_delimiter_special(view)
263 : helpers::find_authority_delimiter(view);
264 std::string_view authority_view(view.data(), location);
266 // If c is U+0040 (@), then:
267 if ((end_of_authority != input_size) &&
268 (url_data[end_of_authority] == '@')) {
269 // If atSignSeen is true, then prepend "%40" to buffer.
270 if (at_sign_seen) {
272 if constexpr (result_type_is_ada_url) {
273 url.password += "%40";
274 } else {
275 url.append_base_password("%40");
276 }
277 } else {
278 if constexpr (result_type_is_ada_url) {
279 url.username += "%40";
280 } else {
281 url.append_base_username("%40");
282 }
283 }
284 }
285
286 at_sign_seen = true;
287
288 if (!password_token_seen) {
289 size_t password_token_location = authority_view.find(':');
291 password_token_location != std::string_view::npos;
292
293 if (!password_token_seen) {
294 if constexpr (result_type_is_ada_url) {
295 url.username += unicode::percent_encode(
297 } else {
298 url.append_base_username(unicode::percent_encode(
300 }
301 } else {
302 if constexpr (result_type_is_ada_url) {
303 url.username += unicode::percent_encode(
306 url.password += unicode::percent_encode(
309 } else {
310 url.append_base_username(unicode::percent_encode(
313 url.append_base_password(unicode::percent_encode(
316 }
317 }
318 } else {
319 if constexpr (result_type_is_ada_url) {
320 url.password += unicode::percent_encode(
322 } else {
323 url.append_base_password(unicode::percent_encode(
325 }
326 }
327 }
328 // Otherwise, if one of the following is true:
329 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
330 // - url is special and c is U+005C (\‍)
331 else if (end_of_authority == input_size ||
332 url_data[end_of_authority] == '/' ||
333 url_data[end_of_authority] == '?' ||
334 (url.is_special() && url_data[end_of_authority] == '\\')) {
335 // If atSignSeen is true and authority_view is the empty string,
336 // validation error, return failure.
337 if (at_sign_seen && authority_view.empty()) {
338 url.is_valid = false;
339 return url;
340 }
342 break;
343 }
345 if (fragment.has_value()) {
346 url.update_unencoded_base_hash(*fragment);
347 }
348 return url;
349 }
351 } while (true);
352
353 break;
354 }
356 ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ",
357 helpers::substring(url_data, input_position));
358
359 // If c is U+002F (/) and remaining starts with U+002F (/),
360 // then set state to special authority ignore slashes state and increase
361 // pointer by 1.
362 std::string_view view = helpers::substring(url_data, input_position);
363 if (ada::checkers::begins_with(view, "//")) {
365 input_position += 2;
366 } else {
367 // Otherwise, validation error, set state to relative state and
368 // decrease pointer by 1.
370 }
371
372 break;
373 }
375 ada_log("PATH_OR_AUTHORITY ",
376 helpers::substring(url_data, input_position));
377
378 // If c is U+002F (/), then set state to authority state.
379 if ((input_position != input_size) &&
380 (url_data[input_position] == '/')) {
383 } else {
384 // Otherwise, set state to path state, and decrease pointer by 1.
386 }
387
388 break;
389 }
391 ada_log("RELATIVE_SCHEME ",
392 helpers::substring(url_data, input_position));
393
394 // Set url's scheme to base's scheme.
395 url.copy_scheme(*base_url);
396
397 // If c is U+002F (/), then set state to relative slash state.
398 if ((input_position != input_size) &&
399 (url_data[input_position] == '/')) {
400 ada_log(
401 "RELATIVE_SCHEME if c is U+002F (/), then set state to relative "
402 "slash state");
404 } else if (url.is_special() && (input_position != input_size) &&
405 (url_data[input_position] == '\\')) {
406 // Otherwise, if url is special and c is U+005C (\‍), validation error,
407 // set state to relative slash state.
408 ada_log(
409 "RELATIVE_SCHEME if url is special and c is U+005C, validation "
410 "error, set state to relative slash state");
412 } else {
413 ada_log("RELATIVE_SCHEME otherwise");
414 // Set url's username to base's username, url's password to base's
415 // password, url's host to base's host, url's port to base's port,
416 // url's path to a clone of base's path, and url's query to base's
417 // query.
418 if constexpr (result_type_is_ada_url) {
419 url.username = base_url->username;
420 url.password = base_url->password;
421 url.host = base_url->host;
422 url.port = base_url->port;
423 // cloning the base path includes cloning the has_opaque_path flag
424 url.has_opaque_path = base_url->has_opaque_path;
425 url.path = base_url->path;
426 url.query = base_url->query;
427 } else {
428 url.update_base_authority(base_url->get_href(),
429 base_url->get_components());
430 // TODO: Get rid of set_hostname and replace it with
431 // update_base_hostname
432 url.set_hostname(base_url->get_hostname());
433 url.update_base_port(base_url->retrieve_base_port());
434 // cloning the base path includes cloning the has_opaque_path flag
435 url.has_opaque_path = base_url->has_opaque_path;
436 url.update_base_pathname(base_url->get_pathname());
437 url.update_base_search(base_url->get_search());
438 }
439
440 url.has_opaque_path = base_url->has_opaque_path;
441
442 // If c is U+003F (?), then set url's query to the empty string, and
443 // state to query state.
444 if ((input_position != input_size) &&
445 (url_data[input_position] == '?')) {
447 }
448 // Otherwise, if c is not the EOF code point:
449 else if (input_position != input_size) {
450 // Set url's query to null.
451 url.clear_search();
452 if constexpr (result_type_is_ada_url) {
453 // Shorten url's path.
454 helpers::shorten_path(url.path, url.type);
455 } else {
456 std::string_view path = url.get_pathname();
457 if (helpers::shorten_path(path, url.type)) {
458 url.update_base_pathname(std::string(path));
459 }
460 }
461 // Set state to path state and decrease pointer by 1.
463 break;
464 }
465 }
467 break;
468 }
470 ada_log("RELATIVE_SLASH ",
471 helpers::substring(url_data, input_position));
472
473 // If url is special and c is U+002F (/) or U+005C (\‍), then:
474 if (url.is_special() && (input_position != input_size) &&
475 (url_data[input_position] == '/' ||
476 url_data[input_position] == '\\')) {
477 // Set state to special authority ignore slashes state.
479 }
480 // Otherwise, if c is U+002F (/), then set state to authority state.
481 else if ((input_position != input_size) &&
482 (url_data[input_position] == '/')) {
484 }
485 // Otherwise, set
486 // - url's username to base's username,
487 // - url's password to base's password,
488 // - url's host to base's host,
489 // - url's port to base's port,
490 // - state to path state, and then, decrease pointer by 1.
491 else {
492 if constexpr (result_type_is_ada_url) {
493 url.username = base_url->username;
494 url.password = base_url->password;
495 url.host = base_url->host;
496 url.port = base_url->port;
497 } else {
498 url.update_base_authority(base_url->get_href(),
499 base_url->get_components());
500 // TODO: Get rid of set_hostname and replace it with
501 // update_base_hostname
502 url.set_hostname(base_url->get_hostname());
503 url.update_base_port(base_url->retrieve_base_port());
504 }
506 break;
507 }
508
510 break;
511 }
513 ada_log("SPECIAL_AUTHORITY_SLASHES ",
514 helpers::substring(url_data, input_position));
515
516 // If c is U+002F (/) and remaining starts with U+002F (/),
517 // then set state to special authority ignore slashes state and increase
518 // pointer by 1.
519 std::string_view view = helpers::substring(url_data, input_position);
520 if (ada::checkers::begins_with(view, "//")) {
521 input_position += 2;
522 }
523
524 [[fallthrough]];
525 }
527 ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ",
528 helpers::substring(url_data, input_position));
529
530 // If c is neither U+002F (/) nor U+005C (\‍), then set state to
531 // authority state and decrease pointer by 1.
532 while ((input_position != input_size) &&
533 ((url_data[input_position] == '/') ||
534 (url_data[input_position] == '\\'))) {
536 }
538
539 break;
540 }
541 case ada::state::QUERY: {
542 ada_log("QUERY ", helpers::substring(url_data, input_position));
543 // Let queryPercentEncodeSet be the special-query percent-encode set if
544 // url is special; otherwise the query percent-encode set.
548
549 // Percent-encode after encoding, with encoding, buffer, and
550 // queryPercentEncodeSet, and append the result to url's query.
551 url.update_base_search(helpers::substring(url_data, input_position),
553 ada_log("QUERY update_base_search completed ");
554 if (fragment.has_value()) {
555 url.update_unencoded_base_hash(*fragment);
556 }
557 return url;
558 }
559 case ada::state::HOST: {
560 ada_log("HOST ", helpers::substring(url_data, input_position));
561
562 std::string_view host_view =
563 helpers::substring(url_data, input_position);
564 auto [location, found_colon] =
565 helpers::get_host_delimiter_location(url.is_special(), host_view);
566 input_position = (location != std::string_view::npos)
568 : input_size;
569 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
570 // Note: the 'found_colon' value is true if and only if a colon was
571 // encountered while not inside brackets.
572 if (found_colon) {
573 // If buffer is the empty string, validation error, return failure.
574 // Let host be the result of host parsing buffer with url is not
575 // special.
576 ada_log("HOST parsing ", host_view);
577 if (!url.parse_host(host_view)) {
578 return url;
579 }
580 ada_log("HOST parsing results in ", url.get_hostname());
581 // Set url's host to host, buffer to the empty string, and state to
582 // port state.
585 }
586 // Otherwise, if one of the following is true:
587 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
588 // - url is special and c is U+005C (\‍)
589 // The get_host_delimiter_location function either brings us to
590 // the colon outside of the bracket, or to one of those characters.
591 else {
592 // If url is special and host_view is the empty string, validation
593 // error, return failure.
594 if (url.is_special() && host_view.empty()) {
595 url.is_valid = false;
596 return url;
597 }
598 ada_log("HOST parsing ", host_view, " href=", url.get_href());
599 // Let host be the result of host parsing host_view with url is not
600 // special.
601 if (host_view.empty()) {
602 url.update_base_hostname("");
603 } else if (!url.parse_host(host_view)) {
604 return url;
605 }
606 ada_log("HOST parsing results in ", url.get_hostname(),
607 " href=", url.get_href());
608
609 // Set url's host to host, and state to path start state.
611 }
612
613 break;
614 }
616 ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position));
617 std::string_view view = helpers::substring(url_data, input_position);
618 // If c is U+003F (?), then set url's query to the empty string and
619 // state to query state.
620 size_t location = view.find('?');
621 if (location != std::string_view::npos) {
622 view.remove_suffix(view.size() - location);
625 } else {
627 }
628 url.has_opaque_path = true;
629 // This is a really unlikely scenario in real world. We should not seek
630 // to optimize it.
631 url.update_base_pathname(unicode::percent_encode(
633 break;
634 }
635 case ada::state::PORT: {
636 ada_log("PORT ", helpers::substring(url_data, input_position));
637 std::string_view port_view =
638 helpers::substring(url_data, input_position);
639 size_t consumed_bytes = url.parse_port(port_view, true);
641 if (!url.is_valid) {
642 return url;
643 }
645 [[fallthrough]];
646 }
648 ada_log("PATH_START ", helpers::substring(url_data, input_position));
649
650 // If url is special, then:
651 if (url.is_special()) {
652 // Set state to path state.
654
655 // Optimization: Avoiding going into PATH state improves the
656 // performance of urls ending with /.
657 if (input_position == input_size) {
658 url.update_base_pathname("/");
659 if (fragment.has_value()) {
660 url.update_unencoded_base_hash(*fragment);
661 }
662 return url;
663 }
664 // If c is neither U+002F (/) nor U+005C (\‍), then decrease pointer
665 // by 1. We know that (input_position == input_size) is impossible
666 // here, because of the previous if-check.
667 if ((url_data[input_position] != '/') &&
668 (url_data[input_position] != '\\')) {
669 break;
670 }
671 }
672 // Otherwise, if state override is not given and c is U+003F (?),
673 // set url's query to the empty string and state to query state.
674 else if ((input_position != input_size) &&
675 (url_data[input_position] == '?')) {
677 }
678 // Otherwise, if c is not the EOF code point:
679 else if (input_position != input_size) {
680 // Set state to path state.
682
683 // If c is not U+002F (/), then decrease pointer by 1.
684 if (url_data[input_position] != '/') {
685 break;
686 }
687 }
688
690 break;
691 }
692 case ada::state::PATH: {
693 std::string_view view = helpers::substring(url_data, input_position);
694 ada_log("PATH ", helpers::substring(url_data, input_position));
695
696 // Most time, we do not need percent encoding.
697 // Furthermore, we can immediately locate the '?'.
698 size_t locofquestionmark = view.find('?');
699 if (locofquestionmark != std::string_view::npos) {
701 view.remove_suffix(view.size() - locofquestionmark);
703 } else {
705 }
706 if constexpr (result_type_is_ada_url) {
707 helpers::parse_prepared_path(view, url.type, url.path);
708 } else {
709 url.consume_prepared_path(view);
710 ADA_ASSERT_TRUE(url.validate());
711 }
712 break;
713 }
715 ada_log("FILE_SLASH ", helpers::substring(url_data, input_position));
716
717 // If c is U+002F (/) or U+005C (\‍), then:
718 if ((input_position != input_size) &&
719 (url_data[input_position] == '/' ||
720 url_data[input_position] == '\\')) {
721 ada_log("FILE_SLASH c is U+002F or U+005C");
722 // Set state to file host state.
725 } else {
726 ada_log("FILE_SLASH otherwise");
727 // If base is non-null and base's scheme is "file", then:
728 // Note: it is unsafe to do base_url->scheme unless you know that
729 // base_url_has_value() is true.
730 if (base_url != nullptr &&
732 // Set url's host to base's host.
733 if constexpr (result_type_is_ada_url) {
734 url.host = base_url->host;
735 } else {
736 // TODO: Optimization opportunity.
737 url.set_host(base_url->get_host());
738 }
739 // If the code point substring from pointer to the end of input does
740 // not start with a Windows drive letter and base's path[0] is a
741 // normalized Windows drive letter, then append base's path[0] to
742 // url's path.
743 if (!base_url->get_pathname().empty()) {
745 helpers::substring(url_data, input_position))) {
746 std::string_view first_base_url_path =
747 base_url->get_pathname().substr(1);
748 size_t loc = first_base_url_path.find('/');
749 if (loc != std::string_view::npos) {
750 helpers::resize(first_base_url_path, loc);
751 }
754 if constexpr (result_type_is_ada_url) {
755 url.path += '/';
756 url.path += first_base_url_path;
757 } else {
758 url.append_base_pathname(
759 helpers::concat("/", first_base_url_path));
760 }
761 }
762 }
763 }
764 }
765
766 // Set state to path state, and decrease pointer by 1.
768 }
769
770 break;
771 }
773 std::string_view view = helpers::substring(url_data, input_position);
774 ada_log("FILE_HOST ", helpers::substring(url_data, input_position));
775
776 size_t location = view.find_first_of("/\\?");
777 std::string_view file_host_buffer(
778 view.data(),
779 (location != std::string_view::npos) ? location : view.size());
780
783 } else if (file_host_buffer.empty()) {
784 // Set url's host to the empty string.
785 if constexpr (result_type_is_ada_url) {
786 url.host = "";
787 } else {
788 url.update_base_hostname("");
789 }
790 // Set state to path start state.
792 } else {
793 size_t consumed_bytes = file_host_buffer.size();
795 // Let host be the result of host parsing buffer with url is not
796 // special.
797 if (!url.parse_host(file_host_buffer)) {
798 return url;
799 }
800
801 if constexpr (result_type_is_ada_url) {
802 // If host is "localhost", then set host to the empty string.
803 if (url.host.has_value() && url.host.value() == "localhost") {
804 url.host = "";
805 }
806 } else {
807 if (url.get_hostname() == "localhost") {
808 url.update_base_hostname("");
809 }
810 }
811
812 // Set buffer to the empty string and state to path start state.
814 }
815
816 break;
817 }
818 case ada::state::FILE: {
819 ada_log("FILE ", helpers::substring(url_data, input_position));
820 std::string_view file_view =
821 helpers::substring(url_data, input_position);
822
823 url.set_protocol_as_file();
824 if constexpr (result_type_is_ada_url) {
825 // Set url's host to the empty string.
826 url.host = "";
827 } else {
828 url.update_base_hostname("");
829 }
830 // If c is U+002F (/) or U+005C (\‍), then:
831 if (input_position != input_size &&
832 (url_data[input_position] == '/' ||
833 url_data[input_position] == '\\')) {
834 ada_log("FILE c is U+002F or U+005C");
835 // Set state to file slash state.
837 }
838 // Otherwise, if base is non-null and base's scheme is "file":
839 else if (base_url != nullptr &&
841 // Set url's host to base's host, url's path to a clone of base's
842 // path, and url's query to base's query.
843 ada_log("FILE base non-null");
844 if constexpr (result_type_is_ada_url) {
845 url.host = base_url->host;
846 url.path = base_url->path;
847 url.query = base_url->query;
848 } else {
849 // TODO: Get rid of set_hostname and replace it with
850 // update_base_hostname
851 url.set_hostname(base_url->get_hostname());
852 url.update_base_pathname(base_url->get_pathname());
853 url.update_base_search(base_url->get_search());
854 }
855 url.has_opaque_path = base_url->has_opaque_path;
856
857 // If c is U+003F (?), then set url's query to the empty string and
858 // state to query state.
861 }
862 // Otherwise, if c is not the EOF code point:
863 else if (input_position != input_size) {
864 // Set url's query to null.
865 url.clear_search();
866 // If the code point substring from pointer to the end of input does
867 // not start with a Windows drive letter, then shorten url's path.
869 if constexpr (result_type_is_ada_url) {
870 helpers::shorten_path(url.path, url.type);
871 } else {
872 std::string_view path = url.get_pathname();
873 if (helpers::shorten_path(path, url.type)) {
874 url.update_base_pathname(std::string(path));
875 }
876 }
877 }
878 // Otherwise:
879 else {
880 // Set url's path to an empty list.
881 url.clear_pathname();
882 url.has_opaque_path = true;
883 }
884
885 // Set state to path state and decrease pointer by 1.
887 break;
888 }
889 }
890 // Otherwise, set state to path state, and decrease pointer by 1.
891 else {
892 ada_log("FILE go to path");
894 break;
895 }
896
898 break;
899 }
900 default:
902 }
903 }
904 if (fragment.has_value()) {
905 url.update_unencoded_base_hash(*fragment);
906 }
907 return url;
908}
909
910template url parse_url<url>(std::string_view user_input,
911 const url* base_url = nullptr);
913 std::string_view user_input, const url_aggregator* base_url = nullptr);
914
915} // namespace ada::parser
Includes all definitions for Ada.
Definitions of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
ada_really_inline bool begins_with(std::string_view view, std::string_view prefix)
Includes the definitions for supported parsers.
template url parse_url< url >(std::string_view user_input, const url *base_url)
result_type parse_url(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:15
template url_aggregator parse_url< url_aggregator >(std::string_view user_input, const url_aggregator *base_url)
ada_warn_unused std::string to_string(encoding_type type)
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
@ SPECIAL_AUTHORITY_SLASHES
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
void unreachable()
ada_warn_unused ada::result< result_type > parse(std::string_view input, const result_type *base_url=nullptr)
Definitions for the parser.
Lightweight URL struct.
ada_really_inline bool is_special() const noexcept
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
Generic URL struct reliant on std::string instantiation.
Definition url.h:38
bool set_hostname(std::string_view input)
bool set_host(std::string_view input)
std::string_view get_pathname() const noexcept
ada_really_inline std::string get_href() const noexcept
Definition url-inl.h:183
std::string get_hostname() const noexcept
std::string get_protocol() const noexcept
Definitions for all unicode specific functions.
Definitions for the URL.