Ada 3.4.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
parser.cpp
Go to the documentation of this file.
1#include "ada/parser-inl.h"
2
3#include <limits>
4#include <ranges>
5
7#include "ada/common_defs.h"
8#include "ada/log.h"
9#include "ada/unicode.h"
10
11namespace ada::parser {
12
13template <class result_type, bool store_values>
14result_type parse_url_impl(std::string_view user_input,
15 const result_type* base_url) {
16 // We can specialize the implementation per type.
17 // Important: result_type_is_ada_url is evaluated at *compile time*. This
18 // means that doing if constexpr(result_type_is_ada_url) { something } else {
19 // something else } is free (at runtime). This means that ada::url_aggregator
20 // and ada::url **do not have to support the exact same API**.
21 constexpr bool result_type_is_ada_url = std::is_same_v<url, result_type>;
22 constexpr bool result_type_is_ada_url_aggregator =
23 std::is_same_v<url_aggregator, result_type>;
24 static_assert(result_type_is_ada_url ||
25 result_type_is_ada_url_aggregator); // We don't support
26 // anything else for now.
27
28 ada_log("ada::parser::parse_url('", user_input, "' [", user_input.size(),
29 " bytes],", (base_url != nullptr ? base_url->to_string() : "null"),
30 ")");
31
33 result_type url{};
34
35 // We refuse to parse URL strings that exceed 4GB. Such strings are almost
36 // surely the result of a bug or are otherwise a security concern.
37 if (user_input.size() > std::numeric_limits<uint32_t>::max()) [[unlikely]] {
38 url.is_valid = false;
39 }
40 // Going forward, user_input.size() is in [0,
41 // std::numeric_limits<uint32_t>::max). If we are provided with an invalid
42 // base, or the optional_url was invalid, we must return.
43 if (base_url != nullptr) {
44 url.is_valid &= base_url->is_valid;
45 }
46 if (!url.is_valid) {
47 return url;
48 }
49 if constexpr (result_type_is_ada_url_aggregator && store_values) {
50 // Most of the time, we just need user_input.size().
51 // In some instances, we may need a bit more.
53 // This is *very* important. This line should *not* be removed
54 // hastily. There are principled reasons why reserve is important
55 // for performance. If you have a benchmark with small inputs,
56 // it may not matter, but in other instances, it could.
58 // This rounds up to the next power of two.
59 // We know that user_input.size() is in [0,
60 // std::numeric_limits<uint32_t>::max).
61 uint32_t reserve_capacity =
62 (0xFFFFFFFF >>
63 helpers::leading_zeroes(uint32_t(1 | user_input.size()))) +
64 1;
65 url.reserve(reserve_capacity);
66 }
67 std::string tmp_buffer;
68 std::string_view url_data;
69 if (unicode::has_tabs_or_newline(user_input)) [[unlikely]] {
70 tmp_buffer = user_input;
71 // Optimization opportunity: Instead of copying and then pruning, we could
72 // just directly build the string from user_input.
73 helpers::remove_ascii_tab_or_newline(tmp_buffer);
74 url_data = tmp_buffer;
75 } else [[likely]] {
76 url_data = user_input;
77 }
78
79 // Leading and trailing control characters are uncommon and easy to deal with
80 // (no performance concern).
81 helpers::trim_c0_whitespace(url_data);
82
83 // Optimization opportunity. Most websites do not have fragment.
84 std::optional<std::string_view> fragment = helpers::prune_hash(url_data);
85 // We add it last so that an implementation like ada::url_aggregator
86 // can append it last to its internal buffer, thus improving performance.
87
88 // Here url_data no longer has its fragment.
89 // We are going to access the data from url_data (it is immutable).
90 // At any given time, we are pointing at byte 'input_position' in url_data.
91 // The input_position variable should range from 0 to input_size.
92 // It is illegal to access url_data at input_size.
93 size_t input_position = 0;
94 const size_t input_size = url_data.size();
95 // Keep running the following state machine by switching on state.
96 // If after a run pointer points to the EOF code point, go to the next step.
97 // Otherwise, increase pointer by 1 and continue with the state machine.
98 // We never decrement input_position.
99 while (input_position <= input_size) {
100 ada_log("In parsing at ", input_position, " out of ", input_size,
101 " in state ", ada::to_string(state));
102 switch (state) {
103 case state::SCHEME_START: {
104 ada_log("SCHEME_START ", helpers::substring(url_data, input_position));
105 // If c is an ASCII alpha, append c, lowercased, to buffer, and set
106 // state to scheme state.
107 if ((input_position != input_size) &&
108 checkers::is_alpha(url_data[input_position])) {
110 input_position++;
111 } else {
112 // Otherwise, if state override is not given, set state to no scheme
113 // state and decrease pointer by 1.
115 }
116 break;
117 }
118 case state::SCHEME: {
119 ada_log("SCHEME ", helpers::substring(url_data, input_position));
120 // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.),
121 // append c, lowercased, to buffer.
122 while ((input_position != input_size) &&
123 (unicode::is_alnum_plus(url_data[input_position]))) {
124 input_position++;
125 }
126 // Otherwise, if c is U+003A (:), then:
127 if ((input_position != input_size) &&
128 (url_data[input_position] == ':')) {
129 ada_log("SCHEME the scheme should be ",
130 url_data.substr(0, input_position));
131 if constexpr (result_type_is_ada_url) {
132 if (!url.parse_scheme(url_data.substr(0, input_position))) {
133 return url;
134 }
135 } else {
136 // we pass the colon along instead of painfully adding it back.
137 if (!url.parse_scheme_with_colon(
138 url_data.substr(0, input_position + 1))) {
139 return url;
140 }
141 }
142 ada_log("SCHEME the scheme is ", url.get_protocol());
143
144 // If url's scheme is "file", then:
145 if (url.type == scheme::type::FILE) {
146 // Set state to file state.
148 }
149 // Otherwise, if url is special, base is non-null, and base's scheme
150 // is url's scheme: Note: Doing base_url->scheme is unsafe if base_url
151 // != nullptr is false.
152 else if (url.is_special() && base_url != nullptr &&
153 base_url->type == url.type) {
154 // Set state to special relative or authority state.
156 }
157 // Otherwise, if url is special, set state to special authority
158 // slashes state.
159 else if (url.is_special()) {
161 }
162 // Otherwise, if remaining starts with an U+002F (/), set state to
163 // path or authority state and increase pointer by 1.
164 else if (input_position + 1 < input_size &&
165 url_data[input_position + 1] == '/') {
167 input_position++;
168 }
169 // Otherwise, set url's path to the empty string and set state to
170 // opaque path state.
171 else {
173 }
174 }
175 // Otherwise, if state override is not given, set buffer to the empty
176 // string, state to no scheme state, and start over (from the first code
177 // point in input).
178 else {
180 input_position = 0;
181 break;
182 }
183 input_position++;
184 break;
185 }
186 case state::NO_SCHEME: {
187 ada_log("NO_SCHEME ", helpers::substring(url_data, input_position));
188 // If base is null, or base has an opaque path and c is not U+0023 (#),
189 // validation error, return failure.
190 if (base_url == nullptr ||
191 (base_url->has_opaque_path && !fragment.has_value())) {
192 ada_log("NO_SCHEME validation error");
193 url.is_valid = false;
194 return url;
195 }
196 // Otherwise, if base has an opaque path and c is U+0023 (#),
197 // set url's scheme to base's scheme, url's path to base's path, url's
198 // query to base's query, and set state to fragment state.
199 else if (base_url->has_opaque_path && fragment.has_value() &&
200 input_position == input_size) {
201 ada_log("NO_SCHEME opaque base with fragment");
202 url.copy_scheme(*base_url);
203 url.has_opaque_path = base_url->has_opaque_path;
204
205 if constexpr (result_type_is_ada_url) {
206 url.path = base_url->path;
207 url.query = base_url->query;
208 } else {
209 url.update_base_pathname(base_url->get_pathname());
210 url.update_base_search(base_url->get_search());
211 }
212 url.update_unencoded_base_hash(*fragment);
213 return url;
214 }
215 // Otherwise, if base's scheme is not "file", set state to relative
216 // state and decrease pointer by 1.
217 else if (base_url->type != scheme::type::FILE) {
218 ada_log("NO_SCHEME non-file relative path");
220 }
221 // Otherwise, set state to file state and decrease pointer by 1.
222 else {
223 ada_log("NO_SCHEME file base type");
225 }
226 break;
227 }
228 case state::AUTHORITY: {
229 ada_log("AUTHORITY ", helpers::substring(url_data, input_position));
230 // most URLs have no @. Having no @ tells us that we don't have to worry
231 // about AUTHORITY. Of course, we could have @ and still not have to
232 // worry about AUTHORITY.
233 // TODO: Instead of just collecting a bool, collect the location of the
234 // '@' and do something useful with it.
235 // TODO: We could do various processing early on, using a single pass
236 // over the string to collect information about it, e.g., telling us
237 // whether there is a @ and if so, where (or how many).
238
239 // Check if url data contains an @.
240 if (url_data.find('@', input_position) == std::string_view::npos) {
242 break;
243 }
244 bool at_sign_seen{false};
245 bool password_token_seen{false};
251 do {
252 std::string_view view = url_data.substr(input_position);
253 // The delimiters are @, /, ? \\.
254 size_t location =
255 url.is_special() ? helpers::find_authority_delimiter_special(view)
256 : helpers::find_authority_delimiter(view);
257 std::string_view authority_view = view.substr(0, location);
258 size_t end_of_authority = input_position + authority_view.size();
259 // If c is U+0040 (@), then:
260 if ((end_of_authority != input_size) &&
261 (url_data[end_of_authority] == '@')) {
262 // If atSignSeen is true, then prepend "%40" to buffer.
263 if (at_sign_seen) {
264 if (password_token_seen) {
265 if constexpr (result_type_is_ada_url) {
266 url.password += "%40";
267 } else {
268 url.append_base_password("%40");
269 }
270 } else {
271 if constexpr (result_type_is_ada_url) {
272 url.username += "%40";
273 } else {
274 url.append_base_username("%40");
275 }
276 }
277 }
278
279 at_sign_seen = true;
280
281 if (!password_token_seen) {
282 size_t password_token_location = authority_view.find(':');
283 password_token_seen =
284 password_token_location != std::string_view::npos;
285
286 if constexpr (store_values) {
287 if (!password_token_seen) {
288 if constexpr (result_type_is_ada_url) {
289 url.username += unicode::percent_encode(
290 authority_view,
292 } else {
293 url.append_base_username(unicode::percent_encode(
294 authority_view,
296 }
297 } else {
298 if constexpr (result_type_is_ada_url) {
299 url.username += unicode::percent_encode(
300 authority_view.substr(0, password_token_location),
302 url.password += unicode::percent_encode(
303 authority_view.substr(password_token_location + 1),
305 } else {
306 url.append_base_username(unicode::percent_encode(
307 authority_view.substr(0, password_token_location),
309 url.append_base_password(unicode::percent_encode(
310 authority_view.substr(password_token_location + 1),
312 }
313 }
314 }
315 } else if constexpr (store_values) {
316 if constexpr (result_type_is_ada_url) {
317 url.password += unicode::percent_encode(
319 } else {
320 url.append_base_password(unicode::percent_encode(
322 }
323 }
324 }
325 // Otherwise, if one of the following is true:
326 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
327 // - url is special and c is U+005C (\‍)
328 else if (end_of_authority == input_size ||
329 url_data[end_of_authority] == '/' ||
330 url_data[end_of_authority] == '?' ||
331 (url.is_special() && url_data[end_of_authority] == '\\')) {
332 // If atSignSeen is true and authority_view is the empty string,
333 // validation error, return failure.
334 if (at_sign_seen && authority_view.empty()) {
335 url.is_valid = false;
336 return url;
337 }
339 break;
340 }
341 if (end_of_authority == input_size) {
342 if constexpr (store_values) {
343 if (fragment.has_value()) {
344 url.update_unencoded_base_hash(*fragment);
345 }
346 }
347 return url;
348 }
349 input_position = end_of_authority + 1;
350 } while (true);
351
352 break;
353 }
355 ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ",
356 helpers::substring(url_data, input_position));
357
358 // If c is U+002F (/) and remaining starts with U+002F (/),
359 // then set state to special authority ignore slashes state and increase
360 // pointer by 1.
361 if (url_data.substr(input_position, 2) == "//") {
363 input_position += 2;
364 } else {
365 // Otherwise, validation error, set state to relative state and
366 // decrease pointer by 1.
368 }
369
370 break;
371 }
373 ada_log("PATH_OR_AUTHORITY ",
374 helpers::substring(url_data, input_position));
375
376 // If c is U+002F (/), then set state to authority state.
377 if ((input_position != input_size) &&
378 (url_data[input_position] == '/')) {
380 input_position++;
381 } else {
382 // Otherwise, set state to path state, and decrease pointer by 1.
384 }
385
386 break;
387 }
389 ada_log("RELATIVE_SCHEME ",
390 helpers::substring(url_data, input_position));
391
392 // Set url's scheme to base's scheme.
393 url.copy_scheme(*base_url);
394
395 // If c is U+002F (/), then set state to relative slash state.
396 if ((input_position != input_size) &&
397 // NOLINTNEXTLINE(bugprone-branch-clone)
398 (url_data[input_position] == '/')) {
399 ada_log(
400 "RELATIVE_SCHEME if c is U+002F (/), then set state to relative "
401 "slash state");
403 } else if (url.is_special() && (input_position != input_size) &&
404 (url_data[input_position] == '\\')) {
405 // Otherwise, if url is special and c is U+005C (\‍), validation error,
406 // set state to relative slash state.
407 ada_log(
408 "RELATIVE_SCHEME if url is special and c is U+005C, validation "
409 "error, set state to relative slash state");
411 } else {
412 ada_log("RELATIVE_SCHEME otherwise");
413 // Set url's username to base's username, url's password to base's
414 // password, url's host to base's host, url's port to base's port,
415 // url's path to a clone of base's path, and url's query to base's
416 // query.
417 if constexpr (result_type_is_ada_url) {
418 url.username = base_url->username;
419 url.password = base_url->password;
420 url.host = base_url->host;
421 url.port = base_url->port;
422 // cloning the base path includes cloning the has_opaque_path flag
423 url.has_opaque_path = base_url->has_opaque_path;
424 url.path = base_url->path;
425 url.query = base_url->query;
426 } else {
427 url.update_base_authority(base_url->get_href(),
428 base_url->get_components());
429 url.update_host_to_base_host(base_url->get_hostname());
430 url.update_base_port(base_url->retrieve_base_port());
431 // cloning the base path includes cloning the has_opaque_path flag
432 url.has_opaque_path = base_url->has_opaque_path;
433 url.update_base_pathname(base_url->get_pathname());
434 url.update_base_search(base_url->get_search());
435 }
436
437 url.has_opaque_path = base_url->has_opaque_path;
438
439 // If c is U+003F (?), then set url's query to the empty string, and
440 // state to query state.
441 if ((input_position != input_size) &&
442 (url_data[input_position] == '?')) {
444 }
445 // Otherwise, if c is not the EOF code point:
446 else if (input_position != input_size) {
447 // Set url's query to null.
448 url.clear_search();
449 if constexpr (result_type_is_ada_url) {
450 // Shorten url's path.
451 helpers::shorten_path(url.path, url.type);
452 } else {
453 std::string_view path = url.get_pathname();
454 if (helpers::shorten_path(path, url.type)) {
455 url.update_base_pathname(std::move(std::string(path)));
456 }
457 }
458 // Set state to path state and decrease pointer by 1.
460 break;
461 }
462 }
463 input_position++;
464 break;
465 }
467 ada_log("RELATIVE_SLASH ",
468 helpers::substring(url_data, input_position));
469
470 // If url is special and c is U+002F (/) or U+005C (\‍), then:
471 if (url.is_special() && (input_position != input_size) &&
472 (url_data[input_position] == '/' ||
473 url_data[input_position] == '\\')) {
474 // Set state to special authority ignore slashes state.
476 }
477 // Otherwise, if c is U+002F (/), then set state to authority state.
478 else if ((input_position != input_size) &&
479 (url_data[input_position] == '/')) {
481 }
482 // Otherwise, set
483 // - url's username to base's username,
484 // - url's password to base's password,
485 // - url's host to base's host,
486 // - url's port to base's port,
487 // - state to path state, and then, decrease pointer by 1.
488 else {
489 if constexpr (result_type_is_ada_url) {
490 url.username = base_url->username;
491 url.password = base_url->password;
492 url.host = base_url->host;
493 url.port = base_url->port;
494 } else {
495 url.update_base_authority(base_url->get_href(),
496 base_url->get_components());
497 url.update_host_to_base_host(base_url->get_hostname());
498 url.update_base_port(base_url->retrieve_base_port());
499 }
501 break;
502 }
503
504 input_position++;
505 break;
506 }
508 ada_log("SPECIAL_AUTHORITY_SLASHES ",
509 helpers::substring(url_data, input_position));
510
511 // If c is U+002F (/) and remaining starts with U+002F (/),
512 // then set state to special authority ignore slashes state and increase
513 // pointer by 1.
514 if (url_data.substr(input_position, 2) == "//") {
515 input_position += 2;
516 }
517
518 [[fallthrough]];
519 }
521 ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ",
522 helpers::substring(url_data, input_position));
523
524 // If c is neither U+002F (/) nor U+005C (\‍), then set state to
525 // authority state and decrease pointer by 1.
526 while ((input_position != input_size) &&
527 ((url_data[input_position] == '/') ||
528 (url_data[input_position] == '\\'))) {
529 input_position++;
530 }
532
533 break;
534 }
535 case state::QUERY: {
536 ada_log("QUERY ", helpers::substring(url_data, input_position));
537 if constexpr (store_values) {
538 // Let queryPercentEncodeSet be the special-query percent-encode set
539 // if url is special; otherwise the query percent-encode set.
540 const uint8_t* query_percent_encode_set =
543
544 // Percent-encode after encoding, with encoding, buffer, and
545 // queryPercentEncodeSet, and append the result to url's query.
546 url.update_base_search(url_data.substr(input_position),
547 query_percent_encode_set);
548 ada_log("QUERY update_base_search completed ");
549 if (fragment.has_value()) {
550 url.update_unencoded_base_hash(*fragment);
551 }
552 }
553 return url;
554 }
555 case state::HOST: {
556 ada_log("HOST ", helpers::substring(url_data, input_position));
557
558 std::string_view host_view = url_data.substr(input_position);
559 auto [location, found_colon] =
560 helpers::get_host_delimiter_location(url.is_special(), host_view);
561 input_position = (location != std::string_view::npos)
562 ? input_position + location
563 : input_size;
564 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
565 // Note: the 'found_colon' value is true if and only if a colon was
566 // encountered while not inside brackets.
567 if (found_colon) {
568 // If buffer is the empty string, validation error, return failure.
569 // Let host be the result of host parsing buffer with url is not
570 // special.
571 ada_log("HOST parsing ", host_view);
572 if (!url.parse_host(host_view)) {
573 return url;
574 }
575 ada_log("HOST parsing results in ", url.get_hostname());
576 // Set url's host to host, buffer to the empty string, and state to
577 // port state.
579 input_position++;
580 }
581 // Otherwise, if one of the following is true:
582 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
583 // - url is special and c is U+005C (\‍)
584 // The get_host_delimiter_location function either brings us to
585 // the colon outside of the bracket, or to one of those characters.
586 else {
587 // If url is special and host_view is the empty string, validation
588 // error, return failure.
589 if (host_view.empty() && url.is_special()) {
590 url.is_valid = false;
591 return url;
592 }
593 ada_log("HOST parsing ", host_view, " href=", url.get_href());
594 // Let host be the result of host parsing host_view with url is not
595 // special.
596 if (host_view.empty()) {
597 url.update_base_hostname("");
598 } else if (!url.parse_host(host_view)) {
599 return url;
600 }
601 ada_log("HOST parsing results in ", url.get_hostname(),
602 " href=", url.get_href());
603
604 // Set url's host to host, and state to path start state.
606 }
607
608 break;
609 }
610 case state::OPAQUE_PATH: {
611 ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position));
612 std::string_view view = url_data.substr(input_position);
613 // If c is U+003F (?), then set url's query to the empty string and
614 // state to query state.
615 size_t location = view.find('?');
616 if (location != std::string_view::npos) {
617 view.remove_suffix(view.size() - location);
619 input_position += location + 1;
620 } else {
621 input_position = input_size + 1;
622 }
623 url.has_opaque_path = true;
624
625 // This is a really unlikely scenario in real world. We should not seek
626 // to optimize it.
627 if (view.ends_with(' ')) {
628 std::string modified_view =
629 std::string(view.substr(0, view.size() - 1)) + "%20";
630 url.update_base_pathname(unicode::percent_encode(
632 } else {
633 url.update_base_pathname(unicode::percent_encode(
635 }
636 break;
637 }
638 case state::PORT: {
639 ada_log("PORT ", helpers::substring(url_data, input_position));
640 std::string_view port_view = url_data.substr(input_position);
641 input_position += url.parse_port(port_view, true);
642 if (!url.is_valid) {
643 return url;
644 }
646 [[fallthrough]];
647 }
648 case state::PATH_START: {
649 ada_log("PATH_START ", helpers::substring(url_data, input_position));
650
651 // If url is special, then:
652 if (url.is_special()) {
653 // Set state to path state.
655
656 // Optimization: Avoiding going into PATH state improves the
657 // performance of urls ending with /.
658 if (input_position == input_size) {
659 if constexpr (store_values) {
660 url.update_base_pathname("/");
661 if (fragment.has_value()) {
662 url.update_unencoded_base_hash(*fragment);
663 }
664 }
665 return url;
666 }
667 // If c is neither U+002F (/) nor U+005C (\‍), then decrease pointer
668 // by 1. We know that (input_position == input_size) is impossible
669 // here, because of the previous if-check.
670 if ((url_data[input_position] != '/') &&
671 (url_data[input_position] != '\\')) {
672 break;
673 }
674 }
675 // Otherwise, if state override is not given and c is U+003F (?),
676 // set url's query to the empty string and state to query state.
677 else if ((input_position != input_size) &&
678 (url_data[input_position] == '?')) {
680 }
681 // Otherwise, if c is not the EOF code point:
682 else if (input_position != input_size) {
683 // Set state to path state.
685
686 // If c is not U+002F (/), then decrease pointer by 1.
687 if (url_data[input_position] != '/') {
688 break;
689 }
690 }
691
692 input_position++;
693 break;
694 }
695 case state::PATH: {
696 ada_log("PATH ", helpers::substring(url_data, input_position));
697 std::string_view view = url_data.substr(input_position);
698
699 // Most time, we do not need percent encoding.
700 // Furthermore, we can immediately locate the '?'.
701 size_t locofquestionmark = view.find('?');
702 if (locofquestionmark != std::string_view::npos) {
704 view.remove_suffix(view.size() - locofquestionmark);
705 input_position += locofquestionmark + 1;
706 } else {
707 input_position = input_size + 1;
708 }
709 if constexpr (store_values) {
710 if constexpr (result_type_is_ada_url) {
711 helpers::parse_prepared_path(view, url.type, url.path);
712 } else {
713 url.consume_prepared_path(view);
714 ADA_ASSERT_TRUE(url.validate());
715 }
716 }
717 break;
718 }
719 case state::FILE_SLASH: {
720 ada_log("FILE_SLASH ", helpers::substring(url_data, input_position));
721
722 // If c is U+002F (/) or U+005C (\‍), then:
723 if ((input_position != input_size) &&
724 (url_data[input_position] == '/' ||
725 url_data[input_position] == '\\')) {
726 ada_log("FILE_SLASH c is U+002F or U+005C");
727 // Set state to file host state.
729 input_position++;
730 } else {
731 ada_log("FILE_SLASH otherwise");
732 // If base is non-null and base's scheme is "file", then:
733 // Note: it is unsafe to do base_url->scheme unless you know that
734 // base_url_has_value() is true.
735 if (base_url != nullptr && base_url->type == scheme::type::FILE) {
736 // Set url's host to base's host.
737 if constexpr (result_type_is_ada_url) {
738 url.host = base_url->host;
739 } else {
740 url.update_host_to_base_host(base_url->get_host());
741 }
742 // If the code point substring from pointer to the end of input does
743 // not start with a Windows drive letter and base's path[0] is a
744 // normalized Windows drive letter, then append base's path[0] to
745 // url's path.
746 if (!base_url->get_pathname().empty()) {
748 url_data.substr(input_position))) {
749 std::string_view first_base_url_path =
750 base_url->get_pathname().substr(1);
751 size_t loc = first_base_url_path.find('/');
752 if (loc != std::string_view::npos) {
753 helpers::resize(first_base_url_path, loc);
754 }
756 first_base_url_path)) {
757 if constexpr (result_type_is_ada_url) {
758 url.path += '/';
759 url.path += first_base_url_path;
760 } else {
761 url.append_base_pathname(
762 helpers::concat("/", first_base_url_path));
763 }
764 }
765 }
766 }
767 }
768
769 // Set state to path state, and decrease pointer by 1.
771 }
772
773 break;
774 }
775 case state::FILE_HOST: {
776 ada_log("FILE_HOST ", helpers::substring(url_data, input_position));
777 std::string_view view = url_data.substr(input_position);
778
779 size_t location = view.find_first_of("/\\?");
780 std::string_view file_host_buffer(
781 view.data(),
782 (location != std::string_view::npos) ? location : view.size());
783
784 if (checkers::is_windows_drive_letter(file_host_buffer)) {
786 } else if (file_host_buffer.empty()) {
787 // Set url's host to the empty string.
788 if constexpr (result_type_is_ada_url) {
789 url.host = "";
790 } else {
791 url.update_base_hostname("");
792 }
793 // Set state to path start state.
795 } else {
796 size_t consumed_bytes = file_host_buffer.size();
797 input_position += consumed_bytes;
798 // Let host be the result of host parsing buffer with url is not
799 // special.
800 if (!url.parse_host(file_host_buffer)) {
801 return url;
802 }
803
804 if constexpr (result_type_is_ada_url) {
805 // If host is "localhost", then set host to the empty string.
806 if (url.host.has_value() && url.host.value() == "localhost") {
807 url.host = "";
808 }
809 } else {
810 if (url.get_hostname() == "localhost") {
811 url.update_base_hostname("");
812 }
813 }
814
815 // Set buffer to the empty string and state to path start state.
817 }
818
819 break;
820 }
821 case state::FILE: {
822 ada_log("FILE ", helpers::substring(url_data, input_position));
823 std::string_view file_view = url_data.substr(input_position);
824
825 url.set_protocol_as_file();
826 if constexpr (result_type_is_ada_url) {
827 // Set url's host to the empty string.
828 url.host = "";
829 } else {
830 url.update_base_hostname("");
831 }
832 // If c is U+002F (/) or U+005C (\‍), then:
833 if (input_position != input_size &&
834 (url_data[input_position] == '/' ||
835 url_data[input_position] == '\\')) {
836 ada_log("FILE c is U+002F or U+005C");
837 // Set state to file slash state.
839 }
840 // Otherwise, if base is non-null and base's scheme is "file":
841 else if (base_url != nullptr && base_url->type == scheme::type::FILE) {
842 // Set url's host to base's host, url's path to a clone of base's
843 // path, and url's query to base's query.
844 ada_log("FILE base non-null");
845 if constexpr (result_type_is_ada_url) {
846 url.host = base_url->host;
847 url.path = base_url->path;
848 url.query = base_url->query;
849 } else {
850 url.update_host_to_base_host(base_url->get_hostname());
851 url.update_base_pathname(base_url->get_pathname());
852 url.update_base_search(base_url->get_search());
853 }
854 url.has_opaque_path = base_url->has_opaque_path;
855
856 // If c is U+003F (?), then set url's query to the empty string and
857 // state to query state.
858 if (input_position != input_size && url_data[input_position] == '?') {
860 }
861 // Otherwise, if c is not the EOF code point:
862 else if (input_position != input_size) {
863 // Set url's query to null.
864 url.clear_search();
865 // If the code point substring from pointer to the end of input does
866 // not start with a Windows drive letter, then shorten url's path.
867 if (!checkers::is_windows_drive_letter(file_view)) {
868 if constexpr (result_type_is_ada_url) {
869 helpers::shorten_path(url.path, url.type);
870 } else {
871 std::string_view path = url.get_pathname();
872 if (helpers::shorten_path(path, url.type)) {
873 url.update_base_pathname(std::move(std::string(path)));
874 }
875 }
876 }
877 // Otherwise:
878 else {
879 // Set url's path to an empty list.
880 url.clear_pathname();
881 url.has_opaque_path = true;
882 }
883
884 // Set state to path state and decrease pointer by 1.
886 break;
887 }
888 }
889 // Otherwise, set state to path state, and decrease pointer by 1.
890 else {
891 ada_log("FILE go to path");
893 break;
894 }
895
896 input_position++;
897 break;
898 }
899 default:
900 unreachable();
901 }
902 }
903 if constexpr (store_values) {
904 if (fragment.has_value()) {
905 url.update_unencoded_base_hash(*fragment);
906 }
907 }
908 return url;
909}
910
911template url parse_url_impl(std::string_view user_input,
912 const url* base_url = nullptr);
914 std::string_view user_input, const url_aggregator* base_url = nullptr);
915
916template <class result_type>
917result_type parse_url(std::string_view user_input,
918 const result_type* base_url) {
919 return parse_url_impl<result_type, true>(user_input, base_url);
920}
921
922template url parse_url<url>(std::string_view user_input,
923 const url* base_url = nullptr);
925 std::string_view user_input, const url_aggregator* base_url = nullptr);
926} // namespace ada::parser
Definitions of the character sets used by unicode functions.
Cross-platform compiler macros and common definitions.
#define ADA_ASSERT_TRUE(COND)
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
Internal URL parsing implementation.
Definition parser-inl.h:16
template url parse_url< url >(std::string_view user_input, const url *base_url)
result_type parse_url(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:917
template url_aggregator parse_url< url_aggregator >(std::string_view user_input, const url_aggregator *base_url)
result_type parse_url_impl(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:14
state
States in the URL parsing state machine.
Definition state.h:27
@ SPECIAL_RELATIVE_OR_AUTHORITY
Definition state.h:101
@ FILE_SLASH
Definition state.h:81
@ SCHEME
Definition state.h:41
@ SPECIAL_AUTHORITY_SLASHES
Definition state.h:96
@ FILE_HOST
Definition state.h:76
@ OPAQUE_PATH
Definition state.h:121
@ RELATIVE_SLASH
Definition state.h:66
@ NO_SCHEME
Definition state.h:51
@ PATH_START
Definition state.h:116
@ RELATIVE_SCHEME
Definition state.h:61
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
Definition state.h:91
@ SCHEME_START
Definition state.h:36
@ AUTHORITY
Definition state.h:31
@ PATH_OR_AUTHORITY
Definition state.h:86
ada_warn_unused std::string_view to_string(encoding_type type)
void unreachable()
Memory-efficient URL representation using a single buffer.
ada_really_inline constexpr bool is_special() const noexcept
bool is_valid
Definition url_base.h:56
bool has_opaque_path
Definition url_base.h:62
Represents a parsed URL with individual string components.
Definition url.h:62
ada_really_inline std::string get_href() const noexcept
Definition url-inl.h:188
constexpr std::string_view get_pathname() const noexcept
Definition url-inl.h:46
std::string get_hostname() const noexcept
Definition url.cpp:655
std::string get_protocol() const noexcept
Definition url.cpp:633
Definitions for all unicode specific functions.