Ada 3.1.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
parser.cpp
Go to the documentation of this file.
1#include "ada/parser-inl.h"
2
3#include <limits>
4
6#include "ada/common_defs.h"
7#include "ada/log.h"
8#include "ada/unicode.h"
9
10namespace ada::parser {
11
12template <class result_type, bool store_values>
13result_type parse_url_impl(std::string_view user_input,
14 const result_type* base_url) {
15 // We can specialize the implementation per type.
16 // Important: result_type_is_ada_url is evaluated at *compile time*. This
17 // means that doing if constexpr(result_type_is_ada_url) { something } else {
18 // something else } is free (at runtime). This means that ada::url_aggregator
19 // and ada::url **do not have to support the exact same API**.
20 constexpr bool result_type_is_ada_url = std::is_same_v<url, result_type>;
21 constexpr bool result_type_is_ada_url_aggregator =
22 std::is_same_v<url_aggregator, result_type>;
23 static_assert(result_type_is_ada_url ||
24 result_type_is_ada_url_aggregator); // We don't support
25 // anything else for now.
26
27 ada_log("ada::parser::parse_url('", user_input, "' [", user_input.size(),
28 " bytes],", (base_url != nullptr ? base_url->to_string() : "null"),
29 ")");
30
32 result_type url{};
33
34 // We refuse to parse URL strings that exceed 4GB. Such strings are almost
35 // surely the result of a bug or are otherwise a security concern.
36 if (user_input.size() > std::numeric_limits<uint32_t>::max()) [[unlikely]] {
37 url.is_valid = false;
38 }
39 // Going forward, user_input.size() is in [0,
40 // std::numeric_limits<uint32_t>::max). If we are provided with an invalid
41 // base, or the optional_url was invalid, we must return.
42 if (base_url != nullptr) {
43 url.is_valid &= base_url->is_valid;
44 }
45 if (!url.is_valid) {
46 return url;
47 }
48 if constexpr (result_type_is_ada_url_aggregator && store_values) {
49 // Most of the time, we just need user_input.size().
50 // In some instances, we may need a bit more.
52 // This is *very* important. This line should *not* be removed
53 // hastily. There are principled reasons why reserve is important
54 // for performance. If you have a benchmark with small inputs,
55 // it may not matter, but in other instances, it could.
57 // This rounds up to the next power of two.
58 // We know that user_input.size() is in [0,
59 // std::numeric_limits<uint32_t>::max).
60 uint32_t reserve_capacity =
61 (0xFFFFFFFF >>
62 helpers::leading_zeroes(uint32_t(1 | user_input.size()))) +
63 1;
64 url.reserve(reserve_capacity);
65 }
66 std::string tmp_buffer;
67 std::string_view url_data;
68 if (unicode::has_tabs_or_newline(user_input)) [[unlikely]] {
69 tmp_buffer = user_input;
70 // Optimization opportunity: Instead of copying and then pruning, we could
71 // just directly build the string from user_input.
72 helpers::remove_ascii_tab_or_newline(tmp_buffer);
73 url_data = tmp_buffer;
74 } else [[likely]] {
75 url_data = user_input;
76 }
77
78 // Leading and trailing control characters are uncommon and easy to deal with
79 // (no performance concern).
80 helpers::trim_c0_whitespace(url_data);
81
82 // Optimization opportunity. Most websites do not have fragment.
83 std::optional<std::string_view> fragment = helpers::prune_hash(url_data);
84 // We add it last so that an implementation like ada::url_aggregator
85 // can append it last to its internal buffer, thus improving performance.
86
87 // Here url_data no longer has its fragment.
88 // We are going to access the data from url_data (it is immutable).
89 // At any given time, we are pointing at byte 'input_position' in url_data.
90 // The input_position variable should range from 0 to input_size.
91 // It is illegal to access url_data at input_size.
92 size_t input_position = 0;
93 const size_t input_size = url_data.size();
94 // Keep running the following state machine by switching on state.
95 // If after a run pointer points to the EOF code point, go to the next step.
96 // Otherwise, increase pointer by 1 and continue with the state machine.
97 // We never decrement input_position.
98 while (input_position <= input_size) {
99 ada_log("In parsing at ", input_position, " out of ", input_size,
100 " in state ", ada::to_string(state));
101 switch (state) {
102 case state::SCHEME_START: {
103 ada_log("SCHEME_START ", helpers::substring(url_data, input_position));
104 // If c is an ASCII alpha, append c, lowercased, to buffer, and set
105 // state to scheme state.
106 if ((input_position != input_size) &&
107 checkers::is_alpha(url_data[input_position])) {
109 input_position++;
110 } else {
111 // Otherwise, if state override is not given, set state to no scheme
112 // state and decrease pointer by 1.
114 }
115 break;
116 }
117 case state::SCHEME: {
118 ada_log("SCHEME ", helpers::substring(url_data, input_position));
119 // If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.),
120 // append c, lowercased, to buffer.
121 while ((input_position != input_size) &&
122 (unicode::is_alnum_plus(url_data[input_position]))) {
123 input_position++;
124 }
125 // Otherwise, if c is U+003A (:), then:
126 if ((input_position != input_size) &&
127 (url_data[input_position] == ':')) {
128 ada_log("SCHEME the scheme should be ",
129 url_data.substr(0, input_position));
130 if constexpr (result_type_is_ada_url) {
131 if (!url.parse_scheme(url_data.substr(0, input_position))) {
132 return url;
133 }
134 } else {
135 // we pass the colon along instead of painfully adding it back.
136 if (!url.parse_scheme_with_colon(
137 url_data.substr(0, input_position + 1))) {
138 return url;
139 }
140 }
141 ada_log("SCHEME the scheme is ", url.get_protocol());
142
143 // If url's scheme is "file", then:
144 if (url.type == scheme::type::FILE) {
145 // Set state to file state.
147 }
148 // Otherwise, if url is special, base is non-null, and base's scheme
149 // is url's scheme: Note: Doing base_url->scheme is unsafe if base_url
150 // != nullptr is false.
151 else if (url.is_special() && base_url != nullptr &&
152 base_url->type == url.type) {
153 // Set state to special relative or authority state.
155 }
156 // Otherwise, if url is special, set state to special authority
157 // slashes state.
158 else if (url.is_special()) {
160 }
161 // Otherwise, if remaining starts with an U+002F (/), set state to
162 // path or authority state and increase pointer by 1.
163 else if (input_position + 1 < input_size &&
164 url_data[input_position + 1] == '/') {
166 input_position++;
167 }
168 // Otherwise, set url's path to the empty string and set state to
169 // opaque path state.
170 else {
172 }
173 }
174 // Otherwise, if state override is not given, set buffer to the empty
175 // string, state to no scheme state, and start over (from the first code
176 // point in input).
177 else {
179 input_position = 0;
180 break;
181 }
182 input_position++;
183 break;
184 }
185 case state::NO_SCHEME: {
186 ada_log("NO_SCHEME ", helpers::substring(url_data, input_position));
187 // If base is null, or base has an opaque path and c is not U+0023 (#),
188 // validation error, return failure.
189 if (base_url == nullptr ||
190 (base_url->has_opaque_path && !fragment.has_value())) {
191 ada_log("NO_SCHEME validation error");
192 url.is_valid = false;
193 return url;
194 }
195 // Otherwise, if base has an opaque path and c is U+0023 (#),
196 // set url's scheme to base's scheme, url's path to base's path, url's
197 // query to base's query, and set state to fragment state.
198 else if (base_url->has_opaque_path && fragment.has_value() &&
199 input_position == input_size) {
200 ada_log("NO_SCHEME opaque base with fragment");
201 url.copy_scheme(*base_url);
202 url.has_opaque_path = base_url->has_opaque_path;
203
204 if constexpr (result_type_is_ada_url) {
205 url.path = base_url->path;
206 url.query = base_url->query;
207 } else {
208 url.update_base_pathname(base_url->get_pathname());
209 url.update_base_search(base_url->get_search());
210 }
211 url.update_unencoded_base_hash(*fragment);
212 return url;
213 }
214 // Otherwise, if base's scheme is not "file", set state to relative
215 // state and decrease pointer by 1.
216 else if (base_url->type != scheme::type::FILE) {
217 ada_log("NO_SCHEME non-file relative path");
219 }
220 // Otherwise, set state to file state and decrease pointer by 1.
221 else {
222 ada_log("NO_SCHEME file base type");
224 }
225 break;
226 }
227 case state::AUTHORITY: {
228 ada_log("AUTHORITY ", helpers::substring(url_data, input_position));
229 // most URLs have no @. Having no @ tells us that we don't have to worry
230 // about AUTHORITY. Of course, we could have @ and still not have to
231 // worry about AUTHORITY.
232 // TODO: Instead of just collecting a bool, collect the location of the
233 // '@' and do something useful with it.
234 // TODO: We could do various processing early on, using a single pass
235 // over the string to collect information about it, e.g., telling us
236 // whether there is a @ and if so, where (or how many).
237
238 // Check if url data contains an @.
239 if (url_data.find('@', input_position) == std::string_view::npos) {
241 break;
242 }
243 bool at_sign_seen{false};
244 bool password_token_seen{false};
250 do {
251 std::string_view view = url_data.substr(input_position);
252 // The delimiters are @, /, ? \\.
253 size_t location =
254 url.is_special() ? helpers::find_authority_delimiter_special(view)
255 : helpers::find_authority_delimiter(view);
256 std::string_view authority_view = view.substr(0, location);
257 size_t end_of_authority = input_position + authority_view.size();
258 // If c is U+0040 (@), then:
259 if ((end_of_authority != input_size) &&
260 (url_data[end_of_authority] == '@')) {
261 // If atSignSeen is true, then prepend "%40" to buffer.
262 if (at_sign_seen) {
263 if (password_token_seen) {
264 if constexpr (result_type_is_ada_url) {
265 url.password += "%40";
266 } else {
267 url.append_base_password("%40");
268 }
269 } else {
270 if constexpr (result_type_is_ada_url) {
271 url.username += "%40";
272 } else {
273 url.append_base_username("%40");
274 }
275 }
276 }
277
278 at_sign_seen = true;
279
280 if (!password_token_seen) {
281 size_t password_token_location = authority_view.find(':');
282 password_token_seen =
283 password_token_location != std::string_view::npos;
284
285 if constexpr (store_values) {
286 if (!password_token_seen) {
287 if constexpr (result_type_is_ada_url) {
288 url.username += unicode::percent_encode(
289 authority_view,
291 } else {
292 url.append_base_username(unicode::percent_encode(
293 authority_view,
295 }
296 } else {
297 if constexpr (result_type_is_ada_url) {
298 url.username += unicode::percent_encode(
299 authority_view.substr(0, password_token_location),
301 url.password += unicode::percent_encode(
302 authority_view.substr(password_token_location + 1),
304 } else {
305 url.append_base_username(unicode::percent_encode(
306 authority_view.substr(0, password_token_location),
308 url.append_base_password(unicode::percent_encode(
309 authority_view.substr(password_token_location + 1),
311 }
312 }
313 }
314 } else if constexpr (store_values) {
315 if constexpr (result_type_is_ada_url) {
316 url.password += unicode::percent_encode(
318 } else {
319 url.append_base_password(unicode::percent_encode(
321 }
322 }
323 }
324 // Otherwise, if one of the following is true:
325 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
326 // - url is special and c is U+005C (\‍)
327 else if (end_of_authority == input_size ||
328 url_data[end_of_authority] == '/' ||
329 url_data[end_of_authority] == '?' ||
330 (url.is_special() && url_data[end_of_authority] == '\\')) {
331 // If atSignSeen is true and authority_view is the empty string,
332 // validation error, return failure.
333 if (at_sign_seen && authority_view.empty()) {
334 url.is_valid = false;
335 return url;
336 }
338 break;
339 }
340 if (end_of_authority == input_size) {
341 if constexpr (store_values) {
342 if (fragment.has_value()) {
343 url.update_unencoded_base_hash(*fragment);
344 }
345 }
346 return url;
347 }
348 input_position = end_of_authority + 1;
349 } while (true);
350
351 break;
352 }
354 ada_log("SPECIAL_RELATIVE_OR_AUTHORITY ",
355 helpers::substring(url_data, input_position));
356
357 // If c is U+002F (/) and remaining starts with U+002F (/),
358 // then set state to special authority ignore slashes state and increase
359 // pointer by 1.
360 if (url_data.substr(input_position, 2) == "//") {
362 input_position += 2;
363 } else {
364 // Otherwise, validation error, set state to relative state and
365 // decrease pointer by 1.
367 }
368
369 break;
370 }
372 ada_log("PATH_OR_AUTHORITY ",
373 helpers::substring(url_data, input_position));
374
375 // If c is U+002F (/), then set state to authority state.
376 if ((input_position != input_size) &&
377 (url_data[input_position] == '/')) {
379 input_position++;
380 } else {
381 // Otherwise, set state to path state, and decrease pointer by 1.
383 }
384
385 break;
386 }
388 ada_log("RELATIVE_SCHEME ",
389 helpers::substring(url_data, input_position));
390
391 // Set url's scheme to base's scheme.
392 url.copy_scheme(*base_url);
393
394 // If c is U+002F (/), then set state to relative slash state.
395 if ((input_position != input_size) &&
396 (url_data[input_position] == '/')) {
397 ada_log(
398 "RELATIVE_SCHEME if c is U+002F (/), then set state to relative "
399 "slash state");
401 } else if (url.is_special() && (input_position != input_size) &&
402 (url_data[input_position] == '\\')) {
403 // Otherwise, if url is special and c is U+005C (\‍), validation error,
404 // set state to relative slash state.
405 ada_log(
406 "RELATIVE_SCHEME if url is special and c is U+005C, validation "
407 "error, set state to relative slash state");
409 } else {
410 ada_log("RELATIVE_SCHEME otherwise");
411 // Set url's username to base's username, url's password to base's
412 // password, url's host to base's host, url's port to base's port,
413 // url's path to a clone of base's path, and url's query to base's
414 // query.
415 if constexpr (result_type_is_ada_url) {
416 url.username = base_url->username;
417 url.password = base_url->password;
418 url.host = base_url->host;
419 url.port = base_url->port;
420 // cloning the base path includes cloning the has_opaque_path flag
421 url.has_opaque_path = base_url->has_opaque_path;
422 url.path = base_url->path;
423 url.query = base_url->query;
424 } else {
425 url.update_base_authority(base_url->get_href(),
426 base_url->get_components());
427 url.update_host_to_base_host(base_url->get_hostname());
428 url.update_base_port(base_url->retrieve_base_port());
429 // cloning the base path includes cloning the has_opaque_path flag
430 url.has_opaque_path = base_url->has_opaque_path;
431 url.update_base_pathname(base_url->get_pathname());
432 url.update_base_search(base_url->get_search());
433 }
434
435 url.has_opaque_path = base_url->has_opaque_path;
436
437 // If c is U+003F (?), then set url's query to the empty string, and
438 // state to query state.
439 if ((input_position != input_size) &&
440 (url_data[input_position] == '?')) {
442 }
443 // Otherwise, if c is not the EOF code point:
444 else if (input_position != input_size) {
445 // Set url's query to null.
446 url.clear_search();
447 if constexpr (result_type_is_ada_url) {
448 // Shorten url's path.
449 helpers::shorten_path(url.path, url.type);
450 } else {
451 std::string_view path = url.get_pathname();
452 if (helpers::shorten_path(path, url.type)) {
453 url.update_base_pathname(std::move(std::string(path)));
454 }
455 }
456 // Set state to path state and decrease pointer by 1.
458 break;
459 }
460 }
461 input_position++;
462 break;
463 }
465 ada_log("RELATIVE_SLASH ",
466 helpers::substring(url_data, input_position));
467
468 // If url is special and c is U+002F (/) or U+005C (\‍), then:
469 if (url.is_special() && (input_position != input_size) &&
470 (url_data[input_position] == '/' ||
471 url_data[input_position] == '\\')) {
472 // Set state to special authority ignore slashes state.
474 }
475 // Otherwise, if c is U+002F (/), then set state to authority state.
476 else if ((input_position != input_size) &&
477 (url_data[input_position] == '/')) {
479 }
480 // Otherwise, set
481 // - url's username to base's username,
482 // - url's password to base's password,
483 // - url's host to base's host,
484 // - url's port to base's port,
485 // - state to path state, and then, decrease pointer by 1.
486 else {
487 if constexpr (result_type_is_ada_url) {
488 url.username = base_url->username;
489 url.password = base_url->password;
490 url.host = base_url->host;
491 url.port = base_url->port;
492 } else {
493 url.update_base_authority(base_url->get_href(),
494 base_url->get_components());
495 url.update_host_to_base_host(base_url->get_hostname());
496 url.update_base_port(base_url->retrieve_base_port());
497 }
499 break;
500 }
501
502 input_position++;
503 break;
504 }
506 ada_log("SPECIAL_AUTHORITY_SLASHES ",
507 helpers::substring(url_data, input_position));
508
509 // If c is U+002F (/) and remaining starts with U+002F (/),
510 // then set state to special authority ignore slashes state and increase
511 // pointer by 1.
512 if (url_data.substr(input_position, 2) == "//") {
513 input_position += 2;
514 }
515
516 [[fallthrough]];
517 }
519 ada_log("SPECIAL_AUTHORITY_IGNORE_SLASHES ",
520 helpers::substring(url_data, input_position));
521
522 // If c is neither U+002F (/) nor U+005C (\‍), then set state to
523 // authority state and decrease pointer by 1.
524 while ((input_position != input_size) &&
525 ((url_data[input_position] == '/') ||
526 (url_data[input_position] == '\\'))) {
527 input_position++;
528 }
530
531 break;
532 }
533 case state::QUERY: {
534 ada_log("QUERY ", helpers::substring(url_data, input_position));
535 if constexpr (store_values) {
536 // Let queryPercentEncodeSet be the special-query percent-encode set
537 // if url is special; otherwise the query percent-encode set.
538 const uint8_t* query_percent_encode_set =
541
542 // Percent-encode after encoding, with encoding, buffer, and
543 // queryPercentEncodeSet, and append the result to url's query.
544 url.update_base_search(url_data.substr(input_position),
545 query_percent_encode_set);
546 ada_log("QUERY update_base_search completed ");
547 if (fragment.has_value()) {
548 url.update_unencoded_base_hash(*fragment);
549 }
550 }
551 return url;
552 }
553 case state::HOST: {
554 ada_log("HOST ", helpers::substring(url_data, input_position));
555
556 std::string_view host_view = url_data.substr(input_position);
557 auto [location, found_colon] =
558 helpers::get_host_delimiter_location(url.is_special(), host_view);
559 input_position = (location != std::string_view::npos)
560 ? input_position + location
561 : input_size;
562 // Otherwise, if c is U+003A (:) and insideBrackets is false, then:
563 // Note: the 'found_colon' value is true if and only if a colon was
564 // encountered while not inside brackets.
565 if (found_colon) {
566 // If buffer is the empty string, validation error, return failure.
567 // Let host be the result of host parsing buffer with url is not
568 // special.
569 ada_log("HOST parsing ", host_view);
570 if (!url.parse_host(host_view)) {
571 return url;
572 }
573 ada_log("HOST parsing results in ", url.get_hostname());
574 // Set url's host to host, buffer to the empty string, and state to
575 // port state.
577 input_position++;
578 }
579 // Otherwise, if one of the following is true:
580 // - c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
581 // - url is special and c is U+005C (\‍)
582 // The get_host_delimiter_location function either brings us to
583 // the colon outside of the bracket, or to one of those characters.
584 else {
585 // If url is special and host_view is the empty string, validation
586 // error, return failure.
587 if (host_view.empty() && url.is_special()) {
588 url.is_valid = false;
589 return url;
590 }
591 ada_log("HOST parsing ", host_view, " href=", url.get_href());
592 // Let host be the result of host parsing host_view with url is not
593 // special.
594 if (host_view.empty()) {
595 url.update_base_hostname("");
596 } else if (!url.parse_host(host_view)) {
597 return url;
598 }
599 ada_log("HOST parsing results in ", url.get_hostname(),
600 " href=", url.get_href());
601
602 // Set url's host to host, and state to path start state.
604 }
605
606 break;
607 }
608 case state::OPAQUE_PATH: {
609 ada_log("OPAQUE_PATH ", helpers::substring(url_data, input_position));
610 std::string_view view = url_data.substr(input_position);
611 // If c is U+003F (?), then set url's query to the empty string and
612 // state to query state.
613 size_t location = view.find('?');
614 if (location != std::string_view::npos) {
615 view.remove_suffix(view.size() - location);
617 input_position += location + 1;
618 } else {
619 input_position = input_size + 1;
620 }
621 url.has_opaque_path = true;
622 // This is a really unlikely scenario in real world. We should not seek
623 // to optimize it.
624 url.update_base_pathname(unicode::percent_encode(
626 break;
627 }
628 case state::PORT: {
629 ada_log("PORT ", helpers::substring(url_data, input_position));
630 std::string_view port_view = url_data.substr(input_position);
631 input_position += url.parse_port(port_view, true);
632 if (!url.is_valid) {
633 return url;
634 }
636 [[fallthrough]];
637 }
638 case state::PATH_START: {
639 ada_log("PATH_START ", helpers::substring(url_data, input_position));
640
641 // If url is special, then:
642 if (url.is_special()) {
643 // Set state to path state.
645
646 // Optimization: Avoiding going into PATH state improves the
647 // performance of urls ending with /.
648 if (input_position == input_size) {
649 if constexpr (store_values) {
650 url.update_base_pathname("/");
651 if (fragment.has_value()) {
652 url.update_unencoded_base_hash(*fragment);
653 }
654 }
655 return url;
656 }
657 // If c is neither U+002F (/) nor U+005C (\‍), then decrease pointer
658 // by 1. We know that (input_position == input_size) is impossible
659 // here, because of the previous if-check.
660 if ((url_data[input_position] != '/') &&
661 (url_data[input_position] != '\\')) {
662 break;
663 }
664 }
665 // Otherwise, if state override is not given and c is U+003F (?),
666 // set url's query to the empty string and state to query state.
667 else if ((input_position != input_size) &&
668 (url_data[input_position] == '?')) {
670 }
671 // Otherwise, if c is not the EOF code point:
672 else if (input_position != input_size) {
673 // Set state to path state.
675
676 // If c is not U+002F (/), then decrease pointer by 1.
677 if (url_data[input_position] != '/') {
678 break;
679 }
680 }
681
682 input_position++;
683 break;
684 }
685 case state::PATH: {
686 ada_log("PATH ", helpers::substring(url_data, input_position));
687 std::string_view view = url_data.substr(input_position);
688
689 // Most time, we do not need percent encoding.
690 // Furthermore, we can immediately locate the '?'.
691 size_t locofquestionmark = view.find('?');
692 if (locofquestionmark != std::string_view::npos) {
694 view.remove_suffix(view.size() - locofquestionmark);
695 input_position += locofquestionmark + 1;
696 } else {
697 input_position = input_size + 1;
698 }
699 if constexpr (store_values) {
700 if constexpr (result_type_is_ada_url) {
701 helpers::parse_prepared_path(view, url.type, url.path);
702 } else {
703 url.consume_prepared_path(view);
704 ADA_ASSERT_TRUE(url.validate());
705 }
706 }
707 break;
708 }
709 case state::FILE_SLASH: {
710 ada_log("FILE_SLASH ", helpers::substring(url_data, input_position));
711
712 // If c is U+002F (/) or U+005C (\‍), then:
713 if ((input_position != input_size) &&
714 (url_data[input_position] == '/' ||
715 url_data[input_position] == '\\')) {
716 ada_log("FILE_SLASH c is U+002F or U+005C");
717 // Set state to file host state.
719 input_position++;
720 } else {
721 ada_log("FILE_SLASH otherwise");
722 // If base is non-null and base's scheme is "file", then:
723 // Note: it is unsafe to do base_url->scheme unless you know that
724 // base_url_has_value() is true.
725 if (base_url != nullptr && base_url->type == scheme::type::FILE) {
726 // Set url's host to base's host.
727 if constexpr (result_type_is_ada_url) {
728 url.host = base_url->host;
729 } else {
730 url.update_host_to_base_host(base_url->get_host());
731 }
732 // If the code point substring from pointer to the end of input does
733 // not start with a Windows drive letter and base's path[0] is a
734 // normalized Windows drive letter, then append base's path[0] to
735 // url's path.
736 if (!base_url->get_pathname().empty()) {
738 url_data.substr(input_position))) {
739 std::string_view first_base_url_path =
740 base_url->get_pathname().substr(1);
741 size_t loc = first_base_url_path.find('/');
742 if (loc != std::string_view::npos) {
743 helpers::resize(first_base_url_path, loc);
744 }
746 first_base_url_path)) {
747 if constexpr (result_type_is_ada_url) {
748 url.path += '/';
749 url.path += first_base_url_path;
750 } else {
751 url.append_base_pathname(
752 helpers::concat("/", first_base_url_path));
753 }
754 }
755 }
756 }
757 }
758
759 // Set state to path state, and decrease pointer by 1.
761 }
762
763 break;
764 }
765 case state::FILE_HOST: {
766 ada_log("FILE_HOST ", helpers::substring(url_data, input_position));
767 std::string_view view = url_data.substr(input_position);
768
769 size_t location = view.find_first_of("/\\?");
770 std::string_view file_host_buffer(
771 view.data(),
772 (location != std::string_view::npos) ? location : view.size());
773
774 if (checkers::is_windows_drive_letter(file_host_buffer)) {
776 } else if (file_host_buffer.empty()) {
777 // Set url's host to the empty string.
778 if constexpr (result_type_is_ada_url) {
779 url.host = "";
780 } else {
781 url.update_base_hostname("");
782 }
783 // Set state to path start state.
785 } else {
786 size_t consumed_bytes = file_host_buffer.size();
787 input_position += consumed_bytes;
788 // Let host be the result of host parsing buffer with url is not
789 // special.
790 if (!url.parse_host(file_host_buffer)) {
791 return url;
792 }
793
794 if constexpr (result_type_is_ada_url) {
795 // If host is "localhost", then set host to the empty string.
796 if (url.host.has_value() && url.host.value() == "localhost") {
797 url.host = "";
798 }
799 } else {
800 if (url.get_hostname() == "localhost") {
801 url.update_base_hostname("");
802 }
803 }
804
805 // Set buffer to the empty string and state to path start state.
807 }
808
809 break;
810 }
811 case state::FILE: {
812 ada_log("FILE ", helpers::substring(url_data, input_position));
813 std::string_view file_view = url_data.substr(input_position);
814
815 url.set_protocol_as_file();
816 if constexpr (result_type_is_ada_url) {
817 // Set url's host to the empty string.
818 url.host = "";
819 } else {
820 url.update_base_hostname("");
821 }
822 // If c is U+002F (/) or U+005C (\‍), then:
823 if (input_position != input_size &&
824 (url_data[input_position] == '/' ||
825 url_data[input_position] == '\\')) {
826 ada_log("FILE c is U+002F or U+005C");
827 // Set state to file slash state.
829 }
830 // Otherwise, if base is non-null and base's scheme is "file":
831 else if (base_url != nullptr && base_url->type == scheme::type::FILE) {
832 // Set url's host to base's host, url's path to a clone of base's
833 // path, and url's query to base's query.
834 ada_log("FILE base non-null");
835 if constexpr (result_type_is_ada_url) {
836 url.host = base_url->host;
837 url.path = base_url->path;
838 url.query = base_url->query;
839 } else {
840 url.update_host_to_base_host(base_url->get_hostname());
841 url.update_base_pathname(base_url->get_pathname());
842 url.update_base_search(base_url->get_search());
843 }
844 url.has_opaque_path = base_url->has_opaque_path;
845
846 // If c is U+003F (?), then set url's query to the empty string and
847 // state to query state.
848 if (input_position != input_size && url_data[input_position] == '?') {
850 }
851 // Otherwise, if c is not the EOF code point:
852 else if (input_position != input_size) {
853 // Set url's query to null.
854 url.clear_search();
855 // If the code point substring from pointer to the end of input does
856 // not start with a Windows drive letter, then shorten url's path.
857 if (!checkers::is_windows_drive_letter(file_view)) {
858 if constexpr (result_type_is_ada_url) {
859 helpers::shorten_path(url.path, url.type);
860 } else {
861 std::string_view path = url.get_pathname();
862 if (helpers::shorten_path(path, url.type)) {
863 url.update_base_pathname(std::move(std::string(path)));
864 }
865 }
866 }
867 // Otherwise:
868 else {
869 // Set url's path to an empty list.
870 url.clear_pathname();
871 url.has_opaque_path = true;
872 }
873
874 // Set state to path state and decrease pointer by 1.
876 break;
877 }
878 }
879 // Otherwise, set state to path state, and decrease pointer by 1.
880 else {
881 ada_log("FILE go to path");
883 break;
884 }
885
886 input_position++;
887 break;
888 }
889 default:
890 unreachable();
891 }
892 }
893 if constexpr (store_values) {
894 if (fragment.has_value()) {
895 url.update_unencoded_base_hash(*fragment);
896 }
897 }
898 return url;
899}
900
901template url parse_url_impl(std::string_view user_input,
902 const url* base_url = nullptr);
904 std::string_view user_input, const url_aggregator* base_url = nullptr);
905
906template <class result_type>
907result_type parse_url(std::string_view user_input,
908 const result_type* base_url) {
909 return parse_url_impl<result_type, true>(user_input, base_url);
910}
911
912template url parse_url<url>(std::string_view user_input,
913 const url* base_url = nullptr);
915 std::string_view user_input, const url_aggregator* base_url = nullptr);
916} // namespace ada::parser
Definitions of the character sets used by unicode functions.
Common definitions for cross-platform compiler support.
#define ADA_ASSERT_TRUE(COND)
constexpr uint8_t QUERY_PERCENT_ENCODE[32]
constexpr uint8_t SPECIAL_QUERY_PERCENT_ENCODE[32]
constexpr uint8_t C0_CONTROL_PERCENT_ENCODE[32]
constexpr uint8_t USERINFO_PERCENT_ENCODE[32]
constexpr bool is_normalized_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_windows_drive_letter(std::string_view input) noexcept
constexpr bool is_alpha(char x) noexcept
Includes the definitions for supported parsers.
Definition parser-inl.h:16
template url parse_url< url >(std::string_view user_input, const url *base_url)
result_type parse_url(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:907
template url_aggregator parse_url< url_aggregator >(std::string_view user_input, const url_aggregator *base_url)
result_type parse_url_impl(std::string_view user_input, const result_type *base_url=nullptr)
Definition parser.cpp:13
ada_warn_unused std::string to_string(encoding_type type)
state
Definition state.h:17
@ SPECIAL_RELATIVE_OR_AUTHORITY
Definition state.h:91
@ FILE_SLASH
Definition state.h:71
@ SCHEME
Definition state.h:31
@ QUERY
Definition state.h:96
@ SPECIAL_AUTHORITY_SLASHES
Definition state.h:86
@ FILE_HOST
Definition state.h:66
@ OPAQUE_PATH
Definition state.h:111
@ RELATIVE_SLASH
Definition state.h:56
@ NO_SCHEME
Definition state.h:41
@ PATH_START
Definition state.h:106
@ RELATIVE_SCHEME
Definition state.h:51
@ SPECIAL_AUTHORITY_IGNORE_SLASHES
Definition state.h:81
@ SCHEME_START
Definition state.h:26
@ AUTHORITY
Definition state.h:21
@ PATH_OR_AUTHORITY
Definition state.h:76
void unreachable()
Lightweight URL struct.
ada_really_inline constexpr bool is_special() const noexcept
bool is_valid
Definition url_base.h:50
bool has_opaque_path
Definition url_base.h:55
Generic URL struct reliant on std::string instantiation.
Definition url.h:45
ada_really_inline std::string get_href() const noexcept
Definition url-inl.h:187
constexpr std::string_view get_pathname() const noexcept
Definition url-inl.h:46
std::string get_hostname() const noexcept
Definition url.cpp:637
std::string get_protocol() const noexcept
Definition url.cpp:615
Definitions for all unicode specific functions.