Ada 3.1.0
Fast spec-compliant URL parser
Loading...
Searching...
No Matches
url_pattern_helpers.h
Go to the documentation of this file.
1
5#ifndef ADA_URL_PATTERN_HELPERS_H
6#define ADA_URL_PATTERN_HELPERS_H
7
8#include "ada/expected.h"
9#include "ada/common_defs.h"
10#include "ada/url_pattern.h"
11
12#include <string>
13#include <tuple>
14#include <vector>
15
16namespace ada {
17enum class errors : uint8_t;
18}
19
21
22// @see https://urlpattern.spec.whatwg.org/#token
23enum class token_type : uint8_t {
25 OPEN, // 1
26 CLOSE, // 2
27 REGEXP, // 3
28 NAME, // 4
29 CHAR, // 5
33 END, // 9
34};
35
36#ifdef ADA_TESTING
37std::string to_string(token_type type);
38#endif // ADA_TESTING
39
40// @see https://urlpattern.spec.whatwg.org/#tokenize-policy
41enum class token_policy {
44};
45
46// @see https://urlpattern.spec.whatwg.org/#tokens
47class token {
48 public:
49 token(token_type _type, size_t _index, std::string&& _value)
50 : type(_type), index(_index), value(std::move(_value)) {}
51
52 // A token has an associated type, a string, initially "invalid-char".
54
55 // A token has an associated index, a number, initially 0. It is the position
56 // of the first code point in the pattern string represented by the token.
57 size_t index = 0;
58
59 // A token has an associated value, a string, initially the empty string. It
60 // contains the code points from the pattern string represented by the token.
61 std::string value{};
62};
63
64// @see https://urlpattern.spec.whatwg.org/#pattern-parser
65template <url_pattern_encoding_callback F>
67 public:
68 url_pattern_parser(F& encoding_callback_,
69 std::string_view segment_wildcard_regexp_)
70 : encoding_callback(encoding_callback_),
71 segment_wildcard_regexp(segment_wildcard_regexp_) {}
72
73 bool can_continue() const { return index < tokens.size(); }
74
75 // @see https://urlpattern.spec.whatwg.org/#try-to-consume-a-token
77 // @see https://urlpattern.spec.whatwg.org/#try-to-consume-a-modifier-token
79 // @see
80 // https://urlpattern.spec.whatwg.org/#try-to-consume-a-regexp-or-wildcard-token
82 // @see https://urlpattern.spec.whatwg.org/#consume-text
83 std::string consume_text();
84 // @see https://urlpattern.spec.whatwg.org/#consume-a-required-token
86 // @see
87 // https://urlpattern.spec.whatwg.org/#maybe-add-a-part-from-the-pending-fixed-value
90 // @see https://urlpattern.spec.whatwg.org/#add-a-part
91 std::optional<errors> add_part(std::string_view prefix, token* name_token,
92 token* regexp_or_wildcard_token,
93 std::string_view suyffix,
94 token* modifier_token) ada_warn_unused;
95
96 std::vector<token> tokens{};
99 std::vector<url_pattern_part> parts{};
100 std::string pending_fixed_value{};
101 size_t index = 0;
103};
104
105// @see https://urlpattern.spec.whatwg.org/#tokenizer
107 public:
108 explicit Tokenizer(std::string_view new_input, token_policy new_policy)
109 : input(new_input), policy(new_policy) {}
110
111 // @see https://urlpattern.spec.whatwg.org/#get-the-next-code-point
112 void get_next_code_point();
113
114 // @see https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point
115 void seek_and_get_next_code_point(size_t index);
116
117 // @see https://urlpattern.spec.whatwg.org/#add-a-token
118
119 void add_token(token_type type, size_t next_position, size_t value_position,
120 size_t value_length);
121
122 // @see https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length
123 void add_token_with_default_length(token_type type, size_t next_position,
124 size_t value_position);
125
126 // @see
127 // https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length
129
130 // @see https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error
131 std::optional<errors> process_tokenizing_error(
132 size_t next_position, size_t value_position) ada_warn_unused;
133
134 friend tl::expected<std::vector<token>, errors> tokenize(
135 std::string_view input, token_policy policy);
136
137 private:
138 // has an associated input, a pattern string, initially the empty string.
139 std::string input;
140 // has an associated policy, a tokenize policy, initially "strict".
141 token_policy policy;
142 // has an associated token list, a token list, initially an empty list.
143 std::vector<token> token_list{};
144 // has an associated index, a number, initially 0.
145 size_t index = 0;
146 // has an associated next index, a number, initially 0.
147 size_t next_index = 0;
148 // has an associated code point, a Unicode code point, initially null.
149 char32_t code_point{};
150};
151
152// @see https://urlpattern.spec.whatwg.org/#constructor-string-parser
153template <url_pattern_regex::regex_concept regex_provider>
155 explicit constructor_string_parser(std::string_view new_input,
156 std::vector<token>&& new_token_list)
157 : input(new_input), token_list(std::move(new_token_list)) {}
158
159 // @see https://urlpattern.spec.whatwg.org/#rewind
160 void rewind();
161
162 // @see https://urlpattern.spec.whatwg.org/#is-a-hash-prefix
163 bool is_hash_prefix();
164
165 // @see https://urlpattern.spec.whatwg.org/#is-a-search-prefix
166 bool is_search_prefix();
167
168 // @see https://urlpattern.spec.whatwg.org/#parse-a-constructor-string
169 static tl::expected<url_pattern_init, errors> parse(std::string_view input);
170
171 // @see https://urlpattern.spec.whatwg.org/#constructor-string-parser-state
185
186 // @see https://urlpattern.spec.whatwg.org/#change-state
187 void change_state(State state, size_t skip);
188
189 // @see https://urlpattern.spec.whatwg.org/#is-a-group-open
190 bool is_group_open() const;
191
192 // @see https://urlpattern.spec.whatwg.org/#is-a-group-close
193 bool is_group_close() const;
194
195 // @see https://urlpattern.spec.whatwg.org/#is-a-protocol-suffix
196 bool is_protocol_suffix() const;
197
198 // @see
199 // https://urlpattern.spec.whatwg.org/#compute-protocol-matches-a-special-scheme-flag
200 std::optional<errors> compute_protocol_matches_special_scheme_flag();
201
202 // @see https://urlpattern.spec.whatwg.org/#next-is-authority-slashes
203 bool next_is_authority_slashes() const;
204
205 // @see https://urlpattern.spec.whatwg.org/#is-an-identity-terminator
206 bool is_an_identity_terminator() const;
207
208 // @see https://urlpattern.spec.whatwg.org/#is-a-pathname-start
209 bool is_pathname_start() const;
210
211 // @see https://urlpattern.spec.whatwg.org/#is-a-password-prefix
212 bool is_password_prefix() const;
213
214 // @see https://urlpattern.spec.whatwg.org/#is-an-ipv6-open
215 bool is_an_ipv6_open() const;
216
217 // @see https://urlpattern.spec.whatwg.org/#is-an-ipv6-close
218 bool is_an_ipv6_close() const;
219
220 // @see https://urlpattern.spec.whatwg.org/#is-a-port-prefix
221 bool is_port_prefix() const;
222
223 private:
224 // @see https://urlpattern.spec.whatwg.org/#is-a-non-special-pattern-char
225 bool is_non_special_pattern_char(size_t index, std::string_view value) const;
226
227 // @see https://urlpattern.spec.whatwg.org/#get-a-safe-token
228 const token* get_safe_token(size_t index) const;
229
230 // @see https://urlpattern.spec.whatwg.org/#make-a-component-string
231 std::string make_component_string();
232 // has an associated input, a string, which must be set upon creation.
233 std::string input;
234 // has an associated token list, a token list, which must be set upon
235 // creation.
236 std::vector<token> token_list;
237 // has an associated result, a URLPatternInit, initially set to a new
238 // URLPatternInit.
240 // has an associated component start, a number, initially set to 0.
241 size_t component_start = 0;
242 // has an associated token index, a number, initially set to 0.
243 size_t token_index = 0;
244 // has an associated token increment, a number, initially set to 1.
245 size_t token_increment = 1;
246 // has an associated group depth, a number, initially set to 0.
247 size_t group_depth = 0;
248 // has an associated hostname IPv6 bracket depth, a number, initially set to
249 // 0.
250 size_t hostname_ipv6_bracket_depth = 0;
251 // has an associated protocol matches a special scheme flag, a boolean,
252 // initially set to false.
253 bool protocol_matches_a_special_scheme_flag = false;
254 // has an associated state, a string, initially set to "init".
256};
257
258// @see https://urlpattern.spec.whatwg.org/#canonicalize-a-protocol
259tl::expected<std::string, errors> canonicalize_protocol(std::string_view input);
260
261// @see https://wicg.github.io/urlpattern/#canonicalize-a-username
262tl::expected<std::string, errors> canonicalize_username(std::string_view input);
263
264// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
265tl::expected<std::string, errors> canonicalize_password(std::string_view input);
266
267// @see https://wicg.github.io/urlpattern/#canonicalize-a-password
268tl::expected<std::string, errors> canonicalize_hostname(std::string_view input);
269
270// @see https://wicg.github.io/urlpattern/#canonicalize-an-ipv6-hostname
271tl::expected<std::string, errors> canonicalize_ipv6_hostname(
272 std::string_view input);
273
274// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
275tl::expected<std::string, errors> canonicalize_port(std::string_view input);
276
277// @see https://wicg.github.io/urlpattern/#canonicalize-a-port
278tl::expected<std::string, errors> canonicalize_port_with_protocol(
279 std::string_view input, std::string_view protocol);
280
281// @see https://wicg.github.io/urlpattern/#canonicalize-a-pathname
282tl::expected<std::string, errors> canonicalize_pathname(std::string_view input);
283
284// @see https://wicg.github.io/urlpattern/#canonicalize-an-opaque-pathname
285tl::expected<std::string, errors> canonicalize_opaque_pathname(
286 std::string_view input);
287
288// @see https://wicg.github.io/urlpattern/#canonicalize-a-search
289tl::expected<std::string, errors> canonicalize_search(std::string_view input);
290
291// @see https://wicg.github.io/urlpattern/#canonicalize-a-hash
292tl::expected<std::string, errors> canonicalize_hash(std::string_view input);
293
294// @see https://urlpattern.spec.whatwg.org/#tokenize
295tl::expected<std::vector<token>, errors> tokenize(std::string_view input,
296 token_policy policy);
297
298// @see https://urlpattern.spec.whatwg.org/#process-a-base-url-string
299std::string process_base_url_string(std::string_view input,
301
302// @see https://urlpattern.spec.whatwg.org/#escape-a-pattern-string
303std::string escape_pattern_string(std::string_view input);
304
305// @see https://urlpattern.spec.whatwg.org/#escape-a-regexp-string
306std::string escape_regexp_string(std::string_view input);
307
308// @see https://urlpattern.spec.whatwg.org/#is-an-absolute-pathname
309constexpr bool is_absolute_pathname(
310 std::string_view input, url_pattern_init::process_type type) noexcept;
311
312// @see https://urlpattern.spec.whatwg.org/#parse-a-pattern-string
313template <url_pattern_encoding_callback F>
314tl::expected<std::vector<url_pattern_part>, errors> parse_pattern_string(
315 std::string_view input, url_pattern_compile_component_options& options,
316 F& encoding_callback);
317
318// @see https://urlpattern.spec.whatwg.org/#generate-a-pattern-string
319std::string generate_pattern_string(
320 std::vector<url_pattern_part>& part_list,
321 url_pattern_compile_component_options& options);
322
323// @see
324// https://urlpattern.spec.whatwg.org/#generate-a-regular-expression-and-name-list
325std::tuple<std::string, std::vector<std::string>>
327 const std::vector<url_pattern_part>& part_list,
328 url_pattern_compile_component_options options);
329
330// @see https://urlpattern.spec.whatwg.org/#hostname-pattern-is-an-ipv6-address
331bool is_ipv6_address(std::string_view input) noexcept;
332
333// @see
334// https://urlpattern.spec.whatwg.org/#protocol-component-matches-a-special-scheme
335template <url_pattern_regex::regex_concept regex_provider>
338
339// @see https://urlpattern.spec.whatwg.org/#convert-a-modifier-to-a-string
341
342// @see https://urlpattern.spec.whatwg.org/#generate-a-segment-wildcard-regexp
344 url_pattern_compile_component_options options);
345
346} // namespace ada::url_pattern_helpers
347
348#endif
void add_token_with_default_length(token_type type, size_t next_position, size_t value_position)
void add_token(token_type type, size_t next_position, size_t value_position, size_t value_length)
std::optional< errors > process_tokenizing_error(size_t next_position, size_t value_position) ada_warn_unused
Tokenizer(std::string_view new_input, token_policy new_policy)
friend tl::expected< std::vector< token >, errors > tokenize(std::string_view input, token_policy policy)
token(token_type _type, size_t _index, std::string &&_value)
std::optional< errors > add_part(std::string_view prefix, token *name_token, token *regexp_or_wildcard_token, std::string_view suyffix, token *modifier_token) ada_warn_unused
token * try_consume_regexp_or_wildcard_token(const token *name_token)
url_pattern_parser(F &encoding_callback_, std::string_view segment_wildcard_regexp_)
std::optional< errors > maybe_add_part_from_the_pending_fixed_value() ada_warn_unused
Common definitions for cross-platform compiler support.
#define ada_warn_unused
Definition common_defs.h:85
tl::expected< std::string, errors > canonicalize_opaque_pathname(std::string_view input)
tl::expected< std::string, errors > canonicalize_pathname(std::string_view input)
std::string escape_pattern_string(std::string_view input)
std::string convert_modifier_to_string(url_pattern_part_modifier modifier)
bool protocol_component_matches_special_scheme(url_pattern_component< regex_provider > &component)
tl::expected< std::string, errors > canonicalize_password(std::string_view input)
tl::expected< std::vector< token >, errors > tokenize(std::string_view input, token_policy policy)
std::string generate_segment_wildcard_regexp(url_pattern_compile_component_options options)
tl::expected< std::string, errors > canonicalize_protocol(std::string_view input)
tl::expected< std::vector< url_pattern_part >, errors > parse_pattern_string(std::string_view input, url_pattern_compile_component_options &options, F &encoding_callback)
tl::expected< std::string, errors > canonicalize_hostname(std::string_view input)
std::string generate_pattern_string(std::vector< url_pattern_part > &part_list, url_pattern_compile_component_options &options)
tl::expected< std::string, errors > canonicalize_port_with_protocol(std::string_view input, std::string_view protocol)
std::string escape_regexp_string(std::string_view input)
tl::expected< std::string, errors > canonicalize_hash(std::string_view input)
tl::expected< std::string, errors > canonicalize_port(std::string_view input)
bool is_ipv6_address(std::string_view input) noexcept
tl::expected< std::string, errors > canonicalize_search(std::string_view input)
constexpr bool is_absolute_pathname(std::string_view input, url_pattern_init::process_type type) noexcept
tl::expected< std::string, errors > canonicalize_ipv6_hostname(std::string_view input)
tl::expected< std::string, errors > canonicalize_username(std::string_view input)
std::tuple< std::string, std::vector< std::string > > generate_regular_expression_and_name_list(const std::vector< url_pattern_part > &part_list, url_pattern_compile_component_options options)
std::string process_base_url_string(std::string_view input, url_pattern_init::process_type type)
Definition ada_idna.h:13
url_pattern_part_modifier
Definition url_pattern.h:40
ada_warn_unused std::string to_string(encoding_type type)
errors
Definition errors.h:10
state
Definition state.h:17
tl::expected< result_type, ada::errors > result
constructor_string_parser(std::string_view new_input, std::vector< token > &&new_token_list)
static tl::expected< url_pattern_init, errors > parse(std::string_view input)
Declaration for the URLPattern implementation.