1 | // __ _____ _____ _____ |
2 | // __| | __| | | | JSON for Modern C++ |
3 | // | | |__ | | | | | | version 3.11.3 |
4 | // |_____|_____|_____|_|___| https://github.com/nlohmann/json |
5 | // |
6 | // SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me> |
7 | // SPDX-License-Identifier: MIT |
8 | |
9 | #pragma once |
10 | |
11 | #include <array> // array |
12 | #include <clocale> // localeconv |
13 | #include <cstddef> // size_t |
14 | #include <cstdio> // snprintf |
15 | #include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull |
16 | #include <initializer_list> // initializer_list |
17 | #include <string> // char_traits, string |
18 | #include <utility> // move |
19 | #include <vector> // vector |
20 | |
21 | #include <nlohmann/detail/input/input_adapters.hpp> |
22 | #include <nlohmann/detail/input/position_t.hpp> |
23 | #include <nlohmann/detail/macro_scope.hpp> |
24 | #include <nlohmann/detail/meta/type_traits.hpp> |
25 | |
26 | NLOHMANN_JSON_NAMESPACE_BEGIN |
27 | namespace detail |
28 | { |
29 | |
30 | /////////// |
31 | // lexer // |
32 | /////////// |
33 | |
34 | template<typename BasicJsonType> |
35 | class lexer_base |
36 | { |
37 | public: |
38 | /// token types for the parser |
39 | enum class token_type |
40 | { |
41 | uninitialized, ///< indicating the scanner is uninitialized |
42 | literal_true, ///< the `true` literal |
43 | literal_false, ///< the `false` literal |
44 | literal_null, ///< the `null` literal |
45 | value_string, ///< a string -- use get_string() for actual value |
46 | value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value |
47 | value_integer, ///< a signed integer -- use get_number_integer() for actual value |
48 | value_float, ///< an floating point number -- use get_number_float() for actual value |
49 | begin_array, ///< the character for array begin `[` |
50 | begin_object, ///< the character for object begin `{` |
51 | end_array, ///< the character for array end `]` |
52 | end_object, ///< the character for object end `}` |
53 | name_separator, ///< the name separator `:` |
54 | value_separator, ///< the value separator `,` |
55 | parse_error, ///< indicating a parse error |
56 | end_of_input, ///< indicating the end of the input buffer |
57 | literal_or_value ///< a literal or the begin of a value (only for diagnostics) |
58 | }; |
59 | |
60 | /// return name of values of type token_type (only used for errors) |
61 | JSON_HEDLEY_RETURNS_NON_NULL |
62 | JSON_HEDLEY_CONST |
63 | static const char* token_type_name(const token_type t) noexcept |
64 | { |
65 | switch (t) |
66 | { |
67 | case token_type::uninitialized: |
68 | return "<uninitialized>" ; |
69 | case token_type::literal_true: |
70 | return "true literal" ; |
71 | case token_type::literal_false: |
72 | return "false literal" ; |
73 | case token_type::literal_null: |
74 | return "null literal" ; |
75 | case token_type::value_string: |
76 | return "string literal" ; |
77 | case token_type::value_unsigned: |
78 | case token_type::value_integer: |
79 | case token_type::value_float: |
80 | return "number literal" ; |
81 | case token_type::begin_array: |
82 | return "'['" ; |
83 | case token_type::begin_object: |
84 | return "'{'" ; |
85 | case token_type::end_array: |
86 | return "']'" ; |
87 | case token_type::end_object: |
88 | return "'}'" ; |
89 | case token_type::name_separator: |
90 | return "':'" ; |
91 | case token_type::value_separator: |
92 | return "','" ; |
93 | case token_type::parse_error: |
94 | return "<parse error>" ; |
95 | case token_type::end_of_input: |
96 | return "end of input" ; |
97 | case token_type::literal_or_value: |
98 | return "'[', '{', or a literal" ; |
99 | // LCOV_EXCL_START |
100 | default: // catch non-enum values |
101 | return "unknown token" ; |
102 | // LCOV_EXCL_STOP |
103 | } |
104 | } |
105 | }; |
106 | /*! |
107 | @brief lexical analysis |
108 | |
109 | This class organizes the lexical analysis during JSON deserialization. |
110 | */ |
111 | template<typename BasicJsonType, typename InputAdapterType> |
112 | class lexer : public lexer_base<BasicJsonType> |
113 | { |
114 | using number_integer_t = typename BasicJsonType::number_integer_t; |
115 | using number_unsigned_t = typename BasicJsonType::number_unsigned_t; |
116 | using number_float_t = typename BasicJsonType::number_float_t; |
117 | using string_t = typename BasicJsonType::string_t; |
118 | using char_type = typename InputAdapterType::char_type; |
119 | using char_int_type = typename char_traits<char_type>::int_type; |
120 | |
121 | public: |
122 | using token_type = typename lexer_base<BasicJsonType>::token_type; |
123 | |
124 | explicit lexer(InputAdapterType&& adapter, bool = false) noexcept |
125 | : ia(std::move(adapter)) |
126 | , ignore_comments(ignore_comments_) |
127 | , decimal_point_char(static_cast<char_int_type>(get_decimal_point())) |
128 | {} |
129 | |
130 | // delete because of pointer members |
131 | lexer(const lexer&) = delete; |
132 | lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) |
133 | lexer& operator=(lexer&) = delete; |
134 | lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) |
135 | ~lexer() = default; |
136 | |
137 | private: |
138 | ///////////////////// |
139 | // locales |
140 | ///////////////////// |
141 | |
142 | /// return the locale-dependent decimal point |
143 | JSON_HEDLEY_PURE |
144 | static char get_decimal_point() noexcept |
145 | { |
146 | const auto* loc = localeconv(); |
147 | JSON_ASSERT(loc != nullptr); |
148 | return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); |
149 | } |
150 | |
151 | ///////////////////// |
152 | // scan functions |
153 | ///////////////////// |
154 | |
155 | /*! |
156 | @brief get codepoint from 4 hex characters following `\u` |
157 | |
158 | For input "\u c1 c2 c3 c4" the codepoint is: |
159 | (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 |
160 | = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) |
161 | |
162 | Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' |
163 | must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The |
164 | conversion is done by subtracting the offset (0x30, 0x37, and 0x57) |
165 | between the ASCII value of the character and the desired integer value. |
166 | |
167 | @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or |
168 | non-hex character) |
169 | */ |
170 | int get_codepoint() |
171 | { |
172 | // this function only makes sense after reading `\u` |
173 | JSON_ASSERT(current == 'u'); |
174 | int codepoint = 0; |
175 | |
176 | const auto factors = { 12u, 8u, 4u, 0u }; |
177 | for (const auto factor : factors) |
178 | { |
179 | get(); |
180 | |
181 | if (current >= '0' && current <= '9') |
182 | { |
183 | codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor); |
184 | } |
185 | else if (current >= 'A' && current <= 'F') |
186 | { |
187 | codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor); |
188 | } |
189 | else if (current >= 'a' && current <= 'f') |
190 | { |
191 | codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor); |
192 | } |
193 | else |
194 | { |
195 | return -1; |
196 | } |
197 | } |
198 | |
199 | JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); |
200 | return codepoint; |
201 | } |
202 | |
203 | /*! |
204 | @brief check if the next byte(s) are inside a given range |
205 | |
206 | Adds the current byte and, for each passed range, reads a new byte and |
207 | checks if it is inside the range. If a violation was detected, set up an |
208 | error message and return false. Otherwise, return true. |
209 | |
210 | @param[in] ranges list of integers; interpreted as list of pairs of |
211 | inclusive lower and upper bound, respectively |
212 | |
213 | @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, |
214 | 1, 2, or 3 pairs. This precondition is enforced by an assertion. |
215 | |
216 | @return true if and only if no range violation was detected |
217 | */ |
218 | bool next_byte_in_range(std::initializer_list<char_int_type> ranges) |
219 | { |
220 | JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); |
221 | add(c: current); |
222 | |
223 | for (auto range = ranges.begin(); range != ranges.end(); ++range) |
224 | { |
225 | get(); |
226 | if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions) |
227 | { |
228 | add(c: current); |
229 | } |
230 | else |
231 | { |
232 | error_message = "invalid string: ill-formed UTF-8 byte" ; |
233 | return false; |
234 | } |
235 | } |
236 | |
237 | return true; |
238 | } |
239 | |
240 | /*! |
241 | @brief scan a string literal |
242 | |
243 | This function scans a string according to Sect. 7 of RFC 8259. While |
244 | scanning, bytes are escaped and copied into buffer token_buffer. Then the |
245 | function returns successfully, token_buffer is *not* null-terminated (as it |
246 | may contain \0 bytes), and token_buffer.size() is the number of bytes in the |
247 | string. |
248 | |
249 | @return token_type::value_string if string could be successfully scanned, |
250 | token_type::parse_error otherwise |
251 | |
252 | @note In case of errors, variable error_message contains a textual |
253 | description. |
254 | */ |
255 | token_type scan_string() |
256 | { |
257 | // reset token_buffer (ignore opening quote) |
258 | reset(); |
259 | |
260 | // we entered the function by reading an open quote |
261 | JSON_ASSERT(current == '\"'); |
262 | |
263 | while (true) |
264 | { |
265 | // get next character |
266 | switch (get()) |
267 | { |
268 | // end of file while parsing string |
269 | case char_traits<char_type>::eof(): |
270 | { |
271 | error_message = "invalid string: missing closing quote" ; |
272 | return token_type::parse_error; |
273 | } |
274 | |
275 | // closing quote |
276 | case '\"': |
277 | { |
278 | return token_type::value_string; |
279 | } |
280 | |
281 | // escapes |
282 | case '\\': |
283 | { |
284 | switch (get()) |
285 | { |
286 | // quotation mark |
287 | case '\"': |
288 | add(c: '\"'); |
289 | break; |
290 | // reverse solidus |
291 | case '\\': |
292 | add(c: '\\'); |
293 | break; |
294 | // solidus |
295 | case '/': |
296 | add(c: '/'); |
297 | break; |
298 | // backspace |
299 | case 'b': |
300 | add(c: '\b'); |
301 | break; |
302 | // form feed |
303 | case 'f': |
304 | add(c: '\f'); |
305 | break; |
306 | // line feed |
307 | case 'n': |
308 | add(c: '\n'); |
309 | break; |
310 | // carriage return |
311 | case 'r': |
312 | add(c: '\r'); |
313 | break; |
314 | // tab |
315 | case 't': |
316 | add(c: '\t'); |
317 | break; |
318 | |
319 | // unicode escapes |
320 | case 'u': |
321 | { |
322 | const int codepoint1 = get_codepoint(); |
323 | int codepoint = codepoint1; // start with codepoint1 |
324 | |
325 | if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) |
326 | { |
327 | error_message = "invalid string: '\\u' must be followed by 4 hex digits" ; |
328 | return token_type::parse_error; |
329 | } |
330 | |
331 | // check if code point is a high surrogate |
332 | if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) |
333 | { |
334 | // expect next \uxxxx entry |
335 | if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) |
336 | { |
337 | const int codepoint2 = get_codepoint(); |
338 | |
339 | if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) |
340 | { |
341 | error_message = "invalid string: '\\u' must be followed by 4 hex digits" ; |
342 | return token_type::parse_error; |
343 | } |
344 | |
345 | // check if codepoint2 is a low surrogate |
346 | if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) |
347 | { |
348 | // overwrite codepoint |
349 | codepoint = static_cast<int>( |
350 | // high surrogate occupies the most significant 22 bits |
351 | (static_cast<unsigned int>(codepoint1) << 10u) |
352 | // low surrogate occupies the least significant 15 bits |
353 | + static_cast<unsigned int>(codepoint2) |
354 | // there is still the 0xD800, 0xDC00 and 0x10000 noise |
355 | // in the result, so we have to subtract with: |
356 | // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 |
357 | - 0x35FDC00u); |
358 | } |
359 | else |
360 | { |
361 | error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF" ; |
362 | return token_type::parse_error; |
363 | } |
364 | } |
365 | else |
366 | { |
367 | error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF" ; |
368 | return token_type::parse_error; |
369 | } |
370 | } |
371 | else |
372 | { |
373 | if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) |
374 | { |
375 | error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF" ; |
376 | return token_type::parse_error; |
377 | } |
378 | } |
379 | |
380 | // result of the above calculation yields a proper codepoint |
381 | JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); |
382 | |
383 | // translate codepoint into bytes |
384 | if (codepoint < 0x80) |
385 | { |
386 | // 1-byte characters: 0xxxxxxx (ASCII) |
387 | add(c: static_cast<char_int_type>(codepoint)); |
388 | } |
389 | else if (codepoint <= 0x7FF) |
390 | { |
391 | // 2-byte characters: 110xxxxx 10xxxxxx |
392 | add(c: static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u))); |
393 | add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
394 | } |
395 | else if (codepoint <= 0xFFFF) |
396 | { |
397 | // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx |
398 | add(c: static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u))); |
399 | add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu))); |
400 | add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
401 | } |
402 | else |
403 | { |
404 | // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
405 | add(c: static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u))); |
406 | add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu))); |
407 | add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu))); |
408 | add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
409 | } |
410 | |
411 | break; |
412 | } |
413 | |
414 | // other characters after escape |
415 | default: |
416 | error_message = "invalid string: forbidden character after backslash" ; |
417 | return token_type::parse_error; |
418 | } |
419 | |
420 | break; |
421 | } |
422 | |
423 | // invalid control characters |
424 | case 0x00: |
425 | { |
426 | error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000" ; |
427 | return token_type::parse_error; |
428 | } |
429 | |
430 | case 0x01: |
431 | { |
432 | error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001" ; |
433 | return token_type::parse_error; |
434 | } |
435 | |
436 | case 0x02: |
437 | { |
438 | error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002" ; |
439 | return token_type::parse_error; |
440 | } |
441 | |
442 | case 0x03: |
443 | { |
444 | error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003" ; |
445 | return token_type::parse_error; |
446 | } |
447 | |
448 | case 0x04: |
449 | { |
450 | error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004" ; |
451 | return token_type::parse_error; |
452 | } |
453 | |
454 | case 0x05: |
455 | { |
456 | error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005" ; |
457 | return token_type::parse_error; |
458 | } |
459 | |
460 | case 0x06: |
461 | { |
462 | error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006" ; |
463 | return token_type::parse_error; |
464 | } |
465 | |
466 | case 0x07: |
467 | { |
468 | error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007" ; |
469 | return token_type::parse_error; |
470 | } |
471 | |
472 | case 0x08: |
473 | { |
474 | error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b" ; |
475 | return token_type::parse_error; |
476 | } |
477 | |
478 | case 0x09: |
479 | { |
480 | error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t" ; |
481 | return token_type::parse_error; |
482 | } |
483 | |
484 | case 0x0A: |
485 | { |
486 | error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n" ; |
487 | return token_type::parse_error; |
488 | } |
489 | |
490 | case 0x0B: |
491 | { |
492 | error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B" ; |
493 | return token_type::parse_error; |
494 | } |
495 | |
496 | case 0x0C: |
497 | { |
498 | error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f" ; |
499 | return token_type::parse_error; |
500 | } |
501 | |
502 | case 0x0D: |
503 | { |
504 | error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r" ; |
505 | return token_type::parse_error; |
506 | } |
507 | |
508 | case 0x0E: |
509 | { |
510 | error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E" ; |
511 | return token_type::parse_error; |
512 | } |
513 | |
514 | case 0x0F: |
515 | { |
516 | error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F" ; |
517 | return token_type::parse_error; |
518 | } |
519 | |
520 | case 0x10: |
521 | { |
522 | error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010" ; |
523 | return token_type::parse_error; |
524 | } |
525 | |
526 | case 0x11: |
527 | { |
528 | error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011" ; |
529 | return token_type::parse_error; |
530 | } |
531 | |
532 | case 0x12: |
533 | { |
534 | error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012" ; |
535 | return token_type::parse_error; |
536 | } |
537 | |
538 | case 0x13: |
539 | { |
540 | error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013" ; |
541 | return token_type::parse_error; |
542 | } |
543 | |
544 | case 0x14: |
545 | { |
546 | error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014" ; |
547 | return token_type::parse_error; |
548 | } |
549 | |
550 | case 0x15: |
551 | { |
552 | error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015" ; |
553 | return token_type::parse_error; |
554 | } |
555 | |
556 | case 0x16: |
557 | { |
558 | error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016" ; |
559 | return token_type::parse_error; |
560 | } |
561 | |
562 | case 0x17: |
563 | { |
564 | error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017" ; |
565 | return token_type::parse_error; |
566 | } |
567 | |
568 | case 0x18: |
569 | { |
570 | error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018" ; |
571 | return token_type::parse_error; |
572 | } |
573 | |
574 | case 0x19: |
575 | { |
576 | error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019" ; |
577 | return token_type::parse_error; |
578 | } |
579 | |
580 | case 0x1A: |
581 | { |
582 | error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A" ; |
583 | return token_type::parse_error; |
584 | } |
585 | |
586 | case 0x1B: |
587 | { |
588 | error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B" ; |
589 | return token_type::parse_error; |
590 | } |
591 | |
592 | case 0x1C: |
593 | { |
594 | error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C" ; |
595 | return token_type::parse_error; |
596 | } |
597 | |
598 | case 0x1D: |
599 | { |
600 | error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D" ; |
601 | return token_type::parse_error; |
602 | } |
603 | |
604 | case 0x1E: |
605 | { |
606 | error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E" ; |
607 | return token_type::parse_error; |
608 | } |
609 | |
610 | case 0x1F: |
611 | { |
612 | error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F" ; |
613 | return token_type::parse_error; |
614 | } |
615 | |
616 | // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) |
617 | case 0x20: |
618 | case 0x21: |
619 | case 0x23: |
620 | case 0x24: |
621 | case 0x25: |
622 | case 0x26: |
623 | case 0x27: |
624 | case 0x28: |
625 | case 0x29: |
626 | case 0x2A: |
627 | case 0x2B: |
628 | case 0x2C: |
629 | case 0x2D: |
630 | case 0x2E: |
631 | case 0x2F: |
632 | case 0x30: |
633 | case 0x31: |
634 | case 0x32: |
635 | case 0x33: |
636 | case 0x34: |
637 | case 0x35: |
638 | case 0x36: |
639 | case 0x37: |
640 | case 0x38: |
641 | case 0x39: |
642 | case 0x3A: |
643 | case 0x3B: |
644 | case 0x3C: |
645 | case 0x3D: |
646 | case 0x3E: |
647 | case 0x3F: |
648 | case 0x40: |
649 | case 0x41: |
650 | case 0x42: |
651 | case 0x43: |
652 | case 0x44: |
653 | case 0x45: |
654 | case 0x46: |
655 | case 0x47: |
656 | case 0x48: |
657 | case 0x49: |
658 | case 0x4A: |
659 | case 0x4B: |
660 | case 0x4C: |
661 | case 0x4D: |
662 | case 0x4E: |
663 | case 0x4F: |
664 | case 0x50: |
665 | case 0x51: |
666 | case 0x52: |
667 | case 0x53: |
668 | case 0x54: |
669 | case 0x55: |
670 | case 0x56: |
671 | case 0x57: |
672 | case 0x58: |
673 | case 0x59: |
674 | case 0x5A: |
675 | case 0x5B: |
676 | case 0x5D: |
677 | case 0x5E: |
678 | case 0x5F: |
679 | case 0x60: |
680 | case 0x61: |
681 | case 0x62: |
682 | case 0x63: |
683 | case 0x64: |
684 | case 0x65: |
685 | case 0x66: |
686 | case 0x67: |
687 | case 0x68: |
688 | case 0x69: |
689 | case 0x6A: |
690 | case 0x6B: |
691 | case 0x6C: |
692 | case 0x6D: |
693 | case 0x6E: |
694 | case 0x6F: |
695 | case 0x70: |
696 | case 0x71: |
697 | case 0x72: |
698 | case 0x73: |
699 | case 0x74: |
700 | case 0x75: |
701 | case 0x76: |
702 | case 0x77: |
703 | case 0x78: |
704 | case 0x79: |
705 | case 0x7A: |
706 | case 0x7B: |
707 | case 0x7C: |
708 | case 0x7D: |
709 | case 0x7E: |
710 | case 0x7F: |
711 | { |
712 | add(c: current); |
713 | break; |
714 | } |
715 | |
716 | // U+0080..U+07FF: bytes C2..DF 80..BF |
717 | case 0xC2: |
718 | case 0xC3: |
719 | case 0xC4: |
720 | case 0xC5: |
721 | case 0xC6: |
722 | case 0xC7: |
723 | case 0xC8: |
724 | case 0xC9: |
725 | case 0xCA: |
726 | case 0xCB: |
727 | case 0xCC: |
728 | case 0xCD: |
729 | case 0xCE: |
730 | case 0xCF: |
731 | case 0xD0: |
732 | case 0xD1: |
733 | case 0xD2: |
734 | case 0xD3: |
735 | case 0xD4: |
736 | case 0xD5: |
737 | case 0xD6: |
738 | case 0xD7: |
739 | case 0xD8: |
740 | case 0xD9: |
741 | case 0xDA: |
742 | case 0xDB: |
743 | case 0xDC: |
744 | case 0xDD: |
745 | case 0xDE: |
746 | case 0xDF: |
747 | { |
748 | if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) |
749 | { |
750 | return token_type::parse_error; |
751 | } |
752 | break; |
753 | } |
754 | |
755 | // U+0800..U+0FFF: bytes E0 A0..BF 80..BF |
756 | case 0xE0: |
757 | { |
758 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) |
759 | { |
760 | return token_type::parse_error; |
761 | } |
762 | break; |
763 | } |
764 | |
765 | // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF |
766 | // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF |
767 | case 0xE1: |
768 | case 0xE2: |
769 | case 0xE3: |
770 | case 0xE4: |
771 | case 0xE5: |
772 | case 0xE6: |
773 | case 0xE7: |
774 | case 0xE8: |
775 | case 0xE9: |
776 | case 0xEA: |
777 | case 0xEB: |
778 | case 0xEC: |
779 | case 0xEE: |
780 | case 0xEF: |
781 | { |
782 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) |
783 | { |
784 | return token_type::parse_error; |
785 | } |
786 | break; |
787 | } |
788 | |
789 | // U+D000..U+D7FF: bytes ED 80..9F 80..BF |
790 | case 0xED: |
791 | { |
792 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) |
793 | { |
794 | return token_type::parse_error; |
795 | } |
796 | break; |
797 | } |
798 | |
799 | // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
800 | case 0xF0: |
801 | { |
802 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) |
803 | { |
804 | return token_type::parse_error; |
805 | } |
806 | break; |
807 | } |
808 | |
809 | // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
810 | case 0xF1: |
811 | case 0xF2: |
812 | case 0xF3: |
813 | { |
814 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) |
815 | { |
816 | return token_type::parse_error; |
817 | } |
818 | break; |
819 | } |
820 | |
821 | // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF |
822 | case 0xF4: |
823 | { |
824 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) |
825 | { |
826 | return token_type::parse_error; |
827 | } |
828 | break; |
829 | } |
830 | |
831 | // remaining bytes (80..C1 and F5..FF) are ill-formed |
832 | default: |
833 | { |
834 | error_message = "invalid string: ill-formed UTF-8 byte" ; |
835 | return token_type::parse_error; |
836 | } |
837 | } |
838 | } |
839 | } |
840 | |
841 | /*! |
842 | * @brief scan a comment |
843 | * @return whether comment could be scanned successfully |
844 | */ |
845 | bool () |
846 | { |
847 | switch (get()) |
848 | { |
849 | // single-line comments skip input until a newline or EOF is read |
850 | case '/': |
851 | { |
852 | while (true) |
853 | { |
854 | switch (get()) |
855 | { |
856 | case '\n': |
857 | case '\r': |
858 | case char_traits<char_type>::eof(): |
859 | case '\0': |
860 | return true; |
861 | |
862 | default: |
863 | break; |
864 | } |
865 | } |
866 | } |
867 | |
868 | // multi-line comments skip input until */ is read |
869 | case '*': |
870 | { |
871 | while (true) |
872 | { |
873 | switch (get()) |
874 | { |
875 | case char_traits<char_type>::eof(): |
876 | case '\0': |
877 | { |
878 | error_message = "invalid comment; missing closing '*/'" ; |
879 | return false; |
880 | } |
881 | |
882 | case '*': |
883 | { |
884 | switch (get()) |
885 | { |
886 | case '/': |
887 | return true; |
888 | |
889 | default: |
890 | { |
891 | unget(); |
892 | continue; |
893 | } |
894 | } |
895 | } |
896 | |
897 | default: |
898 | continue; |
899 | } |
900 | } |
901 | } |
902 | |
903 | // unexpected character after reading '/' |
904 | default: |
905 | { |
906 | error_message = "invalid comment; expecting '/' or '*' after '/'" ; |
907 | return false; |
908 | } |
909 | } |
910 | } |
911 | |
912 | JSON_HEDLEY_NON_NULL(2) |
913 | static void strtof(float& f, const char* str, char** endptr) noexcept |
914 | { |
915 | f = std::strtof(nptr: str, endptr: endptr); |
916 | } |
917 | |
918 | JSON_HEDLEY_NON_NULL(2) |
919 | static void strtof(double& f, const char* str, char** endptr) noexcept |
920 | { |
921 | f = std::strtod(nptr: str, endptr: endptr); |
922 | } |
923 | |
924 | JSON_HEDLEY_NON_NULL(2) |
925 | static void strtof(long double& f, const char* str, char** endptr) noexcept |
926 | { |
927 | f = std::strtold(nptr: str, endptr: endptr); |
928 | } |
929 | |
930 | /*! |
931 | @brief scan a number literal |
932 | |
933 | This function scans a string according to Sect. 6 of RFC 8259. |
934 | |
935 | The function is realized with a deterministic finite state machine derived |
936 | from the grammar described in RFC 8259. Starting in state "init", the |
937 | input is read and used to determined the next state. Only state "done" |
938 | accepts the number. State "error" is a trap state to model errors. In the |
939 | table below, "anything" means any character but the ones listed before. |
940 | |
941 | state | 0 | 1-9 | e E | + | - | . | anything |
942 | ---------|----------|----------|----------|---------|---------|----------|----------- |
943 | init | zero | any1 | [error] | [error] | minus | [error] | [error] |
944 | minus | zero | any1 | [error] | [error] | [error] | [error] | [error] |
945 | zero | done | done | exponent | done | done | decimal1 | done |
946 | any1 | any1 | any1 | exponent | done | done | decimal1 | done |
947 | decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] |
948 | decimal2 | decimal2 | decimal2 | exponent | done | done | done | done |
949 | exponent | any2 | any2 | [error] | sign | sign | [error] | [error] |
950 | sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] |
951 | any2 | any2 | any2 | done | done | done | done | done |
952 | |
953 | The state machine is realized with one label per state (prefixed with |
954 | "scan_number_") and `goto` statements between them. The state machine |
955 | contains cycles, but any cycle can be left when EOF is read. Therefore, |
956 | the function is guaranteed to terminate. |
957 | |
958 | During scanning, the read bytes are stored in token_buffer. This string is |
959 | then converted to a signed integer, an unsigned integer, or a |
960 | floating-point number. |
961 | |
962 | @return token_type::value_unsigned, token_type::value_integer, or |
963 | token_type::value_float if number could be successfully scanned, |
964 | token_type::parse_error otherwise |
965 | |
966 | @note The scanner is independent of the current locale. Internally, the |
967 | locale's decimal point is used instead of `.` to work with the |
968 | locale-dependent converters. |
969 | */ |
970 | token_type scan_number() // lgtm [cpp/use-of-goto] |
971 | { |
972 | // reset token_buffer to store the number's bytes |
973 | reset(); |
974 | |
975 | // the type of the parsed number; initially set to unsigned; will be |
976 | // changed if minus sign, decimal point or exponent is read |
977 | token_type number_type = token_type::value_unsigned; |
978 | |
979 | // state (init): we just found out we need to scan a number |
980 | switch (current) |
981 | { |
982 | case '-': |
983 | { |
984 | add(c: current); |
985 | goto scan_number_minus; |
986 | } |
987 | |
988 | case '0': |
989 | { |
990 | add(c: current); |
991 | goto scan_number_zero; |
992 | } |
993 | |
994 | case '1': |
995 | case '2': |
996 | case '3': |
997 | case '4': |
998 | case '5': |
999 | case '6': |
1000 | case '7': |
1001 | case '8': |
1002 | case '9': |
1003 | { |
1004 | add(c: current); |
1005 | goto scan_number_any1; |
1006 | } |
1007 | |
1008 | // all other characters are rejected outside scan_number() |
1009 | default: // LCOV_EXCL_LINE |
1010 | JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE |
1011 | } |
1012 | |
1013 | scan_number_minus: |
1014 | // state: we just parsed a leading minus sign |
1015 | number_type = token_type::value_integer; |
1016 | switch (get()) |
1017 | { |
1018 | case '0': |
1019 | { |
1020 | add(c: current); |
1021 | goto scan_number_zero; |
1022 | } |
1023 | |
1024 | case '1': |
1025 | case '2': |
1026 | case '3': |
1027 | case '4': |
1028 | case '5': |
1029 | case '6': |
1030 | case '7': |
1031 | case '8': |
1032 | case '9': |
1033 | { |
1034 | add(c: current); |
1035 | goto scan_number_any1; |
1036 | } |
1037 | |
1038 | default: |
1039 | { |
1040 | error_message = "invalid number; expected digit after '-'" ; |
1041 | return token_type::parse_error; |
1042 | } |
1043 | } |
1044 | |
1045 | scan_number_zero: |
1046 | // state: we just parse a zero (maybe with a leading minus sign) |
1047 | switch (get()) |
1048 | { |
1049 | case '.': |
1050 | { |
1051 | add(c: decimal_point_char); |
1052 | goto scan_number_decimal1; |
1053 | } |
1054 | |
1055 | case 'e': |
1056 | case 'E': |
1057 | { |
1058 | add(c: current); |
1059 | goto scan_number_exponent; |
1060 | } |
1061 | |
1062 | default: |
1063 | goto scan_number_done; |
1064 | } |
1065 | |
1066 | scan_number_any1: |
1067 | // state: we just parsed a number 0-9 (maybe with a leading minus sign) |
1068 | switch (get()) |
1069 | { |
1070 | case '0': |
1071 | case '1': |
1072 | case '2': |
1073 | case '3': |
1074 | case '4': |
1075 | case '5': |
1076 | case '6': |
1077 | case '7': |
1078 | case '8': |
1079 | case '9': |
1080 | { |
1081 | add(c: current); |
1082 | goto scan_number_any1; |
1083 | } |
1084 | |
1085 | case '.': |
1086 | { |
1087 | add(c: decimal_point_char); |
1088 | goto scan_number_decimal1; |
1089 | } |
1090 | |
1091 | case 'e': |
1092 | case 'E': |
1093 | { |
1094 | add(c: current); |
1095 | goto scan_number_exponent; |
1096 | } |
1097 | |
1098 | default: |
1099 | goto scan_number_done; |
1100 | } |
1101 | |
1102 | scan_number_decimal1: |
1103 | // state: we just parsed a decimal point |
1104 | number_type = token_type::value_float; |
1105 | switch (get()) |
1106 | { |
1107 | case '0': |
1108 | case '1': |
1109 | case '2': |
1110 | case '3': |
1111 | case '4': |
1112 | case '5': |
1113 | case '6': |
1114 | case '7': |
1115 | case '8': |
1116 | case '9': |
1117 | { |
1118 | add(c: current); |
1119 | goto scan_number_decimal2; |
1120 | } |
1121 | |
1122 | default: |
1123 | { |
1124 | error_message = "invalid number; expected digit after '.'" ; |
1125 | return token_type::parse_error; |
1126 | } |
1127 | } |
1128 | |
1129 | scan_number_decimal2: |
1130 | // we just parsed at least one number after a decimal point |
1131 | switch (get()) |
1132 | { |
1133 | case '0': |
1134 | case '1': |
1135 | case '2': |
1136 | case '3': |
1137 | case '4': |
1138 | case '5': |
1139 | case '6': |
1140 | case '7': |
1141 | case '8': |
1142 | case '9': |
1143 | { |
1144 | add(c: current); |
1145 | goto scan_number_decimal2; |
1146 | } |
1147 | |
1148 | case 'e': |
1149 | case 'E': |
1150 | { |
1151 | add(c: current); |
1152 | goto scan_number_exponent; |
1153 | } |
1154 | |
1155 | default: |
1156 | goto scan_number_done; |
1157 | } |
1158 | |
1159 | scan_number_exponent: |
1160 | // we just parsed an exponent |
1161 | number_type = token_type::value_float; |
1162 | switch (get()) |
1163 | { |
1164 | case '+': |
1165 | case '-': |
1166 | { |
1167 | add(c: current); |
1168 | goto scan_number_sign; |
1169 | } |
1170 | |
1171 | case '0': |
1172 | case '1': |
1173 | case '2': |
1174 | case '3': |
1175 | case '4': |
1176 | case '5': |
1177 | case '6': |
1178 | case '7': |
1179 | case '8': |
1180 | case '9': |
1181 | { |
1182 | add(c: current); |
1183 | goto scan_number_any2; |
1184 | } |
1185 | |
1186 | default: |
1187 | { |
1188 | error_message = |
1189 | "invalid number; expected '+', '-', or digit after exponent" ; |
1190 | return token_type::parse_error; |
1191 | } |
1192 | } |
1193 | |
1194 | scan_number_sign: |
1195 | // we just parsed an exponent sign |
1196 | switch (get()) |
1197 | { |
1198 | case '0': |
1199 | case '1': |
1200 | case '2': |
1201 | case '3': |
1202 | case '4': |
1203 | case '5': |
1204 | case '6': |
1205 | case '7': |
1206 | case '8': |
1207 | case '9': |
1208 | { |
1209 | add(c: current); |
1210 | goto scan_number_any2; |
1211 | } |
1212 | |
1213 | default: |
1214 | { |
1215 | error_message = "invalid number; expected digit after exponent sign" ; |
1216 | return token_type::parse_error; |
1217 | } |
1218 | } |
1219 | |
1220 | scan_number_any2: |
1221 | // we just parsed a number after the exponent or exponent sign |
1222 | switch (get()) |
1223 | { |
1224 | case '0': |
1225 | case '1': |
1226 | case '2': |
1227 | case '3': |
1228 | case '4': |
1229 | case '5': |
1230 | case '6': |
1231 | case '7': |
1232 | case '8': |
1233 | case '9': |
1234 | { |
1235 | add(c: current); |
1236 | goto scan_number_any2; |
1237 | } |
1238 | |
1239 | default: |
1240 | goto scan_number_done; |
1241 | } |
1242 | |
1243 | scan_number_done: |
1244 | // unget the character after the number (we only read it to know that |
1245 | // we are done scanning a number) |
1246 | unget(); |
1247 | |
1248 | char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) |
1249 | errno = 0; |
1250 | |
1251 | // try to parse integers first and fall back to floats |
1252 | if (number_type == token_type::value_unsigned) |
1253 | { |
1254 | const auto x = std::strtoull(nptr: token_buffer.data(), endptr: &endptr, base: 10); |
1255 | |
1256 | // we checked the number format before |
1257 | JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); |
1258 | |
1259 | if (errno == 0) |
1260 | { |
1261 | value_unsigned = static_cast<number_unsigned_t>(x); |
1262 | if (value_unsigned == x) |
1263 | { |
1264 | return token_type::value_unsigned; |
1265 | } |
1266 | } |
1267 | } |
1268 | else if (number_type == token_type::value_integer) |
1269 | { |
1270 | const auto x = std::strtoll(nptr: token_buffer.data(), endptr: &endptr, base: 10); |
1271 | |
1272 | // we checked the number format before |
1273 | JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); |
1274 | |
1275 | if (errno == 0) |
1276 | { |
1277 | value_integer = static_cast<number_integer_t>(x); |
1278 | if (value_integer == x) |
1279 | { |
1280 | return token_type::value_integer; |
1281 | } |
1282 | } |
1283 | } |
1284 | |
1285 | // this code is reached if we parse a floating-point number or if an |
1286 | // integer conversion above failed |
1287 | strtof(value_float, token_buffer.data(), &endptr); |
1288 | |
1289 | // we checked the number format before |
1290 | JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); |
1291 | |
1292 | return token_type::value_float; |
1293 | } |
1294 | |
1295 | /*! |
1296 | @param[in] literal_text the literal text to expect |
1297 | @param[in] length the length of the passed literal text |
1298 | @param[in] return_type the token type to return on success |
1299 | */ |
1300 | JSON_HEDLEY_NON_NULL(2) |
1301 | token_type scan_literal(const char_type* literal_text, const std::size_t length, |
1302 | token_type return_type) |
1303 | { |
1304 | JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[0]); |
1305 | for (std::size_t i = 1; i < length; ++i) |
1306 | { |
1307 | if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i])) |
1308 | { |
1309 | error_message = "invalid literal" ; |
1310 | return token_type::parse_error; |
1311 | } |
1312 | } |
1313 | return return_type; |
1314 | } |
1315 | |
1316 | ///////////////////// |
1317 | // input management |
1318 | ///////////////////// |
1319 | |
1320 | /// reset token_buffer; current character is beginning of token |
1321 | void reset() noexcept |
1322 | { |
1323 | token_buffer.clear(); |
1324 | token_string.clear(); |
1325 | token_string.push_back(char_traits<char_type>::to_char_type(current)); |
1326 | } |
1327 | |
1328 | /* |
1329 | @brief get next character from the input |
1330 | |
1331 | This function provides the interface to the used input adapter. It does |
1332 | not throw in case the input reached EOF, but returns a |
1333 | `char_traits<char>::eof()` in that case. Stores the scanned characters |
1334 | for use in error messages. |
1335 | |
1336 | @return character read from the input |
1337 | */ |
1338 | char_int_type get() |
1339 | { |
1340 | ++position.chars_read_total; |
1341 | ++position.chars_read_current_line; |
1342 | |
1343 | if (next_unget) |
1344 | { |
1345 | // just reset the next_unget variable and work with current |
1346 | next_unget = false; |
1347 | } |
1348 | else |
1349 | { |
1350 | current = ia.get_character(); |
1351 | } |
1352 | |
1353 | if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof())) |
1354 | { |
1355 | token_string.push_back(char_traits<char_type>::to_char_type(current)); |
1356 | } |
1357 | |
1358 | if (current == '\n') |
1359 | { |
1360 | ++position.lines_read; |
1361 | position.chars_read_current_line = 0; |
1362 | } |
1363 | |
1364 | return current; |
1365 | } |
1366 | |
1367 | /*! |
1368 | @brief unget current character (read it again on next get) |
1369 | |
1370 | We implement unget by setting variable next_unget to true. The input is not |
1371 | changed - we just simulate ungetting by modifying chars_read_total, |
1372 | chars_read_current_line, and token_string. The next call to get() will |
1373 | behave as if the unget character is read again. |
1374 | */ |
1375 | void unget() |
1376 | { |
1377 | next_unget = true; |
1378 | |
1379 | --position.chars_read_total; |
1380 | |
1381 | // in case we "unget" a newline, we have to also decrement the lines_read |
1382 | if (position.chars_read_current_line == 0) |
1383 | { |
1384 | if (position.lines_read > 0) |
1385 | { |
1386 | --position.lines_read; |
1387 | } |
1388 | } |
1389 | else |
1390 | { |
1391 | --position.chars_read_current_line; |
1392 | } |
1393 | |
1394 | if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof())) |
1395 | { |
1396 | JSON_ASSERT(!token_string.empty()); |
1397 | token_string.pop_back(); |
1398 | } |
1399 | } |
1400 | |
1401 | /// add a character to token_buffer |
1402 | void add(char_int_type c) |
1403 | { |
1404 | token_buffer.push_back(static_cast<typename string_t::value_type>(c)); |
1405 | } |
1406 | |
1407 | public: |
1408 | ///////////////////// |
1409 | // value getters |
1410 | ///////////////////// |
1411 | |
1412 | /// return integer value |
1413 | constexpr number_integer_t get_number_integer() const noexcept |
1414 | { |
1415 | return value_integer; |
1416 | } |
1417 | |
1418 | /// return unsigned integer value |
1419 | constexpr number_unsigned_t get_number_unsigned() const noexcept |
1420 | { |
1421 | return value_unsigned; |
1422 | } |
1423 | |
1424 | /// return floating-point value |
1425 | constexpr number_float_t get_number_float() const noexcept |
1426 | { |
1427 | return value_float; |
1428 | } |
1429 | |
1430 | /// return current string value (implicitly resets the token; useful only once) |
1431 | string_t& get_string() |
1432 | { |
1433 | return token_buffer; |
1434 | } |
1435 | |
1436 | ///////////////////// |
1437 | // diagnostics |
1438 | ///////////////////// |
1439 | |
1440 | /// return position of last read token |
1441 | constexpr position_t get_position() const noexcept |
1442 | { |
1443 | return position; |
1444 | } |
1445 | |
1446 | /// return the last read token (for errors only). Will never contain EOF |
1447 | /// (an arbitrary value that is not a valid char value, often -1), because |
1448 | /// 255 may legitimately occur. May contain NUL, which should be escaped. |
1449 | std::string get_token_string() const |
1450 | { |
1451 | // escape control characters |
1452 | std::string result; |
1453 | for (const auto c : token_string) |
1454 | { |
1455 | if (static_cast<unsigned char>(c) <= '\x1F') |
1456 | { |
1457 | // escape control characters |
1458 | std::array<char, 9> cs{._M_elems: {}}; |
1459 | static_cast<void>((std::snprintf)(s: cs.data(), maxlen: cs.size(), format: "<U+%.4X>" , static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) |
1460 | result += cs.data(); |
1461 | } |
1462 | else |
1463 | { |
1464 | // add character as is |
1465 | result.push_back(c: static_cast<std::string::value_type>(c)); |
1466 | } |
1467 | } |
1468 | |
1469 | return result; |
1470 | } |
1471 | |
1472 | /// return syntax error message |
1473 | JSON_HEDLEY_RETURNS_NON_NULL |
1474 | constexpr const char* get_error_message() const noexcept |
1475 | { |
1476 | return error_message; |
1477 | } |
1478 | |
1479 | ///////////////////// |
1480 | // actual scanner |
1481 | ///////////////////// |
1482 | |
1483 | /*! |
1484 | @brief skip the UTF-8 byte order mark |
1485 | @return true iff there is no BOM or the correct BOM has been skipped |
1486 | */ |
1487 | bool skip_bom() |
1488 | { |
1489 | if (get() == 0xEF) |
1490 | { |
1491 | // check if we completely parse the BOM |
1492 | return get() == 0xBB && get() == 0xBF; |
1493 | } |
1494 | |
1495 | // the first character is not the beginning of the BOM; unget it to |
1496 | // process is later |
1497 | unget(); |
1498 | return true; |
1499 | } |
1500 | |
1501 | void skip_whitespace() |
1502 | { |
1503 | do |
1504 | { |
1505 | get(); |
1506 | } |
1507 | while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); |
1508 | } |
1509 | |
1510 | token_type scan() |
1511 | { |
1512 | // initially, skip the BOM |
1513 | if (position.chars_read_total == 0 && !skip_bom()) |
1514 | { |
1515 | error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given" ; |
1516 | return token_type::parse_error; |
1517 | } |
1518 | |
1519 | // read next character and ignore whitespace |
1520 | skip_whitespace(); |
1521 | |
1522 | // ignore comments |
1523 | while (ignore_comments && current == '/') |
1524 | { |
1525 | if (!scan_comment()) |
1526 | { |
1527 | return token_type::parse_error; |
1528 | } |
1529 | |
1530 | // skip following whitespace |
1531 | skip_whitespace(); |
1532 | } |
1533 | |
1534 | switch (current) |
1535 | { |
1536 | // structural characters |
1537 | case '[': |
1538 | return token_type::begin_array; |
1539 | case ']': |
1540 | return token_type::end_array; |
1541 | case '{': |
1542 | return token_type::begin_object; |
1543 | case '}': |
1544 | return token_type::end_object; |
1545 | case ':': |
1546 | return token_type::name_separator; |
1547 | case ',': |
1548 | return token_type::value_separator; |
1549 | |
1550 | // literals |
1551 | case 't': |
1552 | { |
1553 | std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}}; |
1554 | return scan_literal(literal_text: true_literal.data(), length: true_literal.size(), return_type: token_type::literal_true); |
1555 | } |
1556 | case 'f': |
1557 | { |
1558 | std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}}; |
1559 | return scan_literal(literal_text: false_literal.data(), length: false_literal.size(), return_type: token_type::literal_false); |
1560 | } |
1561 | case 'n': |
1562 | { |
1563 | std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}}; |
1564 | return scan_literal(literal_text: null_literal.data(), length: null_literal.size(), return_type: token_type::literal_null); |
1565 | } |
1566 | |
1567 | // string |
1568 | case '\"': |
1569 | return scan_string(); |
1570 | |
1571 | // number |
1572 | case '-': |
1573 | case '0': |
1574 | case '1': |
1575 | case '2': |
1576 | case '3': |
1577 | case '4': |
1578 | case '5': |
1579 | case '6': |
1580 | case '7': |
1581 | case '8': |
1582 | case '9': |
1583 | return scan_number(); |
1584 | |
1585 | // end of input (the null byte is needed when parsing from |
1586 | // string literals) |
1587 | case '\0': |
1588 | case char_traits<char_type>::eof(): |
1589 | return token_type::end_of_input; |
1590 | |
1591 | // error |
1592 | default: |
1593 | error_message = "invalid literal" ; |
1594 | return token_type::parse_error; |
1595 | } |
1596 | } |
1597 | |
1598 | private: |
1599 | /// input adapter |
1600 | InputAdapterType ia; |
1601 | |
1602 | /// whether comments should be ignored (true) or signaled as errors (false) |
1603 | const bool = false; |
1604 | |
1605 | /// the current character |
1606 | char_int_type current = char_traits<char_type>::eof(); |
1607 | |
1608 | /// whether the next get() call should just return current |
1609 | bool next_unget = false; |
1610 | |
1611 | /// the start position of the current token |
1612 | position_t position {}; |
1613 | |
1614 | /// raw input token string (for error messages) |
1615 | std::vector<char_type> token_string {}; |
1616 | |
1617 | /// buffer for variable-length tokens (numbers, strings) |
1618 | string_t token_buffer {}; |
1619 | |
1620 | /// a description of occurred lexer errors |
1621 | const char* error_message = "" ; |
1622 | |
1623 | // number values |
1624 | number_integer_t value_integer = 0; |
1625 | number_unsigned_t value_unsigned = 0; |
1626 | number_float_t value_float = 0; |
1627 | |
1628 | /// the decimal point |
1629 | const char_int_type decimal_point_char = '.'; |
1630 | }; |
1631 | |
1632 | } // namespace detail |
1633 | NLOHMANN_JSON_NAMESPACE_END |
1634 | |