1// __ _____ _____ _____
2// __| | __| | | | JSON for Modern C++
3// | | |__ | | | | | | version 3.11.3
4// |_____|_____|_____|_|___| https://github.com/nlohmann/json
5//
6// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
7// SPDX-License-Identifier: MIT
8
9#pragma once
10
11#include <array> // array
12#include <clocale> // localeconv
13#include <cstddef> // size_t
14#include <cstdio> // snprintf
15#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
16#include <initializer_list> // initializer_list
17#include <string> // char_traits, string
18#include <utility> // move
19#include <vector> // vector
20
21#include <nlohmann/detail/input/input_adapters.hpp>
22#include <nlohmann/detail/input/position_t.hpp>
23#include <nlohmann/detail/macro_scope.hpp>
24#include <nlohmann/detail/meta/type_traits.hpp>
25
26NLOHMANN_JSON_NAMESPACE_BEGIN
27namespace detail
28{
29
30///////////
31// lexer //
32///////////
33
34template<typename BasicJsonType>
35class lexer_base
36{
37 public:
38 /// token types for the parser
39 enum class token_type
40 {
41 uninitialized, ///< indicating the scanner is uninitialized
42 literal_true, ///< the `true` literal
43 literal_false, ///< the `false` literal
44 literal_null, ///< the `null` literal
45 value_string, ///< a string -- use get_string() for actual value
46 value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value
47 value_integer, ///< a signed integer -- use get_number_integer() for actual value
48 value_float, ///< an floating point number -- use get_number_float() for actual value
49 begin_array, ///< the character for array begin `[`
50 begin_object, ///< the character for object begin `{`
51 end_array, ///< the character for array end `]`
52 end_object, ///< the character for object end `}`
53 name_separator, ///< the name separator `:`
54 value_separator, ///< the value separator `,`
55 parse_error, ///< indicating a parse error
56 end_of_input, ///< indicating the end of the input buffer
57 literal_or_value ///< a literal or the begin of a value (only for diagnostics)
58 };
59
60 /// return name of values of type token_type (only used for errors)
61 JSON_HEDLEY_RETURNS_NON_NULL
62 JSON_HEDLEY_CONST
63 static const char* token_type_name(const token_type t) noexcept
64 {
65 switch (t)
66 {
67 case token_type::uninitialized:
68 return "<uninitialized>";
69 case token_type::literal_true:
70 return "true literal";
71 case token_type::literal_false:
72 return "false literal";
73 case token_type::literal_null:
74 return "null literal";
75 case token_type::value_string:
76 return "string literal";
77 case token_type::value_unsigned:
78 case token_type::value_integer:
79 case token_type::value_float:
80 return "number literal";
81 case token_type::begin_array:
82 return "'['";
83 case token_type::begin_object:
84 return "'{'";
85 case token_type::end_array:
86 return "']'";
87 case token_type::end_object:
88 return "'}'";
89 case token_type::name_separator:
90 return "':'";
91 case token_type::value_separator:
92 return "','";
93 case token_type::parse_error:
94 return "<parse error>";
95 case token_type::end_of_input:
96 return "end of input";
97 case token_type::literal_or_value:
98 return "'[', '{', or a literal";
99 // LCOV_EXCL_START
100 default: // catch non-enum values
101 return "unknown token";
102 // LCOV_EXCL_STOP
103 }
104 }
105};
106/*!
107@brief lexical analysis
108
109This class organizes the lexical analysis during JSON deserialization.
110*/
111template<typename BasicJsonType, typename InputAdapterType>
112class lexer : public lexer_base<BasicJsonType>
113{
114 using number_integer_t = typename BasicJsonType::number_integer_t;
115 using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
116 using number_float_t = typename BasicJsonType::number_float_t;
117 using string_t = typename BasicJsonType::string_t;
118 using char_type = typename InputAdapterType::char_type;
119 using char_int_type = typename char_traits<char_type>::int_type;
120
121 public:
122 using token_type = typename lexer_base<BasicJsonType>::token_type;
123
124 explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
125 : ia(std::move(adapter))
126 , ignore_comments(ignore_comments_)
127 , decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
128 {}
129
130 // delete because of pointer members
131 lexer(const lexer&) = delete;
132 lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
133 lexer& operator=(lexer&) = delete;
134 lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
135 ~lexer() = default;
136
137 private:
138 /////////////////////
139 // locales
140 /////////////////////
141
142 /// return the locale-dependent decimal point
143 JSON_HEDLEY_PURE
144 static char get_decimal_point() noexcept
145 {
146 const auto* loc = localeconv();
147 JSON_ASSERT(loc != nullptr);
148 return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point);
149 }
150
151 /////////////////////
152 // scan functions
153 /////////////////////
154
155 /*!
156 @brief get codepoint from 4 hex characters following `\u`
157
158 For input "\u c1 c2 c3 c4" the codepoint is:
159 (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4
160 = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
161
162 Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
163 must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
164 conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
165 between the ASCII value of the character and the desired integer value.
166
167 @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
168 non-hex character)
169 */
170 int get_codepoint()
171 {
172 // this function only makes sense after reading `\u`
173 JSON_ASSERT(current == 'u');
174 int codepoint = 0;
175
176 const auto factors = { 12u, 8u, 4u, 0u };
177 for (const auto factor : factors)
178 {
179 get();
180
181 if (current >= '0' && current <= '9')
182 {
183 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor);
184 }
185 else if (current >= 'A' && current <= 'F')
186 {
187 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor);
188 }
189 else if (current >= 'a' && current <= 'f')
190 {
191 codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor);
192 }
193 else
194 {
195 return -1;
196 }
197 }
198
199 JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF);
200 return codepoint;
201 }
202
203 /*!
204 @brief check if the next byte(s) are inside a given range
205
206 Adds the current byte and, for each passed range, reads a new byte and
207 checks if it is inside the range. If a violation was detected, set up an
208 error message and return false. Otherwise, return true.
209
210 @param[in] ranges list of integers; interpreted as list of pairs of
211 inclusive lower and upper bound, respectively
212
213 @pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
214 1, 2, or 3 pairs. This precondition is enforced by an assertion.
215
216 @return true if and only if no range violation was detected
217 */
218 bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
219 {
220 JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6);
221 add(c: current);
222
223 for (auto range = ranges.begin(); range != ranges.end(); ++range)
224 {
225 get();
226 if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions)
227 {
228 add(c: current);
229 }
230 else
231 {
232 error_message = "invalid string: ill-formed UTF-8 byte";
233 return false;
234 }
235 }
236
237 return true;
238 }
239
240 /*!
241 @brief scan a string literal
242
243 This function scans a string according to Sect. 7 of RFC 8259. While
244 scanning, bytes are escaped and copied into buffer token_buffer. Then the
245 function returns successfully, token_buffer is *not* null-terminated (as it
246 may contain \0 bytes), and token_buffer.size() is the number of bytes in the
247 string.
248
249 @return token_type::value_string if string could be successfully scanned,
250 token_type::parse_error otherwise
251
252 @note In case of errors, variable error_message contains a textual
253 description.
254 */
255 token_type scan_string()
256 {
257 // reset token_buffer (ignore opening quote)
258 reset();
259
260 // we entered the function by reading an open quote
261 JSON_ASSERT(current == '\"');
262
263 while (true)
264 {
265 // get next character
266 switch (get())
267 {
268 // end of file while parsing string
269 case char_traits<char_type>::eof():
270 {
271 error_message = "invalid string: missing closing quote";
272 return token_type::parse_error;
273 }
274
275 // closing quote
276 case '\"':
277 {
278 return token_type::value_string;
279 }
280
281 // escapes
282 case '\\':
283 {
284 switch (get())
285 {
286 // quotation mark
287 case '\"':
288 add(c: '\"');
289 break;
290 // reverse solidus
291 case '\\':
292 add(c: '\\');
293 break;
294 // solidus
295 case '/':
296 add(c: '/');
297 break;
298 // backspace
299 case 'b':
300 add(c: '\b');
301 break;
302 // form feed
303 case 'f':
304 add(c: '\f');
305 break;
306 // line feed
307 case 'n':
308 add(c: '\n');
309 break;
310 // carriage return
311 case 'r':
312 add(c: '\r');
313 break;
314 // tab
315 case 't':
316 add(c: '\t');
317 break;
318
319 // unicode escapes
320 case 'u':
321 {
322 const int codepoint1 = get_codepoint();
323 int codepoint = codepoint1; // start with codepoint1
324
325 if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1))
326 {
327 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
328 return token_type::parse_error;
329 }
330
331 // check if code point is a high surrogate
332 if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF)
333 {
334 // expect next \uxxxx entry
335 if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u'))
336 {
337 const int codepoint2 = get_codepoint();
338
339 if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1))
340 {
341 error_message = "invalid string: '\\u' must be followed by 4 hex digits";
342 return token_type::parse_error;
343 }
344
345 // check if codepoint2 is a low surrogate
346 if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF))
347 {
348 // overwrite codepoint
349 codepoint = static_cast<int>(
350 // high surrogate occupies the most significant 22 bits
351 (static_cast<unsigned int>(codepoint1) << 10u)
352 // low surrogate occupies the least significant 15 bits
353 + static_cast<unsigned int>(codepoint2)
354 // there is still the 0xD800, 0xDC00 and 0x10000 noise
355 // in the result, so we have to subtract with:
356 // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
357 - 0x35FDC00u);
358 }
359 else
360 {
361 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
362 return token_type::parse_error;
363 }
364 }
365 else
366 {
367 error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
368 return token_type::parse_error;
369 }
370 }
371 else
372 {
373 if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF))
374 {
375 error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
376 return token_type::parse_error;
377 }
378 }
379
380 // result of the above calculation yields a proper codepoint
381 JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF);
382
383 // translate codepoint into bytes
384 if (codepoint < 0x80)
385 {
386 // 1-byte characters: 0xxxxxxx (ASCII)
387 add(c: static_cast<char_int_type>(codepoint));
388 }
389 else if (codepoint <= 0x7FF)
390 {
391 // 2-byte characters: 110xxxxx 10xxxxxx
392 add(c: static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u)));
393 add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
394 }
395 else if (codepoint <= 0xFFFF)
396 {
397 // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
398 add(c: static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u)));
399 add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
400 add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
401 }
402 else
403 {
404 // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
405 add(c: static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u)));
406 add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu)));
407 add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu)));
408 add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu)));
409 }
410
411 break;
412 }
413
414 // other characters after escape
415 default:
416 error_message = "invalid string: forbidden character after backslash";
417 return token_type::parse_error;
418 }
419
420 break;
421 }
422
423 // invalid control characters
424 case 0x00:
425 {
426 error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
427 return token_type::parse_error;
428 }
429
430 case 0x01:
431 {
432 error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
433 return token_type::parse_error;
434 }
435
436 case 0x02:
437 {
438 error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
439 return token_type::parse_error;
440 }
441
442 case 0x03:
443 {
444 error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
445 return token_type::parse_error;
446 }
447
448 case 0x04:
449 {
450 error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
451 return token_type::parse_error;
452 }
453
454 case 0x05:
455 {
456 error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
457 return token_type::parse_error;
458 }
459
460 case 0x06:
461 {
462 error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
463 return token_type::parse_error;
464 }
465
466 case 0x07:
467 {
468 error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
469 return token_type::parse_error;
470 }
471
472 case 0x08:
473 {
474 error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
475 return token_type::parse_error;
476 }
477
478 case 0x09:
479 {
480 error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
481 return token_type::parse_error;
482 }
483
484 case 0x0A:
485 {
486 error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
487 return token_type::parse_error;
488 }
489
490 case 0x0B:
491 {
492 error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
493 return token_type::parse_error;
494 }
495
496 case 0x0C:
497 {
498 error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
499 return token_type::parse_error;
500 }
501
502 case 0x0D:
503 {
504 error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
505 return token_type::parse_error;
506 }
507
508 case 0x0E:
509 {
510 error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
511 return token_type::parse_error;
512 }
513
514 case 0x0F:
515 {
516 error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
517 return token_type::parse_error;
518 }
519
520 case 0x10:
521 {
522 error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
523 return token_type::parse_error;
524 }
525
526 case 0x11:
527 {
528 error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
529 return token_type::parse_error;
530 }
531
532 case 0x12:
533 {
534 error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
535 return token_type::parse_error;
536 }
537
538 case 0x13:
539 {
540 error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
541 return token_type::parse_error;
542 }
543
544 case 0x14:
545 {
546 error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
547 return token_type::parse_error;
548 }
549
550 case 0x15:
551 {
552 error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
553 return token_type::parse_error;
554 }
555
556 case 0x16:
557 {
558 error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
559 return token_type::parse_error;
560 }
561
562 case 0x17:
563 {
564 error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
565 return token_type::parse_error;
566 }
567
568 case 0x18:
569 {
570 error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
571 return token_type::parse_error;
572 }
573
574 case 0x19:
575 {
576 error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
577 return token_type::parse_error;
578 }
579
580 case 0x1A:
581 {
582 error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
583 return token_type::parse_error;
584 }
585
586 case 0x1B:
587 {
588 error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
589 return token_type::parse_error;
590 }
591
592 case 0x1C:
593 {
594 error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
595 return token_type::parse_error;
596 }
597
598 case 0x1D:
599 {
600 error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
601 return token_type::parse_error;
602 }
603
604 case 0x1E:
605 {
606 error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
607 return token_type::parse_error;
608 }
609
610 case 0x1F:
611 {
612 error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
613 return token_type::parse_error;
614 }
615
616 // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
617 case 0x20:
618 case 0x21:
619 case 0x23:
620 case 0x24:
621 case 0x25:
622 case 0x26:
623 case 0x27:
624 case 0x28:
625 case 0x29:
626 case 0x2A:
627 case 0x2B:
628 case 0x2C:
629 case 0x2D:
630 case 0x2E:
631 case 0x2F:
632 case 0x30:
633 case 0x31:
634 case 0x32:
635 case 0x33:
636 case 0x34:
637 case 0x35:
638 case 0x36:
639 case 0x37:
640 case 0x38:
641 case 0x39:
642 case 0x3A:
643 case 0x3B:
644 case 0x3C:
645 case 0x3D:
646 case 0x3E:
647 case 0x3F:
648 case 0x40:
649 case 0x41:
650 case 0x42:
651 case 0x43:
652 case 0x44:
653 case 0x45:
654 case 0x46:
655 case 0x47:
656 case 0x48:
657 case 0x49:
658 case 0x4A:
659 case 0x4B:
660 case 0x4C:
661 case 0x4D:
662 case 0x4E:
663 case 0x4F:
664 case 0x50:
665 case 0x51:
666 case 0x52:
667 case 0x53:
668 case 0x54:
669 case 0x55:
670 case 0x56:
671 case 0x57:
672 case 0x58:
673 case 0x59:
674 case 0x5A:
675 case 0x5B:
676 case 0x5D:
677 case 0x5E:
678 case 0x5F:
679 case 0x60:
680 case 0x61:
681 case 0x62:
682 case 0x63:
683 case 0x64:
684 case 0x65:
685 case 0x66:
686 case 0x67:
687 case 0x68:
688 case 0x69:
689 case 0x6A:
690 case 0x6B:
691 case 0x6C:
692 case 0x6D:
693 case 0x6E:
694 case 0x6F:
695 case 0x70:
696 case 0x71:
697 case 0x72:
698 case 0x73:
699 case 0x74:
700 case 0x75:
701 case 0x76:
702 case 0x77:
703 case 0x78:
704 case 0x79:
705 case 0x7A:
706 case 0x7B:
707 case 0x7C:
708 case 0x7D:
709 case 0x7E:
710 case 0x7F:
711 {
712 add(c: current);
713 break;
714 }
715
716 // U+0080..U+07FF: bytes C2..DF 80..BF
717 case 0xC2:
718 case 0xC3:
719 case 0xC4:
720 case 0xC5:
721 case 0xC6:
722 case 0xC7:
723 case 0xC8:
724 case 0xC9:
725 case 0xCA:
726 case 0xCB:
727 case 0xCC:
728 case 0xCD:
729 case 0xCE:
730 case 0xCF:
731 case 0xD0:
732 case 0xD1:
733 case 0xD2:
734 case 0xD3:
735 case 0xD4:
736 case 0xD5:
737 case 0xD6:
738 case 0xD7:
739 case 0xD8:
740 case 0xD9:
741 case 0xDA:
742 case 0xDB:
743 case 0xDC:
744 case 0xDD:
745 case 0xDE:
746 case 0xDF:
747 {
748 if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF})))
749 {
750 return token_type::parse_error;
751 }
752 break;
753 }
754
755 // U+0800..U+0FFF: bytes E0 A0..BF 80..BF
756 case 0xE0:
757 {
758 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF}))))
759 {
760 return token_type::parse_error;
761 }
762 break;
763 }
764
765 // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
766 // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
767 case 0xE1:
768 case 0xE2:
769 case 0xE3:
770 case 0xE4:
771 case 0xE5:
772 case 0xE6:
773 case 0xE7:
774 case 0xE8:
775 case 0xE9:
776 case 0xEA:
777 case 0xEB:
778 case 0xEC:
779 case 0xEE:
780 case 0xEF:
781 {
782 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF}))))
783 {
784 return token_type::parse_error;
785 }
786 break;
787 }
788
789 // U+D000..U+D7FF: bytes ED 80..9F 80..BF
790 case 0xED:
791 {
792 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF}))))
793 {
794 return token_type::parse_error;
795 }
796 break;
797 }
798
799 // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
800 case 0xF0:
801 {
802 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
803 {
804 return token_type::parse_error;
805 }
806 break;
807 }
808
809 // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
810 case 0xF1:
811 case 0xF2:
812 case 0xF3:
813 {
814 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}))))
815 {
816 return token_type::parse_error;
817 }
818 break;
819 }
820
821 // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
822 case 0xF4:
823 {
824 if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}))))
825 {
826 return token_type::parse_error;
827 }
828 break;
829 }
830
831 // remaining bytes (80..C1 and F5..FF) are ill-formed
832 default:
833 {
834 error_message = "invalid string: ill-formed UTF-8 byte";
835 return token_type::parse_error;
836 }
837 }
838 }
839 }
840
841 /*!
842 * @brief scan a comment
843 * @return whether comment could be scanned successfully
844 */
845 bool scan_comment()
846 {
847 switch (get())
848 {
849 // single-line comments skip input until a newline or EOF is read
850 case '/':
851 {
852 while (true)
853 {
854 switch (get())
855 {
856 case '\n':
857 case '\r':
858 case char_traits<char_type>::eof():
859 case '\0':
860 return true;
861
862 default:
863 break;
864 }
865 }
866 }
867
868 // multi-line comments skip input until */ is read
869 case '*':
870 {
871 while (true)
872 {
873 switch (get())
874 {
875 case char_traits<char_type>::eof():
876 case '\0':
877 {
878 error_message = "invalid comment; missing closing '*/'";
879 return false;
880 }
881
882 case '*':
883 {
884 switch (get())
885 {
886 case '/':
887 return true;
888
889 default:
890 {
891 unget();
892 continue;
893 }
894 }
895 }
896
897 default:
898 continue;
899 }
900 }
901 }
902
903 // unexpected character after reading '/'
904 default:
905 {
906 error_message = "invalid comment; expecting '/' or '*' after '/'";
907 return false;
908 }
909 }
910 }
911
912 JSON_HEDLEY_NON_NULL(2)
913 static void strtof(float& f, const char* str, char** endptr) noexcept
914 {
915 f = std::strtof(nptr: str, endptr: endptr);
916 }
917
918 JSON_HEDLEY_NON_NULL(2)
919 static void strtof(double& f, const char* str, char** endptr) noexcept
920 {
921 f = std::strtod(nptr: str, endptr: endptr);
922 }
923
924 JSON_HEDLEY_NON_NULL(2)
925 static void strtof(long double& f, const char* str, char** endptr) noexcept
926 {
927 f = std::strtold(nptr: str, endptr: endptr);
928 }
929
930 /*!
931 @brief scan a number literal
932
933 This function scans a string according to Sect. 6 of RFC 8259.
934
935 The function is realized with a deterministic finite state machine derived
936 from the grammar described in RFC 8259. Starting in state "init", the
937 input is read and used to determined the next state. Only state "done"
938 accepts the number. State "error" is a trap state to model errors. In the
939 table below, "anything" means any character but the ones listed before.
940
941 state | 0 | 1-9 | e E | + | - | . | anything
942 ---------|----------|----------|----------|---------|---------|----------|-----------
943 init | zero | any1 | [error] | [error] | minus | [error] | [error]
944 minus | zero | any1 | [error] | [error] | [error] | [error] | [error]
945 zero | done | done | exponent | done | done | decimal1 | done
946 any1 | any1 | any1 | exponent | done | done | decimal1 | done
947 decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error]
948 decimal2 | decimal2 | decimal2 | exponent | done | done | done | done
949 exponent | any2 | any2 | [error] | sign | sign | [error] | [error]
950 sign | any2 | any2 | [error] | [error] | [error] | [error] | [error]
951 any2 | any2 | any2 | done | done | done | done | done
952
953 The state machine is realized with one label per state (prefixed with
954 "scan_number_") and `goto` statements between them. The state machine
955 contains cycles, but any cycle can be left when EOF is read. Therefore,
956 the function is guaranteed to terminate.
957
958 During scanning, the read bytes are stored in token_buffer. This string is
959 then converted to a signed integer, an unsigned integer, or a
960 floating-point number.
961
962 @return token_type::value_unsigned, token_type::value_integer, or
963 token_type::value_float if number could be successfully scanned,
964 token_type::parse_error otherwise
965
966 @note The scanner is independent of the current locale. Internally, the
967 locale's decimal point is used instead of `.` to work with the
968 locale-dependent converters.
969 */
970 token_type scan_number() // lgtm [cpp/use-of-goto]
971 {
972 // reset token_buffer to store the number's bytes
973 reset();
974
975 // the type of the parsed number; initially set to unsigned; will be
976 // changed if minus sign, decimal point or exponent is read
977 token_type number_type = token_type::value_unsigned;
978
979 // state (init): we just found out we need to scan a number
980 switch (current)
981 {
982 case '-':
983 {
984 add(c: current);
985 goto scan_number_minus;
986 }
987
988 case '0':
989 {
990 add(c: current);
991 goto scan_number_zero;
992 }
993
994 case '1':
995 case '2':
996 case '3':
997 case '4':
998 case '5':
999 case '6':
1000 case '7':
1001 case '8':
1002 case '9':
1003 {
1004 add(c: current);
1005 goto scan_number_any1;
1006 }
1007
1008 // all other characters are rejected outside scan_number()
1009 default: // LCOV_EXCL_LINE
1010 JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
1011 }
1012
1013scan_number_minus:
1014 // state: we just parsed a leading minus sign
1015 number_type = token_type::value_integer;
1016 switch (get())
1017 {
1018 case '0':
1019 {
1020 add(c: current);
1021 goto scan_number_zero;
1022 }
1023
1024 case '1':
1025 case '2':
1026 case '3':
1027 case '4':
1028 case '5':
1029 case '6':
1030 case '7':
1031 case '8':
1032 case '9':
1033 {
1034 add(c: current);
1035 goto scan_number_any1;
1036 }
1037
1038 default:
1039 {
1040 error_message = "invalid number; expected digit after '-'";
1041 return token_type::parse_error;
1042 }
1043 }
1044
1045scan_number_zero:
1046 // state: we just parse a zero (maybe with a leading minus sign)
1047 switch (get())
1048 {
1049 case '.':
1050 {
1051 add(c: decimal_point_char);
1052 goto scan_number_decimal1;
1053 }
1054
1055 case 'e':
1056 case 'E':
1057 {
1058 add(c: current);
1059 goto scan_number_exponent;
1060 }
1061
1062 default:
1063 goto scan_number_done;
1064 }
1065
1066scan_number_any1:
1067 // state: we just parsed a number 0-9 (maybe with a leading minus sign)
1068 switch (get())
1069 {
1070 case '0':
1071 case '1':
1072 case '2':
1073 case '3':
1074 case '4':
1075 case '5':
1076 case '6':
1077 case '7':
1078 case '8':
1079 case '9':
1080 {
1081 add(c: current);
1082 goto scan_number_any1;
1083 }
1084
1085 case '.':
1086 {
1087 add(c: decimal_point_char);
1088 goto scan_number_decimal1;
1089 }
1090
1091 case 'e':
1092 case 'E':
1093 {
1094 add(c: current);
1095 goto scan_number_exponent;
1096 }
1097
1098 default:
1099 goto scan_number_done;
1100 }
1101
1102scan_number_decimal1:
1103 // state: we just parsed a decimal point
1104 number_type = token_type::value_float;
1105 switch (get())
1106 {
1107 case '0':
1108 case '1':
1109 case '2':
1110 case '3':
1111 case '4':
1112 case '5':
1113 case '6':
1114 case '7':
1115 case '8':
1116 case '9':
1117 {
1118 add(c: current);
1119 goto scan_number_decimal2;
1120 }
1121
1122 default:
1123 {
1124 error_message = "invalid number; expected digit after '.'";
1125 return token_type::parse_error;
1126 }
1127 }
1128
1129scan_number_decimal2:
1130 // we just parsed at least one number after a decimal point
1131 switch (get())
1132 {
1133 case '0':
1134 case '1':
1135 case '2':
1136 case '3':
1137 case '4':
1138 case '5':
1139 case '6':
1140 case '7':
1141 case '8':
1142 case '9':
1143 {
1144 add(c: current);
1145 goto scan_number_decimal2;
1146 }
1147
1148 case 'e':
1149 case 'E':
1150 {
1151 add(c: current);
1152 goto scan_number_exponent;
1153 }
1154
1155 default:
1156 goto scan_number_done;
1157 }
1158
1159scan_number_exponent:
1160 // we just parsed an exponent
1161 number_type = token_type::value_float;
1162 switch (get())
1163 {
1164 case '+':
1165 case '-':
1166 {
1167 add(c: current);
1168 goto scan_number_sign;
1169 }
1170
1171 case '0':
1172 case '1':
1173 case '2':
1174 case '3':
1175 case '4':
1176 case '5':
1177 case '6':
1178 case '7':
1179 case '8':
1180 case '9':
1181 {
1182 add(c: current);
1183 goto scan_number_any2;
1184 }
1185
1186 default:
1187 {
1188 error_message =
1189 "invalid number; expected '+', '-', or digit after exponent";
1190 return token_type::parse_error;
1191 }
1192 }
1193
1194scan_number_sign:
1195 // we just parsed an exponent sign
1196 switch (get())
1197 {
1198 case '0':
1199 case '1':
1200 case '2':
1201 case '3':
1202 case '4':
1203 case '5':
1204 case '6':
1205 case '7':
1206 case '8':
1207 case '9':
1208 {
1209 add(c: current);
1210 goto scan_number_any2;
1211 }
1212
1213 default:
1214 {
1215 error_message = "invalid number; expected digit after exponent sign";
1216 return token_type::parse_error;
1217 }
1218 }
1219
1220scan_number_any2:
1221 // we just parsed a number after the exponent or exponent sign
1222 switch (get())
1223 {
1224 case '0':
1225 case '1':
1226 case '2':
1227 case '3':
1228 case '4':
1229 case '5':
1230 case '6':
1231 case '7':
1232 case '8':
1233 case '9':
1234 {
1235 add(c: current);
1236 goto scan_number_any2;
1237 }
1238
1239 default:
1240 goto scan_number_done;
1241 }
1242
1243scan_number_done:
1244 // unget the character after the number (we only read it to know that
1245 // we are done scanning a number)
1246 unget();
1247
1248 char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1249 errno = 0;
1250
1251 // try to parse integers first and fall back to floats
1252 if (number_type == token_type::value_unsigned)
1253 {
1254 const auto x = std::strtoull(nptr: token_buffer.data(), endptr: &endptr, base: 10);
1255
1256 // we checked the number format before
1257 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1258
1259 if (errno == 0)
1260 {
1261 value_unsigned = static_cast<number_unsigned_t>(x);
1262 if (value_unsigned == x)
1263 {
1264 return token_type::value_unsigned;
1265 }
1266 }
1267 }
1268 else if (number_type == token_type::value_integer)
1269 {
1270 const auto x = std::strtoll(nptr: token_buffer.data(), endptr: &endptr, base: 10);
1271
1272 // we checked the number format before
1273 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1274
1275 if (errno == 0)
1276 {
1277 value_integer = static_cast<number_integer_t>(x);
1278 if (value_integer == x)
1279 {
1280 return token_type::value_integer;
1281 }
1282 }
1283 }
1284
1285 // this code is reached if we parse a floating-point number or if an
1286 // integer conversion above failed
1287 strtof(value_float, token_buffer.data(), &endptr);
1288
1289 // we checked the number format before
1290 JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1291
1292 return token_type::value_float;
1293 }
1294
1295 /*!
1296 @param[in] literal_text the literal text to expect
1297 @param[in] length the length of the passed literal text
1298 @param[in] return_type the token type to return on success
1299 */
1300 JSON_HEDLEY_NON_NULL(2)
1301 token_type scan_literal(const char_type* literal_text, const std::size_t length,
1302 token_type return_type)
1303 {
1304 JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[0]);
1305 for (std::size_t i = 1; i < length; ++i)
1306 {
1307 if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1308 {
1309 error_message = "invalid literal";
1310 return token_type::parse_error;
1311 }
1312 }
1313 return return_type;
1314 }
1315
1316 /////////////////////
1317 // input management
1318 /////////////////////
1319
1320 /// reset token_buffer; current character is beginning of token
1321 void reset() noexcept
1322 {
1323 token_buffer.clear();
1324 token_string.clear();
1325 token_string.push_back(char_traits<char_type>::to_char_type(current));
1326 }
1327
1328 /*
1329 @brief get next character from the input
1330
1331 This function provides the interface to the used input adapter. It does
1332 not throw in case the input reached EOF, but returns a
1333 `char_traits<char>::eof()` in that case. Stores the scanned characters
1334 for use in error messages.
1335
1336 @return character read from the input
1337 */
1338 char_int_type get()
1339 {
1340 ++position.chars_read_total;
1341 ++position.chars_read_current_line;
1342
1343 if (next_unget)
1344 {
1345 // just reset the next_unget variable and work with current
1346 next_unget = false;
1347 }
1348 else
1349 {
1350 current = ia.get_character();
1351 }
1352
1353 if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
1354 {
1355 token_string.push_back(char_traits<char_type>::to_char_type(current));
1356 }
1357
1358 if (current == '\n')
1359 {
1360 ++position.lines_read;
1361 position.chars_read_current_line = 0;
1362 }
1363
1364 return current;
1365 }
1366
1367 /*!
1368 @brief unget current character (read it again on next get)
1369
1370 We implement unget by setting variable next_unget to true. The input is not
1371 changed - we just simulate ungetting by modifying chars_read_total,
1372 chars_read_current_line, and token_string. The next call to get() will
1373 behave as if the unget character is read again.
1374 */
1375 void unget()
1376 {
1377 next_unget = true;
1378
1379 --position.chars_read_total;
1380
1381 // in case we "unget" a newline, we have to also decrement the lines_read
1382 if (position.chars_read_current_line == 0)
1383 {
1384 if (position.lines_read > 0)
1385 {
1386 --position.lines_read;
1387 }
1388 }
1389 else
1390 {
1391 --position.chars_read_current_line;
1392 }
1393
1394 if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
1395 {
1396 JSON_ASSERT(!token_string.empty());
1397 token_string.pop_back();
1398 }
1399 }
1400
1401 /// add a character to token_buffer
1402 void add(char_int_type c)
1403 {
1404 token_buffer.push_back(static_cast<typename string_t::value_type>(c));
1405 }
1406
1407 public:
1408 /////////////////////
1409 // value getters
1410 /////////////////////
1411
1412 /// return integer value
1413 constexpr number_integer_t get_number_integer() const noexcept
1414 {
1415 return value_integer;
1416 }
1417
1418 /// return unsigned integer value
1419 constexpr number_unsigned_t get_number_unsigned() const noexcept
1420 {
1421 return value_unsigned;
1422 }
1423
1424 /// return floating-point value
1425 constexpr number_float_t get_number_float() const noexcept
1426 {
1427 return value_float;
1428 }
1429
1430 /// return current string value (implicitly resets the token; useful only once)
1431 string_t& get_string()
1432 {
1433 return token_buffer;
1434 }
1435
1436 /////////////////////
1437 // diagnostics
1438 /////////////////////
1439
1440 /// return position of last read token
1441 constexpr position_t get_position() const noexcept
1442 {
1443 return position;
1444 }
1445
1446 /// return the last read token (for errors only). Will never contain EOF
1447 /// (an arbitrary value that is not a valid char value, often -1), because
1448 /// 255 may legitimately occur. May contain NUL, which should be escaped.
1449 std::string get_token_string() const
1450 {
1451 // escape control characters
1452 std::string result;
1453 for (const auto c : token_string)
1454 {
1455 if (static_cast<unsigned char>(c) <= '\x1F')
1456 {
1457 // escape control characters
1458 std::array<char, 9> cs{._M_elems: {}};
1459 static_cast<void>((std::snprintf)(s: cs.data(), maxlen: cs.size(), format: "<U+%.4X>", static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1460 result += cs.data();
1461 }
1462 else
1463 {
1464 // add character as is
1465 result.push_back(c: static_cast<std::string::value_type>(c));
1466 }
1467 }
1468
1469 return result;
1470 }
1471
1472 /// return syntax error message
1473 JSON_HEDLEY_RETURNS_NON_NULL
1474 constexpr const char* get_error_message() const noexcept
1475 {
1476 return error_message;
1477 }
1478
1479 /////////////////////
1480 // actual scanner
1481 /////////////////////
1482
1483 /*!
1484 @brief skip the UTF-8 byte order mark
1485 @return true iff there is no BOM or the correct BOM has been skipped
1486 */
1487 bool skip_bom()
1488 {
1489 if (get() == 0xEF)
1490 {
1491 // check if we completely parse the BOM
1492 return get() == 0xBB && get() == 0xBF;
1493 }
1494
1495 // the first character is not the beginning of the BOM; unget it to
1496 // process is later
1497 unget();
1498 return true;
1499 }
1500
1501 void skip_whitespace()
1502 {
1503 do
1504 {
1505 get();
1506 }
1507 while (current == ' ' || current == '\t' || current == '\n' || current == '\r');
1508 }
1509
1510 token_type scan()
1511 {
1512 // initially, skip the BOM
1513 if (position.chars_read_total == 0 && !skip_bom())
1514 {
1515 error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1516 return token_type::parse_error;
1517 }
1518
1519 // read next character and ignore whitespace
1520 skip_whitespace();
1521
1522 // ignore comments
1523 while (ignore_comments && current == '/')
1524 {
1525 if (!scan_comment())
1526 {
1527 return token_type::parse_error;
1528 }
1529
1530 // skip following whitespace
1531 skip_whitespace();
1532 }
1533
1534 switch (current)
1535 {
1536 // structural characters
1537 case '[':
1538 return token_type::begin_array;
1539 case ']':
1540 return token_type::end_array;
1541 case '{':
1542 return token_type::begin_object;
1543 case '}':
1544 return token_type::end_object;
1545 case ':':
1546 return token_type::name_separator;
1547 case ',':
1548 return token_type::value_separator;
1549
1550 // literals
1551 case 't':
1552 {
1553 std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}};
1554 return scan_literal(literal_text: true_literal.data(), length: true_literal.size(), return_type: token_type::literal_true);
1555 }
1556 case 'f':
1557 {
1558 std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}};
1559 return scan_literal(literal_text: false_literal.data(), length: false_literal.size(), return_type: token_type::literal_false);
1560 }
1561 case 'n':
1562 {
1563 std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}};
1564 return scan_literal(literal_text: null_literal.data(), length: null_literal.size(), return_type: token_type::literal_null);
1565 }
1566
1567 // string
1568 case '\"':
1569 return scan_string();
1570
1571 // number
1572 case '-':
1573 case '0':
1574 case '1':
1575 case '2':
1576 case '3':
1577 case '4':
1578 case '5':
1579 case '6':
1580 case '7':
1581 case '8':
1582 case '9':
1583 return scan_number();
1584
1585 // end of input (the null byte is needed when parsing from
1586 // string literals)
1587 case '\0':
1588 case char_traits<char_type>::eof():
1589 return token_type::end_of_input;
1590
1591 // error
1592 default:
1593 error_message = "invalid literal";
1594 return token_type::parse_error;
1595 }
1596 }
1597
1598 private:
1599 /// input adapter
1600 InputAdapterType ia;
1601
1602 /// whether comments should be ignored (true) or signaled as errors (false)
1603 const bool ignore_comments = false;
1604
1605 /// the current character
1606 char_int_type current = char_traits<char_type>::eof();
1607
1608 /// whether the next get() call should just return current
1609 bool next_unget = false;
1610
1611 /// the start position of the current token
1612 position_t position {};
1613
1614 /// raw input token string (for error messages)
1615 std::vector<char_type> token_string {};
1616
1617 /// buffer for variable-length tokens (numbers, strings)
1618 string_t token_buffer {};
1619
1620 /// a description of occurred lexer errors
1621 const char* error_message = "";
1622
1623 // number values
1624 number_integer_t value_integer = 0;
1625 number_unsigned_t value_unsigned = 0;
1626 number_float_t value_float = 0;
1627
1628 /// the decimal point
1629 const char_int_type decimal_point_char = '.';
1630};
1631
1632} // namespace detail
1633NLOHMANN_JSON_NAMESPACE_END
1634