| 1 | // __ _____ _____ _____ |
| 2 | // __| | __| | | | JSON for Modern C++ |
| 3 | // | | |__ | | | | | | version 3.11.3 |
| 4 | // |_____|_____|_____|_|___| https://github.com/nlohmann/json |
| 5 | // |
| 6 | // SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me> |
| 7 | // SPDX-License-Identifier: MIT |
| 8 | |
| 9 | #pragma once |
| 10 | |
| 11 | #include <array> // array |
| 12 | #include <clocale> // localeconv |
| 13 | #include <cstddef> // size_t |
| 14 | #include <cstdio> // snprintf |
| 15 | #include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull |
| 16 | #include <initializer_list> // initializer_list |
| 17 | #include <string> // char_traits, string |
| 18 | #include <utility> // move |
| 19 | #include <vector> // vector |
| 20 | |
| 21 | #include <nlohmann/detail/input/input_adapters.hpp> |
| 22 | #include <nlohmann/detail/input/position_t.hpp> |
| 23 | #include <nlohmann/detail/macro_scope.hpp> |
| 24 | #include <nlohmann/detail/meta/type_traits.hpp> |
| 25 | |
| 26 | NLOHMANN_JSON_NAMESPACE_BEGIN |
| 27 | namespace detail |
| 28 | { |
| 29 | |
| 30 | /////////// |
| 31 | // lexer // |
| 32 | /////////// |
| 33 | |
| 34 | template<typename BasicJsonType> |
| 35 | class lexer_base |
| 36 | { |
| 37 | public: |
| 38 | /// token types for the parser |
| 39 | enum class token_type |
| 40 | { |
| 41 | uninitialized, ///< indicating the scanner is uninitialized |
| 42 | literal_true, ///< the `true` literal |
| 43 | literal_false, ///< the `false` literal |
| 44 | literal_null, ///< the `null` literal |
| 45 | value_string, ///< a string -- use get_string() for actual value |
| 46 | value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value |
| 47 | value_integer, ///< a signed integer -- use get_number_integer() for actual value |
| 48 | value_float, ///< an floating point number -- use get_number_float() for actual value |
| 49 | begin_array, ///< the character for array begin `[` |
| 50 | begin_object, ///< the character for object begin `{` |
| 51 | end_array, ///< the character for array end `]` |
| 52 | end_object, ///< the character for object end `}` |
| 53 | name_separator, ///< the name separator `:` |
| 54 | value_separator, ///< the value separator `,` |
| 55 | parse_error, ///< indicating a parse error |
| 56 | end_of_input, ///< indicating the end of the input buffer |
| 57 | literal_or_value ///< a literal or the begin of a value (only for diagnostics) |
| 58 | }; |
| 59 | |
| 60 | /// return name of values of type token_type (only used for errors) |
| 61 | JSON_HEDLEY_RETURNS_NON_NULL |
| 62 | JSON_HEDLEY_CONST |
| 63 | static const char* token_type_name(const token_type t) noexcept |
| 64 | { |
| 65 | switch (t) |
| 66 | { |
| 67 | case token_type::uninitialized: |
| 68 | return "<uninitialized>" ; |
| 69 | case token_type::literal_true: |
| 70 | return "true literal" ; |
| 71 | case token_type::literal_false: |
| 72 | return "false literal" ; |
| 73 | case token_type::literal_null: |
| 74 | return "null literal" ; |
| 75 | case token_type::value_string: |
| 76 | return "string literal" ; |
| 77 | case token_type::value_unsigned: |
| 78 | case token_type::value_integer: |
| 79 | case token_type::value_float: |
| 80 | return "number literal" ; |
| 81 | case token_type::begin_array: |
| 82 | return "'['" ; |
| 83 | case token_type::begin_object: |
| 84 | return "'{'" ; |
| 85 | case token_type::end_array: |
| 86 | return "']'" ; |
| 87 | case token_type::end_object: |
| 88 | return "'}'" ; |
| 89 | case token_type::name_separator: |
| 90 | return "':'" ; |
| 91 | case token_type::value_separator: |
| 92 | return "','" ; |
| 93 | case token_type::parse_error: |
| 94 | return "<parse error>" ; |
| 95 | case token_type::end_of_input: |
| 96 | return "end of input" ; |
| 97 | case token_type::literal_or_value: |
| 98 | return "'[', '{', or a literal" ; |
| 99 | // LCOV_EXCL_START |
| 100 | default: // catch non-enum values |
| 101 | return "unknown token" ; |
| 102 | // LCOV_EXCL_STOP |
| 103 | } |
| 104 | } |
| 105 | }; |
| 106 | /*! |
| 107 | @brief lexical analysis |
| 108 | |
| 109 | This class organizes the lexical analysis during JSON deserialization. |
| 110 | */ |
| 111 | template<typename BasicJsonType, typename InputAdapterType> |
| 112 | class lexer : public lexer_base<BasicJsonType> |
| 113 | { |
| 114 | using number_integer_t = typename BasicJsonType::number_integer_t; |
| 115 | using number_unsigned_t = typename BasicJsonType::number_unsigned_t; |
| 116 | using number_float_t = typename BasicJsonType::number_float_t; |
| 117 | using string_t = typename BasicJsonType::string_t; |
| 118 | using char_type = typename InputAdapterType::char_type; |
| 119 | using char_int_type = typename char_traits<char_type>::int_type; |
| 120 | |
| 121 | public: |
| 122 | using token_type = typename lexer_base<BasicJsonType>::token_type; |
| 123 | |
| 124 | explicit lexer(InputAdapterType&& adapter, bool = false) noexcept |
| 125 | : ia(std::move(adapter)) |
| 126 | , ignore_comments(ignore_comments_) |
| 127 | , decimal_point_char(static_cast<char_int_type>(get_decimal_point())) |
| 128 | {} |
| 129 | |
| 130 | // delete because of pointer members |
| 131 | lexer(const lexer&) = delete; |
| 132 | lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) |
| 133 | lexer& operator=(lexer&) = delete; |
| 134 | lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor) |
| 135 | ~lexer() = default; |
| 136 | |
| 137 | private: |
| 138 | ///////////////////// |
| 139 | // locales |
| 140 | ///////////////////// |
| 141 | |
| 142 | /// return the locale-dependent decimal point |
| 143 | JSON_HEDLEY_PURE |
| 144 | static char get_decimal_point() noexcept |
| 145 | { |
| 146 | const auto* loc = localeconv(); |
| 147 | JSON_ASSERT(loc != nullptr); |
| 148 | return (loc->decimal_point == nullptr) ? '.' : *(loc->decimal_point); |
| 149 | } |
| 150 | |
| 151 | ///////////////////// |
| 152 | // scan functions |
| 153 | ///////////////////// |
| 154 | |
| 155 | /*! |
| 156 | @brief get codepoint from 4 hex characters following `\u` |
| 157 | |
| 158 | For input "\u c1 c2 c3 c4" the codepoint is: |
| 159 | (c1 * 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4 |
| 160 | = (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0) |
| 161 | |
| 162 | Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f' |
| 163 | must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The |
| 164 | conversion is done by subtracting the offset (0x30, 0x37, and 0x57) |
| 165 | between the ASCII value of the character and the desired integer value. |
| 166 | |
| 167 | @return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or |
| 168 | non-hex character) |
| 169 | */ |
| 170 | int get_codepoint() |
| 171 | { |
| 172 | // this function only makes sense after reading `\u` |
| 173 | JSON_ASSERT(current == 'u'); |
| 174 | int codepoint = 0; |
| 175 | |
| 176 | const auto factors = { 12u, 8u, 4u, 0u }; |
| 177 | for (const auto factor : factors) |
| 178 | { |
| 179 | get(); |
| 180 | |
| 181 | if (current >= '0' && current <= '9') |
| 182 | { |
| 183 | codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x30u) << factor); |
| 184 | } |
| 185 | else if (current >= 'A' && current <= 'F') |
| 186 | { |
| 187 | codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x37u) << factor); |
| 188 | } |
| 189 | else if (current >= 'a' && current <= 'f') |
| 190 | { |
| 191 | codepoint += static_cast<int>((static_cast<unsigned int>(current) - 0x57u) << factor); |
| 192 | } |
| 193 | else |
| 194 | { |
| 195 | return -1; |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | JSON_ASSERT(0x0000 <= codepoint && codepoint <= 0xFFFF); |
| 200 | return codepoint; |
| 201 | } |
| 202 | |
| 203 | /*! |
| 204 | @brief check if the next byte(s) are inside a given range |
| 205 | |
| 206 | Adds the current byte and, for each passed range, reads a new byte and |
| 207 | checks if it is inside the range. If a violation was detected, set up an |
| 208 | error message and return false. Otherwise, return true. |
| 209 | |
| 210 | @param[in] ranges list of integers; interpreted as list of pairs of |
| 211 | inclusive lower and upper bound, respectively |
| 212 | |
| 213 | @pre The passed list @a ranges must have 2, 4, or 6 elements; that is, |
| 214 | 1, 2, or 3 pairs. This precondition is enforced by an assertion. |
| 215 | |
| 216 | @return true if and only if no range violation was detected |
| 217 | */ |
| 218 | bool next_byte_in_range(std::initializer_list<char_int_type> ranges) |
| 219 | { |
| 220 | JSON_ASSERT(ranges.size() == 2 || ranges.size() == 4 || ranges.size() == 6); |
| 221 | add(c: current); |
| 222 | |
| 223 | for (auto range = ranges.begin(); range != ranges.end(); ++range) |
| 224 | { |
| 225 | get(); |
| 226 | if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions) |
| 227 | { |
| 228 | add(c: current); |
| 229 | } |
| 230 | else |
| 231 | { |
| 232 | error_message = "invalid string: ill-formed UTF-8 byte" ; |
| 233 | return false; |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | return true; |
| 238 | } |
| 239 | |
| 240 | /*! |
| 241 | @brief scan a string literal |
| 242 | |
| 243 | This function scans a string according to Sect. 7 of RFC 8259. While |
| 244 | scanning, bytes are escaped and copied into buffer token_buffer. Then the |
| 245 | function returns successfully, token_buffer is *not* null-terminated (as it |
| 246 | may contain \0 bytes), and token_buffer.size() is the number of bytes in the |
| 247 | string. |
| 248 | |
| 249 | @return token_type::value_string if string could be successfully scanned, |
| 250 | token_type::parse_error otherwise |
| 251 | |
| 252 | @note In case of errors, variable error_message contains a textual |
| 253 | description. |
| 254 | */ |
| 255 | token_type scan_string() |
| 256 | { |
| 257 | // reset token_buffer (ignore opening quote) |
| 258 | reset(); |
| 259 | |
| 260 | // we entered the function by reading an open quote |
| 261 | JSON_ASSERT(current == '\"'); |
| 262 | |
| 263 | while (true) |
| 264 | { |
| 265 | // get next character |
| 266 | switch (get()) |
| 267 | { |
| 268 | // end of file while parsing string |
| 269 | case char_traits<char_type>::eof(): |
| 270 | { |
| 271 | error_message = "invalid string: missing closing quote" ; |
| 272 | return token_type::parse_error; |
| 273 | } |
| 274 | |
| 275 | // closing quote |
| 276 | case '\"': |
| 277 | { |
| 278 | return token_type::value_string; |
| 279 | } |
| 280 | |
| 281 | // escapes |
| 282 | case '\\': |
| 283 | { |
| 284 | switch (get()) |
| 285 | { |
| 286 | // quotation mark |
| 287 | case '\"': |
| 288 | add(c: '\"'); |
| 289 | break; |
| 290 | // reverse solidus |
| 291 | case '\\': |
| 292 | add(c: '\\'); |
| 293 | break; |
| 294 | // solidus |
| 295 | case '/': |
| 296 | add(c: '/'); |
| 297 | break; |
| 298 | // backspace |
| 299 | case 'b': |
| 300 | add(c: '\b'); |
| 301 | break; |
| 302 | // form feed |
| 303 | case 'f': |
| 304 | add(c: '\f'); |
| 305 | break; |
| 306 | // line feed |
| 307 | case 'n': |
| 308 | add(c: '\n'); |
| 309 | break; |
| 310 | // carriage return |
| 311 | case 'r': |
| 312 | add(c: '\r'); |
| 313 | break; |
| 314 | // tab |
| 315 | case 't': |
| 316 | add(c: '\t'); |
| 317 | break; |
| 318 | |
| 319 | // unicode escapes |
| 320 | case 'u': |
| 321 | { |
| 322 | const int codepoint1 = get_codepoint(); |
| 323 | int codepoint = codepoint1; // start with codepoint1 |
| 324 | |
| 325 | if (JSON_HEDLEY_UNLIKELY(codepoint1 == -1)) |
| 326 | { |
| 327 | error_message = "invalid string: '\\u' must be followed by 4 hex digits" ; |
| 328 | return token_type::parse_error; |
| 329 | } |
| 330 | |
| 331 | // check if code point is a high surrogate |
| 332 | if (0xD800 <= codepoint1 && codepoint1 <= 0xDBFF) |
| 333 | { |
| 334 | // expect next \uxxxx entry |
| 335 | if (JSON_HEDLEY_LIKELY(get() == '\\' && get() == 'u')) |
| 336 | { |
| 337 | const int codepoint2 = get_codepoint(); |
| 338 | |
| 339 | if (JSON_HEDLEY_UNLIKELY(codepoint2 == -1)) |
| 340 | { |
| 341 | error_message = "invalid string: '\\u' must be followed by 4 hex digits" ; |
| 342 | return token_type::parse_error; |
| 343 | } |
| 344 | |
| 345 | // check if codepoint2 is a low surrogate |
| 346 | if (JSON_HEDLEY_LIKELY(0xDC00 <= codepoint2 && codepoint2 <= 0xDFFF)) |
| 347 | { |
| 348 | // overwrite codepoint |
| 349 | codepoint = static_cast<int>( |
| 350 | // high surrogate occupies the most significant 22 bits |
| 351 | (static_cast<unsigned int>(codepoint1) << 10u) |
| 352 | // low surrogate occupies the least significant 15 bits |
| 353 | + static_cast<unsigned int>(codepoint2) |
| 354 | // there is still the 0xD800, 0xDC00 and 0x10000 noise |
| 355 | // in the result, so we have to subtract with: |
| 356 | // (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00 |
| 357 | - 0x35FDC00u); |
| 358 | } |
| 359 | else |
| 360 | { |
| 361 | error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF" ; |
| 362 | return token_type::parse_error; |
| 363 | } |
| 364 | } |
| 365 | else |
| 366 | { |
| 367 | error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF" ; |
| 368 | return token_type::parse_error; |
| 369 | } |
| 370 | } |
| 371 | else |
| 372 | { |
| 373 | if (JSON_HEDLEY_UNLIKELY(0xDC00 <= codepoint1 && codepoint1 <= 0xDFFF)) |
| 374 | { |
| 375 | error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF" ; |
| 376 | return token_type::parse_error; |
| 377 | } |
| 378 | } |
| 379 | |
| 380 | // result of the above calculation yields a proper codepoint |
| 381 | JSON_ASSERT(0x00 <= codepoint && codepoint <= 0x10FFFF); |
| 382 | |
| 383 | // translate codepoint into bytes |
| 384 | if (codepoint < 0x80) |
| 385 | { |
| 386 | // 1-byte characters: 0xxxxxxx (ASCII) |
| 387 | add(c: static_cast<char_int_type>(codepoint)); |
| 388 | } |
| 389 | else if (codepoint <= 0x7FF) |
| 390 | { |
| 391 | // 2-byte characters: 110xxxxx 10xxxxxx |
| 392 | add(c: static_cast<char_int_type>(0xC0u | (static_cast<unsigned int>(codepoint) >> 6u))); |
| 393 | add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
| 394 | } |
| 395 | else if (codepoint <= 0xFFFF) |
| 396 | { |
| 397 | // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx |
| 398 | add(c: static_cast<char_int_type>(0xE0u | (static_cast<unsigned int>(codepoint) >> 12u))); |
| 399 | add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu))); |
| 400 | add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
| 401 | } |
| 402 | else |
| 403 | { |
| 404 | // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 405 | add(c: static_cast<char_int_type>(0xF0u | (static_cast<unsigned int>(codepoint) >> 18u))); |
| 406 | add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 12u) & 0x3Fu))); |
| 407 | add(c: static_cast<char_int_type>(0x80u | ((static_cast<unsigned int>(codepoint) >> 6u) & 0x3Fu))); |
| 408 | add(c: static_cast<char_int_type>(0x80u | (static_cast<unsigned int>(codepoint) & 0x3Fu))); |
| 409 | } |
| 410 | |
| 411 | break; |
| 412 | } |
| 413 | |
| 414 | // other characters after escape |
| 415 | default: |
| 416 | error_message = "invalid string: forbidden character after backslash" ; |
| 417 | return token_type::parse_error; |
| 418 | } |
| 419 | |
| 420 | break; |
| 421 | } |
| 422 | |
| 423 | // invalid control characters |
| 424 | case 0x00: |
| 425 | { |
| 426 | error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000" ; |
| 427 | return token_type::parse_error; |
| 428 | } |
| 429 | |
| 430 | case 0x01: |
| 431 | { |
| 432 | error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001" ; |
| 433 | return token_type::parse_error; |
| 434 | } |
| 435 | |
| 436 | case 0x02: |
| 437 | { |
| 438 | error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002" ; |
| 439 | return token_type::parse_error; |
| 440 | } |
| 441 | |
| 442 | case 0x03: |
| 443 | { |
| 444 | error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003" ; |
| 445 | return token_type::parse_error; |
| 446 | } |
| 447 | |
| 448 | case 0x04: |
| 449 | { |
| 450 | error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004" ; |
| 451 | return token_type::parse_error; |
| 452 | } |
| 453 | |
| 454 | case 0x05: |
| 455 | { |
| 456 | error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005" ; |
| 457 | return token_type::parse_error; |
| 458 | } |
| 459 | |
| 460 | case 0x06: |
| 461 | { |
| 462 | error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006" ; |
| 463 | return token_type::parse_error; |
| 464 | } |
| 465 | |
| 466 | case 0x07: |
| 467 | { |
| 468 | error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007" ; |
| 469 | return token_type::parse_error; |
| 470 | } |
| 471 | |
| 472 | case 0x08: |
| 473 | { |
| 474 | error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b" ; |
| 475 | return token_type::parse_error; |
| 476 | } |
| 477 | |
| 478 | case 0x09: |
| 479 | { |
| 480 | error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t" ; |
| 481 | return token_type::parse_error; |
| 482 | } |
| 483 | |
| 484 | case 0x0A: |
| 485 | { |
| 486 | error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n" ; |
| 487 | return token_type::parse_error; |
| 488 | } |
| 489 | |
| 490 | case 0x0B: |
| 491 | { |
| 492 | error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B" ; |
| 493 | return token_type::parse_error; |
| 494 | } |
| 495 | |
| 496 | case 0x0C: |
| 497 | { |
| 498 | error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f" ; |
| 499 | return token_type::parse_error; |
| 500 | } |
| 501 | |
| 502 | case 0x0D: |
| 503 | { |
| 504 | error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r" ; |
| 505 | return token_type::parse_error; |
| 506 | } |
| 507 | |
| 508 | case 0x0E: |
| 509 | { |
| 510 | error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E" ; |
| 511 | return token_type::parse_error; |
| 512 | } |
| 513 | |
| 514 | case 0x0F: |
| 515 | { |
| 516 | error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F" ; |
| 517 | return token_type::parse_error; |
| 518 | } |
| 519 | |
| 520 | case 0x10: |
| 521 | { |
| 522 | error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010" ; |
| 523 | return token_type::parse_error; |
| 524 | } |
| 525 | |
| 526 | case 0x11: |
| 527 | { |
| 528 | error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011" ; |
| 529 | return token_type::parse_error; |
| 530 | } |
| 531 | |
| 532 | case 0x12: |
| 533 | { |
| 534 | error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012" ; |
| 535 | return token_type::parse_error; |
| 536 | } |
| 537 | |
| 538 | case 0x13: |
| 539 | { |
| 540 | error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013" ; |
| 541 | return token_type::parse_error; |
| 542 | } |
| 543 | |
| 544 | case 0x14: |
| 545 | { |
| 546 | error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014" ; |
| 547 | return token_type::parse_error; |
| 548 | } |
| 549 | |
| 550 | case 0x15: |
| 551 | { |
| 552 | error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015" ; |
| 553 | return token_type::parse_error; |
| 554 | } |
| 555 | |
| 556 | case 0x16: |
| 557 | { |
| 558 | error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016" ; |
| 559 | return token_type::parse_error; |
| 560 | } |
| 561 | |
| 562 | case 0x17: |
| 563 | { |
| 564 | error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017" ; |
| 565 | return token_type::parse_error; |
| 566 | } |
| 567 | |
| 568 | case 0x18: |
| 569 | { |
| 570 | error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018" ; |
| 571 | return token_type::parse_error; |
| 572 | } |
| 573 | |
| 574 | case 0x19: |
| 575 | { |
| 576 | error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019" ; |
| 577 | return token_type::parse_error; |
| 578 | } |
| 579 | |
| 580 | case 0x1A: |
| 581 | { |
| 582 | error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A" ; |
| 583 | return token_type::parse_error; |
| 584 | } |
| 585 | |
| 586 | case 0x1B: |
| 587 | { |
| 588 | error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B" ; |
| 589 | return token_type::parse_error; |
| 590 | } |
| 591 | |
| 592 | case 0x1C: |
| 593 | { |
| 594 | error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C" ; |
| 595 | return token_type::parse_error; |
| 596 | } |
| 597 | |
| 598 | case 0x1D: |
| 599 | { |
| 600 | error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D" ; |
| 601 | return token_type::parse_error; |
| 602 | } |
| 603 | |
| 604 | case 0x1E: |
| 605 | { |
| 606 | error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E" ; |
| 607 | return token_type::parse_error; |
| 608 | } |
| 609 | |
| 610 | case 0x1F: |
| 611 | { |
| 612 | error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F" ; |
| 613 | return token_type::parse_error; |
| 614 | } |
| 615 | |
| 616 | // U+0020..U+007F (except U+0022 (quote) and U+005C (backspace)) |
| 617 | case 0x20: |
| 618 | case 0x21: |
| 619 | case 0x23: |
| 620 | case 0x24: |
| 621 | case 0x25: |
| 622 | case 0x26: |
| 623 | case 0x27: |
| 624 | case 0x28: |
| 625 | case 0x29: |
| 626 | case 0x2A: |
| 627 | case 0x2B: |
| 628 | case 0x2C: |
| 629 | case 0x2D: |
| 630 | case 0x2E: |
| 631 | case 0x2F: |
| 632 | case 0x30: |
| 633 | case 0x31: |
| 634 | case 0x32: |
| 635 | case 0x33: |
| 636 | case 0x34: |
| 637 | case 0x35: |
| 638 | case 0x36: |
| 639 | case 0x37: |
| 640 | case 0x38: |
| 641 | case 0x39: |
| 642 | case 0x3A: |
| 643 | case 0x3B: |
| 644 | case 0x3C: |
| 645 | case 0x3D: |
| 646 | case 0x3E: |
| 647 | case 0x3F: |
| 648 | case 0x40: |
| 649 | case 0x41: |
| 650 | case 0x42: |
| 651 | case 0x43: |
| 652 | case 0x44: |
| 653 | case 0x45: |
| 654 | case 0x46: |
| 655 | case 0x47: |
| 656 | case 0x48: |
| 657 | case 0x49: |
| 658 | case 0x4A: |
| 659 | case 0x4B: |
| 660 | case 0x4C: |
| 661 | case 0x4D: |
| 662 | case 0x4E: |
| 663 | case 0x4F: |
| 664 | case 0x50: |
| 665 | case 0x51: |
| 666 | case 0x52: |
| 667 | case 0x53: |
| 668 | case 0x54: |
| 669 | case 0x55: |
| 670 | case 0x56: |
| 671 | case 0x57: |
| 672 | case 0x58: |
| 673 | case 0x59: |
| 674 | case 0x5A: |
| 675 | case 0x5B: |
| 676 | case 0x5D: |
| 677 | case 0x5E: |
| 678 | case 0x5F: |
| 679 | case 0x60: |
| 680 | case 0x61: |
| 681 | case 0x62: |
| 682 | case 0x63: |
| 683 | case 0x64: |
| 684 | case 0x65: |
| 685 | case 0x66: |
| 686 | case 0x67: |
| 687 | case 0x68: |
| 688 | case 0x69: |
| 689 | case 0x6A: |
| 690 | case 0x6B: |
| 691 | case 0x6C: |
| 692 | case 0x6D: |
| 693 | case 0x6E: |
| 694 | case 0x6F: |
| 695 | case 0x70: |
| 696 | case 0x71: |
| 697 | case 0x72: |
| 698 | case 0x73: |
| 699 | case 0x74: |
| 700 | case 0x75: |
| 701 | case 0x76: |
| 702 | case 0x77: |
| 703 | case 0x78: |
| 704 | case 0x79: |
| 705 | case 0x7A: |
| 706 | case 0x7B: |
| 707 | case 0x7C: |
| 708 | case 0x7D: |
| 709 | case 0x7E: |
| 710 | case 0x7F: |
| 711 | { |
| 712 | add(c: current); |
| 713 | break; |
| 714 | } |
| 715 | |
| 716 | // U+0080..U+07FF: bytes C2..DF 80..BF |
| 717 | case 0xC2: |
| 718 | case 0xC3: |
| 719 | case 0xC4: |
| 720 | case 0xC5: |
| 721 | case 0xC6: |
| 722 | case 0xC7: |
| 723 | case 0xC8: |
| 724 | case 0xC9: |
| 725 | case 0xCA: |
| 726 | case 0xCB: |
| 727 | case 0xCC: |
| 728 | case 0xCD: |
| 729 | case 0xCE: |
| 730 | case 0xCF: |
| 731 | case 0xD0: |
| 732 | case 0xD1: |
| 733 | case 0xD2: |
| 734 | case 0xD3: |
| 735 | case 0xD4: |
| 736 | case 0xD5: |
| 737 | case 0xD6: |
| 738 | case 0xD7: |
| 739 | case 0xD8: |
| 740 | case 0xD9: |
| 741 | case 0xDA: |
| 742 | case 0xDB: |
| 743 | case 0xDC: |
| 744 | case 0xDD: |
| 745 | case 0xDE: |
| 746 | case 0xDF: |
| 747 | { |
| 748 | if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({0x80, 0xBF}))) |
| 749 | { |
| 750 | return token_type::parse_error; |
| 751 | } |
| 752 | break; |
| 753 | } |
| 754 | |
| 755 | // U+0800..U+0FFF: bytes E0 A0..BF 80..BF |
| 756 | case 0xE0: |
| 757 | { |
| 758 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0xA0, 0xBF, 0x80, 0xBF})))) |
| 759 | { |
| 760 | return token_type::parse_error; |
| 761 | } |
| 762 | break; |
| 763 | } |
| 764 | |
| 765 | // U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF |
| 766 | // U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF |
| 767 | case 0xE1: |
| 768 | case 0xE2: |
| 769 | case 0xE3: |
| 770 | case 0xE4: |
| 771 | case 0xE5: |
| 772 | case 0xE6: |
| 773 | case 0xE7: |
| 774 | case 0xE8: |
| 775 | case 0xE9: |
| 776 | case 0xEA: |
| 777 | case 0xEB: |
| 778 | case 0xEC: |
| 779 | case 0xEE: |
| 780 | case 0xEF: |
| 781 | { |
| 782 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF})))) |
| 783 | { |
| 784 | return token_type::parse_error; |
| 785 | } |
| 786 | break; |
| 787 | } |
| 788 | |
| 789 | // U+D000..U+D7FF: bytes ED 80..9F 80..BF |
| 790 | case 0xED: |
| 791 | { |
| 792 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x9F, 0x80, 0xBF})))) |
| 793 | { |
| 794 | return token_type::parse_error; |
| 795 | } |
| 796 | break; |
| 797 | } |
| 798 | |
| 799 | // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
| 800 | case 0xF0: |
| 801 | { |
| 802 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) |
| 803 | { |
| 804 | return token_type::parse_error; |
| 805 | } |
| 806 | break; |
| 807 | } |
| 808 | |
| 809 | // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
| 810 | case 0xF1: |
| 811 | case 0xF2: |
| 812 | case 0xF3: |
| 813 | { |
| 814 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF})))) |
| 815 | { |
| 816 | return token_type::parse_error; |
| 817 | } |
| 818 | break; |
| 819 | } |
| 820 | |
| 821 | // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF |
| 822 | case 0xF4: |
| 823 | { |
| 824 | if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF})))) |
| 825 | { |
| 826 | return token_type::parse_error; |
| 827 | } |
| 828 | break; |
| 829 | } |
| 830 | |
| 831 | // remaining bytes (80..C1 and F5..FF) are ill-formed |
| 832 | default: |
| 833 | { |
| 834 | error_message = "invalid string: ill-formed UTF-8 byte" ; |
| 835 | return token_type::parse_error; |
| 836 | } |
| 837 | } |
| 838 | } |
| 839 | } |
| 840 | |
| 841 | /*! |
| 842 | * @brief scan a comment |
| 843 | * @return whether comment could be scanned successfully |
| 844 | */ |
| 845 | bool () |
| 846 | { |
| 847 | switch (get()) |
| 848 | { |
| 849 | // single-line comments skip input until a newline or EOF is read |
| 850 | case '/': |
| 851 | { |
| 852 | while (true) |
| 853 | { |
| 854 | switch (get()) |
| 855 | { |
| 856 | case '\n': |
| 857 | case '\r': |
| 858 | case char_traits<char_type>::eof(): |
| 859 | case '\0': |
| 860 | return true; |
| 861 | |
| 862 | default: |
| 863 | break; |
| 864 | } |
| 865 | } |
| 866 | } |
| 867 | |
| 868 | // multi-line comments skip input until */ is read |
| 869 | case '*': |
| 870 | { |
| 871 | while (true) |
| 872 | { |
| 873 | switch (get()) |
| 874 | { |
| 875 | case char_traits<char_type>::eof(): |
| 876 | case '\0': |
| 877 | { |
| 878 | error_message = "invalid comment; missing closing '*/'" ; |
| 879 | return false; |
| 880 | } |
| 881 | |
| 882 | case '*': |
| 883 | { |
| 884 | switch (get()) |
| 885 | { |
| 886 | case '/': |
| 887 | return true; |
| 888 | |
| 889 | default: |
| 890 | { |
| 891 | unget(); |
| 892 | continue; |
| 893 | } |
| 894 | } |
| 895 | } |
| 896 | |
| 897 | default: |
| 898 | continue; |
| 899 | } |
| 900 | } |
| 901 | } |
| 902 | |
| 903 | // unexpected character after reading '/' |
| 904 | default: |
| 905 | { |
| 906 | error_message = "invalid comment; expecting '/' or '*' after '/'" ; |
| 907 | return false; |
| 908 | } |
| 909 | } |
| 910 | } |
| 911 | |
| 912 | JSON_HEDLEY_NON_NULL(2) |
| 913 | static void strtof(float& f, const char* str, char** endptr) noexcept |
| 914 | { |
| 915 | f = std::strtof(nptr: str, endptr: endptr); |
| 916 | } |
| 917 | |
| 918 | JSON_HEDLEY_NON_NULL(2) |
| 919 | static void strtof(double& f, const char* str, char** endptr) noexcept |
| 920 | { |
| 921 | f = std::strtod(nptr: str, endptr: endptr); |
| 922 | } |
| 923 | |
| 924 | JSON_HEDLEY_NON_NULL(2) |
| 925 | static void strtof(long double& f, const char* str, char** endptr) noexcept |
| 926 | { |
| 927 | f = std::strtold(nptr: str, endptr: endptr); |
| 928 | } |
| 929 | |
| 930 | /*! |
| 931 | @brief scan a number literal |
| 932 | |
| 933 | This function scans a string according to Sect. 6 of RFC 8259. |
| 934 | |
| 935 | The function is realized with a deterministic finite state machine derived |
| 936 | from the grammar described in RFC 8259. Starting in state "init", the |
| 937 | input is read and used to determined the next state. Only state "done" |
| 938 | accepts the number. State "error" is a trap state to model errors. In the |
| 939 | table below, "anything" means any character but the ones listed before. |
| 940 | |
| 941 | state | 0 | 1-9 | e E | + | - | . | anything |
| 942 | ---------|----------|----------|----------|---------|---------|----------|----------- |
| 943 | init | zero | any1 | [error] | [error] | minus | [error] | [error] |
| 944 | minus | zero | any1 | [error] | [error] | [error] | [error] | [error] |
| 945 | zero | done | done | exponent | done | done | decimal1 | done |
| 946 | any1 | any1 | any1 | exponent | done | done | decimal1 | done |
| 947 | decimal1 | decimal2 | decimal2 | [error] | [error] | [error] | [error] | [error] |
| 948 | decimal2 | decimal2 | decimal2 | exponent | done | done | done | done |
| 949 | exponent | any2 | any2 | [error] | sign | sign | [error] | [error] |
| 950 | sign | any2 | any2 | [error] | [error] | [error] | [error] | [error] |
| 951 | any2 | any2 | any2 | done | done | done | done | done |
| 952 | |
| 953 | The state machine is realized with one label per state (prefixed with |
| 954 | "scan_number_") and `goto` statements between them. The state machine |
| 955 | contains cycles, but any cycle can be left when EOF is read. Therefore, |
| 956 | the function is guaranteed to terminate. |
| 957 | |
| 958 | During scanning, the read bytes are stored in token_buffer. This string is |
| 959 | then converted to a signed integer, an unsigned integer, or a |
| 960 | floating-point number. |
| 961 | |
| 962 | @return token_type::value_unsigned, token_type::value_integer, or |
| 963 | token_type::value_float if number could be successfully scanned, |
| 964 | token_type::parse_error otherwise |
| 965 | |
| 966 | @note The scanner is independent of the current locale. Internally, the |
| 967 | locale's decimal point is used instead of `.` to work with the |
| 968 | locale-dependent converters. |
| 969 | */ |
| 970 | token_type scan_number() // lgtm [cpp/use-of-goto] |
| 971 | { |
| 972 | // reset token_buffer to store the number's bytes |
| 973 | reset(); |
| 974 | |
| 975 | // the type of the parsed number; initially set to unsigned; will be |
| 976 | // changed if minus sign, decimal point or exponent is read |
| 977 | token_type number_type = token_type::value_unsigned; |
| 978 | |
| 979 | // state (init): we just found out we need to scan a number |
| 980 | switch (current) |
| 981 | { |
| 982 | case '-': |
| 983 | { |
| 984 | add(c: current); |
| 985 | goto scan_number_minus; |
| 986 | } |
| 987 | |
| 988 | case '0': |
| 989 | { |
| 990 | add(c: current); |
| 991 | goto scan_number_zero; |
| 992 | } |
| 993 | |
| 994 | case '1': |
| 995 | case '2': |
| 996 | case '3': |
| 997 | case '4': |
| 998 | case '5': |
| 999 | case '6': |
| 1000 | case '7': |
| 1001 | case '8': |
| 1002 | case '9': |
| 1003 | { |
| 1004 | add(c: current); |
| 1005 | goto scan_number_any1; |
| 1006 | } |
| 1007 | |
| 1008 | // all other characters are rejected outside scan_number() |
| 1009 | default: // LCOV_EXCL_LINE |
| 1010 | JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE |
| 1011 | } |
| 1012 | |
| 1013 | scan_number_minus: |
| 1014 | // state: we just parsed a leading minus sign |
| 1015 | number_type = token_type::value_integer; |
| 1016 | switch (get()) |
| 1017 | { |
| 1018 | case '0': |
| 1019 | { |
| 1020 | add(c: current); |
| 1021 | goto scan_number_zero; |
| 1022 | } |
| 1023 | |
| 1024 | case '1': |
| 1025 | case '2': |
| 1026 | case '3': |
| 1027 | case '4': |
| 1028 | case '5': |
| 1029 | case '6': |
| 1030 | case '7': |
| 1031 | case '8': |
| 1032 | case '9': |
| 1033 | { |
| 1034 | add(c: current); |
| 1035 | goto scan_number_any1; |
| 1036 | } |
| 1037 | |
| 1038 | default: |
| 1039 | { |
| 1040 | error_message = "invalid number; expected digit after '-'" ; |
| 1041 | return token_type::parse_error; |
| 1042 | } |
| 1043 | } |
| 1044 | |
| 1045 | scan_number_zero: |
| 1046 | // state: we just parse a zero (maybe with a leading minus sign) |
| 1047 | switch (get()) |
| 1048 | { |
| 1049 | case '.': |
| 1050 | { |
| 1051 | add(c: decimal_point_char); |
| 1052 | goto scan_number_decimal1; |
| 1053 | } |
| 1054 | |
| 1055 | case 'e': |
| 1056 | case 'E': |
| 1057 | { |
| 1058 | add(c: current); |
| 1059 | goto scan_number_exponent; |
| 1060 | } |
| 1061 | |
| 1062 | default: |
| 1063 | goto scan_number_done; |
| 1064 | } |
| 1065 | |
| 1066 | scan_number_any1: |
| 1067 | // state: we just parsed a number 0-9 (maybe with a leading minus sign) |
| 1068 | switch (get()) |
| 1069 | { |
| 1070 | case '0': |
| 1071 | case '1': |
| 1072 | case '2': |
| 1073 | case '3': |
| 1074 | case '4': |
| 1075 | case '5': |
| 1076 | case '6': |
| 1077 | case '7': |
| 1078 | case '8': |
| 1079 | case '9': |
| 1080 | { |
| 1081 | add(c: current); |
| 1082 | goto scan_number_any1; |
| 1083 | } |
| 1084 | |
| 1085 | case '.': |
| 1086 | { |
| 1087 | add(c: decimal_point_char); |
| 1088 | goto scan_number_decimal1; |
| 1089 | } |
| 1090 | |
| 1091 | case 'e': |
| 1092 | case 'E': |
| 1093 | { |
| 1094 | add(c: current); |
| 1095 | goto scan_number_exponent; |
| 1096 | } |
| 1097 | |
| 1098 | default: |
| 1099 | goto scan_number_done; |
| 1100 | } |
| 1101 | |
| 1102 | scan_number_decimal1: |
| 1103 | // state: we just parsed a decimal point |
| 1104 | number_type = token_type::value_float; |
| 1105 | switch (get()) |
| 1106 | { |
| 1107 | case '0': |
| 1108 | case '1': |
| 1109 | case '2': |
| 1110 | case '3': |
| 1111 | case '4': |
| 1112 | case '5': |
| 1113 | case '6': |
| 1114 | case '7': |
| 1115 | case '8': |
| 1116 | case '9': |
| 1117 | { |
| 1118 | add(c: current); |
| 1119 | goto scan_number_decimal2; |
| 1120 | } |
| 1121 | |
| 1122 | default: |
| 1123 | { |
| 1124 | error_message = "invalid number; expected digit after '.'" ; |
| 1125 | return token_type::parse_error; |
| 1126 | } |
| 1127 | } |
| 1128 | |
| 1129 | scan_number_decimal2: |
| 1130 | // we just parsed at least one number after a decimal point |
| 1131 | switch (get()) |
| 1132 | { |
| 1133 | case '0': |
| 1134 | case '1': |
| 1135 | case '2': |
| 1136 | case '3': |
| 1137 | case '4': |
| 1138 | case '5': |
| 1139 | case '6': |
| 1140 | case '7': |
| 1141 | case '8': |
| 1142 | case '9': |
| 1143 | { |
| 1144 | add(c: current); |
| 1145 | goto scan_number_decimal2; |
| 1146 | } |
| 1147 | |
| 1148 | case 'e': |
| 1149 | case 'E': |
| 1150 | { |
| 1151 | add(c: current); |
| 1152 | goto scan_number_exponent; |
| 1153 | } |
| 1154 | |
| 1155 | default: |
| 1156 | goto scan_number_done; |
| 1157 | } |
| 1158 | |
| 1159 | scan_number_exponent: |
| 1160 | // we just parsed an exponent |
| 1161 | number_type = token_type::value_float; |
| 1162 | switch (get()) |
| 1163 | { |
| 1164 | case '+': |
| 1165 | case '-': |
| 1166 | { |
| 1167 | add(c: current); |
| 1168 | goto scan_number_sign; |
| 1169 | } |
| 1170 | |
| 1171 | case '0': |
| 1172 | case '1': |
| 1173 | case '2': |
| 1174 | case '3': |
| 1175 | case '4': |
| 1176 | case '5': |
| 1177 | case '6': |
| 1178 | case '7': |
| 1179 | case '8': |
| 1180 | case '9': |
| 1181 | { |
| 1182 | add(c: current); |
| 1183 | goto scan_number_any2; |
| 1184 | } |
| 1185 | |
| 1186 | default: |
| 1187 | { |
| 1188 | error_message = |
| 1189 | "invalid number; expected '+', '-', or digit after exponent" ; |
| 1190 | return token_type::parse_error; |
| 1191 | } |
| 1192 | } |
| 1193 | |
| 1194 | scan_number_sign: |
| 1195 | // we just parsed an exponent sign |
| 1196 | switch (get()) |
| 1197 | { |
| 1198 | case '0': |
| 1199 | case '1': |
| 1200 | case '2': |
| 1201 | case '3': |
| 1202 | case '4': |
| 1203 | case '5': |
| 1204 | case '6': |
| 1205 | case '7': |
| 1206 | case '8': |
| 1207 | case '9': |
| 1208 | { |
| 1209 | add(c: current); |
| 1210 | goto scan_number_any2; |
| 1211 | } |
| 1212 | |
| 1213 | default: |
| 1214 | { |
| 1215 | error_message = "invalid number; expected digit after exponent sign" ; |
| 1216 | return token_type::parse_error; |
| 1217 | } |
| 1218 | } |
| 1219 | |
| 1220 | scan_number_any2: |
| 1221 | // we just parsed a number after the exponent or exponent sign |
| 1222 | switch (get()) |
| 1223 | { |
| 1224 | case '0': |
| 1225 | case '1': |
| 1226 | case '2': |
| 1227 | case '3': |
| 1228 | case '4': |
| 1229 | case '5': |
| 1230 | case '6': |
| 1231 | case '7': |
| 1232 | case '8': |
| 1233 | case '9': |
| 1234 | { |
| 1235 | add(c: current); |
| 1236 | goto scan_number_any2; |
| 1237 | } |
| 1238 | |
| 1239 | default: |
| 1240 | goto scan_number_done; |
| 1241 | } |
| 1242 | |
| 1243 | scan_number_done: |
| 1244 | // unget the character after the number (we only read it to know that |
| 1245 | // we are done scanning a number) |
| 1246 | unget(); |
| 1247 | |
| 1248 | char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) |
| 1249 | errno = 0; |
| 1250 | |
| 1251 | // try to parse integers first and fall back to floats |
| 1252 | if (number_type == token_type::value_unsigned) |
| 1253 | { |
| 1254 | const auto x = std::strtoull(nptr: token_buffer.data(), endptr: &endptr, base: 10); |
| 1255 | |
| 1256 | // we checked the number format before |
| 1257 | JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); |
| 1258 | |
| 1259 | if (errno == 0) |
| 1260 | { |
| 1261 | value_unsigned = static_cast<number_unsigned_t>(x); |
| 1262 | if (value_unsigned == x) |
| 1263 | { |
| 1264 | return token_type::value_unsigned; |
| 1265 | } |
| 1266 | } |
| 1267 | } |
| 1268 | else if (number_type == token_type::value_integer) |
| 1269 | { |
| 1270 | const auto x = std::strtoll(nptr: token_buffer.data(), endptr: &endptr, base: 10); |
| 1271 | |
| 1272 | // we checked the number format before |
| 1273 | JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); |
| 1274 | |
| 1275 | if (errno == 0) |
| 1276 | { |
| 1277 | value_integer = static_cast<number_integer_t>(x); |
| 1278 | if (value_integer == x) |
| 1279 | { |
| 1280 | return token_type::value_integer; |
| 1281 | } |
| 1282 | } |
| 1283 | } |
| 1284 | |
| 1285 | // this code is reached if we parse a floating-point number or if an |
| 1286 | // integer conversion above failed |
| 1287 | strtof(value_float, token_buffer.data(), &endptr); |
| 1288 | |
| 1289 | // we checked the number format before |
| 1290 | JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size()); |
| 1291 | |
| 1292 | return token_type::value_float; |
| 1293 | } |
| 1294 | |
| 1295 | /*! |
| 1296 | @param[in] literal_text the literal text to expect |
| 1297 | @param[in] length the length of the passed literal text |
| 1298 | @param[in] return_type the token type to return on success |
| 1299 | */ |
| 1300 | JSON_HEDLEY_NON_NULL(2) |
| 1301 | token_type scan_literal(const char_type* literal_text, const std::size_t length, |
| 1302 | token_type return_type) |
| 1303 | { |
| 1304 | JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[0]); |
| 1305 | for (std::size_t i = 1; i < length; ++i) |
| 1306 | { |
| 1307 | if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i])) |
| 1308 | { |
| 1309 | error_message = "invalid literal" ; |
| 1310 | return token_type::parse_error; |
| 1311 | } |
| 1312 | } |
| 1313 | return return_type; |
| 1314 | } |
| 1315 | |
| 1316 | ///////////////////// |
| 1317 | // input management |
| 1318 | ///////////////////// |
| 1319 | |
| 1320 | /// reset token_buffer; current character is beginning of token |
| 1321 | void reset() noexcept |
| 1322 | { |
| 1323 | token_buffer.clear(); |
| 1324 | token_string.clear(); |
| 1325 | token_string.push_back(char_traits<char_type>::to_char_type(current)); |
| 1326 | } |
| 1327 | |
| 1328 | /* |
| 1329 | @brief get next character from the input |
| 1330 | |
| 1331 | This function provides the interface to the used input adapter. It does |
| 1332 | not throw in case the input reached EOF, but returns a |
| 1333 | `char_traits<char>::eof()` in that case. Stores the scanned characters |
| 1334 | for use in error messages. |
| 1335 | |
| 1336 | @return character read from the input |
| 1337 | */ |
| 1338 | char_int_type get() |
| 1339 | { |
| 1340 | ++position.chars_read_total; |
| 1341 | ++position.chars_read_current_line; |
| 1342 | |
| 1343 | if (next_unget) |
| 1344 | { |
| 1345 | // just reset the next_unget variable and work with current |
| 1346 | next_unget = false; |
| 1347 | } |
| 1348 | else |
| 1349 | { |
| 1350 | current = ia.get_character(); |
| 1351 | } |
| 1352 | |
| 1353 | if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof())) |
| 1354 | { |
| 1355 | token_string.push_back(char_traits<char_type>::to_char_type(current)); |
| 1356 | } |
| 1357 | |
| 1358 | if (current == '\n') |
| 1359 | { |
| 1360 | ++position.lines_read; |
| 1361 | position.chars_read_current_line = 0; |
| 1362 | } |
| 1363 | |
| 1364 | return current; |
| 1365 | } |
| 1366 | |
| 1367 | /*! |
| 1368 | @brief unget current character (read it again on next get) |
| 1369 | |
| 1370 | We implement unget by setting variable next_unget to true. The input is not |
| 1371 | changed - we just simulate ungetting by modifying chars_read_total, |
| 1372 | chars_read_current_line, and token_string. The next call to get() will |
| 1373 | behave as if the unget character is read again. |
| 1374 | */ |
| 1375 | void unget() |
| 1376 | { |
| 1377 | next_unget = true; |
| 1378 | |
| 1379 | --position.chars_read_total; |
| 1380 | |
| 1381 | // in case we "unget" a newline, we have to also decrement the lines_read |
| 1382 | if (position.chars_read_current_line == 0) |
| 1383 | { |
| 1384 | if (position.lines_read > 0) |
| 1385 | { |
| 1386 | --position.lines_read; |
| 1387 | } |
| 1388 | } |
| 1389 | else |
| 1390 | { |
| 1391 | --position.chars_read_current_line; |
| 1392 | } |
| 1393 | |
| 1394 | if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof())) |
| 1395 | { |
| 1396 | JSON_ASSERT(!token_string.empty()); |
| 1397 | token_string.pop_back(); |
| 1398 | } |
| 1399 | } |
| 1400 | |
| 1401 | /// add a character to token_buffer |
| 1402 | void add(char_int_type c) |
| 1403 | { |
| 1404 | token_buffer.push_back(static_cast<typename string_t::value_type>(c)); |
| 1405 | } |
| 1406 | |
| 1407 | public: |
| 1408 | ///////////////////// |
| 1409 | // value getters |
| 1410 | ///////////////////// |
| 1411 | |
| 1412 | /// return integer value |
| 1413 | constexpr number_integer_t get_number_integer() const noexcept |
| 1414 | { |
| 1415 | return value_integer; |
| 1416 | } |
| 1417 | |
| 1418 | /// return unsigned integer value |
| 1419 | constexpr number_unsigned_t get_number_unsigned() const noexcept |
| 1420 | { |
| 1421 | return value_unsigned; |
| 1422 | } |
| 1423 | |
| 1424 | /// return floating-point value |
| 1425 | constexpr number_float_t get_number_float() const noexcept |
| 1426 | { |
| 1427 | return value_float; |
| 1428 | } |
| 1429 | |
| 1430 | /// return current string value (implicitly resets the token; useful only once) |
| 1431 | string_t& get_string() |
| 1432 | { |
| 1433 | return token_buffer; |
| 1434 | } |
| 1435 | |
| 1436 | ///////////////////// |
| 1437 | // diagnostics |
| 1438 | ///////////////////// |
| 1439 | |
| 1440 | /// return position of last read token |
| 1441 | constexpr position_t get_position() const noexcept |
| 1442 | { |
| 1443 | return position; |
| 1444 | } |
| 1445 | |
| 1446 | /// return the last read token (for errors only). Will never contain EOF |
| 1447 | /// (an arbitrary value that is not a valid char value, often -1), because |
| 1448 | /// 255 may legitimately occur. May contain NUL, which should be escaped. |
| 1449 | std::string get_token_string() const |
| 1450 | { |
| 1451 | // escape control characters |
| 1452 | std::string result; |
| 1453 | for (const auto c : token_string) |
| 1454 | { |
| 1455 | if (static_cast<unsigned char>(c) <= '\x1F') |
| 1456 | { |
| 1457 | // escape control characters |
| 1458 | std::array<char, 9> cs{._M_elems: {}}; |
| 1459 | static_cast<void>((std::snprintf)(s: cs.data(), maxlen: cs.size(), format: "<U+%.4X>" , static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) |
| 1460 | result += cs.data(); |
| 1461 | } |
| 1462 | else |
| 1463 | { |
| 1464 | // add character as is |
| 1465 | result.push_back(c: static_cast<std::string::value_type>(c)); |
| 1466 | } |
| 1467 | } |
| 1468 | |
| 1469 | return result; |
| 1470 | } |
| 1471 | |
| 1472 | /// return syntax error message |
| 1473 | JSON_HEDLEY_RETURNS_NON_NULL |
| 1474 | constexpr const char* get_error_message() const noexcept |
| 1475 | { |
| 1476 | return error_message; |
| 1477 | } |
| 1478 | |
| 1479 | ///////////////////// |
| 1480 | // actual scanner |
| 1481 | ///////////////////// |
| 1482 | |
| 1483 | /*! |
| 1484 | @brief skip the UTF-8 byte order mark |
| 1485 | @return true iff there is no BOM or the correct BOM has been skipped |
| 1486 | */ |
| 1487 | bool skip_bom() |
| 1488 | { |
| 1489 | if (get() == 0xEF) |
| 1490 | { |
| 1491 | // check if we completely parse the BOM |
| 1492 | return get() == 0xBB && get() == 0xBF; |
| 1493 | } |
| 1494 | |
| 1495 | // the first character is not the beginning of the BOM; unget it to |
| 1496 | // process is later |
| 1497 | unget(); |
| 1498 | return true; |
| 1499 | } |
| 1500 | |
| 1501 | void skip_whitespace() |
| 1502 | { |
| 1503 | do |
| 1504 | { |
| 1505 | get(); |
| 1506 | } |
| 1507 | while (current == ' ' || current == '\t' || current == '\n' || current == '\r'); |
| 1508 | } |
| 1509 | |
| 1510 | token_type scan() |
| 1511 | { |
| 1512 | // initially, skip the BOM |
| 1513 | if (position.chars_read_total == 0 && !skip_bom()) |
| 1514 | { |
| 1515 | error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given" ; |
| 1516 | return token_type::parse_error; |
| 1517 | } |
| 1518 | |
| 1519 | // read next character and ignore whitespace |
| 1520 | skip_whitespace(); |
| 1521 | |
| 1522 | // ignore comments |
| 1523 | while (ignore_comments && current == '/') |
| 1524 | { |
| 1525 | if (!scan_comment()) |
| 1526 | { |
| 1527 | return token_type::parse_error; |
| 1528 | } |
| 1529 | |
| 1530 | // skip following whitespace |
| 1531 | skip_whitespace(); |
| 1532 | } |
| 1533 | |
| 1534 | switch (current) |
| 1535 | { |
| 1536 | // structural characters |
| 1537 | case '[': |
| 1538 | return token_type::begin_array; |
| 1539 | case ']': |
| 1540 | return token_type::end_array; |
| 1541 | case '{': |
| 1542 | return token_type::begin_object; |
| 1543 | case '}': |
| 1544 | return token_type::end_object; |
| 1545 | case ':': |
| 1546 | return token_type::name_separator; |
| 1547 | case ',': |
| 1548 | return token_type::value_separator; |
| 1549 | |
| 1550 | // literals |
| 1551 | case 't': |
| 1552 | { |
| 1553 | std::array<char_type, 4> true_literal = {{static_cast<char_type>('t'), static_cast<char_type>('r'), static_cast<char_type>('u'), static_cast<char_type>('e')}}; |
| 1554 | return scan_literal(literal_text: true_literal.data(), length: true_literal.size(), return_type: token_type::literal_true); |
| 1555 | } |
| 1556 | case 'f': |
| 1557 | { |
| 1558 | std::array<char_type, 5> false_literal = {{static_cast<char_type>('f'), static_cast<char_type>('a'), static_cast<char_type>('l'), static_cast<char_type>('s'), static_cast<char_type>('e')}}; |
| 1559 | return scan_literal(literal_text: false_literal.data(), length: false_literal.size(), return_type: token_type::literal_false); |
| 1560 | } |
| 1561 | case 'n': |
| 1562 | { |
| 1563 | std::array<char_type, 4> null_literal = {{static_cast<char_type>('n'), static_cast<char_type>('u'), static_cast<char_type>('l'), static_cast<char_type>('l')}}; |
| 1564 | return scan_literal(literal_text: null_literal.data(), length: null_literal.size(), return_type: token_type::literal_null); |
| 1565 | } |
| 1566 | |
| 1567 | // string |
| 1568 | case '\"': |
| 1569 | return scan_string(); |
| 1570 | |
| 1571 | // number |
| 1572 | case '-': |
| 1573 | case '0': |
| 1574 | case '1': |
| 1575 | case '2': |
| 1576 | case '3': |
| 1577 | case '4': |
| 1578 | case '5': |
| 1579 | case '6': |
| 1580 | case '7': |
| 1581 | case '8': |
| 1582 | case '9': |
| 1583 | return scan_number(); |
| 1584 | |
| 1585 | // end of input (the null byte is needed when parsing from |
| 1586 | // string literals) |
| 1587 | case '\0': |
| 1588 | case char_traits<char_type>::eof(): |
| 1589 | return token_type::end_of_input; |
| 1590 | |
| 1591 | // error |
| 1592 | default: |
| 1593 | error_message = "invalid literal" ; |
| 1594 | return token_type::parse_error; |
| 1595 | } |
| 1596 | } |
| 1597 | |
| 1598 | private: |
| 1599 | /// input adapter |
| 1600 | InputAdapterType ia; |
| 1601 | |
| 1602 | /// whether comments should be ignored (true) or signaled as errors (false) |
| 1603 | const bool = false; |
| 1604 | |
| 1605 | /// the current character |
| 1606 | char_int_type current = char_traits<char_type>::eof(); |
| 1607 | |
| 1608 | /// whether the next get() call should just return current |
| 1609 | bool next_unget = false; |
| 1610 | |
| 1611 | /// the start position of the current token |
| 1612 | position_t position {}; |
| 1613 | |
| 1614 | /// raw input token string (for error messages) |
| 1615 | std::vector<char_type> token_string {}; |
| 1616 | |
| 1617 | /// buffer for variable-length tokens (numbers, strings) |
| 1618 | string_t token_buffer {}; |
| 1619 | |
| 1620 | /// a description of occurred lexer errors |
| 1621 | const char* error_message = "" ; |
| 1622 | |
| 1623 | // number values |
| 1624 | number_integer_t value_integer = 0; |
| 1625 | number_unsigned_t value_unsigned = 0; |
| 1626 | number_float_t value_float = 0; |
| 1627 | |
| 1628 | /// the decimal point |
| 1629 | const char_int_type decimal_point_char = '.'; |
| 1630 | }; |
| 1631 | |
| 1632 | } // namespace detail |
| 1633 | NLOHMANN_JSON_NAMESPACE_END |
| 1634 | |