lexer.hpp source code [include/nlohmann/detail/input/lexer.hpp]

1	// __ _____ _____ _____
2	// __\| \| __\| \| \| \| JSON for Modern C++
3	// \| \| \|__ \| \| \| \| \| \| version 3.11.3
4	// \|_____\|_____\|_____\|_\|___\| https://github.com/nlohmann/json
5	//
6	// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann <https://nlohmann.me>
7	// SPDX-License-Identifier: MIT
8
9	#pragma once
10
11	#include <array> // array
12	#include <clocale> // localeconv
13	#include <cstddef> // size_t
14	#include <cstdio> // snprintf
15	#include <cstdlib> // strtof, strtod, strtold, strtoll, strtoull
16	#include <initializer_list> // initializer_list
17	#include <string> // char_traits, string
18	#include <utility> // move
19	#include <vector> // vector
20
21	#include <nlohmann/detail/input/input_adapters.hpp>
22	#include <nlohmann/detail/input/position_t.hpp>
23	#include <nlohmann/detail/macro_scope.hpp>
24	#include <nlohmann/detail/meta/type_traits.hpp>
25
26	NLOHMANN_JSON_NAMESPACE_BEGIN
27	namespace detail
28	{
29
30	///////////
31	// lexer //
32	///////////
33
34	template<typename BasicJsonType>
35	class lexer_base
36	{
37	public:
38	/// token types for the parser
39	enum class token_type
40	{
41	uninitialized, ///< indicating the scanner is uninitialized
42	literal_true, ///< the `true` literal
43	literal_false, ///< the `false` literal
44	literal_null, ///< the `null` literal
45	value_string, ///< a string -- use get_string() for actual value
46	value_unsigned, ///< an unsigned integer -- use get_number_unsigned() for actual value
47	value_integer, ///< a signed integer -- use get_number_integer() for actual value
48	value_float, ///< an floating point number -- use get_number_float() for actual value
49	begin_array, ///< the character for array begin `[`
50	begin_object, ///< the character for object begin `{`
51	end_array, ///< the character for array end `]`
52	end_object, ///< the character for object end `}`
53	name_separator, ///< the name separator `:`
54	value_separator, ///< the value separator `,`
55	parse_error, ///< indicating a parse error
56	end_of_input, ///< indicating the end of the input buffer
57	literal_or_value ///< a literal or the begin of a value (only for diagnostics)
58	};
59
60	/// return name of values of type token_type (only used for errors)
61	JSON_HEDLEY_RETURNS_NON_NULL
62	JSON_HEDLEY_CONST
63	static const char* token_type_name(const token_type t) noexcept
64	{
65	switch (t)
66	{
67	case token_type::uninitialized:
68	return "<uninitialized>";
69	case token_type::literal_true:
70	return "true literal";
71	case token_type::literal_false:
72	return "false literal";
73	case token_type::literal_null:
74	return "null literal";
75	case token_type::value_string:
76	return "string literal";
77	case token_type::value_unsigned:
78	case token_type::value_integer:
79	case token_type::value_float:
80	return "number literal";
81	case token_type::begin_array:
82	return "'['";
83	case token_type::begin_object:
84	return "'{'";
85	case token_type::end_array:
86	return "']'";
87	case token_type::end_object:
88	return "'}'";
89	case token_type::name_separator:
90	return "':'";
91	case token_type::value_separator:
92	return "','";
93	case token_type::parse_error:
94	return "<parse error>";
95	case token_type::end_of_input:
96	return "end of input";
97	case token_type::literal_or_value:
98	return "'[', '{', or a literal";
99	// LCOV_EXCL_START
100	default: // catch non-enum values
101	return "unknown token";
102	// LCOV_EXCL_STOP
103	}
104	}
105	};
106	/!*
107	@brief lexical analysis
108
109	This class organizes the lexical analysis during JSON deserialization.
110	*/
111	template<typename BasicJsonType, typename InputAdapterType>
112	class lexer : public lexer_base<BasicJsonType>
113	{
114	using number_integer_t = typename BasicJsonType::number_integer_t;
115	using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
116	using number_float_t = typename BasicJsonType::number_float_t;
117	using string_t = typename BasicJsonType::string_t;
118	using char_type = typename InputAdapterType::char_type;
119	using char_int_type = typename char_traits<char_type>::int_type;
120
121	public:
122	using token_type = typename lexer_base<BasicJsonType>::token_type;
123
124	explicit lexer(InputAdapterType&& adapter, bool ignore_comments_ = false) noexcept
125	: ia(std::move(adapter))
126	, ignore_comments(ignore_comments_)
127	, decimal_point_char(static_cast<char_int_type>(get_decimal_point()))
128	{}
129
130	// delete because of pointer members
131	lexer(const lexer&) = delete;
132	lexer(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
133	lexer& operator=(lexer&) = delete;
134	lexer& operator=(lexer&&) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor)
135	~lexer() = default;
136
137	private:
138	/////////////////////
139	// locales
140	/////////////////////
141
142	/// return the locale-dependent decimal point
143	JSON_HEDLEY_PURE
144	static char get_decimal_point() noexcept
145	{
146	const auto* loc = localeconv();
147	JSON_ASSERT(loc != nullptr);
148	return (loc->decimal_point == nullptr) ? `'.'` : *(loc->decimal_point);
149	}
150
151	/////////////////////
152	// scan functions
153	/////////////////////
154
155	/!*
156	@brief get codepoint from 4 hex characters following `\u`
157
158	For input "\u c1 c2 c3 c4" the codepoint is:
159	(c1 0x1000) + (c2 * 0x0100) + (c3 * 0x0010) + c4*
160	= (c1 << 12) + (c2 << 8) + (c3 << 4) + (c4 << 0)
161
162	Furthermore, the possible characters '0'..'9', 'A'..'F', and 'a'..'f'
163	must be converted to the integers 0x0..0x9, 0xA..0xF, 0xA..0xF, resp. The
164	conversion is done by subtracting the offset (0x30, 0x37, and 0x57)
165	between the ASCII value of the character and the desired integer value.
166
167	@return codepoint (0x0000..0xFFFF) or -1 in case of an error (e.g. EOF or
168	non-hex character)
169	*/
170	int get_codepoint()
171	{
172	// this function only makes sense after reading `\u`
173	JSON_ASSERT(current == `'u'`);
174	int codepoint = `0`;
175
176	const auto factors = { `12u`, `8u`, `4u`, `0u` };
177	for (const auto factor : factors)
178	{
179	get();
180
181	if (current >= `'0'` && current <= `'9'`)
182	{
183	codepoint += static_cast<int>((static_cast<unsigned int>(current) - `0x30u`) << factor);
184	}
185	else if (current >= `'A'` && current <= `'F'`)
186	{
187	codepoint += static_cast<int>((static_cast<unsigned int>(current) - `0x37u`) << factor);
188	}
189	else if (current >= `'a'` && current <= `'f'`)
190	{
191	codepoint += static_cast<int>((static_cast<unsigned int>(current) - `0x57u`) << factor);
192	}
193	else
194	{
195	return -`1`;
196	}
197	}
198
199	JSON_ASSERT(`0x0000` <= codepoint && codepoint <= `0xFFFF`);
200	return codepoint;
201	}
202
203	/!*
204	@brief check if the next byte(s) are inside a given range
205
206	Adds the current byte and, for each passed range, reads a new byte and
207	checks if it is inside the range. If a violation was detected, set up an
208	error message and return false. Otherwise, return true.
209
210	@param[in] ranges list of integers; interpreted as list of pairs of
211	inclusive lower and upper bound, respectively
212
213	@pre The passed list @a ranges must have 2, 4, or 6 elements; that is,
214	1, 2, or 3 pairs. This precondition is enforced by an assertion.
215
216	@return true if and only if no range violation was detected
217	*/
218	bool next_byte_in_range(std::initializer_list<char_int_type> ranges)
219	{
220	JSON_ASSERT(ranges.size() == `2` \|\| ranges.size() == `4` \|\| ranges.size() == `6`);
221	add(c: current);
222
223	for (auto range = ranges.begin(); range != ranges.end(); ++range)
224	{
225	get();
226	if (JSON_HEDLEY_LIKELY(range <= current && current <= (++range))) // NOLINT(bugprone-inc-dec-in-conditions)
227	{
228	add(c: current);
229	}
230	else
231	{
232	error_message = "invalid string: ill-formed UTF-8 byte";
233	return false;
234	}
235	}
236
237	return true;
238	}
239
240	/!*
241	@brief scan a string literal
242
243	This function scans a string according to Sect. 7 of RFC 8259. While
244	scanning, bytes are escaped and copied into buffer token_buffer. Then the
245	function returns successfully, token_buffer is not* null-terminated (as it*
246	may contain \0 bytes), and token_buffer.size() is the number of bytes in the
247	string.
248
249	@return token_type::value_string if string could be successfully scanned,
250	token_type::parse_error otherwise
251
252	@note In case of errors, variable error_message contains a textual
253	description.
254	*/
255	token_type scan_string()
256	{
257	// reset token_buffer (ignore opening quote)
258	reset();
259
260	// we entered the function by reading an open quote
261	JSON_ASSERT(current == `'\"'`);
262
263	while (true)
264	{
265	// get next character
266	switch (get())
267	{
268	// end of file while parsing string
269	case char_traits<char_type>::eof():
270	{
271	error_message = "invalid string: missing closing quote";
272	return token_type::parse_error;
273	}
274
275	// closing quote
276	case `'\"'`:
277	{
278	return token_type::value_string;
279	}
280
281	// escapes
282	case `'\\'`:
283	{
284	switch (get())
285	{
286	// quotation mark
287	case `'\"'`:
288	add(c: `'\"'`);
289	break;
290	// reverse solidus
291	case `'\\'`:
292	add(c: `'\\'`);
293	break;
294	// solidus
295	case `'/'`:
296	add(c: `'/'`);
297	break;
298	// backspace
299	case `'b'`:
300	add(c: `'\b'`);
301	break;
302	// form feed
303	case `'f'`:
304	add(c: `'\f'`);
305	break;
306	// line feed
307	case `'n'`:
308	add(c: `'\n'`);
309	break;
310	// carriage return
311	case `'r'`:
312	add(c: `'\r'`);
313	break;
314	// tab
315	case `'t'`:
316	add(c: `'\t'`);
317	break;
318
319	// unicode escapes
320	case `'u'`:
321	{
322	const int codepoint1 = get_codepoint();
323	int codepoint = codepoint1; // start with codepoint1
324
325	if (JSON_HEDLEY_UNLIKELY(codepoint1 == -`1`))
326	{
327	error_message = "invalid string: '\\u' must be followed by 4 hex digits";
328	return token_type::parse_error;
329	}
330
331	// check if code point is a high surrogate
332	if (`0xD800` <= codepoint1 && codepoint1 <= `0xDBFF`)
333	{
334	// expect next \uxxxx entry
335	if (JSON_HEDLEY_LIKELY(get() == `'\\'` && get() == `'u'`))
336	{
337	const int codepoint2 = get_codepoint();
338
339	if (JSON_HEDLEY_UNLIKELY(codepoint2 == -`1`))
340	{
341	error_message = "invalid string: '\\u' must be followed by 4 hex digits";
342	return token_type::parse_error;
343	}
344
345	// check if codepoint2 is a low surrogate
346	if (JSON_HEDLEY_LIKELY(`0xDC00` <= codepoint2 && codepoint2 <= `0xDFFF`))
347	{
348	// overwrite codepoint
349	codepoint = static_cast<int>(
350	// high surrogate occupies the most significant 22 bits
351	(static_cast<unsigned int>(codepoint1) << `10u`)
352	// low surrogate occupies the least significant 15 bits
353	+ static_cast<unsigned int>(codepoint2)
354	// there is still the 0xD800, 0xDC00 and 0x10000 noise
355	// in the result, so we have to subtract with:
356	// (0xD800 << 10) + DC00 - 0x10000 = 0x35FDC00
357	- `0x35FDC00u`);
358	}
359	else
360	{
361	error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
362	return token_type::parse_error;
363	}
364	}
365	else
366	{
367	error_message = "invalid string: surrogate U+D800..U+DBFF must be followed by U+DC00..U+DFFF";
368	return token_type::parse_error;
369	}
370	}
371	else
372	{
373	if (JSON_HEDLEY_UNLIKELY(`0xDC00` <= codepoint1 && codepoint1 <= `0xDFFF`))
374	{
375	error_message = "invalid string: surrogate U+DC00..U+DFFF must follow U+D800..U+DBFF";
376	return token_type::parse_error;
377	}
378	}
379
380	// result of the above calculation yields a proper codepoint
381	JSON_ASSERT(`0x00` <= codepoint && codepoint <= `0x10FFFF`);
382
383	// translate codepoint into bytes
384	if (codepoint < `0x80`)
385	{
386	// 1-byte characters: 0xxxxxxx (ASCII)
387	add(c: static_cast<char_int_type>(codepoint));
388	}
389	else if (codepoint <= `0x7FF`)
390	{
391	// 2-byte characters: 110xxxxx 10xxxxxx
392	add(c: static_cast<char_int_type>(`0xC0u` \| (static_cast<unsigned int>(codepoint) >> `6u`)));
393	add(c: static_cast<char_int_type>(`0x80u` \| (static_cast<unsigned int>(codepoint) & `0x3Fu`)));
394	}
395	else if (codepoint <= `0xFFFF`)
396	{
397	// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
398	add(c: static_cast<char_int_type>(`0xE0u` \| (static_cast<unsigned int>(codepoint) >> `12u`)));
399	add(c: static_cast<char_int_type>(`0x80u` \| ((static_cast<unsigned int>(codepoint) >> `6u`) & `0x3Fu`)));
400	add(c: static_cast<char_int_type>(`0x80u` \| (static_cast<unsigned int>(codepoint) & `0x3Fu`)));
401	}
402	else
403	{
404	// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
405	add(c: static_cast<char_int_type>(`0xF0u` \| (static_cast<unsigned int>(codepoint) >> `18u`)));
406	add(c: static_cast<char_int_type>(`0x80u` \| ((static_cast<unsigned int>(codepoint) >> `12u`) & `0x3Fu`)));
407	add(c: static_cast<char_int_type>(`0x80u` \| ((static_cast<unsigned int>(codepoint) >> `6u`) & `0x3Fu`)));
408	add(c: static_cast<char_int_type>(`0x80u` \| (static_cast<unsigned int>(codepoint) & `0x3Fu`)));
409	}
410
411	break;
412	}
413
414	// other characters after escape
415	default:
416	error_message = "invalid string: forbidden character after backslash";
417	return token_type::parse_error;
418	}
419
420	break;
421	}
422
423	// invalid control characters
424	case `0x00`:
425	{
426	error_message = "invalid string: control character U+0000 (NUL) must be escaped to \\u0000";
427	return token_type::parse_error;
428	}
429
430	case `0x01`:
431	{
432	error_message = "invalid string: control character U+0001 (SOH) must be escaped to \\u0001";
433	return token_type::parse_error;
434	}
435
436	case `0x02`:
437	{
438	error_message = "invalid string: control character U+0002 (STX) must be escaped to \\u0002";
439	return token_type::parse_error;
440	}
441
442	case `0x03`:
443	{
444	error_message = "invalid string: control character U+0003 (ETX) must be escaped to \\u0003";
445	return token_type::parse_error;
446	}
447
448	case `0x04`:
449	{
450	error_message = "invalid string: control character U+0004 (EOT) must be escaped to \\u0004";
451	return token_type::parse_error;
452	}
453
454	case `0x05`:
455	{
456	error_message = "invalid string: control character U+0005 (ENQ) must be escaped to \\u0005";
457	return token_type::parse_error;
458	}
459
460	case `0x06`:
461	{
462	error_message = "invalid string: control character U+0006 (ACK) must be escaped to \\u0006";
463	return token_type::parse_error;
464	}
465
466	case `0x07`:
467	{
468	error_message = "invalid string: control character U+0007 (BEL) must be escaped to \\u0007";
469	return token_type::parse_error;
470	}
471
472	case `0x08`:
473	{
474	error_message = "invalid string: control character U+0008 (BS) must be escaped to \\u0008 or \\b";
475	return token_type::parse_error;
476	}
477
478	case `0x09`:
479	{
480	error_message = "invalid string: control character U+0009 (HT) must be escaped to \\u0009 or \\t";
481	return token_type::parse_error;
482	}
483
484	case `0x0A`:
485	{
486	error_message = "invalid string: control character U+000A (LF) must be escaped to \\u000A or \\n";
487	return token_type::parse_error;
488	}
489
490	case `0x0B`:
491	{
492	error_message = "invalid string: control character U+000B (VT) must be escaped to \\u000B";
493	return token_type::parse_error;
494	}
495
496	case `0x0C`:
497	{
498	error_message = "invalid string: control character U+000C (FF) must be escaped to \\u000C or \\f";
499	return token_type::parse_error;
500	}
501
502	case `0x0D`:
503	{
504	error_message = "invalid string: control character U+000D (CR) must be escaped to \\u000D or \\r";
505	return token_type::parse_error;
506	}
507
508	case `0x0E`:
509	{
510	error_message = "invalid string: control character U+000E (SO) must be escaped to \\u000E";
511	return token_type::parse_error;
512	}
513
514	case `0x0F`:
515	{
516	error_message = "invalid string: control character U+000F (SI) must be escaped to \\u000F";
517	return token_type::parse_error;
518	}
519
520	case `0x10`:
521	{
522	error_message = "invalid string: control character U+0010 (DLE) must be escaped to \\u0010";
523	return token_type::parse_error;
524	}
525
526	case `0x11`:
527	{
528	error_message = "invalid string: control character U+0011 (DC1) must be escaped to \\u0011";
529	return token_type::parse_error;
530	}
531
532	case `0x12`:
533	{
534	error_message = "invalid string: control character U+0012 (DC2) must be escaped to \\u0012";
535	return token_type::parse_error;
536	}
537
538	case `0x13`:
539	{
540	error_message = "invalid string: control character U+0013 (DC3) must be escaped to \\u0013";
541	return token_type::parse_error;
542	}
543
544	case `0x14`:
545	{
546	error_message = "invalid string: control character U+0014 (DC4) must be escaped to \\u0014";
547	return token_type::parse_error;
548	}
549
550	case `0x15`:
551	{
552	error_message = "invalid string: control character U+0015 (NAK) must be escaped to \\u0015";
553	return token_type::parse_error;
554	}
555
556	case `0x16`:
557	{
558	error_message = "invalid string: control character U+0016 (SYN) must be escaped to \\u0016";
559	return token_type::parse_error;
560	}
561
562	case `0x17`:
563	{
564	error_message = "invalid string: control character U+0017 (ETB) must be escaped to \\u0017";
565	return token_type::parse_error;
566	}
567
568	case `0x18`:
569	{
570	error_message = "invalid string: control character U+0018 (CAN) must be escaped to \\u0018";
571	return token_type::parse_error;
572	}
573
574	case `0x19`:
575	{
576	error_message = "invalid string: control character U+0019 (EM) must be escaped to \\u0019";
577	return token_type::parse_error;
578	}
579
580	case `0x1A`:
581	{
582	error_message = "invalid string: control character U+001A (SUB) must be escaped to \\u001A";
583	return token_type::parse_error;
584	}
585
586	case `0x1B`:
587	{
588	error_message = "invalid string: control character U+001B (ESC) must be escaped to \\u001B";
589	return token_type::parse_error;
590	}
591
592	case `0x1C`:
593	{
594	error_message = "invalid string: control character U+001C (FS) must be escaped to \\u001C";
595	return token_type::parse_error;
596	}
597
598	case `0x1D`:
599	{
600	error_message = "invalid string: control character U+001D (GS) must be escaped to \\u001D";
601	return token_type::parse_error;
602	}
603
604	case `0x1E`:
605	{
606	error_message = "invalid string: control character U+001E (RS) must be escaped to \\u001E";
607	return token_type::parse_error;
608	}
609
610	case `0x1F`:
611	{
612	error_message = "invalid string: control character U+001F (US) must be escaped to \\u001F";
613	return token_type::parse_error;
614	}
615
616	// U+0020..U+007F (except U+0022 (quote) and U+005C (backspace))
617	case `0x20`:
618	case `0x21`:
619	case `0x23`:
620	case `0x24`:
621	case `0x25`:
622	case `0x26`:
623	case `0x27`:
624	case `0x28`:
625	case `0x29`:
626	case `0x2A`:
627	case `0x2B`:
628	case `0x2C`:
629	case `0x2D`:
630	case `0x2E`:
631	case `0x2F`:
632	case `0x30`:
633	case `0x31`:
634	case `0x32`:
635	case `0x33`:
636	case `0x34`:
637	case `0x35`:
638	case `0x36`:
639	case `0x37`:
640	case `0x38`:
641	case `0x39`:
642	case `0x3A`:
643	case `0x3B`:
644	case `0x3C`:
645	case `0x3D`:
646	case `0x3E`:
647	case `0x3F`:
648	case `0x40`:
649	case `0x41`:
650	case `0x42`:
651	case `0x43`:
652	case `0x44`:
653	case `0x45`:
654	case `0x46`:
655	case `0x47`:
656	case `0x48`:
657	case `0x49`:
658	case `0x4A`:
659	case `0x4B`:
660	case `0x4C`:
661	case `0x4D`:
662	case `0x4E`:
663	case `0x4F`:
664	case `0x50`:
665	case `0x51`:
666	case `0x52`:
667	case `0x53`:
668	case `0x54`:
669	case `0x55`:
670	case `0x56`:
671	case `0x57`:
672	case `0x58`:
673	case `0x59`:
674	case `0x5A`:
675	case `0x5B`:
676	case `0x5D`:
677	case `0x5E`:
678	case `0x5F`:
679	case `0x60`:
680	case `0x61`:
681	case `0x62`:
682	case `0x63`:
683	case `0x64`:
684	case `0x65`:
685	case `0x66`:
686	case `0x67`:
687	case `0x68`:
688	case `0x69`:
689	case `0x6A`:
690	case `0x6B`:
691	case `0x6C`:
692	case `0x6D`:
693	case `0x6E`:
694	case `0x6F`:
695	case `0x70`:
696	case `0x71`:
697	case `0x72`:
698	case `0x73`:
699	case `0x74`:
700	case `0x75`:
701	case `0x76`:
702	case `0x77`:
703	case `0x78`:
704	case `0x79`:
705	case `0x7A`:
706	case `0x7B`:
707	case `0x7C`:
708	case `0x7D`:
709	case `0x7E`:
710	case `0x7F`:
711	{
712	add(c: current);
713	break;
714	}
715
716	// U+0080..U+07FF: bytes C2..DF 80..BF
717	case `0xC2`:
718	case `0xC3`:
719	case `0xC4`:
720	case `0xC5`:
721	case `0xC6`:
722	case `0xC7`:
723	case `0xC8`:
724	case `0xC9`:
725	case `0xCA`:
726	case `0xCB`:
727	case `0xCC`:
728	case `0xCD`:
729	case `0xCE`:
730	case `0xCF`:
731	case `0xD0`:
732	case `0xD1`:
733	case `0xD2`:
734	case `0xD3`:
735	case `0xD4`:
736	case `0xD5`:
737	case `0xD6`:
738	case `0xD7`:
739	case `0xD8`:
740	case `0xD9`:
741	case `0xDA`:
742	case `0xDB`:
743	case `0xDC`:
744	case `0xDD`:
745	case `0xDE`:
746	case `0xDF`:
747	{
748	if (JSON_HEDLEY_UNLIKELY(!next_byte_in_range({`0x80`, `0xBF`})))
749	{
750	return token_type::parse_error;
751	}
752	break;
753	}
754
755	// U+0800..U+0FFF: bytes E0 A0..BF 80..BF
756	case `0xE0`:
757	{
758	if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({`0xA0`, `0xBF`, `0x80`, `0xBF`}))))
759	{
760	return token_type::parse_error;
761	}
762	break;
763	}
764
765	// U+1000..U+CFFF: bytes E1..EC 80..BF 80..BF
766	// U+E000..U+FFFF: bytes EE..EF 80..BF 80..BF
767	case `0xE1`:
768	case `0xE2`:
769	case `0xE3`:
770	case `0xE4`:
771	case `0xE5`:
772	case `0xE6`:
773	case `0xE7`:
774	case `0xE8`:
775	case `0xE9`:
776	case `0xEA`:
777	case `0xEB`:
778	case `0xEC`:
779	case `0xEE`:
780	case `0xEF`:
781	{
782	if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({`0x80`, `0xBF`, `0x80`, `0xBF`}))))
783	{
784	return token_type::parse_error;
785	}
786	break;
787	}
788
789	// U+D000..U+D7FF: bytes ED 80..9F 80..BF
790	case `0xED`:
791	{
792	if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({`0x80`, `0x9F`, `0x80`, `0xBF`}))))
793	{
794	return token_type::parse_error;
795	}
796	break;
797	}
798
799	// U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
800	case `0xF0`:
801	{
802	if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({`0x90`, `0xBF`, `0x80`, `0xBF`, `0x80`, `0xBF`}))))
803	{
804	return token_type::parse_error;
805	}
806	break;
807	}
808
809	// U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
810	case `0xF1`:
811	case `0xF2`:
812	case `0xF3`:
813	{
814	if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({`0x80`, `0xBF`, `0x80`, `0xBF`, `0x80`, `0xBF`}))))
815	{
816	return token_type::parse_error;
817	}
818	break;
819	}
820
821	// U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
822	case `0xF4`:
823	{
824	if (JSON_HEDLEY_UNLIKELY(!(next_byte_in_range({`0x80`, `0x8F`, `0x80`, `0xBF`, `0x80`, `0xBF`}))))
825	{
826	return token_type::parse_error;
827	}
828	break;
829	}
830
831	// remaining bytes (80..C1 and F5..FF) are ill-formed
832	default:
833	{
834	error_message = "invalid string: ill-formed UTF-8 byte";
835	return token_type::parse_error;
836	}
837	}
838	}
839	}
840
841	/!*
842	* @brief scan a comment
843	* @return whether comment could be scanned successfully
844	*/
845	bool scan_comment()
846	{
847	switch (get())
848	{
849	// single-line comments skip input until a newline or EOF is read
850	case `'/'`:
851	{
852	while (true)
853	{
854	switch (get())
855	{
856	case `'\n'`:
857	case `'\r'`:
858	case char_traits<char_type>::eof():
859	case `'\0'`:
860	return true;
861
862	default:
863	break;
864	}
865	}
866	}
867
868	// multi-line comments skip input until / is read*
869	case `'*'`:
870	{
871	while (true)
872	{
873	switch (get())
874	{
875	case char_traits<char_type>::eof():
876	case `'\0'`:
877	{
878	error_message = "invalid comment; missing closing '*/'";
879	return false;
880	}
881
882	case `'*'`:
883	{
884	switch (get())
885	{
886	case `'/'`:
887	return true;
888
889	default:
890	{
891	unget();
892	continue;
893	}
894	}
895	}
896
897	default:
898	continue;
899	}
900	}
901	}
902
903	// unexpected character after reading '/'
904	default:
905	{
906	error_message = "invalid comment; expecting '/' or '*' after '/'";
907	return false;
908	}
909	}
910	}
911
912	JSON_HEDLEY_NON_NULL(`2`)
913	static void strtof(float& f, const char* str, char endptr) noexcept**
914	{
915	f = std::strtof(nptr: str, endptr: endptr);
916	}
917
918	JSON_HEDLEY_NON_NULL(`2`)
919	static void strtof(double& f, const char* str, char endptr) noexcept**
920	{
921	f = std::strtod(nptr: str, endptr: endptr);
922	}
923
924	JSON_HEDLEY_NON_NULL(`2`)
925	static void strtof(long double& f, const char* str, char endptr) noexcept**
926	{
927	f = std::strtold(nptr: str, endptr: endptr);
928	}
929
930	/!*
931	@brief scan a number literal
932
933	This function scans a string according to Sect. 6 of RFC 8259.
934
935	The function is realized with a deterministic finite state machine derived
936	from the grammar described in RFC 8259. Starting in state "init", the
937	input is read and used to determined the next state. Only state "done"
938	accepts the number. State "error" is a trap state to model errors. In the
939	table below, "anything" means any character but the ones listed before.
940
941	state \| 0 \| 1-9 \| e E \| + \| - \| . \| anything
942	---------\|----------\|----------\|----------\|---------\|---------\|----------\|-----------
943	init \| zero \| any1 \| [error] \| [error] \| minus \| [error] \| [error]
944	minus \| zero \| any1 \| [error] \| [error] \| [error] \| [error] \| [error]
945	zero \| done \| done \| exponent \| done \| done \| decimal1 \| done
946	any1 \| any1 \| any1 \| exponent \| done \| done \| decimal1 \| done
947	decimal1 \| decimal2 \| decimal2 \| [error] \| [error] \| [error] \| [error] \| [error]
948	decimal2 \| decimal2 \| decimal2 \| exponent \| done \| done \| done \| done
949	exponent \| any2 \| any2 \| [error] \| sign \| sign \| [error] \| [error]
950	sign \| any2 \| any2 \| [error] \| [error] \| [error] \| [error] \| [error]
951	any2 \| any2 \| any2 \| done \| done \| done \| done \| done
952
953	The state machine is realized with one label per state (prefixed with
954	"scan_number_") and `goto` statements between them. The state machine
955	contains cycles, but any cycle can be left when EOF is read. Therefore,
956	the function is guaranteed to terminate.
957
958	During scanning, the read bytes are stored in token_buffer. This string is
959	then converted to a signed integer, an unsigned integer, or a
960	floating-point number.
961
962	@return token_type::value_unsigned, token_type::value_integer, or
963	token_type::value_float if number could be successfully scanned,
964	token_type::parse_error otherwise
965
966	@note The scanner is independent of the current locale. Internally, the
967	locale's decimal point is used instead of `.` to work with the
968	locale-dependent converters.
969	*/
970	token_type scan_number() // lgtm [cpp/use-of-goto]
971	{
972	// reset token_buffer to store the number's bytes
973	reset();
974
975	// the type of the parsed number; initially set to unsigned; will be
976	// changed if minus sign, decimal point or exponent is read
977	token_type number_type = token_type::value_unsigned;
978
979	// state (init): we just found out we need to scan a number
980	switch (current)
981	{
982	case `'-'`:
983	{
984	add(c: current);
985	goto scan_number_minus;
986	}
987
988	case `'0'`:
989	{
990	add(c: current);
991	goto scan_number_zero;
992	}
993
994	case `'1'`:
995	case `'2'`:
996	case `'3'`:
997	case `'4'`:
998	case `'5'`:
999	case `'6'`:
1000	case `'7'`:
1001	case `'8'`:
1002	case `'9'`:
1003	{
1004	add(c: current);
1005	goto scan_number_any1;
1006	}
1007
1008	// all other characters are rejected outside scan_number()
1009	default: // LCOV_EXCL_LINE
1010	JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
1011	}
1012
1013	scan_number_minus:
1014	// state: we just parsed a leading minus sign
1015	number_type = token_type::value_integer;
1016	switch (get())
1017	{
1018	case `'0'`:
1019	{
1020	add(c: current);
1021	goto scan_number_zero;
1022	}
1023
1024	case `'1'`:
1025	case `'2'`:
1026	case `'3'`:
1027	case `'4'`:
1028	case `'5'`:
1029	case `'6'`:
1030	case `'7'`:
1031	case `'8'`:
1032	case `'9'`:
1033	{
1034	add(c: current);
1035	goto scan_number_any1;
1036	}
1037
1038	default:
1039	{
1040	error_message = "invalid number; expected digit after '-'";
1041	return token_type::parse_error;
1042	}
1043	}
1044
1045	scan_number_zero:
1046	// state: we just parse a zero (maybe with a leading minus sign)
1047	switch (get())
1048	{
1049	case `'.'`:
1050	{
1051	add(c: decimal_point_char);
1052	goto scan_number_decimal1;
1053	}
1054
1055	case `'e'`:
1056	case `'E'`:
1057	{
1058	add(c: current);
1059	goto scan_number_exponent;
1060	}
1061
1062	default:
1063	goto scan_number_done;
1064	}
1065
1066	scan_number_any1:
1067	// state: we just parsed a number 0-9 (maybe with a leading minus sign)
1068	switch (get())
1069	{
1070	case `'0'`:
1071	case `'1'`:
1072	case `'2'`:
1073	case `'3'`:
1074	case `'4'`:
1075	case `'5'`:
1076	case `'6'`:
1077	case `'7'`:
1078	case `'8'`:
1079	case `'9'`:
1080	{
1081	add(c: current);
1082	goto scan_number_any1;
1083	}
1084
1085	case `'.'`:
1086	{
1087	add(c: decimal_point_char);
1088	goto scan_number_decimal1;
1089	}
1090
1091	case `'e'`:
1092	case `'E'`:
1093	{
1094	add(c: current);
1095	goto scan_number_exponent;
1096	}
1097
1098	default:
1099	goto scan_number_done;
1100	}
1101
1102	scan_number_decimal1:
1103	// state: we just parsed a decimal point
1104	number_type = token_type::value_float;
1105	switch (get())
1106	{
1107	case `'0'`:
1108	case `'1'`:
1109	case `'2'`:
1110	case `'3'`:
1111	case `'4'`:
1112	case `'5'`:
1113	case `'6'`:
1114	case `'7'`:
1115	case `'8'`:
1116	case `'9'`:
1117	{
1118	add(c: current);
1119	goto scan_number_decimal2;
1120	}
1121
1122	default:
1123	{
1124	error_message = "invalid number; expected digit after '.'";
1125	return token_type::parse_error;
1126	}
1127	}
1128
1129	scan_number_decimal2:
1130	// we just parsed at least one number after a decimal point
1131	switch (get())
1132	{
1133	case `'0'`:
1134	case `'1'`:
1135	case `'2'`:
1136	case `'3'`:
1137	case `'4'`:
1138	case `'5'`:
1139	case `'6'`:
1140	case `'7'`:
1141	case `'8'`:
1142	case `'9'`:
1143	{
1144	add(c: current);
1145	goto scan_number_decimal2;
1146	}
1147
1148	case `'e'`:
1149	case `'E'`:
1150	{
1151	add(c: current);
1152	goto scan_number_exponent;
1153	}
1154
1155	default:
1156	goto scan_number_done;
1157	}
1158
1159	scan_number_exponent:
1160	// we just parsed an exponent
1161	number_type = token_type::value_float;
1162	switch (get())
1163	{
1164	case `'+'`:
1165	case `'-'`:
1166	{
1167	add(c: current);
1168	goto scan_number_sign;
1169	}
1170
1171	case `'0'`:
1172	case `'1'`:
1173	case `'2'`:
1174	case `'3'`:
1175	case `'4'`:
1176	case `'5'`:
1177	case `'6'`:
1178	case `'7'`:
1179	case `'8'`:
1180	case `'9'`:
1181	{
1182	add(c: current);
1183	goto scan_number_any2;
1184	}
1185
1186	default:
1187	{
1188	error_message =
1189	"invalid number; expected '+', '-', or digit after exponent";
1190	return token_type::parse_error;
1191	}
1192	}
1193
1194	scan_number_sign:
1195	// we just parsed an exponent sign
1196	switch (get())
1197	{
1198	case `'0'`:
1199	case `'1'`:
1200	case `'2'`:
1201	case `'3'`:
1202	case `'4'`:
1203	case `'5'`:
1204	case `'6'`:
1205	case `'7'`:
1206	case `'8'`:
1207	case `'9'`:
1208	{
1209	add(c: current);
1210	goto scan_number_any2;
1211	}
1212
1213	default:
1214	{
1215	error_message = "invalid number; expected digit after exponent sign";
1216	return token_type::parse_error;
1217	}
1218	}
1219
1220	scan_number_any2:
1221	// we just parsed a number after the exponent or exponent sign
1222	switch (get())
1223	{
1224	case `'0'`:
1225	case `'1'`:
1226	case `'2'`:
1227	case `'3'`:
1228	case `'4'`:
1229	case `'5'`:
1230	case `'6'`:
1231	case `'7'`:
1232	case `'8'`:
1233	case `'9'`:
1234	{
1235	add(c: current);
1236	goto scan_number_any2;
1237	}
1238
1239	default:
1240	goto scan_number_done;
1241	}
1242
1243	scan_number_done:
1244	// unget the character after the number (we only read it to know that
1245	// we are done scanning a number)
1246	unget();
1247
1248	char* endptr = nullptr; // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1249	errno = `0`;
1250
1251	// try to parse integers first and fall back to floats
1252	if (number_type == token_type::value_unsigned)
1253	{
1254	const auto x = std::strtoull(nptr: token_buffer.data(), endptr: &endptr, base: `10`);
1255
1256	// we checked the number format before
1257	JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1258
1259	if (errno == `0`)
1260	{
1261	value_unsigned = static_cast<number_unsigned_t>(x);
1262	if (value_unsigned == x)
1263	{
1264	return token_type::value_unsigned;
1265	}
1266	}
1267	}
1268	else if (number_type == token_type::value_integer)
1269	{
1270	const auto x = std::strtoll(nptr: token_buffer.data(), endptr: &endptr, base: `10`);
1271
1272	// we checked the number format before
1273	JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1274
1275	if (errno == `0`)
1276	{
1277	value_integer = static_cast<number_integer_t>(x);
1278	if (value_integer == x)
1279	{
1280	return token_type::value_integer;
1281	}
1282	}
1283	}
1284
1285	// this code is reached if we parse a floating-point number or if an
1286	// integer conversion above failed
1287	strtof(value_float, token_buffer.data(), &endptr);
1288
1289	// we checked the number format before
1290	JSON_ASSERT(endptr == token_buffer.data() + token_buffer.size());
1291
1292	return token_type::value_float;
1293	}
1294
1295	/!*
1296	@param[in] literal_text the literal text to expect
1297	@param[in] length the length of the passed literal text
1298	@param[in] return_type the token type to return on success
1299	*/
1300	JSON_HEDLEY_NON_NULL(`2`)
1301	token_type scan_literal(const char_type* literal_text, const std::size_t length,
1302	token_type return_type)
1303	{
1304	JSON_ASSERT(char_traits<char_type>::to_char_type(current) == literal_text[`0`]);
1305	for (std::size_t i = `1`; i < length; ++i)
1306	{
1307	if (JSON_HEDLEY_UNLIKELY(char_traits<char_type>::to_char_type(get()) != literal_text[i]))
1308	{
1309	error_message = "invalid literal";
1310	return token_type::parse_error;
1311	}
1312	}
1313	return return_type;
1314	}
1315
1316	/////////////////////
1317	// input management
1318	/////////////////////
1319
1320	/// reset token_buffer; current character is beginning of token
1321	void reset() noexcept
1322	{
1323	token_buffer.clear();
1324	token_string.clear();
1325	token_string.push_back(char_traits<char_type>::to_char_type(current));
1326	}
1327
1328	/*
1329	@brief get next character from the input
1330
1331	This function provides the interface to the used input adapter. It does
1332	not throw in case the input reached EOF, but returns a
1333	`char_traits<char>::eof()` in that case. Stores the scanned characters
1334	for use in error messages.
1335
1336	@return character read from the input
1337	*/
1338	char_int_type get()
1339	{
1340	++position.chars_read_total;
1341	++position.chars_read_current_line;
1342
1343	if (next_unget)
1344	{
1345	// just reset the next_unget variable and work with current
1346	next_unget = false;
1347	}
1348	else
1349	{
1350	current = ia.get_character();
1351	}
1352
1353	if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
1354	{
1355	token_string.push_back(char_traits<char_type>::to_char_type(current));
1356	}
1357
1358	if (current == `'\n'`)
1359	{
1360	++position.lines_read;
1361	position.chars_read_current_line = `0`;
1362	}
1363
1364	return current;
1365	}
1366
1367	/!*
1368	@brief unget current character (read it again on next get)
1369
1370	We implement unget by setting variable next_unget to true. The input is not
1371	changed - we just simulate ungetting by modifying chars_read_total,
1372	chars_read_current_line, and token_string. The next call to get() will
1373	behave as if the unget character is read again.
1374	*/
1375	void unget()
1376	{
1377	next_unget = true;
1378
1379	--position.chars_read_total;
1380
1381	// in case we "unget" a newline, we have to also decrement the lines_read
1382	if (position.chars_read_current_line == `0`)
1383	{
1384	if (position.lines_read > `0`)
1385	{
1386	--position.lines_read;
1387	}
1388	}
1389	else
1390	{
1391	--position.chars_read_current_line;
1392	}
1393
1394	if (JSON_HEDLEY_LIKELY(current != char_traits<char_type>::eof()))
1395	{
1396	JSON_ASSERT(!token_string.empty());
1397	token_string.pop_back();
1398	}
1399	}
1400
1401	/// add a character to token_buffer
1402	void add(char_int_type c)
1403	{
1404	token_buffer.push_back(static_cast<typename string_t::value_type>(c));
1405	}
1406
1407	public:
1408	/////////////////////
1409	// value getters
1410	/////////////////////
1411
1412	/// return integer value
1413	constexpr number_integer_t get_number_integer() const noexcept
1414	{
1415	return value_integer;
1416	}
1417
1418	/// return unsigned integer value
1419	constexpr number_unsigned_t get_number_unsigned() const noexcept
1420	{
1421	return value_unsigned;
1422	}
1423
1424	/// return floating-point value
1425	constexpr number_float_t get_number_float() const noexcept
1426	{
1427	return value_float;
1428	}
1429
1430	/// return current string value (implicitly resets the token; useful only once)
1431	string_t& get_string()
1432	{
1433	return token_buffer;
1434	}
1435
1436	/////////////////////
1437	// diagnostics
1438	/////////////////////
1439
1440	/// return position of last read token
1441	constexpr position_t get_position() const noexcept
1442	{
1443	return position;
1444	}
1445
1446	/// return the last read token (for errors only). Will never contain EOF
1447	/// (an arbitrary value that is not a valid char value, often -1), because
1448	/// 255 may legitimately occur. May contain NUL, which should be escaped.
1449	std::string get_token_string() const
1450	{
1451	// escape control characters
1452	std::string result;
1453	for (const auto c : token_string)
1454	{
1455	if (static_cast<unsigned char>(c) <= `'\x1F'`)
1456	{
1457	// escape control characters
1458	std::array<char, `9`> cs{._M_elems: {}};
1459	static_cast<void>((std::snprintf)(s: cs.data(), maxlen: cs.size(), format: "<U+%.4X>", static_cast<unsigned char>(c))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg)
1460	result += cs.data();
1461	}
1462	else
1463	{
1464	// add character as is
1465	result.push_back(c: static_cast<std::string::value_type>(c));
1466	}
1467	}
1468
1469	return result;
1470	}
1471
1472	/// return syntax error message
1473	JSON_HEDLEY_RETURNS_NON_NULL
1474	constexpr const char* get_error_message() const noexcept
1475	{
1476	return error_message;
1477	}
1478
1479	/////////////////////
1480	// actual scanner
1481	/////////////////////
1482
1483	/!*
1484	@brief skip the UTF-8 byte order mark
1485	@return true iff there is no BOM or the correct BOM has been skipped
1486	*/
1487	bool skip_bom()
1488	{
1489	if (get() == `0xEF`)
1490	{
1491	// check if we completely parse the BOM
1492	return get() == `0xBB` && get() == `0xBF`;
1493	}
1494
1495	// the first character is not the beginning of the BOM; unget it to
1496	// process is later
1497	unget();
1498	return true;
1499	}
1500
1501	void skip_whitespace()
1502	{
1503	do
1504	{
1505	get();
1506	}
1507	while (current == `' '` \|\| current == `'\t'` \|\| current == `'\n'` \|\| current == `'\r'`);
1508	}
1509
1510	token_type scan()
1511	{
1512	// initially, skip the BOM
1513	if (position.chars_read_total == `0` && !skip_bom())
1514	{
1515	error_message = "invalid BOM; must be 0xEF 0xBB 0xBF if given";
1516	return token_type::parse_error;
1517	}
1518
1519	// read next character and ignore whitespace
1520	skip_whitespace();
1521
1522	// ignore comments
1523	while (ignore_comments && current == `'/'`)
1524	{
1525	if (!scan_comment())
1526	{
1527	return token_type::parse_error;
1528	}
1529
1530	// skip following whitespace
1531	skip_whitespace();
1532	}
1533
1534	switch (current)
1535	{
1536	// structural characters
1537	case `'['`:
1538	return token_type::begin_array;
1539	case `']'`:
1540	return token_type::end_array;
1541	case `'{'`:
1542	return token_type::begin_object;
1543	case `'}'`:
1544	return token_type::end_object;
1545	case `':'`:
1546	return token_type::name_separator;
1547	case `','`:
1548	return token_type::value_separator;
1549
1550	// literals
1551	case `'t'`:
1552	{
1553	std::array<char_type, `4`> true_literal = {{static_cast<char_type>(`'t'`), static_cast<char_type>(`'r'`), static_cast<char_type>(`'u'`), static_cast<char_type>(`'e'`)}};
1554	return scan_literal(literal_text: true_literal.data(), length: true_literal.size(), return_type: token_type::literal_true);
1555	}
1556	case `'f'`:
1557	{
1558	std::array<char_type, `5`> false_literal = {{static_cast<char_type>(`'f'`), static_cast<char_type>(`'a'`), static_cast<char_type>(`'l'`), static_cast<char_type>(`'s'`), static_cast<char_type>(`'e'`)}};
1559	return scan_literal(literal_text: false_literal.data(), length: false_literal.size(), return_type: token_type::literal_false);
1560	}
1561	case `'n'`:
1562	{
1563	std::array<char_type, `4`> null_literal = {{static_cast<char_type>(`'n'`), static_cast<char_type>(`'u'`), static_cast<char_type>(`'l'`), static_cast<char_type>(`'l'`)}};
1564	return scan_literal(literal_text: null_literal.data(), length: null_literal.size(), return_type: token_type::literal_null);
1565	}
1566
1567	// string
1568	case `'\"'`:
1569	return scan_string();
1570
1571	// number
1572	case `'-'`:
1573	case `'0'`:
1574	case `'1'`:
1575	case `'2'`:
1576	case `'3'`:
1577	case `'4'`:
1578	case `'5'`:
1579	case `'6'`:
1580	case `'7'`:
1581	case `'8'`:
1582	case `'9'`:
1583	return scan_number();
1584
1585	// end of input (the null byte is needed when parsing from
1586	// string literals)
1587	case `'\0'`:
1588	case char_traits<char_type>::eof():
1589	return token_type::end_of_input;
1590
1591	// error
1592	default:
1593	error_message = "invalid literal";
1594	return token_type::parse_error;
1595	}
1596	}
1597
1598	private:
1599	/// input adapter
1600	InputAdapterType ia;
1601
1602	/// whether comments should be ignored (true) or signaled as errors (false)
1603	const bool ignore_comments = false;
1604
1605	/// the current character
1606	char_int_type current = char_traits<char_type>::eof();
1607
1608	/// whether the next get() call should just return current
1609	bool next_unget = false;
1610
1611	/// the start position of the current token
1612	position_t position {};
1613
1614	/// raw input token string (for error messages)
1615	std::vector<char_type> token_string {};
1616
1617	/// buffer for variable-length tokens (numbers, strings)
1618	string_t token_buffer {};
1619
1620	/// a description of occurred lexer errors
1621	const char* error_message = "";
1622
1623	// number values
1624	number_integer_t value_integer = `0`;
1625	number_unsigned_t value_unsigned = `0`;
1626	number_float_t value_float = `0`;
1627
1628	/// the decimal point
1629	const char_int_type decimal_point_char = `'.'`;
1630	};
1631
1632	} // namespace detail
1633	NLOHMANN_JSON_NAMESPACE_END
1634

Browse the source code of include/nlohmann/detail/input/lexer.hpp