comments updated
[henge/apc.git] / stb / stb_c_lexer.h
1 // stb_c_lexer.h - v0.08 - public domain Sean Barrett 2013
2 // lexer for making little C-like languages with recursive-descent parsers
3 //
4 // This file provides both the interface and the implementation.
5 // To instantiate the implementation,
6 // #define STB_C_LEXER_IMPLEMENTATION
7 // in *ONE* source file, before #including this file.
8 //
9 // The default configuration is fairly close to a C lexer, although
10 // suffixes on integer constants are not handled (you can override this).
11 //
12 // History:
13 // 0.08 fix bad pointer comparison
14 // 0.07 fix mishandling of hexadecimal constants parsed by strtol
15 // 0.06 fix missing next character after ending quote mark (Andreas Fredriksson)
16 // 0.05 refixed get_location because github version had lost the fix
17 // 0.04 fix octal parsing bug
18 // 0.03 added STB_C_LEX_DISCARD_PREPROCESSOR option
19 // refactor API to simplify (only one struct instead of two)
20 // change literal enum names to have 'lit' at the end
21 // 0.02 first public release
22 //
23 // Status:
24 // - haven't tested compiling as C++
25 // - haven't tested the float parsing path
26 // - haven't tested the non-default-config paths (e.g. non-stdlib)
27 // - only tested default-config paths by eyeballing output of self-parse
28 //
29 // - haven't implemented multiline strings
30 // - haven't implemented octal/hex character constants
31 // - haven't implemented support for unicode CLEX_char
32 // - need to expand error reporting so you don't just get "CLEX_parse_error"
33 //
34 // Contributors:
35 // Arpad Goretity (bugfix)
36 //
37 // LICENSE
38 //
39 // This software is dual-licensed to the public domain and under the following
40 // license: you are granted a perpetual, irrevocable license to copy, modify,
41 // publish, and distribute this file as you see fit.
42
43 #ifndef STB_C_LEXER_DEFINITIONS
44 // to change the default parsing rules, copy the following lines
45 // into your C/C++ file *before* including this, and then replace
46 // the Y's with N's for the ones you don't want.
47 // --BEGIN--
48
49 #define STB_C_LEX_C_DECIMAL_INTS Y // "0|[1-9][0-9]*" CLEX_intlit
50 #define STB_C_LEX_C_HEX_INTS Y // "0x[0-9a-fA-F]+" CLEX_intlit
51 #define STB_C_LEX_C_OCTAL_INTS Y // "[0-7]+" CLEX_intlit
52 #define STB_C_LEX_C_DECIMAL_FLOATS Y // "[0-9]*(.[0-9]*([eE]-?[0-9]+)?) CLEX_floatlit
53 #define STB_C_LEX_C_IDENTIFIERS Y // "[_a-zA-Z][_a-zA-Z0-9]*" CLEX_id
54 #define STB_C_LEX_C_DQ_STRINGS Y // double-quote-delimited strings with escapes CLEX_dqstring
55 #define STB_C_LEX_C_SQ_STRINGS N // single-quote-delimited strings with escapes CLEX_ssstring
56 #define STB_C_LEX_C_CHARS Y // single-quote-delimited character with escape CLEX_charlits
57 #define STB_C_LEX_C_COMMENTS Y // "/* comment */"
58 #define STB_C_LEX_CPP_COMMENTS Y // "// comment to end of line\n"
59 #define STB_C_LEX_C_COMPARISONS Y // "==" CLEX_eq "!=" CLEX_noteq "<=" CLEX_lesseq ">=" CLEX_greatereq
60 #define STB_C_LEX_C_LOGICAL Y // "&&" CLEX_andand "||" CLEX_oror
61 #define STB_C_LEX_C_SHIFTS Y // "<<" CLEX_shl ">>" CLEX_shr
62 #define STB_C_LEX_C_INCREMENTS Y // "++" CLEX_plusplus "--" CLEX_minusminus
63 #define STB_C_LEX_C_ARROW Y // "->" CLEX_arrow
64 #define STB_C_LEX_EQUAL_ARROW N // "=>" CLEX_eqarrow
65 #define STB_C_LEX_C_BITWISEEQ Y // "&=" CLEX_andeq "|=" CLEX_oreq "^=" CLEX_xoreq
66 #define STB_C_LEX_C_ARITHEQ Y // "+=" CLEX_pluseq "-=" CLEX_minuseq
67 // "*=" CLEX_muleq "/=" CLEX_diveq "%=" CLEX_modeq
68 // if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
69 // "<<=" CLEX_shleq ">>=" CLEX_shreq
70
71 #define STB_C_LEX_PARSE_SUFFIXES N // letters after numbers are parsed as part of those numbers, and must be in suffix list below
72 #define STB_C_LEX_DECIMAL_SUFFIXES "" // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
73 #define STB_C_LEX_HEX_SUFFIXES "" // e.g. "uUlL"
74 #define STB_C_LEX_OCTAL_SUFFIXES "" // e.g. "uUlL"
75 #define STB_C_LEX_FLOAT_SUFFIXES "" //
76
77 #define STB_C_LEX_0_IS_EOF N // if Y, ends parsing at '\0'; if N, returns '\0' as token
78 #define STB_C_LEX_INTEGERS_AS_DOUBLES N // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N
79 #define STB_C_LEX_MULTILINE_DSTRINGS N // allow newlines in double-quoted strings
80 #define STB_C_LEX_MULTILINE_SSTRINGS N // allow newlines in single-quoted strings
81 #define STB_C_LEX_USE_STDLIB Y // use strtod,strtol for parsing #s; otherwise inaccurate hack
82 #define STB_C_LEX_DOLLAR_IDENTIFIER Y // allow $ as an identifier character
83 #define STB_C_LEX_FLOAT_NO_DECIMAL Y // allow floats that have no decimal point if they have an exponent
84
85 #define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES N // if Y, all CLEX_ token names are defined, even if never returned
86 // leaving it as N should help you catch config bugs
87
88 #define STB_C_LEX_DISCARD_PREPROCESSOR Y // discard C-preprocessor directives (e.g. after prepocess
89 // still have #line, #pragma, etc)
90
91 //#define STB_C_LEX_ISWHITE(str) ... // return length in bytes of whitespace characters if first char is whitespace
92
93 #define STB_C_LEXER_DEFINITIONS // This line prevents the header file from replacing your definitions
94 // --END--
95
96 #endif
97
98 #ifndef INCLUDE_STB_C_LEXER_H
99 #define INCLUDE_STB_C_LEXER_H
100
101 typedef struct
102 {
103 // lexer variables
104 char *input_stream;
105 char *eof;
106 char *parse_point;
107 char *string_storage;
108 int string_storage_len;
109
110 // lexer parse location for error messages
111 char *where_firstchar;
112 char *where_lastchar;
113
114 // lexer token variables
115 long token;
116 double real_number;
117 long int_number;
118 char *string;
119 int string_len;
120 } stb_lexer;
121
122 typedef struct
123 {
124 int line_number;
125 int line_offset;
126 } stb_lex_location;
127
128 #ifdef __cplusplus
129 extern "C" {
130 #endif
131
132 extern void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length);
133 // this function initialize the 'lexer' structure
134 // Input:
135 // - input_stream points to the file to parse, loaded into memory
136 // - input_stream_end points to the end of the file, or NULL if you use 0-for-EOF
137 // - string_store is storage the lexer can use for storing parsed strings and identifiers
138 // - store_length is the length of that storage
139
140 extern int stb_c_lexer_get_token(stb_lexer *lexer);
141 // this function returns non-zero if a token is parsed, or 0 if at EOF
142 // Output:
143 // - lexer->token is the token ID, which is unicode code point for a single-char token, < 0 for a multichar or eof or error
144 // - lexer->real_number is a double constant value for CLEX_floatlit, or CLEX_intlit if STB_C_LEX_INTEGERS_AS_DOUBLES
145 // - lexer->int_number is an integer constant for CLEX_intlit if !STB_C_LEX_INTEGERS_AS_DOUBLES, or character for CLEX_charlit
146 // - lexer->string is a 0-terminated string for CLEX_dqstring or CLEX_sqstring or CLEX_identifier
147 // - lexer->string_len is the byte length of lexer->string
148
149 extern void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc);
150 // this inefficient function returns the line number and character offset of a
151 // given location in the file as returned by stb_lex_token. Because it's inefficient,
152 // you should only call it for errors, not for every token.
153 // For error messages of invalid tokens, you typically want the location of the start
154 // of the token (which caused the token to be invalid). For bugs involving legit
155 // tokens, you can report the first or the range.
156 // Output:
157 // - loc->line_number is the line number in the file, counting from 1, of the location
158 // - loc->line_offset is the char-offset in the line, counting from 0, of the location
159
160
161 #ifdef __cplusplus
162 }
163 #endif
164
165 #endif // INCLUDE_STB_C_LEXER_H
166
167 #ifdef STB_C_LEXER_IMPLEMENTATION
168
169 #if defined(Y) || defined(N)
170 #error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined"
171 #endif
172
173
174 // Hacky definitions so we can easily #if on them
175 #define Y(x) 1
176 #define N(x) 0
177
178 #if STB_C_LEX_USE_STDLIB(x)
179 #define STB__CLEX_use_stdlib
180 #include <stdlib.h>
181 #endif
182
183 #if STB_C_LEX_INTEGERS_AS_DOUBLES(x)
184 typedef double stb__clex_int;
185 #define intfield real_number
186 #define STB__clex_int_as_double
187 #else
188 typedef long stb__clex_int;
189 #define intfield int_number
190 #endif
191
192 // Convert these config options to simple conditional #defines so we can more
193 // easily test them once we've change the meaning of Y/N
194
195 #if STB_C_LEX_PARSE_SUFFIXES(x)
196 #define STB__clex_parse_suffixes
197 #endif
198
199 #if STB_C_LEX_C_DECIMAL_INTS(x) || STB_C_LEX_C_HEX_INTS(x) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
200 #define STB__clex_define_int
201 #endif
202
203 #if (STB_C_LEX_C_ARITHEQ(x) && STB_C_LEX_C_SHIFTS(x)) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
204 #define STB__clex_define_shifts
205 #endif
206
207 #if STB_C_LEX_C_HEX_INTS(x)
208 #define STB__clex_hex_ints
209 #endif
210
211 #if STB_C_LEX_C_DECIMAL_INTS(x)
212 #define STB__clex_decimal_ints
213 #endif
214
215 #if STB_C_LEX_C_OCTAL_INTS(x)
216 #define STB__clex_octal_ints
217 #endif
218
219 #if STB_C_LEX_C_DECIMAL_FLOATS(x)
220 #define STB__clex_decimal_floats
221 #endif
222
223 #if STB_C_LEX_DISCARD_PREPROCESSOR(x)
224 #define STB__clex_discard_preprocessor
225 #endif
226
227 // Now pick a definition of Y/N that's conducive to
228 // defining the enum of token names.
229 #if STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) || defined(STB_C_LEXER_SELF_TEST)
230 #undef N
231 #define N(a) Y(a)
232 #else
233 #undef N
234 #define N(a)
235 #endif
236
237 #undef Y
238 #define Y(a) a,
239
240 enum
241 {
242 CLEX_eof = 256,
243 CLEX_parse_error,
244
245 #ifdef STB__clex_define_int
246 CLEX_intlit,
247 #endif
248
249 STB_C_LEX_C_DECIMAL_FLOATS( CLEX_floatlit )
250 STB_C_LEX_C_IDENTIFIERS( CLEX_id )
251 STB_C_LEX_C_DQ_STRINGS( CLEX_dqstring )
252 STB_C_LEX_C_SQ_STRINGS( CLEX_sqstring )
253 STB_C_LEX_C_CHARS( CLEX_charlit )
254 STB_C_LEX_C_COMPARISONS( CLEX_eq )
255 STB_C_LEX_C_COMPARISONS( CLEX_noteq )
256 STB_C_LEX_C_COMPARISONS( CLEX_lesseq )
257 STB_C_LEX_C_COMPARISONS( CLEX_greatereq )
258 STB_C_LEX_C_LOGICAL( CLEX_andand )
259 STB_C_LEX_C_LOGICAL( CLEX_oror )
260 STB_C_LEX_C_SHIFTS( CLEX_shl )
261 STB_C_LEX_C_SHIFTS( CLEX_shr )
262 STB_C_LEX_C_INCREMENTS( CLEX_plusplus )
263 STB_C_LEX_C_INCREMENTS( CLEX_minusminus )
264 STB_C_LEX_C_ARITHEQ( CLEX_pluseq )
265 STB_C_LEX_C_ARITHEQ( CLEX_minuseq )
266 STB_C_LEX_C_ARITHEQ( CLEX_muleq )
267 STB_C_LEX_C_ARITHEQ( CLEX_diveq )
268 STB_C_LEX_C_ARITHEQ( CLEX_modeq )
269 STB_C_LEX_C_BITWISEEQ( CLEX_andeq )
270 STB_C_LEX_C_BITWISEEQ( CLEX_oreq )
271 STB_C_LEX_C_BITWISEEQ( CLEX_xoreq )
272 STB_C_LEX_C_ARROW( CLEX_arrow )
273 STB_C_LEX_EQUAL_ARROW( CLEX_eqarrow )
274
275 #ifdef STB__clex_define_shifts
276 CLEX_shleq, CLEX_shreq,
277 #endif
278
279 CLEX_first_unused_token
280
281 #undef Y
282 #define Y(a) a
283 };
284
285 // Now for the rest of the file we'll use the basic definition where
286 // where Y expands to its contents and N expands to nothing
287 #undef N
288 #define N(a)
289
290 // API function
291 void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length)
292 {
293 lexer->input_stream = (char *) input_stream;
294 lexer->eof = (char *) input_stream_end;
295 lexer->parse_point = (char *) input_stream;
296 lexer->string_storage = string_store;
297 lexer->string_storage_len = store_length;
298 }
299
300 // API function
301 void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc)
302 {
303 char *p = lexer->input_stream;
304 int line_number = 1;
305 int char_offset = 0;
306 while (*p && p < where) {
307 if (*p == '\n' || *p == '\r') {
308 p += (p[0]+p[1] == '\r'+'\n' ? 2 : 1); // skip newline
309 line_number += 1;
310 char_offset = 0;
311 } else {
312 ++p;
313 ++char_offset;
314 }
315 }
316 loc->line_number = line_number;
317 loc->line_offset = char_offset;
318 }
319
320 // main helper function for returning a parsed token
321 static int stb__clex_token(stb_lexer *lexer, int token, char *start, char *end)
322 {
323 lexer->token = token;
324 lexer->where_firstchar = start;
325 lexer->where_lastchar = end;
326 lexer->parse_point = end+1;
327 return 1;
328 }
329
330 // helper function for returning eof
331 static int stb__clex_eof(stb_lexer *lexer)
332 {
333 lexer->token = CLEX_eof;
334 return 0;
335 }
336
337 static int stb__clex_iswhite(int x)
338 {
339 return x == ' ' || x == '\t' || x == '\r' || x == '\n' || x == '\f';
340 }
341
342 static const char *stb__strchr(const char *str, int ch)
343 {
344 for (; *str; ++str)
345 if (*str == ch)
346 return str;
347 return 0;
348 }
349
350 // parse suffixes at the end of a number
351 static int stb__clex_parse_suffixes(stb_lexer *lexer, long tokenid, char *start, char *cur, const char *suffixes)
352 {
353 #ifdef STB__clex_parse_suffixes
354 lexer->string = lexer->string_storage;
355 lexer->string_len = 0;
356
357 while ((*cur >= 'a' && *cur <= 'z') || (*cur >= 'A' && *cur <= 'Z')) {
358 if (stb__strchr(suffixes, *cur) == 0)
359 return stb__clex_token(lexer, CLEX_parse_error, start, cur);
360 if (lexer->string_len+1 >= lexer->string_storage_len)
361 return stb__clex_token(lexer, CLEX_parse_error, start, cur);
362 lexer->string[lexer->string_len++] = *cur++;
363 }
364 #else
365 suffixes = suffixes; // attempt to suppress warnings
366 #endif
367 return stb__clex_token(lexer, tokenid, start, cur-1);
368 }
369
370 #ifndef STB__CLEX_use_stdlib
371 static double stb__clex_parse_float(char *p, char **q)
372 {
373 double value=0;
374 while (*p >= '0' && *p <= '9')
375 value = value*10 + (*p++ - '0');
376 if (*p == '.') {
377 double powten=1, addend = 0;
378 ++p;
379 while (*p >= '0' && *p <= '9') {
380 addend = addend + 10*(*p++ - '0');
381 powten *= 10;
382 }
383 value += addend / powten;
384 }
385 if (*p == 'e' || *p == 'E') {
386 int sign = p[1] == '-';
387 int exponent=0;
388 double pow10=1;
389 p += 1+sign;
390 while (*p >= '0' && *p <= '9')
391 exponent = exponent*10 + (*p++ - '0');
392 // can't use pow() from stdlib, so do it slow way
393 while (exponent-- > 0)
394 pow10 *= 10;
395 if (sign)
396 value /= pow10;
397 else
398 value *= pow10;
399 }
400 *q = p;
401 return value;
402 }
403 #endif
404
405 static int stb__clex_parse_char(char *p, char **q)
406 {
407 if (*p == '\\') {
408 *q = p+2; // tentatively guess we'll parse two characters
409 switch(p[1]) {
410 case '\\': return '\\';
411 case '\'': return '\'';
412 case '"': return '"';
413 case 't': return '\t';
414 case 'f': return '\f';
415 case 'n': return '\n';
416 case 'r': return '\r';
417 case '0': return '\0'; // @TODO ocatal constants
418 case 'x': case 'X': return -1; // @TODO hex constants
419 case 'u': return -1; // @TODO unicode constants
420 }
421 }
422 *q = p+1;
423 return (unsigned char) *p;
424 }
425
426 static int stb__clex_parse_string(stb_lexer *lexer, char *p, int type)
427 {
428 char *start = p;
429 char delim = *p++; // grab the " or ' for later matching
430 char *out = lexer->string_storage;
431 char *outend = lexer->string_storage + lexer->string_storage_len;
432 while (*p != delim) {
433 int n;
434 if (*p == '\\') {
435 char *q;
436 n = stb__clex_parse_char(p, &q);
437 if (n < 0)
438 return stb__clex_token(lexer, CLEX_parse_error, start, q);
439 p = q;
440 } else {
441 // @OPTIMIZE: could speed this up by looping-while-not-backslash
442 n = (unsigned char) *p++;
443 }
444 if (out+1 > outend)
445 return stb__clex_token(lexer, CLEX_parse_error, start, p);
446 // @TODO expand unicode escapes to UTF8
447 *out++ = (char) n;
448 }
449 *out = 0;
450 lexer->string = lexer->string_storage;
451 lexer->string_len = out - lexer->string_storage;
452 return stb__clex_token(lexer, type, start, p);
453 }
454
455 int stb_c_lexer_get_token(stb_lexer *lexer)
456 {
457 char *p = lexer->parse_point;
458
459 // skip whitespace and comments
460 for (;;) {
461 #ifdef STB_C_LEX_ISWHITE
462 while (p != lexer->stream_end) {
463 int n;
464 n = STB_C_LEX_ISWHITE(p);
465 if (n == 0) break;
466 if (lexer->eof && lexer->eof - lexer->parse_point < n)
467 return stb__clex_token(tok, CLEX_parse_error, p,lexer->eof-1);
468 p += n;
469 }
470 #else
471 while (p != lexer->eof && stb__clex_iswhite(*p))
472 ++p;
473 #endif
474
475 STB_C_LEX_CPP_COMMENTS(
476 if (p != lexer->eof && p[0] == '/' && p[1] == '/') {
477 while (p != lexer->eof && *p != '\r' && *p != '\n')
478 ++p;
479 continue;
480 }
481 )
482
483 STB_C_LEX_C_COMMENTS(
484 if (p != lexer->eof && p[0] == '/' && p[1] == '*') {
485 char *start = p;
486 p += 2;
487 while (p != lexer->eof && (p[0] != '*' || p[1] != '/'))
488 ++p;
489 if (p == lexer->eof)
490 return stb__clex_token(lexer, CLEX_parse_error, start, p-1);
491 p += 2;
492 continue;
493 }
494 )
495
496 #ifdef STB__clex_discard_preprocessor
497 // @TODO this discards everything after a '#', regardless
498 // of where in the line the # is, rather than requiring it
499 // be at the start. (because this parser doesn't otherwise
500 // check for line breaks!)
501 if (p != lexer->eof && p[0] == '#') {
502 while (p != lexer->eof && *p != '\r' && *p != '\n')
503 ++p;
504 continue;
505 }
506 #endif
507
508 break;
509 }
510
511 if (p == lexer->eof)
512 return stb__clex_eof(lexer);
513
514 switch (*p) {
515 default:
516 if ( (*p >= 'a' && *p <= 'z')
517 || (*p >= 'A' && *p <= 'Z')
518 || *p == '_' || (unsigned char) *p >= 128 // >= 128 is UTF8 char
519 STB_C_LEX_DOLLAR_IDENTIFIER( || *p == '$' ) )
520 {
521 int n = 0;
522 lexer->string = lexer->string_storage;
523 lexer->string_len = n;
524 do {
525 if (n+1 >= lexer->string_storage_len)
526 return stb__clex_token(lexer, CLEX_parse_error, p, p+n);
527 lexer->string[n] = p[n];
528 ++n;
529 } while (
530 (p[n] >= 'a' && p[n] <= 'z')
531 || (p[n] >= 'A' && p[n] <= 'Z')
532 || (p[n] >= '0' && p[n] <= '9') // allow digits in middle of identifier
533 || p[n] == '_' || (unsigned char) p[n] >= 128
534 STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' )
535 );
536 lexer->string[n] = 0;
537 return stb__clex_token(lexer, CLEX_id, p, p+n-1);
538 }
539
540 // check for EOF
541 STB_C_LEX_0_IS_EOF(
542 if (*p == 0)
543 return stb__clex_eof(tok);
544 )
545
546 single_char:
547 // not an identifier, return the character as itself
548 return stb__clex_token(lexer, *p, p, p);
549
550 case '+':
551 if (p+1 != lexer->eof) {
552 STB_C_LEX_C_INCREMENTS(if (p[1] == '+') return stb__clex_token(lexer, CLEX_plusplus, p,p+1);)
553 STB_C_LEX_C_ARITHEQ( if (p[1] == '=') return stb__clex_token(lexer, CLEX_pluseq , p,p+1);)
554 }
555 goto single_char;
556 case '-':
557 if (p+1 != lexer->eof) {
558 STB_C_LEX_C_INCREMENTS(if (p[1] == '-') return stb__clex_token(lexer, CLEX_minusminus, p,p+1);)
559 STB_C_LEX_C_ARITHEQ( if (p[1] == '=') return stb__clex_token(lexer, CLEX_minuseq , p,p+1);)
560 STB_C_LEX_C_ARROW( if (p[1] == '>') return stb__clex_token(lexer, CLEX_arrow , p,p+1);)
561 }
562 goto single_char;
563 case '&':
564 if (p+1 != lexer->eof) {
565 STB_C_LEX_C_LOGICAL( if (p[1] == '&') return stb__clex_token(lexer, CLEX_andand, p,p+1);)
566 STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_andeq , p,p+1);)
567 }
568 goto single_char;
569 case '|':
570 if (p+1 != lexer->eof) {
571 STB_C_LEX_C_LOGICAL( if (p[1] == '|') return stb__clex_token(lexer, CLEX_oror, p,p+1);)
572 STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_oreq, p,p+1);)
573 }
574 goto single_char;
575 case '=':
576 if (p+1 != lexer->eof) {
577 STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_eq, p,p+1);)
578 STB_C_LEX_EQUAL_ARROW( if (p[1] == '>') return stb__clex_token(lexer, CLEX_eqarrow, p,p+1);)
579 }
580 goto single_char;
581 case '!':
582 STB_C_LEX_C_COMPARISONS(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_noteq, p,p+1);)
583 goto single_char;
584 case '^':
585 STB_C_LEX_C_BITWISEEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_xoreq, p,p+1));
586 goto single_char;
587 case '%':
588 STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_modeq, p,p+1));
589 goto single_char;
590 case '*':
591 STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_muleq, p,p+1));
592 goto single_char;
593 case '/':
594 STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_diveq, p,p+1));
595 goto single_char;
596 case '<':
597 if (p+1 != lexer->eof) {
598 STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_lesseq, p,p+1);)
599 STB_C_LEX_C_SHIFTS( if (p[1] == '<') {
600 STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
601 return stb__clex_token(lexer, CLEX_shleq, p,p+2);)
602 return stb__clex_token(lexer, CLEX_shl, p,p+1);
603 }
604 )
605 }
606 goto single_char;
607 case '>':
608 if (p+1 != lexer->eof) {
609 STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_greatereq, p,p+1);)
610 STB_C_LEX_C_SHIFTS( if (p[1] == '>') {
611 STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
612 return stb__clex_token(lexer, CLEX_shreq, p,p+2);)
613 return stb__clex_token(lexer, CLEX_shr, p,p+1);
614 }
615 )
616 }
617 goto single_char;
618
619 case '"':
620 STB_C_LEX_C_DQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_dqstring);)
621 goto single_char;
622 case '\'':
623 STB_C_LEX_C_SQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_sqstring);)
624 STB_C_LEX_C_CHARS(
625 {
626 char *start = p;
627 lexer->int_number = stb__clex_parse_char(p+1, &p);
628 if (lexer->int_number < 0)
629 return stb__clex_token(lexer, CLEX_parse_error, start,start);
630 if (p == lexer->eof || *p != '\'')
631 return stb__clex_token(lexer, CLEX_parse_error, start,p);
632 return stb__clex_token(lexer, CLEX_charlit, start, p+1);
633 })
634 goto single_char;
635
636 case '0':
637 #ifdef STB__clex_hex_ints
638 if (p+1 != lexer->eof) {
639 if (p[1] == 'x' || p[1] == 'X') {
640 char *q = p+2;
641 #ifdef STB__CLEX_use_stdlib
642 lexer->int_number = strtol((char *) p, (char **) &q, 16);
643 #else
644 stb__clex_int n=0;
645 while (q != lexer->eof) {
646 if (*q >= '0' && *q <= '9')
647 n = n*16 + (*q - '0');
648 else if (*q >= 'a' && *q <= 'f')
649 n = n*16 + (*q - 'a') + 10;
650 else if (*q >= 'A' && *q <= 'F')
651 n = n*16 + (*q - 'A') + 10;
652 else
653 break;
654 ++q;
655 }
656 lexer->int_field = n; // int_field is macro that expands to real_number/int_number depending on type of n
657 #endif
658 if (q == p+2)
659 return stb__clex_token(lexer, CLEX_parse_error, p-2,p-1);
660 return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_HEX_SUFFIXES);
661 }
662 }
663 #endif // STB__clex_hex_ints
664 // can't test for octal because we might parse '0.0' as float or as '0' '.' '0',
665 // so have to do float first
666
667 /* FALL THROUGH */
668 case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
669
670 #ifdef STB__clex_decimal_floats
671 {
672 char *q = p;
673 while (q != lexer->eof && (*q >= '0' && *q <= '9'))
674 ++q;
675 if (q != lexer->eof) {
676 if (*q == '.' STB_C_LEX_FLOAT_NO_DECIMAL(|| *q == 'e' || *q == 'E')) {
677 #ifdef STB__CLEX_use_stdlib
678 lexer->real_number = strtod((char *) p, (char**) &q);
679 #else
680 lexer->real_number = stb__clex_parse_float(p, &q);
681 #endif
682
683 return stb__clex_parse_suffixes(lexer, CLEX_floatlit, p,q, STB_C_LEX_FLOAT_SUFFIXES);
684
685 }
686 }
687 }
688 #endif // STB__clex_decimal_floats
689
690 #ifdef STB__clex_octal_ints
691 if (p[0] == '0') {
692 char *q = p;
693 #ifdef STB__CLEX_use_stdlib
694 lexer->int_number = strtol((char *) p, (char **) &q, 8);
695 #else
696 stb__clex_int n=0;
697 while (q != lexer->eof) {
698 if (*q >= '0' && *q <= '7')
699 n = n*8 + (q - '0');
700 else
701 break;
702 ++q;
703 }
704 if (q != lexer->eof && (*q == '8' || *q=='9'))
705 return stb__clex_token(tok, CLEX_parse_error, p, q);
706 lexer->int_field = n;
707 #endif
708 return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
709 }
710 #endif // STB__clex_octal_ints
711
712 #ifdef STB__clex_decimal_ints
713 {
714 char *q = p;
715 #ifdef STB__CLEX_use_stdlib
716 lexer->int_number = strtol((char *) p, (char **) &q, 10);
717 #else
718 stb__clex_int n=0;
719 while (q != lexer->eof) {
720 if (*q >= '0' && *q <= '9')
721 n = n*10 + (q - '0');
722 else
723 break;
724 ++q;
725 }
726 lexer->int_field = n;
727 #endif
728 return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
729 }
730 #endif // STB__clex_decimal_ints
731 goto single_char;
732 }
733 }
734 #endif // STB_C_LEXER_IMPLEMENTATION
735
736 #ifdef STB_C_LEXER_SELF_TEST
737
738 #include <stdio.h>
739
740 static void print_token(stb_lexer *lexer)
741 {
742 switch (lexer->token) {
743 case CLEX_id : printf("_%s", lexer->string); break;
744 case CLEX_eq : printf("=="); break;
745 case CLEX_noteq : printf("!="); break;
746 case CLEX_lesseq : printf("<="); break;
747 case CLEX_greatereq : printf(">="); break;
748 case CLEX_andand : printf("&&"); break;
749 case CLEX_oror : printf("||"); break;
750 case CLEX_shl : printf("<<"); break;
751 case CLEX_shr : printf(">>"); break;
752 case CLEX_plusplus : printf("++"); break;
753 case CLEX_minusminus: printf("--"); break;
754 case CLEX_arrow : printf("->"); break;
755 case CLEX_andeq : printf("&="); break;
756 case CLEX_oreq : printf("|="); break;
757 case CLEX_xoreq : printf("^="); break;
758 case CLEX_pluseq : printf("+="); break;
759 case CLEX_minuseq : printf("-="); break;
760 case CLEX_muleq : printf("*="); break;
761 case CLEX_diveq : printf("/="); break;
762 case CLEX_modeq : printf("%%="); break;
763 case CLEX_shleq : printf("<<="); break;
764 case CLEX_shreq : printf(">>="); break;
765 case CLEX_eqarrow : printf("=>"); break;
766 case CLEX_dqstring : printf("\"%s\"", lexer->string); break;
767 case CLEX_sqstring : printf("'\"%s\"'", lexer->string); break;
768 case CLEX_charlit : printf("'%s'", lexer->string); break;
769 #if defined(STB__clex_int_as_double) && !defined(STB__CLEX_use_stdlib)
770 case CLEX_intlit : printf("#%g", lexer->real_number); break;
771 #else
772 case CLEX_intlit : printf("#%ld", lexer->int_number); break;
773 #endif
774 case CLEX_floatlit : printf("%g", lexer->real_number); break;
775 default:
776 if (lexer->token >= 0 && lexer->token < 256)
777 printf("%c", (int) lexer->token);
778 else {
779 printf("<<<UNKNOWN TOKEN %ld >>>\n", lexer->token);
780 }
781 break;
782 }
783 }
784
785 /* Force a test
786 of parsing
787 multiline comments */
788
789 /*/ comment /*/
790 /**/ extern /**/
791
792 void dummy(void)
793 {
794 printf("test",1); // https://github.com/nothings/stb/issues/13
795 }
796
797 int main(int argc, char **argv)
798 {
799 FILE *f = fopen("stb_c_lexer.h","rb");
800 char *text = (char *) malloc(1 << 20);
801 int len = f ? fread(text, 1, 1<<20, f) : -1;
802 stb_lexer lex;
803 if (len < 0) {
804 fprintf(stderr, "Error opening file\n");
805 return 1;
806 }
807 fclose(f);
808
809 stb_c_lexer_init(&lex, text, text+len, (char *) malloc(1<<16), 1<<16);
810 while (stb_c_lexer_get_token(&lex)) {
811 if (lex.token == CLEX_parse_error) {
812 printf("\n<<<PARSE ERROR>>>\n");
813 break;
814 }
815 print_token(&lex);
816 printf(" ");
817 }
818 return 0;
819 }
820 #endif