stb/stb_c_lexer.h

   1 // stb_c_lexer.h - v0.08 - public domain Sean Barrett 2013
   2 // lexer for making little C-like languages with recursive-descent parsers
   3 //
   4 // This file provides both the interface and the implementation.
   5 // To instantiate the implementation,
   6 //      #define STB_C_LEXER_IMPLEMENTATION
   7 // in *ONE* source file, before #including this file.
   8 //
   9 // The default configuration is fairly close to a C lexer, although
  10 // suffixes on integer constants are not handled (you can override this).
  11 //
  12 // History:
  13 //     0.08 fix bad pointer comparison
  14 //     0.07 fix mishandling of hexadecimal constants parsed by strtol
  15 //     0.06 fix missing next character after ending quote mark (Andreas Fredriksson)
  16 //     0.05 refixed get_location because github version had lost the fix
  17 //     0.04 fix octal parsing bug
  18 //     0.03 added STB_C_LEX_DISCARD_PREPROCESSOR option
  19 //          refactor API to simplify (only one struct instead of two)
  20 //          change literal enum names to have 'lit' at the end
  21 //     0.02 first public release
  22 //
  23 // Status:
  24 //     - haven't tested compiling as C++
  25 //     - haven't tested the float parsing path
  26 //     - haven't tested the non-default-config paths (e.g. non-stdlib)
  27 //     - only tested default-config paths by eyeballing output of self-parse
  28 //
  29 //     - haven't implemented multiline strings
  30 //     - haven't implemented octal/hex character constants
  31 //     - haven't implemented support for unicode CLEX_char
  32 //     - need to expand error reporting so you don't just get "CLEX_parse_error"
  33 //
  34 // Contributors:
  35 //   Arpad Goretity (bugfix)
  36 //
  37 // LICENSE
  38 //
  39 //   This software is dual-licensed to the public domain and under the following
  40 //   license: you are granted a perpetual, irrevocable license to copy, modify,
  41 //   publish, and distribute this file as you see fit.
  42
  43 #ifndef STB_C_LEXER_DEFINITIONS
  44 // to change the default parsing rules, copy the following lines
  45 // into your C/C++ file *before* including this, and then replace
  46 // the Y's with N's for the ones you don't want.
  47 // --BEGIN--
  48
  49 #define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
  50 #define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
  51 #define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
  52 #define STB_C_LEX_C_DECIMAL_FLOATS  Y   //  "[0-9]*(.[0-9]*([eE]-?[0-9]+)?)        CLEX_floatlit
  53 #define STB_C_LEX_C_IDENTIFIERS     Y   //  "[_a-zA-Z][_a-zA-Z0-9]*"               CLEX_id
  54 #define STB_C_LEX_C_DQ_STRINGS      Y   //  double-quote-delimited strings with escapes  CLEX_dqstring
  55 #define STB_C_LEX_C_SQ_STRINGS      N   //  single-quote-delimited strings with escapes  CLEX_ssstring
  56 #define STB_C_LEX_C_CHARS           Y   //  single-quote-delimited character with escape CLEX_charlits
  57 #define STB_C_LEX_C_COMMENTS        Y   //  "/* comment */"
  58 #define STB_C_LEX_CPP_COMMENTS      Y   //  "// comment to end of line\n"
  59 #define STB_C_LEX_C_COMPARISONS     Y   //  "==" CLEX_eq  "!=" CLEX_noteq   "<=" CLEX_lesseq  ">=" CLEX_greatereq
  60 #define STB_C_LEX_C_LOGICAL         Y   //  "&&"  CLEX_andand   "||"  CLEX_oror
  61 #define STB_C_LEX_C_SHIFTS          Y   //  "<<"  CLEX_shl      ">>"  CLEX_shr
  62 #define STB_C_LEX_C_INCREMENTS      Y   //  "++"  CLEX_plusplus "--"  CLEX_minusminus
  63 #define STB_C_LEX_C_ARROW           Y   //  "->"  CLEX_arrow
  64 #define STB_C_LEX_EQUAL_ARROW       N   //  "=>"  CLEX_eqarrow
  65 #define STB_C_LEX_C_BITWISEEQ       Y   //  "&="  CLEX_andeq    "|="  CLEX_oreq     "^="  CLEX_xoreq
  66 #define STB_C_LEX_C_ARITHEQ         Y   //  "+="  CLEX_pluseq   "-="  CLEX_minuseq
  67                                         //  "*="  CLEX_muleq    "/="  CLEX_diveq    "%=" CLEX_modeq
  68                                         //  if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
  69                                         //                      "<<=" CLEX_shleq    ">>=" CLEX_shreq
  70
  71 #define STB_C_LEX_PARSE_SUFFIXES    N   // letters after numbers are parsed as part of those numbers, and must be in suffix list below
  72 #define STB_C_LEX_DECIMAL_SUFFIXES  ""  // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
  73 #define STB_C_LEX_HEX_SUFFIXES      ""  // e.g. "uUlL"
  74 #define STB_C_LEX_OCTAL_SUFFIXES    ""  // e.g. "uUlL"
  75 #define STB_C_LEX_FLOAT_SUFFIXES    ""  //
  76
  77 #define STB_C_LEX_0_IS_EOF             N  // if Y, ends parsing at '\0'; if N, returns '\0' as token
  78 #define STB_C_LEX_INTEGERS_AS_DOUBLES  N  // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N
  79 #define STB_C_LEX_MULTILINE_DSTRINGS   N  // allow newlines in double-quoted strings
  80 #define STB_C_LEX_MULTILINE_SSTRINGS   N  // allow newlines in single-quoted strings
  81 #define STB_C_LEX_USE_STDLIB           Y  // use strtod,strtol for parsing #s; otherwise inaccurate hack
  82 #define STB_C_LEX_DOLLAR_IDENTIFIER    Y  // allow $ as an identifier character
  83 #define STB_C_LEX_FLOAT_NO_DECIMAL     Y  // allow floats that have no decimal point if they have an exponent
  84
  85 #define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES  N   // if Y, all CLEX_ token names are defined, even if never returned
  86                                               // leaving it as N should help you catch config bugs
  87
  88 #define STB_C_LEX_DISCARD_PREPROCESSOR    Y   // discard C-preprocessor directives (e.g. after prepocess
  89                                               // still have #line, #pragma, etc)
  90
  91 //#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
  92
  93 #define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
  94 // --END--
  95
  96 #endif
  97
  98 #ifndef INCLUDE_STB_C_LEXER_H
  99 #define INCLUDE_STB_C_LEXER_H
 100
 101 typedef struct
 102 {
 103    // lexer variables
 104    char *input_stream;
 105    char *eof;
 106    char *parse_point;
 107    char *string_storage;
 108    int   string_storage_len;
 109
 110    // lexer parse location for error messages
 111    char *where_firstchar;
 112    char *where_lastchar;
 113
 114    // lexer token variables
 115    long token;
 116    double real_number;
 117    long   int_number;
 118    char *string;
 119    int string_len;
 120 } stb_lexer;
 121
 122 typedef struct
 123 {
 124    int line_number;
 125    int line_offset;
 126 } stb_lex_location;
 127
 128 #ifdef __cplusplus
 129 extern "C" {
 130 #endif
 131
 132 extern void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length);
 133 // this function initialize the 'lexer' structure
 134 //   Input:
 135 //   - input_stream points to the file to parse, loaded into memory
 136 //   - input_stream_end points to the end of the file, or NULL if you use 0-for-EOF
 137 //   - string_store is storage the lexer can use for storing parsed strings and identifiers
 138 //   - store_length is the length of that storage
 139
 140 extern int stb_c_lexer_get_token(stb_lexer *lexer);
 141 // this function returns non-zero if a token is parsed, or 0 if at EOF
 142 //   Output:
 143 //   - lexer->token is the token ID, which is unicode code point for a single-char token, < 0 for a multichar or eof or error
 144 //   - lexer->real_number is a double constant value for CLEX_floatlit, or CLEX_intlit if STB_C_LEX_INTEGERS_AS_DOUBLES
 145 //   - lexer->int_number is an integer constant for CLEX_intlit if !STB_C_LEX_INTEGERS_AS_DOUBLES, or character for CLEX_charlit
 146 //   - lexer->string is a 0-terminated string for CLEX_dqstring or CLEX_sqstring or CLEX_identifier
 147 //   - lexer->string_len is the byte length of lexer->string
 148
 149 extern void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc);
 150 // this inefficient function returns the line number and character offset of a
 151 // given location in the file as returned by stb_lex_token. Because it's inefficient,
 152 // you should only call it for errors, not for every token.
 153 // For error messages of invalid tokens, you typically want the location of the start
 154 // of the token (which caused the token to be invalid). For bugs involving legit
 155 // tokens, you can report the first or the range.
 156 //    Output:
 157 //    - loc->line_number is the line number in the file, counting from 1, of the location
 158 //    - loc->line_offset is the char-offset in the line, counting from 0, of the location
 159
 160
 161 #ifdef __cplusplus
 162 }
 163 #endif
 164
 165 #endif // INCLUDE_STB_C_LEXER_H
 166
 167 #ifdef STB_C_LEXER_IMPLEMENTATION
 168
 169    #if defined(Y) || defined(N)
 170    #error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined"
 171    #endif
 172
 173
 174 // Hacky definitions so we can easily #if on them
 175 #define Y(x) 1
 176 #define N(x) 0
 177
 178 #if STB_C_LEX_USE_STDLIB(x)
 179 #define STB__CLEX_use_stdlib
 180 #include <stdlib.h>
 181 #endif
 182
 183 #if STB_C_LEX_INTEGERS_AS_DOUBLES(x)
 184 typedef double     stb__clex_int;
 185 #define intfield   real_number
 186 #define STB__clex_int_as_double
 187 #else
 188 typedef long       stb__clex_int;
 189 #define intfield   int_number
 190 #endif
 191
 192 // Convert these config options to simple conditional #defines so we can more
 193 // easily test them once we've change the meaning of Y/N
 194
 195 #if STB_C_LEX_PARSE_SUFFIXES(x)
 196 #define STB__clex_parse_suffixes
 197 #endif
 198
 199 #if STB_C_LEX_C_DECIMAL_INTS(x) || STB_C_LEX_C_HEX_INTS(x) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
 200 #define STB__clex_define_int
 201 #endif
 202
 203 #if (STB_C_LEX_C_ARITHEQ(x) && STB_C_LEX_C_SHIFTS(x)) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
 204 #define STB__clex_define_shifts
 205 #endif
 206
 207 #if STB_C_LEX_C_HEX_INTS(x)
 208 #define STB__clex_hex_ints
 209 #endif
 210
 211 #if STB_C_LEX_C_DECIMAL_INTS(x)
 212 #define STB__clex_decimal_ints
 213 #endif
 214
 215 #if STB_C_LEX_C_OCTAL_INTS(x)
 216 #define STB__clex_octal_ints
 217 #endif
 218
 219 #if STB_C_LEX_C_DECIMAL_FLOATS(x)
 220 #define STB__clex_decimal_floats
 221 #endif
 222
 223 #if STB_C_LEX_DISCARD_PREPROCESSOR(x)
 224 #define STB__clex_discard_preprocessor
 225 #endif
 226
 227 // Now pick a definition of Y/N that's conducive to
 228 // defining the enum of token names.
 229 #if STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) || defined(STB_C_LEXER_SELF_TEST)
 230   #undef  N
 231   #define N(a) Y(a)
 232 #else
 233   #undef  N
 234   #define N(a)
 235 #endif
 236
 237 #undef  Y
 238 #define Y(a) a,
 239
 240 enum
 241 {
 242    CLEX_eof = 256,
 243    CLEX_parse_error,
 244
 245 #ifdef STB__clex_define_int
 246    CLEX_intlit,
 247 #endif
 248
 249    STB_C_LEX_C_DECIMAL_FLOATS( CLEX_floatlit    )
 250    STB_C_LEX_C_IDENTIFIERS(  CLEX_id            )
 251    STB_C_LEX_C_DQ_STRINGS(   CLEX_dqstring      )
 252    STB_C_LEX_C_SQ_STRINGS(   CLEX_sqstring      )
 253    STB_C_LEX_C_CHARS(        CLEX_charlit       )
 254    STB_C_LEX_C_COMPARISONS(  CLEX_eq            )
 255    STB_C_LEX_C_COMPARISONS(  CLEX_noteq         )
 256    STB_C_LEX_C_COMPARISONS(  CLEX_lesseq        )
 257    STB_C_LEX_C_COMPARISONS(  CLEX_greatereq     )
 258    STB_C_LEX_C_LOGICAL(      CLEX_andand        )
 259    STB_C_LEX_C_LOGICAL(      CLEX_oror          )
 260    STB_C_LEX_C_SHIFTS(       CLEX_shl           )
 261    STB_C_LEX_C_SHIFTS(       CLEX_shr           )
 262    STB_C_LEX_C_INCREMENTS(   CLEX_plusplus      )
 263    STB_C_LEX_C_INCREMENTS(   CLEX_minusminus    )
 264    STB_C_LEX_C_ARITHEQ(      CLEX_pluseq        )
 265    STB_C_LEX_C_ARITHEQ(      CLEX_minuseq       )
 266    STB_C_LEX_C_ARITHEQ(      CLEX_muleq         )
 267    STB_C_LEX_C_ARITHEQ(      CLEX_diveq         )
 268    STB_C_LEX_C_ARITHEQ(      CLEX_modeq         )
 269    STB_C_LEX_C_BITWISEEQ(    CLEX_andeq         )
 270    STB_C_LEX_C_BITWISEEQ(    CLEX_oreq          )
 271    STB_C_LEX_C_BITWISEEQ(    CLEX_xoreq         )
 272    STB_C_LEX_C_ARROW(        CLEX_arrow         )
 273    STB_C_LEX_EQUAL_ARROW(    CLEX_eqarrow       )
 274
 275 #ifdef STB__clex_define_shifts
 276    CLEX_shleq, CLEX_shreq,
 277 #endif
 278
 279    CLEX_first_unused_token
 280
 281 #undef Y
 282 #define Y(a) a
 283 };
 284
 285 // Now for the rest of the file we'll use the basic definition where
 286 // where Y expands to its contents and N expands to nothing
 287 #undef N
 288 #define N(a)
 289
 290 // API function
 291 void stb_c_lexer_init(stb_lexer *lexer, const char *input_stream, const char *input_stream_end, char *string_store, int store_length)
 292 {
 293    lexer->input_stream = (char *) input_stream;
 294    lexer->eof = (char *) input_stream_end;
 295    lexer->parse_point = (char *) input_stream;
 296    lexer->string_storage = string_store;
 297    lexer->string_storage_len = store_length;
 298 }
 299
 300 // API function
 301 void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, stb_lex_location *loc)
 302 {
 303    char *p = lexer->input_stream;
 304    int line_number = 1;
 305    int char_offset = 0;
 306    while (*p && p < where) {
 307       if (*p == '\n' || *p == '\r') {
 308          p += (p[0]+p[1] == '\r'+'\n' ? 2 : 1); // skip newline
 309          line_number += 1;
 310          char_offset = 0;
 311       } else {
 312          ++p;
 313          ++char_offset;
 314       }
 315    }
 316    loc->line_number = line_number;
 317    loc->line_offset = char_offset;
 318 }
 319
 320 // main helper function for returning a parsed token
 321 static int stb__clex_token(stb_lexer *lexer, int token, char *start, char *end)
 322 {
 323    lexer->token = token;
 324    lexer->where_firstchar = start;
 325    lexer->where_lastchar = end;
 326    lexer->parse_point = end+1;
 327    return 1;
 328 }
 329
 330 // helper function for returning eof
 331 static int stb__clex_eof(stb_lexer *lexer)
 332 {
 333    lexer->token = CLEX_eof;
 334    return 0;
 335 }
 336
 337 static int stb__clex_iswhite(int x)
 338 {
 339    return x == ' ' || x == '\t' || x == '\r' || x == '\n' || x == '\f';
 340 }
 341
 342 static const char *stb__strchr(const char *str, int ch)
 343 {
 344    for (; *str; ++str)
 345       if (*str == ch)
 346          return str;
 347    return 0;
 348 }
 349
 350 // parse suffixes at the end of a number
 351 static int stb__clex_parse_suffixes(stb_lexer *lexer, long tokenid, char *start, char *cur, const char *suffixes)
 352 {
 353    #ifdef STB__clex_parse_suffixes
 354    lexer->string = lexer->string_storage;
 355    lexer->string_len = 0;
 356
 357    while ((*cur >= 'a' && *cur <= 'z') || (*cur >= 'A' && *cur <= 'Z')) {
 358       if (stb__strchr(suffixes, *cur) == 0)
 359          return stb__clex_token(lexer, CLEX_parse_error, start, cur);
 360       if (lexer->string_len+1 >= lexer->string_storage_len)
 361          return stb__clex_token(lexer, CLEX_parse_error, start, cur);
 362       lexer->string[lexer->string_len++] = *cur++;
 363    }
 364    #else
 365    suffixes = suffixes; // attempt to suppress warnings
 366    #endif
 367    return stb__clex_token(lexer, tokenid, start, cur-1);
 368 }
 369
 370 #ifndef STB__CLEX_use_stdlib
 371 static double stb__clex_parse_float(char *p, char **q)
 372 {
 373    double value=0;
 374    while (*p >= '0' && *p <= '9')
 375       value = value*10 + (*p++ - '0');
 376    if (*p == '.') {
 377       double powten=1, addend = 0;
 378       ++p;
 379       while (*p >= '0' && *p <= '9') {
 380          addend = addend + 10*(*p++ - '0');
 381          powten *= 10;
 382       }
 383       value += addend / powten;
 384    }
 385    if (*p == 'e' || *p == 'E') {
 386       int sign = p[1] == '-';
 387       int exponent=0;
 388       double pow10=1;
 389       p += 1+sign;
 390       while (*p >= '0' && *p <= '9')
 391          exponent = exponent*10 + (*p++ - '0');
 392       // can't use pow() from stdlib, so do it slow way
 393       while (exponent-- > 0)
 394          pow10 *= 10;
 395       if (sign)
 396          value /= pow10;
 397       else
 398          value *= pow10;
 399    }
 400    *q = p;
 401    return value;
 402 }
 403 #endif
 404
 405 static int stb__clex_parse_char(char *p, char **q)
 406 {
 407    if (*p == '\\') {
 408       *q = p+2; // tentatively guess we'll parse two characters
 409       switch(p[1]) {
 410          case '\\': return '\\';
 411          case '\'': return '\'';
 412          case '"': return '"';
 413          case 't': return '\t';
 414          case 'f': return '\f';
 415          case 'n': return '\n';
 416          case 'r': return '\r';
 417          case '0': return '\0'; // @TODO ocatal constants
 418          case 'x': case 'X': return -1; // @TODO hex constants
 419          case 'u': return -1; // @TODO unicode constants
 420       }
 421    }
 422    *q = p+1;
 423    return (unsigned char) *p;
 424 }
 425
 426 static int stb__clex_parse_string(stb_lexer *lexer, char *p, int type)
 427 {
 428    char *start = p;
 429    char delim = *p++; // grab the " or ' for later matching
 430    char *out = lexer->string_storage;
 431    char *outend = lexer->string_storage + lexer->string_storage_len;
 432    while (*p != delim) {
 433       int n;
 434       if (*p == '\\') {
 435          char *q;
 436          n = stb__clex_parse_char(p, &q);
 437          if (n < 0)
 438             return stb__clex_token(lexer, CLEX_parse_error, start, q);
 439          p = q;
 440       } else {
 441          // @OPTIMIZE: could speed this up by looping-while-not-backslash
 442          n = (unsigned char) *p++;
 443       }
 444       if (out+1 > outend)
 445          return stb__clex_token(lexer, CLEX_parse_error, start, p);
 446       // @TODO expand unicode escapes to UTF8
 447       *out++ = (char) n;
 448    }
 449    *out = 0;
 450    lexer->string = lexer->string_storage;
 451    lexer->string_len = out - lexer->string_storage;
 452    return stb__clex_token(lexer, type, start, p);
 453 }
 454
 455 int stb_c_lexer_get_token(stb_lexer *lexer)
 456 {
 457    char *p = lexer->parse_point;
 458
 459    // skip whitespace and comments
 460    for (;;) {
 461       #ifdef STB_C_LEX_ISWHITE
 462       while (p != lexer->stream_end) {
 463          int n;
 464          n = STB_C_LEX_ISWHITE(p);
 465          if (n == 0) break;
 466          if (lexer->eof && lexer->eof - lexer->parse_point < n)
 467             return stb__clex_token(tok, CLEX_parse_error, p,lexer->eof-1);
 468          p += n;
 469       }
 470       #else
 471       while (p != lexer->eof && stb__clex_iswhite(*p))
 472          ++p;
 473       #endif
 474
 475       STB_C_LEX_CPP_COMMENTS(
 476          if (p != lexer->eof && p[0] == '/' && p[1] == '/') {
 477             while (p != lexer->eof && *p != '\r' && *p != '\n')
 478                ++p;
 479             continue;
 480          }
 481       )
 482
 483       STB_C_LEX_C_COMMENTS(
 484          if (p != lexer->eof && p[0] == '/' && p[1] == '*') {
 485             char *start = p;
 486             p += 2;
 487             while (p != lexer->eof && (p[0] != '*' || p[1] != '/'))
 488                ++p;
 489             if (p == lexer->eof)
 490                return stb__clex_token(lexer, CLEX_parse_error, start, p-1);
 491             p += 2;
 492             continue;
 493          }
 494       )
 495
 496       #ifdef STB__clex_discard_preprocessor
 497          // @TODO this discards everything after a '#', regardless
 498          // of where in the line the # is, rather than requiring it
 499          // be at the start. (because this parser doesn't otherwise
 500          // check for line breaks!)
 501          if (p != lexer->eof && p[0] == '#') {
 502             while (p != lexer->eof && *p != '\r' && *p != '\n')
 503                ++p;
 504             continue;
 505          }
 506       #endif
 507
 508       break;
 509    }
 510
 511    if (p == lexer->eof)
 512       return stb__clex_eof(lexer);
 513
 514    switch (*p) {
 515       default:
 516          if (   (*p >= 'a' && *p <= 'z')
 517              || (*p >= 'A' && *p <= 'Z')
 518              || *p == '_' || (unsigned char) *p >= 128    // >= 128 is UTF8 char
 519              STB_C_LEX_DOLLAR_IDENTIFIER( || *p == '$' ) )
 520          {
 521             int n = 0;
 522             lexer->string = lexer->string_storage;
 523             lexer->string_len = n;
 524             do {
 525                if (n+1 >= lexer->string_storage_len)
 526                   return stb__clex_token(lexer, CLEX_parse_error, p, p+n);
 527                lexer->string[n] = p[n];
 528                ++n;
 529             } while (
 530                   (p[n] >= 'a' && p[n] <= 'z')
 531                || (p[n] >= 'A' && p[n] <= 'Z')
 532                || (p[n] >= '0' && p[n] <= '9') // allow digits in middle of identifier
 533                || p[n] == '_' || (unsigned char) p[n] >= 128
 534                 STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' )
 535             );
 536             lexer->string[n] = 0;
 537             return stb__clex_token(lexer, CLEX_id, p, p+n-1);
 538          }
 539
 540          // check for EOF
 541          STB_C_LEX_0_IS_EOF(
 542             if (*p == 0)
 543                return stb__clex_eof(tok);
 544          )
 545
 546       single_char:
 547          // not an identifier, return the character as itself
 548          return stb__clex_token(lexer, *p, p, p);
 549
 550       case '+':
 551          if (p+1 != lexer->eof) {
 552             STB_C_LEX_C_INCREMENTS(if (p[1] == '+') return stb__clex_token(lexer, CLEX_plusplus, p,p+1);)
 553             STB_C_LEX_C_ARITHEQ(   if (p[1] == '=') return stb__clex_token(lexer, CLEX_pluseq  , p,p+1);)
 554          }
 555          goto single_char;
 556       case '-':
 557          if (p+1 != lexer->eof) {
 558             STB_C_LEX_C_INCREMENTS(if (p[1] == '-') return stb__clex_token(lexer, CLEX_minusminus, p,p+1);)
 559             STB_C_LEX_C_ARITHEQ(   if (p[1] == '=') return stb__clex_token(lexer, CLEX_minuseq   , p,p+1);)
 560             STB_C_LEX_C_ARROW(     if (p[1] == '>') return stb__clex_token(lexer, CLEX_arrow     , p,p+1);)
 561          }
 562          goto single_char;
 563       case '&':
 564          if (p+1 != lexer->eof) {
 565             STB_C_LEX_C_LOGICAL(  if (p[1] == '&') return stb__clex_token(lexer, CLEX_andand, p,p+1);)
 566             STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_andeq , p,p+1);)
 567          }
 568          goto single_char;
 569       case '|':
 570          if (p+1 != lexer->eof) {
 571             STB_C_LEX_C_LOGICAL(  if (p[1] == '|') return stb__clex_token(lexer, CLEX_oror, p,p+1);)
 572             STB_C_LEX_C_BITWISEEQ(if (p[1] == '=') return stb__clex_token(lexer, CLEX_oreq, p,p+1);)
 573          }
 574          goto single_char;
 575       case '=':
 576          if (p+1 != lexer->eof) {
 577             STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_eq, p,p+1);)
 578             STB_C_LEX_EQUAL_ARROW(  if (p[1] == '>') return stb__clex_token(lexer, CLEX_eqarrow, p,p+1);)
 579          }
 580          goto single_char;
 581       case '!':
 582          STB_C_LEX_C_COMPARISONS(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_noteq, p,p+1);)
 583          goto single_char;
 584       case '^':
 585          STB_C_LEX_C_BITWISEEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_xoreq, p,p+1));
 586          goto single_char;
 587       case '%':
 588          STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_modeq, p,p+1));
 589          goto single_char;
 590       case '*':
 591          STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_muleq, p,p+1));
 592          goto single_char;
 593       case '/':
 594          STB_C_LEX_C_ARITHEQ(if (p+1 != lexer->eof && p[1] == '=') return stb__clex_token(lexer, CLEX_diveq, p,p+1));
 595          goto single_char;
 596       case '<':
 597          if (p+1 != lexer->eof) {
 598             STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_lesseq, p,p+1);)
 599             STB_C_LEX_C_SHIFTS(     if (p[1] == '<') {
 600                                        STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
 601                                                               return stb__clex_token(lexer, CLEX_shleq, p,p+2);)
 602                                        return stb__clex_token(lexer, CLEX_shl, p,p+1);
 603                                     }
 604                               )
 605          }
 606          goto single_char;
 607       case '>':
 608          if (p+1 != lexer->eof) {
 609             STB_C_LEX_C_COMPARISONS(if (p[1] == '=') return stb__clex_token(lexer, CLEX_greatereq, p,p+1);)
 610             STB_C_LEX_C_SHIFTS(     if (p[1] == '>') {
 611                                        STB_C_LEX_C_ARITHEQ(if (p+2 != lexer->eof && p[2] == '=')
 612                                                               return stb__clex_token(lexer, CLEX_shreq, p,p+2);)
 613                                        return stb__clex_token(lexer, CLEX_shr, p,p+1);
 614                                     }
 615                               )
 616          }
 617          goto single_char;
 618
 619       case '"':
 620          STB_C_LEX_C_DQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_dqstring);)
 621          goto single_char;
 622       case '\'':
 623          STB_C_LEX_C_SQ_STRINGS(return stb__clex_parse_string(lexer, p, CLEX_sqstring);)
 624          STB_C_LEX_C_CHARS(
 625          {
 626             char *start = p;
 627             lexer->int_number = stb__clex_parse_char(p+1, &p);
 628             if (lexer->int_number < 0)
 629                return stb__clex_token(lexer, CLEX_parse_error, start,start);
 630             if (p == lexer->eof || *p != '\'')
 631                return stb__clex_token(lexer, CLEX_parse_error, start,p);
 632             return stb__clex_token(lexer, CLEX_charlit, start, p+1);
 633          })
 634          goto single_char;
 635
 636       case '0':
 637          #ifdef STB__clex_hex_ints
 638             if (p+1 != lexer->eof) {
 639                if (p[1] == 'x' || p[1] == 'X') {
 640                   char *q = p+2;
 641                   #ifdef STB__CLEX_use_stdlib
 642                   lexer->int_number = strtol((char *) p, (char **) &q, 16);
 643                   #else
 644                   stb__clex_int n=0;
 645                   while (q != lexer->eof) {
 646                      if (*q >= '0' && *q <= '9')
 647                         n = n*16 + (*q - '0');
 648                      else if (*q >= 'a' && *q <= 'f')
 649                         n = n*16 + (*q - 'a') + 10;
 650                      else if (*q >= 'A' && *q <= 'F')
 651                         n = n*16 + (*q - 'A') + 10;
 652                      else
 653                         break;
 654                      ++q;
 655                   }
 656                   lexer->int_field = n; // int_field is macro that expands to real_number/int_number depending on type of n
 657                   #endif
 658                   if (q == p+2)
 659                      return stb__clex_token(lexer, CLEX_parse_error, p-2,p-1);
 660                   return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_HEX_SUFFIXES);
 661                }
 662             }
 663          #endif // STB__clex_hex_ints
 664          // can't test for octal because we might parse '0.0' as float or as '0' '.' '0',
 665          // so have to do float first
 666
 667          /* FALL THROUGH */
 668       case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
 669
 670          #ifdef STB__clex_decimal_floats
 671          {
 672             char *q = p;
 673             while (q != lexer->eof && (*q >= '0' && *q <= '9'))
 674                ++q;
 675             if (q != lexer->eof) {
 676                if (*q == '.' STB_C_LEX_FLOAT_NO_DECIMAL(|| *q == 'e' || *q == 'E')) {
 677                   #ifdef STB__CLEX_use_stdlib
 678                   lexer->real_number = strtod((char *) p, (char**) &q);
 679                   #else
 680                   lexer->real_number = stb__clex_parse_float(p, &q);
 681                   #endif
 682
 683                   return stb__clex_parse_suffixes(lexer, CLEX_floatlit, p,q, STB_C_LEX_FLOAT_SUFFIXES);
 684
 685                }
 686             }
 687          }
 688          #endif // STB__clex_decimal_floats
 689
 690          #ifdef STB__clex_octal_ints
 691          if (p[0] == '0') {
 692             char *q = p;
 693             #ifdef STB__CLEX_use_stdlib
 694             lexer->int_number = strtol((char *) p, (char **) &q, 8);
 695             #else
 696             stb__clex_int n=0;
 697             while (q != lexer->eof) {
 698                if (*q >= '0' && *q <= '7')
 699                   n = n*8 + (q - '0');
 700                else
 701                   break;
 702                ++q;
 703             }
 704             if (q != lexer->eof && (*q == '8' || *q=='9'))
 705                return stb__clex_token(tok, CLEX_parse_error, p, q);
 706             lexer->int_field = n;
 707             #endif
 708             return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
 709          }
 710          #endif // STB__clex_octal_ints
 711
 712          #ifdef STB__clex_decimal_ints
 713          {
 714             char *q = p;
 715             #ifdef STB__CLEX_use_stdlib
 716             lexer->int_number = strtol((char *) p, (char **) &q, 10);
 717             #else
 718             stb__clex_int n=0;
 719             while (q != lexer->eof) {
 720                if (*q >= '0' && *q <= '9')
 721                   n = n*10 + (q - '0');
 722                else
 723                   break;
 724                ++q;
 725             }
 726             lexer->int_field = n;
 727             #endif
 728             return stb__clex_parse_suffixes(lexer, CLEX_intlit, p,q, STB_C_LEX_OCTAL_SUFFIXES);
 729          }
 730          #endif // STB__clex_decimal_ints
 731          goto single_char;
 732    }
 733 }
 734 #endif // STB_C_LEXER_IMPLEMENTATION
 735
 736 #ifdef STB_C_LEXER_SELF_TEST
 737
 738 #include <stdio.h>
 739
 740 static void print_token(stb_lexer *lexer)
 741 {
 742    switch (lexer->token) {
 743       case CLEX_id        : printf("_%s", lexer->string); break;
 744       case CLEX_eq        : printf("=="); break;
 745       case CLEX_noteq     : printf("!="); break;
 746       case CLEX_lesseq    : printf("<="); break;
 747       case CLEX_greatereq : printf(">="); break;
 748       case CLEX_andand    : printf("&&"); break;
 749       case CLEX_oror      : printf("||"); break;
 750       case CLEX_shl       : printf("<<"); break;
 751       case CLEX_shr       : printf(">>"); break;
 752       case CLEX_plusplus  : printf("++"); break;
 753       case CLEX_minusminus: printf("--"); break;
 754       case CLEX_arrow     : printf("->"); break;
 755       case CLEX_andeq     : printf("&="); break;
 756       case CLEX_oreq      : printf("|="); break;
 757       case CLEX_xoreq     : printf("^="); break;
 758       case CLEX_pluseq    : printf("+="); break;
 759       case CLEX_minuseq   : printf("-="); break;
 760       case CLEX_muleq     : printf("*="); break;
 761       case CLEX_diveq     : printf("/="); break;
 762       case CLEX_modeq     : printf("%%="); break;
 763       case CLEX_shleq     : printf("<<="); break;
 764       case CLEX_shreq     : printf(">>="); break;
 765       case CLEX_eqarrow   : printf("=>"); break;
 766       case CLEX_dqstring  : printf("\"%s\"", lexer->string); break;
 767       case CLEX_sqstring  : printf("'\"%s\"'", lexer->string); break;
 768       case CLEX_charlit   : printf("'%s'", lexer->string); break;
 769       #if defined(STB__clex_int_as_double) && !defined(STB__CLEX_use_stdlib)
 770       case CLEX_intlit    : printf("#%g", lexer->real_number); break;
 771       #else
 772       case CLEX_intlit    : printf("#%ld", lexer->int_number); break;
 773       #endif
 774       case CLEX_floatlit  : printf("%g", lexer->real_number); break;
 775       default:
 776          if (lexer->token >= 0 && lexer->token < 256)
 777             printf("%c", (int) lexer->token);
 778          else {
 779             printf("<<<UNKNOWN TOKEN %ld >>>\n", lexer->token);
 780          }
 781          break;
 782    }
 783 }
 784
 785 /* Force a test
 786 of parsing
 787 multiline comments */
 788
 789 /*/ comment /*/
 790 /**/ extern /**/
 791
 792 void dummy(void)
 793 {
 794    printf("test",1); // https://github.com/nothings/stb/issues/13
 795 }
 796
 797 int main(int argc, char **argv)
 798 {
 799    FILE *f = fopen("stb_c_lexer.h","rb");
 800    char *text = (char *) malloc(1 << 20);
 801    int len = f ? fread(text, 1, 1<<20, f) : -1;
 802    stb_lexer lex;
 803    if (len < 0) {
 804       fprintf(stderr, "Error opening file\n");
 805       return 1;
 806    }
 807    fclose(f);
 808
 809    stb_c_lexer_init(&lex, text, text+len, (char *) malloc(1<<16), 1<<16);
 810    while (stb_c_lexer_get_token(&lex)) {
 811       if (lex.token == CLEX_parse_error) {
 812          printf("\n<<<PARSE ERROR>>>\n");
 813          break;
 814       }
 815       print_token(&lex);
 816       printf("  ");
 817    }
 818    return 0;
 819 }
 820 #endif