ragel state machine for lexer/scanner
[henge/webcc.git] / src / apc / lexer_lex.rl
1 /* Ragel State Machine for tokenizing text */
2 #include <stdio.h>
3 #include <string.h>
4 #include <apc/parser.tab.h>
5
6 extern void lexer_pushtok(int, YYSTYPE);
7
8 int lexer_lex(const char*);
9 int ipow(int, int);
10 int ttov(const char* str, int);
11 uint64_t ttor(const char* str, int);
12 char* ttos(const char* str, int);
13
14
15 #define MAX_TOK_LEN 64
16 #define MAX_TOKENS 16
17 #define MAX_STR_SIZE (MAX_TOK_LEN * MAX_TOKENS)
18
19
20 %%{
21 machine token_matcher;
22
23 # set up yylval and tok_t to be pushed to stack
24 action set_ref {
25 tok_t = REF; \
26 yylval.ref = ttor(ts, p-ts); \
27 lexer_pushtok(tok_t, yylval); \
28 ts = p; }
29
30 action set_val { tok_t = NUM; \
31 yylval.val = ttov(ts, p-ts); \
32 lexer_pushtok(tok_t, yylval); \
33 ts = p; }
34
35 action set_name { tok_t = NAME; \
36 yylval.str = ttos(ts, p-ts); \
37 lexer_pushtok(tok_t, yylval); \
38 ts = p; }
39
40 # instantiate machines for each possible token
41 ref = '0x' xdigit+ %set_ref;
42 val = digit+ %set_val;
43 name = alpha+ %set_name;
44 tok = ref | val | name;
45
46 main := (tok . '_')* . tok;
47 }%%
48
49
50 %%write data;
51
52 /* 0xxdigit+ => tok_t REF, yylval.ref = uint64_t
53 [0-9]+ => tok_t NUM, yylval.val = int
54 [a-zA-Z]+ => tok_t NAME, yylval.str = char* */
55
56 /* Scan filename and push the its tokens
57 onto the stack */
58 int lexer_lex (const char* str)
59 {
60 const char *p, *pe, *ts, *eof;
61 int cs, tok_t ; //tok_t == token type
62
63 p = ts = str;
64 pe = p + strlen(str) + 1;
65 %%write init;
66 %%write exec;
67
68 lexer_pushtok(tok_t, yylval);
69
70 printf (str);
71 return 1;
72 }
73
74 int ipow(int base, int exp)
75 {
76 int result = 1;
77 while (exp)
78 {
79 if (exp & 1)
80 result = result * base;
81 exp = exp >> 1;
82 base *= base;
83 }
84
85 return result;
86 }
87
88 /* Token to Value */
89 int ttov(const char* str, int len)
90 {
91 int i, val = 0;
92
93 for (i = 0; i < len; i++)
94 {
95 val += ((str[len - (i + 1)] - '0') * ipow(10,i));
96 }
97
98 return val;
99 }
100
101 uint64_t ttor(const char* str, int len)
102 {
103 int i;
104 uint64_t num = 0;
105
106 for (i = 0; i < len; i++)
107 {
108 num += ((str[len - (i + 1)] - '0') * ipow(10,i));
109 }
110
111 return num;
112 }
113
114 char* ttos(const char* str, int len)
115 {
116 int i;
117 char token_buf[MAX_TOK_LEN];
118
119 memmove(token_buf, str, len);
120 token_buf[len+1] = '\0';
121
122 return strdup(token_buf);
123 }