f11c6a21d1736743b9059aa0e9fa110c1e353654
[henge/webcc.git] / src / apc / lexer.c
1 /*!@file
2 \brief lexical analyzer implementation for APC
3 \details The lexer manages two FIFO stacks. One for maintaining tokens, the
4 other for maintaining a list of files to be scanned. During
5 execution, the lexer will return a token from its token queue if any
6 are present. If not, the lexer will will pop an element from its
7 file queue to 'scanner' to be tokenized. If the file queue is empty,
8 the lexer will instead call 'parsedir' to traverse the directory tree
9 and tokenize the results. If 'parsedir' does not generate any new
10 tokens, we are done.
11 \author Jordan Lavatai
12 \date Aug 2016
13 ----------------------------------------------------------------------------*/
14 /* Standard */
15 #include <stdio.h>
16 #include <string.h>
17 #include <stdint.h>
18 #include <errno.h>
19 /* Posix */
20 #include <unistd.h>
21 #include <unitypes.h>
22 #include <unistr.h>
23 #include <uniconv.h>
24 #include <uniname.h>
25 #include <unistdio.h>
26 #include <stdlib.h>
27 #include <limits.h> //realpath, NAME_MAX, PATH_MAX
28 #include <dirent.h>
29 /* Redefinitions of NAME_MAX and PATH_MAX */
30 //#define NAME_MAX NAME_MAX/4
31 //#define PATH_MAX PATH_MAX/4
32
33 /* Local */
34 #include "parser.tab.h"
35 #ifndef DE_STACKSIZE
36 #define DE_STACKSIZE 1024
37 #endif
38 #ifndef TK_STACKSIZE
39 #define TK_STACKSIZE 1024
40 #endif
41 #ifndef MAX_SETNAME_LEN //max setname length
42 #define MAX_SETNAME_LEN 32
43 #endif
44
45 /* Public */
46 int lexer_init(void);
47 int lexer(void);
48 int lexer_lexfile(const uint8_t*);
49 void lexer_pushtok(int, YYSTYPE);
50 uint8_t const* lexer_get_current_filepath(void);
51 int lexer_lexfilename(uint8_t*);
52 struct dirent* lexer_direntpa[DE_STACKSIZE],** lexer_direntpp,** lexer_direntpb;
53 /* Private */
54 extern //lexer_fsm.rl
55 int lexer_lexstring(uint8_t*, int);
56 extern //scanner.c
57 int scanner_init(void);
58 extern //scanner.c
59 int scanner(void);
60 static inline
61 int dredge_current_depth(void);
62 extern //bison
63 YYSTYPE yylval;
64 static
65 uint8_t const* current_filename;
66 static
67 struct tok
68 { YYSTYPE lval; //token val
69 int tok_t; //token type
70 } token_stack[TK_STACKSIZE], *tsp, *tsx;
71
72 /* Directory Entity Array/Stack
73 Simple array for keeping track of dirents yet to be processed by the scanner.
74 If this list is empty and there are no tokens, the lexer is done.
75 This array is populated by the scanner as an array, and popped locally by the
76 lexer as a stack, and is popped as a FIFO stack.
77 */
78 #define DE_STACK (lexer_direntpa)
79 #define DE_STACKP (lexer_direntpp)
80 #define DE_STACKB (lexer_direntpb)
81 #define DE_LEN() (DE_STACKP - DE_STACKB)
82 #define DE_INIT() (DE_STACKP = DE_STACKB = DE_STACK)
83 #define DE_POP() (*DE_STACKB++)
84
85 /* Token Stack
86 This is a FIFO stack whose pointers are a union of either a pointer to an
87 integer, or a pointer to two integers (a struct tok). This way, integers may
88 be added or removed from the stack either singularly (IPUSH/IPOP), or as a
89 full token of two integers (PUSH/POP).
90 An alignment error will occur if IPOP or IPUSH are used a non-even number of
91 times in a sequence!
92 */
93 #define TK_STACK (token_stack)
94 #define TK_STACKP (tsp)
95 #define TK_STACKX (tsx)
96 #define TK_LEN() (TK_STACKX - TK_STACKP)
97 #define TK_INIT() (TK_STACKP = TK_STACKX = TK_STACK)
98 #define TK_POP() (*TK_STACKP++)
99 #define TK_PUSH(T,L) (*TK_STACKX++ = (struct tok){L,T})
100
101 /* Initializer
102 The initializer returns boolean true if an error occurs, which may be handled
103 with standard errno.
104 */
105 int lexer_init
106 ()
107 { TK_INIT();
108 DE_INIT();
109 return scanner_init();
110 }
111
112 /* Lexer
113 If the token buffer is empty, 'lexer' will initialize the token buffer and
114 call 'lexer_scandir'. If SCAN_ERROR is returned, an error is printed
115 before sending a null return to bison. If 0 tokens are generated, the error
116 printing is skipped. In all other cases, 'yylval' is set, and the token's
117 integer representation is returned.
118 */
119 int lexer
120 #define $($)#$
121 #define SCAN_ERROR -1
122 #define TK_EMPTY (TK_STACKP == TK_STACKX)
123 #define FAIL(...) \
124 do { \
125 fprintf(stderr,__VA_ARGS__); \
126 goto done; \
127 } while (0)
128 ()
129 { struct tok token;
130 start:
131 while (DE_LEN() > 0)//lex any directory entries in our stack
132 {
133 if (lexer_lexfile(DE_POP()->d_name) == 0)
134 FAIL("Lexer failed to tokenize [%s]\n",(*DE_STACKB)->d_name);
135 }
136 if (TK_EMPTY) //if there are no tokens,
137 { TK_INIT(); //initialize the token stack back to 0
138 switch (scanner())
139 { case SCAN_ERROR: //if an error occurred,
140 FAIL("Scanner error\n");
141 case 0: //if the the scanner finds no dirents,
142 goto done; //then we are done
143 default: //if we found some elements to scan,
144 goto start; //start over and lex them
145 }
146 }
147 token = TK_POP();
148 yylval = token.lval;
149 return token.tok_t;
150 done:
151 yylval.val = 0;
152 return 0;
153 }
154
155
156 /* Token Receiver
157 This receiver takes a struct tok and pushes it to the FIFO stack.
158 */
159 void lexer_pushtok
160 #define $($)#$ //stringifier
161 #define ERR_TK "Fatal: Generated over " $(TK_STACKSIZE) " tokens in one pass."
162 ( int tok, YYSTYPE lval )
163 { if (TK_LEN() >= TK_STACKSIZE)
164 { fprintf(stderr, ERR_TK);
165 exit(EXIT_FAILURE);
166 }
167 TK_PUSH(tok, lval);
168 }
169
170 /* Lexical analysis of a file
171 Strips a filename to its base name, then sends it to lexer_lex
172 */
173 int lexer_lexfile
174 #define HIDDEN_WARNING "%s is hidden and will not be parsed!\n", filename
175 ( const uint8_t *filename
176 )
177 { static uint8_t fname[NAME_MAX];
178 uint8_t *last_period = NULL, *iter;
179
180 if (*filename == '.')
181 { fprintf (stderr, HIDDEN_WARNING);
182 return 0;
183 }
184 /* Copy the filename and remove its suffix */
185 u8_strncpy(fname,filename,NAME_MAX);
186 last_period = NULL;
187 for (iter = fname; *iter; iter++) //find the last '.' char
188 if (*iter == '.')
189 last_period = iter;
190 if (last_period) //if we found one,
191 *last_period = 0; //truncate the string there
192 /* Register the current_filename */
193 current_filename = filename;
194
195 return lexer_lexfilename(fname);
196 }
197
198 uint8_t const* lexer_get_current_filepath
199 ()
200 { static uint8_t current_path[PATH_MAX];
201 static uint8_t const* last_filename;
202 if ((!last_filename || last_filename != current_filename) &&
203 (realpath(current_filename, current_path) != (char*) current_path))
204 { perror("realpath: ");
205 return NULL;
206 }
207 return (const char*)current_path;
208 }
209
210 /* Scan filename and push the its tokens
211 onto the stack */
212 int lexer_lexfilename
213 (uint8_t* str)
214 {
215 int ntok, i, cmp, len, set_len, height, width;
216 char map_key[] = "_m_";
217 static uint8_t set_name[MAX_SETNAME_LEN] = {0};
218 uint8_t *first, *map_begin;
219
220 printf("Starting lexerfilename on %s\n", str);
221
222
223 if(*str == 0)
224 printf("Lexfilename:: str is NULL so fail\n");
225 printf("setname is %s\n", set_name);
226
227 /* If last file was a mapfile, then its 5th to last token should
228 be a MOPEN. If this is the case, then we only pass MOPEN, height,
229 weight and name of the current file. */
230 if( (TK_STACKX - 5)->tok_t == MOPEN )
231 { printf("The last file was a mapfile\n");
232 if( (map_begin = strstr(map_key, str)) ) //if the current file is a mapfile
233 { printf("The current file is a variant of the last mapfile\n");
234 printf("Start lexing mapfile %s\n", str);
235 ntok += lexer_lexstring(map_begin, strlen(map_begin));
236 }
237 printf("Current file is not a variant of the last mapfile\n");
238 }
239 else //last file was not a mapfile
240 { printf("Last file was not a mapfile\n");
241
242 first = (uint8_t*) u8_strchr(str, '_'); //find the first '_' to find where str set_name ends
243
244 if(set_name[0] != 0) //if there is a set_name from last str
245 { printf("There is a set_name (%s) present\n", set_name);
246 set_len = first - str;
247
248 if(u8_strncmp(str, set_name, set_len) == 0) //check if it matches the current set_name
249 { str = str + set_len + 1; //if so, remove it from str
250 printf("str set_name matched last set_name, set str to %s\n", str);
251 }
252 else //update set_name to be str set_name
253 { u8_cpy(set_name, str, set_len);
254 set_name[set_len] = 0;
255
256 }
257 }
258 else //set set_name
259 { u8_cpy(set_name, str, first-str);
260 }
261 /* Call lexer_lexstring to tokenize the string */
262 printf("calling lexstring to tokenize str (%s) of len %d\n", str, u8_strlen(str));
263 ntok += lexer_lexstring(str, u8_strlen(str));
264 }
265
266 /*TODO: if regfile, store full path for later */
267
268 printf("Ending lexer_lex on %s, %d tokens were lexed\n", str, ntok);
269 return ntok;
270 }
271
272 /* int lexer_lexmapfile */
273 /* #define INC_X() */
274 /* (int height, int width) */
275 /* { */
276 /* int x, y; */
277
278 /* /\* Give scanner_scanpixels a buffer and a len. Iterate through */
279 /* buf with buf[n]. If n == 0, do nothing. if n has a value, push x, */
280 /* push y, push (z = n << 24), push (ref_id = n >> 8) *\/ */
281 /* //scanner_scanpixels() */
282
283 /* for(i = 0; i < len; i++) */
284 /* if(buf[i] == 0) */
285 /* if(x == width) */
286 /* x = 0; */
287 /* else */
288
289
290
291
292 /* } */
293 /* fname_bytes = (uint8_t*)(DE_POP()->d_name); */
294 /* printf("d_name is %s\n", fname_bytes); */
295 /* for (fnp = filename, i = 0; i < NAME_MAX; i += unit_size, fnp++) */
296 /* { unit_size = u8_mblen(fname_bytes + i, min(4, NAME_MAX - i)); */
297 /* if (u8_mbtouc(fnp, fname_bytes + i, unit_size) == -1) //add ucs4 char to the filename */
298 /* FAIL("Lexer failed to convert ^%s to unicode\n", (fname_bytes + i)); */
299 /* if (*fnp == 0) //added a terminating char */
300 /* break; */
301 /* } */
302 /* if(u8_mbtouc(filename, DE_POP()->d_name, NAME_MAX) == -1) */
303 /* FAIL("Lexer failed to convert d_name into uint8_t\n"); */
304 /* ulc_fprintf(stdout, "filename is %11U\n c", filename); */