buncha small fixes for ir, parser, etc.
[henge/apc.git] / src / lexer.c
1 /*!@file
2 \brief lexical analyzer implementation for APC
3 \details The lexer manages two FIFO stacks. One for maintaining tokens, the
4 other for maintaining a list of files to be scanned. During
5 execution, the lexer will return a token from its token queue if any
6 are present. If not, the lexer will will pop an element from its
7 file queue to 'scanner' to be tokenized. If the file queue is empty,
8 the lexer will instead call 'parsedir' to traverse the directory tree
9 and tokenize the results. If 'parsedir' does not generate any new
10 tokens, we are done.
11 \author Jordan Lavatai
12 \date Aug 2016
13 ----------------------------------------------------------------------------*/
14 /* Standard */
15 #include <stdio.h>
16 #include <string.h>
17 #include <stdint.h>
18 #include <errno.h>
19 /* Posix */
20 #include <unistd.h>
21 #include <unitypes.h>
22 #include <unistr.h>
23 #include <uniconv.h>
24 #include <uniname.h>
25 #include <unistdio.h>
26 #include <stdlib.h>
27 #include <limits.h> //realpath, NAME_MAX, FPATH_MAX
28 #include <dirent.h>
29
30 /* Local */
31 #include "apc.h"
32 #include "parser.tab.h"
33 #ifndef DE_STACKSIZE
34 #define DE_STACKSIZE 1024
35 #endif
36 #ifndef TK_STACKSIZE
37 #define TK_STACKSIZE 1024
38 #endif
39
40
41 /* Public */
42 int lexer_init(void);
43 int lexer(void);
44 int lexer_lexfile(const uint8_t*);
45 void lexer_pushtok(int, YYSTYPE);
46 uint8_t const* lexer_get_current_filepath(void);
47 int lexer_lexfilename(uint8_t*);
48 struct dirent* lexer_direntpa[DE_STACKSIZE],** lexer_direntpp,** lexer_direntpb;
49 /* Private */
50 extern //lexer_fsm.rl
51 int lexer_lexstring(uint8_t*, int);
52 extern //lexer_fsm.rl
53 int lexer_setstr(uint8_t*, int);
54 extern //scanner.c
55 int scanner_init(void);
56 extern //scanner.c
57 int scanner(void);
58 extern //bison
59 YYSTYPE yylval;
60 static
61 uint8_t const* current_filename;
62
63 static
64 struct tok
65 { YYSTYPE lval; //token val
66 int tok_t; //token type
67 } token_stack[TK_STACKSIZE], *tsp, *tsx;
68
69 /* Directory Entity Array/Stack
70 Simple array for keeping track of dirents yet to be processed by the scanner.
71 If this list is empty and there are no tokens, the lexer is done.
72 This array is populated by the scanner as an array, and popped locally by the
73 lexer as a stack, and is popped as a FIFO stack.
74 */
75 #define DE_STACK (lexer_direntpa)
76 #define DE_STACKP (lexer_direntpp)
77 #define DE_STACKB (lexer_direntpb)
78 #define DE_LEN() (DE_STACKP - DE_STACKB)
79 #define DE_INIT() (DE_STACKP = DE_STACKB = DE_STACK)
80 #define DE_POP() (*DE_STACKB++)
81
82 /* Token Stack
83 This is a FIFO stack whose pointers are a union of either a pointer to an
84 integer, or a pointer to two integers (a struct tok). This way, integers may
85 be added or removed from the stack either singularly (IPUSH/IPOP), or as a
86 full token of two integers (PUSH/POP).
87 An alignment error will occur if IPOP or IPUSH are used a non-even number of
88 times in a sequence!
89 */
90 #define TK_STACK (token_stack)
91 #define TK_STACKP (tsp)
92 #define TK_STACKX (tsx)
93 #define TK_LEN() (TK_STACKX - TK_STACKP)
94 #define TK_INIT() (TK_STACKP = TK_STACKX = TK_STACK)
95 #define TK_POP() (*TK_STACKP++)
96 #define TK_PUSH(T,L) (*TK_STACKX++ = (struct tok){L,T})
97
98 /* Initializer
99 The initializer returns boolean true if an error occurs, which may be handled
100 with standard errno.
101 */
102 int lexer_init
103 ()
104 { TK_INIT();
105 DE_INIT();
106 return scanner_init();
107 }
108
109 /* Lexer
110 If the token buffer is empty, 'lexer' will initialize the token buffer and
111 call 'lexer_scandir'. If SCAN_ERROR is returned, an error is printed
112 before sending a null return to bison. If 0 tokens are generated, the error
113 printing is skipped. In all other cases, 'yylval' is set, and the token's
114 integer representation is returned.
115 */
116 int lexer
117 #define $($)#$
118 #define SCAN_ERROR -1
119 #define TK_EMPTY (TK_STACKP == TK_STACKX)
120 #define FAIL(...) \
121 do { \
122 fprintf(stderr,__VA_ARGS__); \
123 goto done; \
124 } while (0)
125 ()
126 { struct tok token;
127 start:
128 while (DE_LEN() > 0)//lex any directory entries in our stack
129 {
130 if (lexer_lexfile(DE_POP()->d_name) == 0)
131 FAIL("Lexer failed to tokenize [%s]\n",(*DE_STACKB)->d_name);
132 }
133 if (TK_EMPTY) //if there are no tokens,
134 { TK_INIT(); //initialize the token stack back to 0
135 switch (scanner())
136 { case SCAN_ERROR: //if an error occurred,
137 FAIL("Scanner error\n");
138 case 0: //if the the scanner finds no dirents,
139 goto done; //then we are done
140 default: //if we found some elements to scan,
141 goto start; //start over and lex them
142 }
143 }
144 token = TK_POP();
145 yylval = token.lval;
146 return token.tok_t;
147 done:
148 yylval.val = 0;
149 return 0;
150 }
151
152
153 /* Token Receiver
154 This receiver takes a struct tok and pushes it to the FIFO stack.
155 */
156 void lexer_pushtok
157 #define $($)#$ //stringifier
158 #define ERR_TK "Fatal: Generated over " $(TK_STACKSIZE) " tokens in one pass."
159 ( int tok, YYSTYPE lval )
160 { if (TK_LEN() >= TK_STACKSIZE)
161 { fprintf(stderr, ERR_TK);
162 exit(EXIT_FAILURE);
163 }
164 TK_PUSH(tok, lval);
165 }
166
167 /* Lexical analysis of a file
168 Strips a filename to its base name, then sends it to lexer_lex
169 */
170 int lexer_lexfile
171 #define HIDDEN_WARNING "%s is hidden and will not be parsed!\n", filename
172 ( const uint8_t *filename
173 )
174 { static uint8_t fname[FNAME_MAX];
175 uint8_t *last_period = NULL, *iter;
176
177 if (*filename == '.')
178 { fprintf (stderr, HIDDEN_WARNING);
179 return 0;
180 }
181 /* Copy the filename and remove its suffix */
182 u8_strncpy(fname,filename,FNAME_MAX);
183 last_period = NULL;
184 for (iter = fname; *iter; iter++) //find the last '.' char
185 if (*iter == '.')
186 last_period = iter;
187 if (last_period) //if we found one,
188 *last_period = 0; //truncate the string there
189 /* Register the current_filename */
190 current_filename = filename;
191 printf("lexer_lexfilename(%s)\n",fname);
192 return lexer_lexfilename(fname);
193 }
194
195 uint8_t const* lexer_get_current_filepath
196 ()
197 { static uint8_t current_path[FPATH_MAX];
198 static uint8_t const* last_filename;
199 if ((!last_filename || last_filename != current_filename) &&
200 ((uint8_t*) realpath(current_filename, current_path) != (uint8_t*) current_path))
201 { perror("realpath: ");
202 return NULL;
203 }
204 return (const uint8_t*)current_path;
205 }
206
207 /* Scan filename and push the its tokens
208 onto the stack */
209 int lexer_lexfilename
210 (uint8_t* str)
211 { int ntok, len;
212 uint8_t *filepath;
213
214
215 printf("|---- Begin lexerfilename on %s ----|\n", str);
216
217 if(*str == 0)
218 perror("Lexfilename:: str is NULL so fail\n");
219
220 /* Determine the filetype of str */
221 len = u8_strlen(str);
222
223 ntok = lexer_lexstring(str, len);
224
225 /* Pass back filepath as end of statment operator */
226 filepath = u8_strdup(lexer_get_current_filepath());
227 yylval.str = filepath;
228 lexer_pushtok(NAME, yylval);
229 printf("Pushing filepath %s\n", filepath);
230 ntok++;
231
232 printf("|---- Ending lexer_lexfilename on %s, %d tokens were lexed ----|\n", str, ntok);
233 return ntok;
234 }
235
236 /**************************/
237 /****Abandon All Hope******/
238 /**************************/
239 /*** ***/
240 /*** ***/
241 /*** ***/
242 /*** ***/
243
244 #if 0
245 int
246 lexer_lexelemap
247 ( uint8_t* str)
248 { int setname_len, elename_len, strlen;
249 uint8_t* setname_end, *elename_end, *newstrt;
250 uint8_t curr_setname[MAX_SETNAME_LEN] = {0};
251 uint8_t curr_elename[MAX_ELENAME_LEN] = {0};
252
253 newstrt = str;
254
255 SET_CURR_SETNAME(newstrt);
256 SET_CURR_ELENAME(newstrt);
257 if(PREV_MAPFILE())
258 { printf("Lexer_lexelemap:: previous file was mapfile*\n");
259 SET_MAPSTR(newstrt);
260 }
261 else
262 {
263 if(SETNAME_MATCHES())
264 { DEL_FTOK(newstrt);
265 if(REF(newstrt))
266 DEL_FTOK(newstrt);
267 printf("Lexer_lexelemap:: setname matches\n");
268 if(ELENAME_MATCHES())
269 DEL_FTOK(newstrt);
270 if(REF(newstrt))
271 DEL_FTOK(newstrt);
272 }
273 }
274
275 UPDATE_PREV_ELENAME(newstrt);
276 UPDATE_PREV_SETNAME(newstrt);
277
278 return newstrt - str;
279
280
281 }
282
283 int
284 lexer_lexelemodel
285 (uint8_t* str)
286 { int setname_len, elename_len;
287 uint8_t* setname_end, *elename_end, *newstrt;
288 uint8_t curr_setname[MAX_SETNAME_LEN] = {0};
289 uint8_t curr_elename[MAX_ELENAME_LEN] = {0};
290
291 printf("Lexer_lexelemodel:: Begin str is %s\n", str);
292
293 newstrt = str;
294
295 SET_CURR_SETNAME(newstrt);
296 SET_CURR_ELENAME(newstrt);
297 if(SETNAME_MATCHES())
298 { printf("Lexer_lexelemodel:: curr_setname(%s) matches prev_setname (%s)\n", curr_setname, prev_setname);
299 DEL_FTOK(newstrt);
300 printf("Lexer_lexelemodel:: Deleted setname, newstrt is now %s\n", newstrt);
301 if(REF(newstrt))
302 DEL_FTOK(newstrt);
303 if(ELENAME_MATCHES())
304 { printf("Lexer_lexelemodel:: elename matches\n");
305 DEL_FTOK(newstrt);
306 if(REF(newstrt))
307 DEL_FTOK(newstrt);
308 }
309 }
310 UPDATE_PREV_ELENAME(newstrt);
311 UPDATE_PREV_SETNAME(newstrt);
312
313 return newstrt - str;
314 }
315
316 int
317 lexer_lexsetmap
318 (uint8_t* str)
319 { int setname_len, elename_len;
320 uint8_t* setname_end, *elename_end, *newstrt;
321 uint8_t curr_setname[MAX_SETNAME_LEN] = {0};
322 uint8_t curr_elename[MAX_ELENAME_LEN] = {0};
323
324 newstrt = str;
325
326 SET_CURR_SETNAME(newstrt);
327 if(PREV_MAPFILE())
328 SET_MAPSTR(newstrt);
329 else
330 if( SETNAME_MATCHES())
331 DEL_FTOK(newstrt);
332 if(REF(newstrt))
333 DEL_FTOK(newstrt);
334
335 UPDATE_PREV_SETNAME(newstrt);
336
337 return newstrt - str;
338 }
339
340 int
341 lexer_lexsetmodel
342 (uint8_t* str)
343 { int setname_len, elename_len;
344 uint8_t* setname_end, *elename_end, *newstrt;
345 uint8_t curr_setname[MAX_SETNAME_LEN] = {0};
346 uint8_t curr_elename[MAX_ELENAME_LEN] = {0};
347
348 newstrt = str;
349
350 SET_CURR_SETNAME(newstrt);
351 if( SETNAME_MATCHES())
352 DEL_FTOK(newstrt);
353 if(REF(newstrt))
354 DEL_FTOK(newstrt);
355 UPDATE_PREV_SETNAME(newstrt);
356
357 return newstrt - str;
358
359 }
360
361 int
362 lexer_lexsetvlink
363 (uint8_t* str)
364 { int setname_len, elename_len;
365 uint8_t* setname_end, *elename_end, *newstrt;
366 uint8_t curr_setname[MAX_SETNAME_LEN] = {0};
367 uint8_t curr_elename[MAX_ELENAME_LEN] = {0};
368
369 newstrt = str;
370
371 SET_CURR_SETNAME(newstrt);
372 if( SETNAME_MATCHES())
373 DEL_FTOK(newstrt);
374 if(REF((NEXT_TOK(newstrt)))) //if NAME REF REF
375 DEL_FTOK(newstrt);
376 UPDATE_PREV_SETNAME(newstrt);
377
378 return newstrt - str;
379
380 }
381
382 int
383 lexer_lexelevlink
384 (uint8_t* str)
385 { int setname_len, elename_len;
386 uint8_t* setname_end, *elename_end, *newstrt;
387 uint8_t curr_setname[MAX_SETNAME_LEN] = {0};
388 uint8_t curr_elename[MAX_ELENAME_LEN] = {0};
389
390 newstrt = str;
391
392 SET_CURR_SETNAME(newstrt);
393 SET_CURR_ELENAME(newstrt);
394 if(SETNAME_MATCHES())
395 { DEL_FTOK(newstrt);
396 if(REF(NEXT_TOK(newstrt))) //NAME REF REF, where is set_label
397 DEL_FTOK(newstrt);
398 }
399
400 return newstrt - str;
401 }
402
403 int
404 lexer_lexsetolink
405 (uint8_t* str)
406 { int setname_len, elename_len;
407 uint8_t* setname_end, *elename_end;
408 uint8_t curr_setname[MAX_SETNAME_LEN] = {0};
409 uint8_t curr_elename[MAX_ELENAME_LEN] = {0};
410
411 return 0;
412
413 //do nothing
414 }
415
416 int
417 lexer_lexeleolink
418 (uint8_t* str)
419 { int setname_len, elename_len;
420 uint8_t* setname_end, *elename_end, *newstrt;
421 uint8_t curr_setname[MAX_SETNAME_LEN] = {0};
422 uint8_t curr_elename[MAX_ELENAME_LEN] = {0};
423
424 newstrt = str;
425
426 SET_CURR_SETNAME(newstrt);
427 printf("prev_setname %s, curr_setname %s\n", prev_setname, curr_setname);
428 if(SETNAME_MATCHES())
429 { DEL_FTOK(newstrt);
430 if(REF(newstrt))
431 DEL_FTOK(newstrt);
432 }
433
434 return newstrt - str;
435
436
437 }
438
439
440
441 #define REF(STR) (STR[0] <= 0x39 && STR[0] >= 0x30)
442 #define DEL_FTOK(STR) (STR = u8_strchr(STR, '_') + 1)
443 #define NEXT_TOK(STR) (u8_strchr(STR, '_') + 1)
444 #define SET_CURR_SETNAME(STR) \
445 do { \
446 printf("Lexer_lexX:: setting curr_setname of str(%s)\n", STR); \
447 setname_end = u8_chr(STR, FNAME_MAX, '_'); \
448 setname_len = setname_end - str; \
449 u8_move(curr_setname, STR, setname_len); \
450 printf("Lexer_lexX:: curr_setname is now %s\n",curr_setname); \
451 } while (0)
452 #define SET_CURR_ELENAME(STR) \
453 do { \
454 printf("Lexer_lexX:: setting curr_elename of str(%s)\n", STR); \
455 setname_end = u8_chr(STR, FNAME_MAX, '_') + 1; \
456 if(REF(setname_end)) \
457 setname_end = u8_chr(setname_end, FNAME_MAX, '_') + 1; \
458 elename_end = u8_chr(setname_end, FNAME_MAX, '_'); \
459 elename_len = elename_end - setname_end; \
460 u8_move(curr_elename, setname_end, elename_len); \
461 printf("Lexer_lexX:: curr_elename is now %s\n", curr_elename); \
462 } while (0)
463
464 #define SETNAME_MATCHES() (u8_strcmp(curr_setname, prev_setname) == 0)
465 #define ELENAME_MATCHES() (u8_strcmp(curr_elename, prev_elename) == 0)
466 #define UPDATE_PREV_SETNAME(STR) \
467 do { \
468 printf("Lexer_lexX:: updating prev_setname from (%s)", prev_setname); \
469 u8_set(prev_setname , (ucs4_t) 0, MAX_SETNAME_LEN ); \
470 u8_move(prev_setname, curr_setname, setname_len); \
471 printf(" to %s\n", prev_setname); \
472 } while (0)
473 #define UPDATE_PREV_ELENAME(STR) \
474 do { \
475 u8_set(prev_elename , (ucs4_t) 0, MAX_ELENAME_LEN ); \
476 u8_move(prev_elename, curr_elename, elename_len); \
477 } while (0)
478 #define PREV_MAPFILE() (TK_STACKX - 5)->tok_t == MOPEN || (TK_STACKX-3)->tok_t == MOPEN
479 #define SET_MAPSTR(STR) (STR = u8_strstr(STR, map_key))
480
481
482 #endif
483
484
485 /* int lexer_lexmapfile */
486 /* #define INC_X() */
487 /* (int height, int width) */
488 /* { */
489 /* int x, y; */
490
491 /* /\* Give scanner_scanpixels a buffer and a len. Iterate through */
492 /* buf with buf[n]. If n == 0, do nothing. if n has a value, push x, */
493 /* push y, push (z = n << 24), push (ref_id = n >> 8) *\/ */
494 /* //scanner_scanpixels() */
495
496 /* for(i = 0; i < len; i++) */
497 /* if(buf[i] == 0) */
498 /* if(x == width) */
499 /* x = 0; */
500 /* else */
501
502
503
504
505 /* } */
506 /* fname_bytes = (uint8_t*)(DE_POP()->d_name); */
507 /* printf("d_name is %s\n", fname_bytes); */
508 /* for (fnp = filename, i = 0; i < FNAME_MAX; i += unit_size, fnp++) */
509 /* { unit_size = u8_mblen(fname_bytes + i, min(4, FNAME_MAX - i)); */
510 /* if (u8_mbtouc(fnp, fname_bytes + i, unit_size) == -1) //add ucs4 char to the filename */
511 /* FAIL("Lexer failed to convert ^%s to unicode\n", (fname_bytes + i)); */
512 /* if (*fnp == 0) //added a terminating char */
513 /* break; */
514 /* } */
515 /* if(u8_mbtouc(filename, DE_POP()->d_name, FNAME_MAXy) == -1) */
516 /* FAIL("Lexer failed to convert d_name into uint8_t\n"); */
517 /* ulc_fprintf(stdout, "filename is %11U\n c", filename); */