From 882513ad6569a640cb5c90b83e27e742ace71b63 Mon Sep 17 00:00:00 2001 From: ksg Date: Mon, 29 Aug 2016 16:31:03 -0700 Subject: [PATCH] Lexer 1.0 --- src/apc/lexer.c | 316 ++++++++++++--------------------- src/apc/scanner.c | 194 ++++++++++++++++++++ style/snippets/pointer_array.c | 3 +- 3 files changed, 306 insertions(+), 207 deletions(-) create mode 100644 src/apc/scanner.c diff --git a/src/apc/lexer.c b/src/apc/lexer.c index 47d2c35..4e0a611 100644 --- a/src/apc/lexer.c +++ b/src/apc/lexer.c @@ -1,7 +1,13 @@ /*!@file \brief lexical analyzer implementation for APC - \details this lexer scans a root directory given from the command line - for subdirectories and files structured for the APC grammar. + \details The lexer manages two FIFO stacks. One for maintaining tokens, the + other for maintaining a list of files to be scanned. During + execution, the lexer will return a token from its token queue if any + are present. If not, the lexer will will pop an element from its + file queue to 'scanner' to be tokenized. If the file queue is empty, + the lexer will instead call 'parsedir' to traverse the directory tree + and tokenize the results. If 'parsedir' does not generate any new + tokens, we are done. \author Jordan Lavatai \date Aug 2016 ----------------------------------------------------------------------------*/ @@ -14,109 +20,47 @@ #include #include /* Local */ -#include "parser.tab.h" -#define TOKEN_BUF_SIZE 1024 -#define ENTITY_BUF_SIZE 512 -#define SCANDIR_ERROR -1 - -int lexer_init(void); -int lexer(void); - -extern //main.c -const char* cargs['Z']; - +//#include "parser.tab.h" +#ifndef DP_STACKSIZE +#define DP_STACKSIZE 1024 +#endif +#ifndef TK_STACKSIZE +#define TK_STACKSIZE 1024 +#endif +/* Public */ +struct tok +{ int lval; + int tok; +}; +int lexer_init(void); +int lexer(void); +inline +void lexer_pushtok(int int); +struct dirent* lexer_direntpa[DP_STACKSIZE]; +/* Private */ static inline int scan(void); static inline -int qcomp(const void*, const void*); - -/* Directory Listing Stack - FILO Stack for keeping an open DIR* at each directory depth for treewalk. - This stack is depth-safe, checking its depth during push operations, but not - during pop operations, to ensure the thread doesn't open too many files at - once (512 in c runtime), or traverse too far through symbolic links. - A directory listing includes a DIR* and all DIR-typed entity in the directory - as recognized by dirent, populated externally (and optionally). - This stack behaves abnormally by incrementing its PUSH operation prior to - evaluation, and the POP operations after evaluation. This behavior allows - the 'DL_CURDEPTH' operation to map to the current element in the 'dl_stack' - array, and it is always treated as the "current depth". This also allows us - to init the root directory to 'directory_list_stack'[0] and pop it in a safe - and explicit manner. -*/ -#define S(S)#S -#ifndef DL_DEPTHMAX -#define DL_DEPTHMAX 64 -#endif -#ifndef DL_CHILDMAX -#define DL_CHILDMAX DL_DEPTHMAX //square tree -#endif -#define DL_STACK (directory_list_stack) -#define DL_STACKP (dls) -#define DL_STACKSIZE (DL_DEPTHMAX + 1) //+1 because push increments early -#define DL_LEN (DL_STACKP - DL_STACK) -#define DL_CURDEPTH (DL_LEN) -#define DL_CD_STACK ((*DL_STACKP).child_entity_stack) -#define DL_CD_STACKP ((*DL_STACKP).cds) -#define DL_CD_STACKSIZE (DL_DEPTHMAX) //square tree -#define DL_CD_LEN (DL_CD_STACKP - DL_CD_STACK) -#define ERR_CHILD "Fatal: Maximum of " S(MAX_CHILD) \ - " child directories exceeded for directory %s at depth %i\n" \ - ,DL_STACK[DL_DEPTH()].label,DL_DEPTH() -#define ERR_DEPTH "Fatal: Maximum directory depth of " S(DL_DEPTHMAX) \ - " exceeded during directory scan\n" -#define DL_INIT() (DL_STACKP = DL_STACK) -#define DL_CD_INIT() (DL_CD_STACKP = DL_CD_STACK) -#define DL_POP() ((DIR*)(*DL_STACKP--)) -#define DL_CD_POP() (*--DL_CD_STACKP) -#define DL_PUSH(D) (*(DIR**)(++DL_STACKP) = DIRP) -#define DL_CD_PUSH(E) (*DL_CD_STACKP++ = E) -#define DL_SAFEPUSH(D) \ - do { \ - if (DL_DEPTH() >= DL_DEPTHMAX) \ - { fprintf(stderr, ERR_DEPTH); \ - exit(EXIT_FAILURE); \ - } \ - if ((*(DIR**)(++DL_STACKP) = D) == NULL) \ - { perror("DL_SAFEPUSH"); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) -#define DL_CD_POPULATE() \ - do { \ - DL_CD_INIT(); \ - while ((*DL_CD_STACKP = readdir(*DL_STACKP)) != NULL) \ - { switch((*DL_CD_STACKP)->d_type) \ - { case DT_REG: \ - case DT_DIR: \ - DL_CD_STACKP++; \ - break; \ - default: \ - } \ - } \ - qsort(DL_CD_STACKP, DL_CD_LEN, sizeof *DL_CD_STACKP, qcomp); \ - } while (0) - +int dredge_current_depth(void); +static +struct dirent** dps; static -struct dirlist -{ DIR* dirp; - struct dirent** child_directory_stack[DL_CD_STACKSIZE],*** cds; - const char* label; -} directory_list_stack[DL_STACKSIZE],* dls; +struct tok token_stack[TK_STACKSIZE]; static +union tokp +{ int* i; + struct tok* t; +} tks, tkx; -/* Directory Entity Stack - Simple stack for keeping track of dirents still being processed at the - current depth. +/* Directory Entity Array/Stack + Simple array for keeping track of dirents yet to be processed by the scanner. + If this list is empty and there are no tokens, the lexer is done. */ -#define CE_STACK (child_entity_stack) -#define CE_STACKP (ces) -#define CE_STACKSIZE 1024 -static -struct dirent** child_entity_stack[CE_STACKSIZE],*** ces; -#define CE_INIT() (CE_STACKP = CE_STACK) -#define CE_POP() (*--CE_STACK) -#define CE_PUSH(E) (*CE_STACK++ = E) +#define DP_STACK (lexer_direntpa) +#define DP_STACKP (dps) +#define DP_LEN() (DP_STACKP - DP_STACK) +#define DP_INIT() (DP_STACKP = DP_STACK) +#define DP_POP() (*--DP_STACKP) /* Token Stack This is a FIFO stack whose pointers are a union of either a pointer to an @@ -127,51 +71,33 @@ struct dirent** child_entity_stack[CE_STACKSIZE],*** ces; times in a sequence! */ #define TK_STACK (token_stack) -#define TK_STACKP (tks.t) -#define TK_STACKPI (tks.i) -#define TK_STACKX (tkx.t) -#define TK_STACKXI (tkx.i) -#ifndef TK_STACKSIZE -#define TK_STACKSIZE 1024 -#endif -#define TK_EMPTY (TK_STACKP == TK_STACKX) -#define TK_INIT() (TK_STACKP = TK_STACKX = TK_STACK) -#define TK_POP() (*TK_STACKP++) -#define TK_IPOP() (*TK_STACKPI++); -#define TK_PUSH(TOK,LVAL) (*TKSTACKX++ = (struct tok){(LVAL),(TOK)}) -#define TK_IPUSH(I) (*TKSTACKXI++ = (I)) -static -struct tok -{ int lval; - int tok; -} token_stack[TK_STACKSIZE]; -static -union tokp -{ int* i; - struct tok* t; -} tks, tkx; +#define TK_STACKP (tks.t) +#define TK_STACKPI (tks.i) +#define TK_STACKX (tkx.t) +#define TK_STACKXI (tkx.i) +#define TK_LEN() (TK_STACKP - TK_STACKX) +#define TK_INIT() (TK_STACKP = TK_STACKX = TK_STACK) +#define TK_POP() (*TK_STACKP++) +#define TK_POPI() (*TK_STACKPI++); +#define TK_PUSH(T) (*TKSTACKX++ = T) +#define TK_PUSHI(I) (*TKSTACKXI++ = (I)) +extern //main.c +const char* cargs['Z']; + +extern //scanner.c +int scanner_init(void); +extern //scanner.c +int scanner(struct dirent**); /* Initializer - The initializer returns boolean true if opendir contains an error. + The initializer returns boolean true if an error occurs, which may be handled with standard errno. */ -int lexer_init() -#define ERR_UF "Fatal: Unknown file [%s] encountered in directory %s", \ - DL_CD_STACK[i]->d_name, DL_STACK[DL_DEPTH] -{ int i = 0; - TK_INIT(); - DL_INIT(); - DL_STACK[0].label = cargs['r'] ? cargs['r'] : "./"; - if ((DL_STACK[0].dirp = opendir(DL_STACK[0].label)) == NULL) - return -1; - errno = 0; - while ((DL_CD_STACK[i] = readdir(DL_STACK[0].dirp)) != NULL) - switch(DL_CD_STACK[i]->d_type) - { - default: - fprintf(stderr, "Fatal: Unkown File %s\n", DL_CD_STACK[i]->d_name); - } - return (errno); +int lexer_init +() +{ TK_INIT(); + DP_INIT(); + return scanner_init(); } /* Lexer @@ -181,85 +107,63 @@ int lexer_init() printing is skipped. In all other cases, 'yylval' is set, and the token's integer representation is returned. */ -int lexer() -{ if (TOK_BUF_EMPTY) - { TOK_BUF_INIT(); - switch (lexer_scan) - { case SCANDIR_ERROR: +int lexer +#define SCAN_ERROR -1 +#define TK_EMPTY (TK_STACKP == TK_STACKX) +() +{ if (TK_EMPTY) + { switch (parsedir()) + { case SCAN_ERROR: perror("lexer_scan"); case 0: yylval = 0; return 0; default: + break; } } - yylval = TOK_BUF_IPOP(); - return TOK_BUF_IPOP(); + yylval = TK_IPOP(); + return TK_IPOP(); } - -static inline -int lexer_scan() -{ static DIR* -} - -/* Scanner - Scans a filename from its alphabetically ordered list of file elements - and tokenizes the result. If the file list is empty, then the stack of - directory elements will be popped and processed as they are encountered. - - Returns the number of tokens generated. +/* Token Receiver + This receiver takes a struct tok and pushes it to the FIFO stack. */ -#define MAX_ENTITIES 256 -static -int lexer_scan() -{ static struct dirent* entity; - static struct dirent* files[MAX_ENTITIES] = {0}; - static int num_files = 0; -#define NO_FILES (num_files == 0) - - if (NO_FILES) - - - //sort out files and directories, grow directories from bottom up - while ((entity = readdir(dirp)) != NULL) - { switch (entity->d_type) - { case DT_LNK: - case DT_REG: - files[num_files++] = entity; - break; - case DT_DIR: - *(dirs - num_dirs++) = entity; - break; - case DT_UNKNOWN: - default: - printf("Ignoring unknown file: %s\n", entity->d_name); - break; - } +inline +void lexer_pushtok +#define ERR_TK "Fatal: Generated over " S(TK_STACKSIZE) " tokens in one pass." +( struct tok token ) +{ if (TK_LEN >= TK_STACKSIZE) + { fprintf(stderr, ERR_TK); + exit(EXIT_FAILURE); } - if (errno) - perror("readdir"); - qsort(&files[0], num_files, sizeof struct dirent*, qalpha); - num_ents = scandirat(dirfd, ".", &namelist, scanfilter, scancompar); - if (num_ents < 0) - { perror("scandirat"); - return -1; - } - //process files - - //recurse into directories - return tbx - tbp; + TK_PUSH(token); } -/* Quicksort comparison function - sort each dirent encountered first by its file type (regular files first) - and then by their alphabetical ordering. -*/ -static -int qcompar -( const void* a, - const void* b - ) -{ +/* Lexical Analysis + Ragel state machine for tokenizing text. + */ +void lexer_lex +(const char* str) +{ struct tok token; + token.TOK = 1; + token.LVAL = 2; + lexer_pushtok(token); + printf (str); } +/* init_file: + if (lsp != NULL) + while ((c = *lsp++) == *csp) + { switch (c) + { case DELIM: + delimeters_skipped++; + default: + csp++; //delayed to ensure csp is the start of scannable text + break; + } + } + last_string = string; + scan_text: + return scanner_tokenize(csp); +*/ diff --git a/src/apc/scanner.c b/src/apc/scanner.c new file mode 100644 index 0000000..b039cda --- /dev/null +++ b/src/apc/scanner.c @@ -0,0 +1,194 @@ +/*!@file + \brief APC Directory Scanner + \details This hand-written parser/scanner traverses a directory tree and + tokenizes elements of the structure which correspond to APC grammar. + The parser is implemented as a 2D stack which populates a list of + child directories at each depth, handling only the leaf nodes + (regular files) of the directory open at the current depth to + conserve memory and speed up traversal. + The scanner works with the lexer to lexically analyze text, and + assumes the existence of an external 'lex' function + \author Jordan Lavatai + \date Aug 2016 + ----------------------------------------------------------------------------*/ +/* Standard */ +#include //print +#include //errno +/* Posix */ +#include //warnx +#include //exit +#include //chdir +#include //opendir +/* Public */ +int scanner_init(); +int scanner(void); +/* Private */ +#ifndef DL_STACKSIZE +#define DL_STACKSIZE 64 +#endif +#ifndef DL_CD_STACKSIZE +#define DL_CD_STACKSIZE DL_STACKSIZE //square tree +#endif +extern //lexer.c +int lexer_lex(const char*); +extern //lexer.c +void lexer_pushtok(int, int); +static +int dredge_current_depth(void); +extern //lexer.c +struct dirent* lexer_direntpa[]; +extern //main.c +const char* cargs['Z']; + +struct dirlist +{ DIR* dirp; + struct dirent* child_directory_stack[DL_CD_STACKSIZE],** cds; +} directory_list_stack[DL_STACKSIZE + 1],* dls; //+1 for the root dir + +/* Directory Listing Stack + FILO Stack for keeping an open DIR* at each directory depth for treewalk. + This stack is depth-safe, checking its depth during push operations, but not + during pop operations, to ensure the thread doesn't open too many files at + once (512 in c runtime), or traverse too far through symbolic links. + A directory listing includes a DIR* and all DIR-typed entity in the directory + as recognized by dirent, populated externally (and optionally). + This stack behaves abnormally by incrementing its PUSH operation prior to + evaluation, and the POP operations after evaluation. This behavior allows + the 'DL_CURDEPTH' operation to map to the current element in the 'dl_stack' + array, and it is always treated as the "current depth". This also allows us + to init the root directory to 'directory_list_stack'[0] and pop it in a safe + and explicit manner. +*/ +#define DL_STACK (directory_list_stack) +#define DL_STACKP (dls) +#define DL_CD_STACK ((*DL_STACKP).child_directory_stack) +#define DL_CD_STACKP ((*DL_STACKP).cds) +#define DL_CURDIR() ((*DL_STACKP).dirp) +#define DL_LEN() (DL_STACKP - DL_STACK) +#define DL_CD_LEN() (DL_CD_STACKP - DL_CD_STACK) +#define DL_INIT() (DL_STACKP = DL_STACK) +#define DL_CD_INIT() (DL_CD_STACKP = DL_CD_STACK) +#define DL_POP() ((*DL_STACKP--).dirp) +#define DL_CD_POP() (*--DL_CD_STACKP) +#define DL_PUSH(D) ((*++DL_STACKP).dirp = D) +#define DL_CD_PUSH(E) (*DL_CD_STACKP++ = E) + + +/* Initializer + Initializer expects a function pointer to its lexical analysis function. + Sets up stack pointers and returns boolean true if 'opendir' encounters an + error, or if dredge_current_depth returns boolean true. +*/ +int scanner_init +#define CWDSTR "./" +#define ROOTDIR (cargs['r'] ? cargs['r'] : CWDSTR) +() +{ DL_INIT(); + DL_STACK[0].dirp = opendir(ROOTDIR); + return DL_STACK[0].dirp == NULL || dredge_current_depth() == 0; +} + +/* Scanner + The main driver of the scanner will advance the current treewalk state and + tokenize tree-based push/pop operations. It will call 'lexer_lex' to + tokenize directory names prior to making a push operation. safe checking for + all returns from the filesystem handler will exit on serious system errors. + + after pushing a new directory to the directory list, the scanner will dredge + the directory and alphabetically sort all file entries into the lexer's file + array, while placing all subdirectory entries in the current depth's child + directory stack to the scanned later. + + Returns the number of elements added to the lexer's file array. +*/ +int scanner +#define S(S)#S //stringifier +#define ERR_CHILD "Fatal: Maximum of " S(DL_CD_STACKSIZE) \ + " child directories exceeded for directory at depth %i\n" \ + ,DL_LEN() +#define ERR_DEPTH "Fatal: Maximum directory depth of " S(DL_STACKSIZE) \ + " exceeded during directory scan\n" +#define ERR_DL "Fatal: Directory List Stack Corruption %x\n", DL_LEN() +#define TOK_CLOPEN 0x55, 0 //TODO +#define TOK_CLCLOSE 0x56, 0 //TODO +() +{ struct dirent* direntp; + parse: + if (DL_CD_LEN() >= DL_CD_STACKSIZE)//fail if maxchildren exceeded + { fprintf(stderr, ERR_CHILD); + goto fail; + } + if (DL_CD_LEN() > 0) //There are entities to process at this depth + { if (direntp = DL_CD_POP()) //If the dirent is null, the library + goto libfail; //function in dirent has failed + lexer_lex(direntp->d_name); //lex the directory name + if (DL_LEN() >= DL_STACKSIZE) //fail if maxdepth exceeded + { fprintf(stderr, ERR_DEPTH); + goto fail; + } + if (chdir(direntp->d_name)) //move into the new directory + goto libfail; + if (DL_PUSH(opendir(CWDSTR))) //open the cwd + goto libfail; + lexer_pushtok(TOK_CLOPEN); //Push "Open Directory" token + return dredge_current_depth(); //Filter and sort the current depth + } + else if (DL_LEN() >= 0) //Any dirs left? (Including root) + { if (closedir(DL_POP())) //close the directory we just left + goto libfail; + lexer_pushtok(TOK_CLCLOSE); //Push "Close Directory" token + if (DL_LEN() == -1) //If we just popped root, we're done + return 0; + if (!chdir("..")) //Move up a directory and start over + goto parse; + } + fprintf(stderr, ERR_DL); + libfail: + perror("parsedir"); + fail: + exit(EXIT_FAILURE); +} + +/* Directory Entity Sort and Filter (Dredge) + This filter removes all unhandled file types, and places any 'DT_DIR' type + files in the current Directory List's directory stack. Upon finishing, + the 'CE_STACK' is sorted alphabetically, and the current 'DL_CD_STACK' is + populated. Prints warnings for unhandled files. + + Returns -1 if 'readdir' encounters an error, otherwise returns the number of + directory entries sent to the external 'lexer_direntpa' array. +*/ +typedef +int (*qcomp)(const void*, const void*); +static inline +int dredge_current_depth +#define READDIR_ERROR (-1) +#define READDIR_DONE (0) +#define DPS_LEN() (direntpp - lexer_direntpa) +#define DPS_PUSH(E) (*direntpp++ = E) +() +{ struct dirent** direntpp = lexer_direntpa; + DIR* cwd = DL_CURDIR(); + struct dirent* direntp; + DL_CD_INIT(); + scan_next: + if ((direntp = readdir(cwd)) != NULL) + { switch (direntp->d_type) + { case DT_REG: + DPS_PUSH(direntp); + goto scan_next; + case DT_DIR: + DL_CD_PUSH(direntp); + goto scan_next; + case DT_UNKNOWN: + warnx("unknown file %s: ignoring", direntp->d_name); + default: + goto scan_next; + } + } + if (errno) + return -1; + qsort(lexer_direntpa, DPS_LEN(), sizeof direntp, (qcomp)alphasort); + return DPS_LEN(); +} + diff --git a/style/snippets/pointer_array.c b/style/snippets/pointer_array.c index 913f3b6..8e29942 100644 --- a/style/snippets/pointer_array.c +++ b/style/snippets/pointer_array.c @@ -1,7 +1,8 @@ /* Inverse 2d array population */ #include int main(void); -int main() +int main +() { int** ipp,* ip,* ipa[16], ia[16<<4], i; for (ipp = ipa, i = 0; ((*ipp++ = ip = ia + (i++ << 4)) - ia < 16 << 4);) { while ((*ip++ = ip - ia) < (ipp - ipa) << 4) -- 2.18.0