diff options
Diffstat (limited to 'wrapperhelper/src/prepare.c')
| -rw-r--r-- | wrapperhelper/src/prepare.c | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/wrapperhelper/src/prepare.c b/wrapperhelper/src/prepare.c new file mode 100644 index 00000000..090da0b0 --- /dev/null +++ b/wrapperhelper/src/prepare.c @@ -0,0 +1,372 @@ +#include "prepare.h" + +#include <string.h> + +struct prepare_s { + FILE *f; + int buf[4]; + int buf_len; // <= 4 (though 3 *should* be enough) + char *srcn; + enum prepare_state { + PREPST_NONE = 0, + PREPST_NL, + PREPST_HASH, + PREPST_INCL, + PREPST_DEF, + PREPST_DEFID, + } st; +}; + +prepare_t *prepare_new_file(FILE *f, const char *filename) { + prepare_t *ret = malloc(sizeof *ret); + if (!ret) { + fclose(f); + return NULL; + } + *ret = (prepare_t){ + .f = f, + .buf = {0, 0, 0}, + .buf_len = 0, + .srcn = strdup(filename), + .st = PREPST_NL, + }; + return ret; +} + +void prepare_del(prepare_t *prep) { + if (prep->f) fclose(prep->f); + if (prep->srcn) free(prep->srcn); + free(prep); +} + +static int get_char(prepare_t *src) { +start_get_char: + int c = src->buf_len ? src->buf[--src->buf_len] : getc(src->f); + src->buf_len = 0; + if (c == '\\') { + c = src->buf_len ? src->buf[--src->buf_len] : getc(src->f); + if (c == '\n') goto start_get_char; + src->buf[src->buf_len++] = c; + return '\\'; + } + return c; +} +// Do not call this more than twice in a row if the last character retrieved is '\\' +static void unget_char(prepare_t *src, int c) { + src->buf[src->buf_len++] = c; +} + +static void fill_ident(prepare_t *src, string_t *buf) { + while (1) { + int c = get_char(src); + if ((c == '_') || ((c >= '0') && (c <= '9')) || ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z'))) { + string_add_char(buf, (char)c); + } else { + unget_char(src, c); + return; + } + } +} + +static void fill_num(prepare_t *src, string_t *buf) { + int started_exp = 0; + while (1) { + int c = get_char(src); + if ((c == '_') || (c == '.') || ((c >= '0') && (c <= '9')) || ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')) + || (started_exp && ((c == '+') || (c == '-')))) { + started_exp = (c == 'e') || (c == 'E') || (c == 'p') || (c == 'P'); + string_add_char(buf, (char)c); + } else { + unget_char(src, c); + return; + } + } +} + +static void fill_str(prepare_t *src, string_t *buf, char end_c, int can_esc) { + int has_esc = 0; + while (1) { + int c = get_char(src); + if (has_esc && (c >= 0) && (c <= 0x7F) && (c != '\n')) { + // Not technically standard compliant (should support \ooo, \x..., \u..., \U...) + // Since we don't really care about parsing the content, only the delimiters, this is good enough + string_add_char(buf, '\\'); + string_add_char(buf, (char)c); + has_esc = 0; + } else if (c == '\\') { + if (can_esc) { + has_esc = 1; + } else { + string_add_char(buf, '\\'); + } + } else if ((c >= 0) && (c <= 0x7F) && (c != end_c)) { + has_esc = 0; + string_add_char(buf, (char)c); + } else { + if (has_esc) { + // c is invalid or a '\n', or can_esc = 0 and c = end_c + string_add_char(buf, '\\'); + } + if (c != end_c) + unget_char(src, c); + return; + } + } +} + +#define BASE_NSYMS 25 +static const struct symbs_s { + char c; + enum token_sym_type_e sym; + int nnext; + const struct symbs_s *next; +} *symbs = (struct symbs_s[BASE_NSYMS]){ +#define TERM(ch, t) { .c = ch, .sym = t, .nnext = 0, .next = NULL } +#define NONTERM(ch, t, n, ...) { .c = ch, .sym = t, .nnext = n, .next = (struct symbs_s[n]){__VA_ARGS__} } + // Only '..' must have a sym > LAST_SYM; change next_token if this is not the case + NONTERM('.', SYM_DOT, 1, NONTERM('.', LAST_SYM + 1, 1, TERM('.', SYM_VARIADIC))), + TERM('{', SYM_LBRACKET), + TERM('}', SYM_RBRACKET), + TERM('[', SYM_LSQBRACKET), + TERM(']', SYM_RSQBRACKET), + TERM('(', SYM_LPAREN), + TERM(')', SYM_RPAREN), + NONTERM('#', SYM_HASH, 1, TERM('#', SYM_HASHHASH)), + TERM(';', SYM_SEMICOLON), + NONTERM(':', SYM_COLON, 1, TERM(':', SYM_COLONCOLON)), + TERM('?', SYM_QUESTION), + TERM('~', SYM_TILDE), + NONTERM('!', SYM_EXCL, 1, TERM('=', SYM_EXCLEQ)), + NONTERM('+', SYM_PLUS, 2, TERM('=', SYM_PLUSEQ), TERM('+', SYM_PLUSPLUS)), + NONTERM('-', SYM_DASH, 3, TERM('=', SYM_DASHEQ), TERM('-', SYM_DASHDASH), TERM('>', SYM_DASHGT)), + NONTERM('*', SYM_STAR, 1, TERM('=', SYM_STAREQ)), + NONTERM('/', SYM_SLASH, 1, TERM('=', SYM_SLASHEQ)), + NONTERM('%', SYM_PERCENT, 1, TERM('=', SYM_PERCENTEQ)), + NONTERM('^', SYM_HAT, 1, TERM('=', SYM_HATEQ)), + NONTERM('&', SYM_AMP, 2, TERM('=', SYM_AMPEQ), TERM('&', SYM_AMPAMP)), + NONTERM('|', SYM_PIPE, 2, TERM('=', SYM_PIPEEQ), TERM('|', SYM_PIPEPIPE)), + NONTERM('=', SYM_EQ, 1, TERM('=', SYM_EQEQ)), + NONTERM('<', SYM_LT, 2, TERM('=', SYM_LTEQ), NONTERM('<', SYM_LTLT, 1, TERM('=', SYM_LTLTEQ))), + NONTERM('>', SYM_GT, 2, TERM('=', SYM_GTEQ), NONTERM('>', SYM_GTGT, 1, TERM('=', SYM_GTGTEQ))), + TERM(',', SYM_COMMA), +#undef NONTERM +#undef TERM +}; + +preproc_token_t pre_next_token(prepare_t *src, int allow_comments) { +start_next_token: + int c = get_char(src); + if (c == EOF) { + if (src->st == PREPST_NL) { + return (preproc_token_t){ + .tokt = PPTOK_EOF, + .tokv.c = (char)c + }; + } else { + // Force newline at EOF + unget_char(src, c); + src->st = PREPST_NL; + return (preproc_token_t){ + .tokt = PPTOK_NEWLINE, + .tokv.c = (char)c + }; + } + } + + if (src->st == PREPST_INCL && (c == '<')) { + src->st = PREPST_NONE; + preproc_token_t ret; + ret.tokt = PPTOK_INCL; + ret.tokv.sisstr = 0; + ret.tokv.sstr = string_new(); + fill_str(src, ret.tokv.sstr, '>', 0); + return ret; + } + if (c == '\'') { + src->st = PREPST_NONE; + preproc_token_t ret; + ret.tokt = PPTOK_STRING; + ret.tokv.sisstr = 0; + ret.tokv.sstr = string_new_cap(1); // Usually only one character is inside a char literal + fill_str(src, ret.tokv.sstr, '\'', 1); + return ret; + } + if (c == '"') { + preproc_token_t ret; + ret.tokt = (src->st == PREPST_INCL) ? PPTOK_INCL : PPTOK_STRING; + src->st = PREPST_NONE; + ret.tokv.sisstr = 1; + ret.tokv.sstr = string_new(); + fill_str(src, ret.tokv.sstr, '"', ret.tokt == PPTOK_STRING); + return ret; + } + if ((c == ' ') || (c == '\f') || (c == '\t') || (c == '\v')) { + if (src->st == PREPST_DEFID) { + src->st = PREPST_NONE; + return (preproc_token_t){ + .tokt = PPTOK_BLANK, + .tokv.c = (char)c + }; + } else goto start_next_token; + } + if (c == '\n') { + src->st = PREPST_NL; + return (preproc_token_t){ + .tokt = PPTOK_NEWLINE, + .tokv.c = (char)c + }; + } + if ((c == '_') || ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'))) { + preproc_token_t ret; + ret.tokt = PPTOK_IDENT; + ret.tokv.str = string_new_cap(1); + string_add_char(ret.tokv.str, (char)c); + fill_ident(src, ret.tokv.str); + src->st = + ((src->st == PREPST_HASH) && (!strcmp(string_content(ret.tokv.str), "include"))) ? PREPST_INCL : + ((src->st == PREPST_HASH) && (!strcmp(string_content(ret.tokv.str), "include_next"))) ? PREPST_INCL : + ((src->st == PREPST_HASH) && (!strcmp(string_content(ret.tokv.str), "define"))) ? PREPST_DEF : + (src->st == PREPST_DEF) ? PREPST_DEFID : + PREPST_NONE; + return ret; + } + if ((c >= '0') && (c <= '9')) { + src->st = PREPST_NONE; + preproc_token_t ret; + ret.tokt = PPTOK_NUM; + ret.tokv.str = string_new_cap(1); + string_add_char(ret.tokv.str, (char)c); + fill_num(src, ret.tokv.str); + return ret; + } + if (c == '.') { + c = get_char(src); + if ((c >= '0') && (c <= '9')) { + src->st = PREPST_NONE; + preproc_token_t ret; + ret.tokt = PPTOK_NUM; + ret.tokv.str = string_new_cap(2); + string_add_char(ret.tokv.str, '.'); + string_add_char(ret.tokv.str, (char)c); + fill_num(src, ret.tokv.str); + return ret; + } else { + unget_char(src, c); + c = '.'; + } + } + if (c == '/') { + c = get_char(src); + if (c == '/') { + if (allow_comments) { + src->st = PREPST_NONE; + return (preproc_token_t){ + .tokt = PPTOK_START_LINE_COMMENT, + .tokv.c = '/' + }; + } + + do { + c = get_char(src); + } while ((c != EOF) && (c != '\n')); + if (c != EOF) { + if (src->st == PREPST_NL) + goto start_next_token; + else { + src->st = PREPST_NL; + return (preproc_token_t){ + .tokt = PPTOK_NEWLINE, + .tokv.c = (char)c + }; + } + } + + src->st = PREPST_NONE; + printf("Unfinished comment while preparing %s\n", src->srcn); + return (preproc_token_t){ + .tokt = PPTOK_INVALID, + .tokv.c = (char)c + }; + } else if (c == '*') { + c = get_char(src); + int last_star = 0; + while ((c != EOF) && (!last_star || (c != '/'))) { + last_star = c == '*'; + c = get_char(src); + } + if (c != EOF) goto start_next_token; + + src->st = PREPST_NONE; + printf("Unfinished comment while preparing %s\n", src->srcn); + return (preproc_token_t){ + .tokt = PPTOK_INVALID, + .tokv.c = (char)c + }; + } else { + unget_char(src, c); + c = '/'; + } + } + + struct symbs_s const *sym = NULL; + for (int i = 0; i < BASE_NSYMS; ++i) { + if (c == symbs[i].c) { + sym = &symbs[i]; + break; + } + } + if (sym) { + while (sym->nnext) { + c = get_char(src); + int found = 0; + for (int i = 0; i < sym->nnext; ++i) { + if (c == sym->next[i].c) { + found = 1; + sym = &sym->next[i]; + break; + } + } + if (!found) { + unget_char(src, c); + break; + } + } + if (sym->sym == LAST_SYM + 1) { + unget_char(src, sym->c); + sym = &symbs[0]; // This is where no check is made (see comment in the definition of symbs) + } + src->st = ((src->st == PREPST_NL) && (sym->sym == SYM_HASH)) ? PREPST_HASH : PREPST_NONE; + return (preproc_token_t){ + .tokt = PPTOK_SYM, + .tokv.sym = sym->sym + }; + } + + src->st = PREPST_NONE; + printf("Invalid character 0x%X (%c) while preparing %s\n", (unsigned)c, (c >= 0x20) && (c < 127) ? c : '?', src->srcn); + return (preproc_token_t){ + .tokt = PPTOK_INVALID, + .tokv.c = (char)c + }; +} +preproc_token_t pre_next_newline_token(prepare_t *src) { +start_next_token: + int c = get_char(src); + if (c == EOF) { + // Force newline at EOF + unget_char(src, c); + src->st = PREPST_NL; + return (preproc_token_t){ + .tokt = PPTOK_NEWLINE, + .tokv.c = (char)c + }; + } + if (c == '\n') { + src->st = PREPST_NL; + return (preproc_token_t){ + .tokt = PPTOK_NEWLINE, + .tokv.c = (char)c + }; + } + goto start_next_token; +} |