X-Git-Url: http://git.ozlabs.org/?a=blobdiff_plain;f=ccan%2Fccan_tokenizer%2Fccan_tokenizer.c;fp=ccan%2Fccan_tokenizer%2Fccan_tokenizer.c;h=7d29e025552ba93e9a6aec802db00911b49534cb;hb=69cc1b45b4921c0be738902fe0d5225f135e2aae;hp=0000000000000000000000000000000000000000;hpb=46b1a03e21303e03b68de213b41c0840767fbc96;p=ccan diff --git a/ccan/ccan_tokenizer/ccan_tokenizer.c b/ccan/ccan_tokenizer/ccan_tokenizer.c new file mode 100644 index 00000000..7d29e025 --- /dev/null +++ b/ccan/ccan_tokenizer/ccan_tokenizer.c @@ -0,0 +1,1057 @@ +/* + Copyright (c) 2009 Joseph A. Adams + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "ccan_tokenizer.h" + +#include + +#include + +//Shown by operator precedence; based on +// http://tigcc.ticalc.org/doc/opers.html#precedence . + +static struct dict_entry c_dictionary[] = { +//1. Highest + {'(',"("}, {')',")"}, + {'[',"["}, {']',"]"}, + {'{',"{"}, {'}',"}"}, + {'.',"."}, + {PTR_OP,"->"}, + +//2. Unary + {'!',"!"}, {'~',"~"}, //prefix + {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix + // + - & * + +//3. Multiplicative + // * + {'/',"/"}, {'%',"%"}, + +//4. Additive + // + - + +//5. Shift + {LEFT_OP,"<<"}, {RIGHT_OP,">>"}, + +//6. Relational + {'<',"<"}, {'>',">"}, + {LE_OP,"<="}, {GE_OP,">="}, + +//7. Equality + {EQ_OP,"=="}, {NE_OP,"!="}, + +//8. Bitwise AND + // & +//9. Bitwise XOR + {'^',"^"}, +//10. Bitwise OR + {'|',"|"}, + +//11. Logical AND + {AND_OP,"&&"}, +//12. Logical OR + {OR_OP,"||"}, + +//13. Conditional + {'?',"?"}, {':',":"}, + +//14. Assignment + {'=',"="}, + {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="}, + {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="}, + {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="}, + {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="}, + +//15. Comma + {',',","}, + +//16. Semicolon + {';',";"}, + +//Misc + {ELLIPSIS,"..."}, + {'#',"#"}, + {DOUBLE_POUND,"##"}, + +//Ambiguous + //unary or binary + {'+',"+"}, {'-',"-"}, + {'&',"&"}, {'*',"*"}, + +//Keywords + {_BOOL, "_Bool"}, + {_COMPLEX, "_Complex"}, + {_IMAGINARY, "_Imaginary"}, + {BREAK, "break"}, + {CASE, "case"}, + {CHAR, "char"}, + {CONST, "const"}, + {CONTINUE, "continue"}, + {DEFAULT, "default"}, + {DO, "do"}, + {DOUBLE, "double"}, + {ELSE, "else"}, + {ENUM, "enum"}, + {EXTERN, "extern"}, + {FLOAT, "float"}, + {FOR, "for"}, + {GOTO, "goto"}, + {IF, "if"}, + {INLINE, "inline"}, + {INT, "int"}, + {LONG, "long"}, + {REGISTER, "register"}, + {RESTRICT, "restrict"}, + {RETURN, "return"}, + {SHORT, "short"}, + {SIGNED, "signed"}, + {SIZEOF, "sizeof"}, + {STATIC, "static"}, + {STRUCT, "struct"}, + {SWITCH, "switch"}, + {TYPEDEF, "typedef"}, + {UNION, "union"}, + {UNSIGNED, "unsigned"}, + {VOID, "void"}, + {VOLATILE, "volatile"}, + {WHILE, "while"}, + +//Preprocessor keywords (except those already defined) + {VA_ARGS, "__VA_ARGS__"}, + {DEFINE, "define"}, + {ELIF, "elif"}, +// {ELSE, "else"}, + {ENDIF, "endif"}, + {ERROR, "error"}, +// {IF, "if"}, + {IFDEF, "ifdef"}, + {IFNDEF, "ifndef"}, + {INCLUDE, "include"}, + {LINE, "line"}, + {PRAGMA, "pragma"}, + {UNDEF, "undef"}, + {WARNING, "warning"}, +}; + +#if 0 + +struct tokenizer *tokenizer_new(void *ctx) { + struct tokenizer *t = talloc(ctx, struct tokenizer); + t->ctx = ctx; + queue_init(t->mq, t); + t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary)); + + return t; +} + +#endif + +#define MESSAGE_PATH "tokenize/" + +static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) { + const char *s = tl->orig, *e = s+tl->orig_size; + array_char txt = array_new(tl); + array(const char*) olines = array_new(tl); + array(const char*) tlines = array_new(tl); + + do { + const char *line_start = s, *line_end; + const char *lnw; //last non-white + size_t start_offset = txt.size; + + //scan to the next line and find the last non-white character in the line + while (sline_start && cspace(lnw[-1])) lnw--; + if (sline_start && lnw[-1]=='\\' && line_endolines = olines.item; + tl->olines_size = olines.size; + tl->txt = txt.item; + tl->txt_size = txt.size; + tl->tlines = tlines.item; + tl->tlines_size = tlines.size; +} + +static void normal_keyword(struct token *tok) { + if (tok->type==TOK_KEYWORD && + (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS)) + tok->type = TOK_IDENTIFIER; +} + +static int define_parmlist_has_ellipsis(struct token *start, struct token *end) { + while (end>start && token_is_ignored(end-1)) end--; + return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS); +} + +//Used to label __VA_ARGS__ as keywords within applicable macro expansions +//Start should follow the DEFINE directive keyword +static void this_is_a_define(struct token *start, struct token *end) { + struct token *i = start, *pl_start; + + //skip past the identifier that is defined + while (i= end) + return; + //TODO: check i->type to make sure it's an identifier, throw error otherwise + normal_keyword(i++); + + //see if this is actually a variadic macro + if (!(itype==TOK_OPERATOR && i->opkw=='(')) + goto not_va_args; + pl_start = ++i; + while (itype==TOK_OPERATOR && i->opkw==')')) + normal_keyword(i++); + if (!define_parmlist_has_ellipsis(pl_start, i++)) + goto not_va_args; + + //We have arrived at the macro expansion and know there is a ... argument + //Thus, we'll only change directive-only keywords to identifiers + for(; itype==TOK_KEYWORD && opkw_is_directive_only(i->opkw)) + i->type = TOK_IDENTIFIER; + } + +not_va_args: + while (i < end) + normal_keyword(i++); +} + +//fill the flags field of each token and untangle keywords and such +static void finalize_line(struct token *start, struct token *end) { + struct token *i = start, *j; + + assert(starttype==TOK_STARTLINE); + i++; + + while (itype==TOK_OPERATOR && i->opkw=='#') { + //preprocessor line + i->type = TOK_LEADING_POUND; + + //set pp on all tokens in this line + for (j=start; jflags.pp = 1; + + //find the relevant token after the '#' + for (i++; iflags.pp_directive = 1; + if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw)) + i->type = TOK_IDENTIFIER; + //TODO: Handle invalid preprocessor directives (e.g. #+ ) + + if (i->type==TOK_KEYWORD && i->opkw==DEFINE) { + for (j=i+1; jtlines; + const char * const *lse = lss + tl->tlines_size; + struct token *i; + struct token *startline = NULL; + + assert(start < end); + + tl->first = start; + tl->last = end-1; + + for (i=start; ; i++) { + //perform a second pass on each line + if (i >= end || i->type == TOK_STARTLINE) { + if (startline) + finalize_line(startline, i); + startline = i; + } + + if (i >= end) { + end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig; + break; + } + + //set up the list links + i->prev = i>start ? i-1 : NULL; + i->next = i+1txt starts on a later line, advance to it + while (lss+1txt >= lss[1] && i->txt > lss[0]) + lss++; + + //set up line, col, orig, and orig_size + i->line = lss - tl->tlines; + i->col = i->txt - *lss; + i->orig = tl->olines[i->line] + i->col; + if (i > start) + i[-1].orig_size = i->orig - i[-1].orig; + + assert(i->line < tl->olines_size); + + //clear the flags + memset(&i->flags, 0, sizeof(i->flags)); + } +} + +#define add(...) do { \ + struct token tok = {__VA_ARGS__}; \ + tok.txt = orig; \ + tok.txt_size = s-orig; \ + array_append(array, tok); \ + } while (0) + +#define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\') +#define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$') + //believe it or not, $ is a valid character in an identifier + +struct dict *tokenizer_dict = NULL; + +static void free_tokenizer_dict(void) { + talloc_free(tokenizer_dict); +} + +struct token_list *tokenize(const char *orig, size_t orig_size, + tok_message_queue *mq) { + struct token_list *tl = talloc(orig, struct token_list); + const char *s, *e; + size_t stray_count=0, cr_count=0; + array(struct token) array = array_new(tl); + int only_pound_include = 0; + + if (!tokenizer_dict) { + tokenizer_dict = dict_build(NULL, c_dictionary, + sizeof(c_dictionary)/sizeof(*c_dictionary)); + atexit(free_tokenizer_dict); + } + + tl->orig = orig; + tl->orig_size = orig_size; + unbreak_backslash_broken_lines(tl, mq); + tl->filename = NULL; + + s = tl->txt; + e = s + tl->txt_size; + + array_appends(array, { + .type = TOK_STARTLINE, + .txt = s, + .txt_size = 0 + } ); + + while (s'; + short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE; + + while (s= e) { + s = e; + tok_msg_error(unterminated_comment, comment_start, + "Unterminated comment"); + break; + } + if (s[0]=='*' && s[1]=='/') { + s += 2; + break; + } + } + add(.type = TOK_CCOMMENT); + } else { // C++-style comment + while (sid}); + if (ent->id == INCLUDE) { + //hacky way to lex #include string properly + struct token *ts = array.item; + struct token *tp = ts+array.size-1; + while (tp>ts && token_is_ignored(tp-1)) + tp--; + if (tp>ts && token_is_op(tp-1, '#')) { + tp--; + while (tp>ts && token_is_ignored(tp-1)) + tp--; + if (tp>ts && tp[-1].type==TOK_STARTLINE) { + only_pound_include = 1; + continue; + } + } + } + } else { + s = ident_e; + add(.type = TOK_IDENTIFIER); + } + } else if (ent) { //operator + add(.type = TOK_OPERATOR, + {.opkw = ent->id}); + } else { //invalid symbol (shouldn't happen) + tok_msg_bug(unrecognized_symbol, s, + "Unrecognized symbol \'%c\'", c); + s++; + add(.type = TOK_STRAY); + } + } + } + + if (added_something) + only_pound_include = 0; + } + + /*if (stray_count) { + tok_msg_error(stray_characters, NULL, + "%lu stray characters in text", (unsigned long)stray_count); + }*/ + if (cr_count) { + tok_msg_warn(nonstandard_newlines, NULL, + "Text contains non-standard line terminators"); + } + + finalize(tl, array.item, array.item+array.size); + + return tl; +} + +size_t token_list_count(const struct token_list *tl) { + size_t ret = 0; + const struct token *i; + + for (i=tl->first; i; i=i->next) + ret++; + + return ret; +} + +static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) { + const char * const *orig = lines; + const char * const *orig_e = lines+line_count; + + while (line_count > 1) { + size_t middle = line_count>>1; + if (ptr < lines[middle]) + line_count = middle; + else { + lines += middle; + line_count -= middle; + } + } + + //select the *last* of equivalent lines + while (lines+1 < orig_e && lines[0]==lines[1]) + lines++; + + // (don't) select the *first* of equivalent lines + //while (lines>orig && linesolines_size; + + memset(out, 0, sizeof(*out)); + if (!tl) + return 0; + + if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) { + out->txt = ptr; + out->line = find_line(ptr, tl->tlines, line_count); + if (out->line < line_count) { + out->col = ptr - tl->tlines[out->line]; + out->orig = tl->olines[out->line] + out->col; + } else { + out->col = 0; + out->orig = tl->orig + tl->orig_size; + } + return 1; + } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) { + out->orig = ptr; + out->line = find_line(ptr, tl->olines, line_count); + if (out->line < line_count) { + const char *tline_start = tl->tlines[out->line]; + const char *tline_end = out->line+1 < line_count ? + tl->tlines[out->line+1] : + tl->txt + tl->txt_size; + + out->col = ptr - tl->olines[out->line]; + out->txt = tline_start + out->col; + + if (out->txt > tline_end) + out->txt = tline_end; + } else { + out->col = 0; + out->txt = tl->txt + tl->txt_size; + } + return 1; + } else { + return 0; + } +} + +static char *escape_string(array_char *buf, const char *str, size_t size) { + const char *s = str, *e = s+size; + array_from_lit(*buf, ""); + + for (;sitem; +} + +static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) { + const char *ts = txt, *te = ts+txt_size; + const char *os = orig, *oe = os+orig_size; + + do { + const char *ob = os; //start of next backslash break + const char *obe = os; //end of next backslash break + size_t size; //amount of text to compare for this round + + while (ob te || memcmp(ts, os, size)) + return 0; + ts += size; + os = obe; + } while (tsfirst; + struct token *last = tl->last; + struct token *i; + const char *txt=tl->txt, *orig=tl->orig; + const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size; + + if ((char*)first > (char*)last || + (size_t)((char*)last - (char*)first) % sizeof(struct token)) + failed("Token list pointers don't look right"); + + //token list should not end with TOK_STARTLINE unless + // the document is empty + if (last!=first && last->type==TOK_STARTLINE) + return 0; + + for (i=first; i; i=i->next) { + //Verify list links + if (i != first && i->prev != i-1) + failed("list.prev is incorrect"); + if (i != last && i->next != i+1) + failed("list.next is incorrect"); + + //Make sure txt segments fill the entire tl->txt + if (i->txt != txt) + failed("txt does not fill the token list"); + txt += i->txt_size; + if (txt > txt_e) + failed("txt is out of bounds"); + + //Make sure orig segments fill the entire tl->orig + if (i->orig != orig) + failed("orig does not fill the token list"); + orig += i->orig_size; + if (orig > orig_e) + failed("orig is out of bounds"); + } + + if (txt != txt_e) + return 0; + if (orig != orig_e) + return 0; + + return 1; +} + +int token_list_sanity_check(const struct token_list *tl, FILE *err) { + struct token *first = tl->first; + struct token *last = tl->last; + struct token *i; + int initial = 1; + + if (tl->first == NULL || tl->last == NULL) + failed("Token list is completely empty"); + + if (first->type!=TOK_STARTLINE || + first->txt!=tl->txt || first->txt_size!=0 || + first->orig!=tl->orig || first->orig_size!=0 || + first->line!=0 || first->col!=0) + failed("Token list does not start with a valid TOK_STARTLINE"); + + if (first->prev!=NULL || last->next!=NULL) + failed("Token edge links are not NULL"); + + for (i=first; i; i=i->next) { + //Verify line,col + if (tl->tlines[i->line] + i->col != i->txt) + failed("line,col is wrong against txt"); + if (tl->olines[i->line] + i->col != i->orig) + failed("line,col is wrong against orig"); + + //Make sure tokens have proper sizes + if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) ) + failed("Token is empty"); + if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) ) + failed("TOK_STARTLINE is non-empty"); + + //Make sure TOK_WHITE actually contains white tokens + if (i->type==TOK_WHITE) { + const char *s = i->txt, *e = s+i->txt_size; + while (stxt, i->txt_size, i->orig, i->orig_size)) { + array_char buf = array_new(NULL); + fprintf(err, + "txt and orig do not match:\n" + "\ttxt = \"%s\"\n", + escape_string(&buf, i->txt, i->txt_size) ); + fprintf(err, "\torig = \"%s\"\n", + escape_string(&buf, i->orig, i->orig_size) ); + + array_free(buf); + return 0; + } + + //Make sure tok_point_lookup returns correct point + { + struct tok_point tok_point; + const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p; + size_t line=i->line, col=i->col; + + #define check(ptr) do { \ + if (tok_point_lookup(&tok_point, ptr, tl)) { \ + if (tok_point.txt != t || tok_point.orig != o) \ + failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \ + (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \ + if (tok_point.line != line || tok_point.col != col) \ + failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \ + (int)(tok_point.line-line), (int)(tok_point.col-col)); \ + } else if (initial) {\ + failed("tok_point_lookup failed on initial token list"); \ + } \ + } while(0) + + for (;;) { + while (is_backslash_break(&p, o, e)) { + while (o= e) + break; + do { + if (creturn(*o)) { + p = o+1; + if (porig, *e = s+tl->orig_size; + size_t i, line_count = tl->olines_size; + + //both line arrays should be exactly the same size + if (tl->olines_size != tl->tlines_size) + return 0; + + for (i=0; stlines[i+1] - tl->tlines[i]; + else + tline_size = tl->txt+tl->txt_size - tl->tlines[i]; + + while (solines[i]) + return 0; + + //verify that tlines elements are in range + p = tl->tlines[i]; + if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size) + return 0; + + //verify that original lines have sizes >= the unbroken lines + if (oline_size < tline_size) + return 0; + + //if sizes are inconsistent, make sure it is due to a backslash escape + if (oline_size > tline_size) { + p = line_start+tline_size; + if (*p++ != '\\') + return 0; + while (polines[i], + tl->tlines[i], + tline_size) ) + return 0; + } + } + + if (initial && !token_list_sanity_check_initial(tl, err)) + failed("Initial sanity checks failed. Has the list been modified after it was returned from tokenize() ?"); + + return 1; +} + +#undef failed + +static char *sprint_token_flags(char buf[3], struct token_flags flags) { + buf[0] = flags.pp ? 'p' : '-'; + buf[1] = flags.pp_directive ? 'D' : '-'; + buf[2] = 0; + return buf; +} + +void token_list_dump(const struct token_list *tl, FILE *f) { + struct token *tok; + array_char buf = array_new(NULL); + size_t i = 0; + char buf2[8]; + const char *token_type_str[] = { + "TOK_INTEGER ", + "TOK_FLOATING ", + "TOK_OPERATOR ", + "TOK_KEYWORD ", + "TOK_IDENTIFIER ", + "TOK_CHAR ", + "TOK_STRING ", + "TOK_LEADING_POUND", + "TOK_STRING_IQUOTE", + "TOK_STRING_IANGLE", + "TOK_CCOMMENT ", + "TOK_CPPCOMMENT ", + "TOK_WHITE ", + "TOK_STARTLINE ", + "TOK_STRAY " + }; + + for (tok=tl->first; tok; tok=tok->next) { + fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++), + token_type_str[tok->type], + sprint_token_flags(buf2, tok->flags), + escape_string(&buf, tok->txt, tok->txt_size)); + #if 1 //print tok->orig + fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size)); + #else + fprintf(f, "\n"); + #endif + } + + array_free(buf); +} + +void tok_message_print(struct tok_message *m, struct token_list *tl) { + struct tok_point pt; + int resolved = tok_point_lookup(&pt, m->location, tl); + + if (tl->filename) { + printf("%s:%s", tl->filename, resolved ? "" : " "); + } + + if (resolved) { + printf("%zu:%zu %s: %s\n", + pt.line+1, pt.col+1, + m->level==TM_DEBUG ? "debug" : + m->level==TM_INFO ? "info" : + m->level==TM_WARN ? "warning" : + m->level==TM_ERROR ? "error" : + m->level==TM_BUG ? "BUG" : + "???", + m->message); + } else { + printf("%s: %s\n", + m->level==TM_DEBUG ? "debug" : + m->level==TM_INFO ? "info" : + m->level==TM_WARN ? "warning" : + m->level==TM_ERROR ? "error" : + m->level==TM_BUG ? "BUG" : + "???", + m->message); + } +} + +void tok_message_dump(struct tok_message *m) { + printf("%s: %s: %s\n", + m->level==TM_DEBUG ? "debug" : + m->level==TM_INFO ? "info" : + m->level==TM_WARN ? "warning" : + m->level==TM_ERROR ? "error" : + m->level==TM_BUG ? "BUG" : + "???", m->path, m->message); +} + +void tok_message_add(tok_message_queue *mq, enum tok_message_level level, + const char *path, const char *loc, const char *fmt, ...) { + struct tok_message msg = {.level=level, .path=path, .location=loc}; + va_list ap; + + if (!mq) + return; + + va_start(ap, fmt); + msg.message = talloc_vasprintf(mq->item, fmt, ap); + va_end(ap); + + enqueue(*mq, msg); +} + +void tok_message_queue_dump(const tok_message_queue *mq) { + size_t i; + for (i=0; i