git.ozlabs.org Git - ccan/blob - ccan/ccan_tokenizer/ccan_tokenizer.c

   1 /*
   2         Copyright (c) 2009  Joseph A. Adams
   3         All rights reserved.
   4
   5         Redistribution and use in source and binary forms, with or without
   6         modification, are permitted provided that the following conditions
   7         are met:
   8         1. Redistributions of source code must retain the above copyright
   9            notice, this list of conditions and the following disclaimer.
  10         2. Redistributions in binary form must reproduce the above copyright
  11            notice, this list of conditions and the following disclaimer in the
  12            documentation and/or other materials provided with the distribution.
  13         3. The name of the author may not be used to endorse or promote products
  14            derived from this software without specific prior written permission.
  15
  16         THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  17         IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18         OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19         IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  20         INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  21         NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22         DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23         THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24         (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  25         THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 */
  27
  28 #include "ccan_tokenizer.h"
  29
  30 #include <ccan/talloc/talloc.h>
  31
  32 #include <assert.h>
  33
  34 //Shown by operator precedence; based on
  35 // http://tigcc.ticalc.org/doc/opers.html#precedence .
  36
  37 static struct dict_entry c_dictionary[] = {
  38 //1. Highest
  39         {'(',"("}, {')',")"},
  40         {'[',"["}, {']',"]"},
  41         {'{',"{"}, {'}',"}"},
  42         {'.',"."},
  43         {PTR_OP,"->"},
  44
  45 //2. Unary
  46         {'!',"!"}, {'~',"~"}, //prefix
  47         {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
  48         // + - & *
  49
  50 //3. Multiplicative
  51         // *
  52         {'/',"/"}, {'%',"%"},
  53
  54 //4. Additive
  55         // + -
  56
  57 //5. Shift
  58         {LEFT_OP,"<<"}, {RIGHT_OP,">>"},
  59
  60 //6. Relational
  61         {'<',"<"}, {'>',">"},
  62         {LE_OP,"<="}, {GE_OP,">="},
  63
  64 //7. Equality
  65         {EQ_OP,"=="}, {NE_OP,"!="},
  66
  67 //8. Bitwise AND
  68         // &
  69 //9. Bitwise XOR
  70         {'^',"^"},
  71 //10. Bitwise OR
  72         {'|',"|"},
  73
  74 //11. Logical AND
  75         {AND_OP,"&&"},
  76 //12. Logical OR
  77         {OR_OP,"||"},
  78
  79 //13. Conditional
  80         {'?',"?"}, {':',":"},
  81
  82 //14. Assignment
  83         {'=',"="},
  84         {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
  85         {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
  86         {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
  87         {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
  88
  89 //15. Comma
  90         {',',","},
  91
  92 //16. Semicolon
  93         {';',";"},
  94
  95 //Misc
  96         {ELLIPSIS,"..."},
  97         {'#',"#"},
  98         {DOUBLE_POUND,"##"},
  99
 100 //Ambiguous
 101         //unary or binary
 102         {'+',"+"}, {'-',"-"},
 103         {'&',"&"}, {'*',"*"},
 104
 105 //Keywords
 106         {_BOOL, "_Bool"},
 107         {_COMPLEX, "_Complex"},
 108         {_IMAGINARY, "_Imaginary"},
 109         {BREAK, "break"},
 110         {CASE, "case"},
 111         {CHAR, "char"},
 112         {CONST, "const"},
 113         {CONTINUE, "continue"},
 114         {DEFAULT, "default"},
 115         {DO, "do"},
 116         {DOUBLE, "double"},
 117         {ELSE, "else"},
 118         {ENUM, "enum"},
 119         {EXTERN, "extern"},
 120         {FLOAT, "float"},
 121         {FOR, "for"},
 122         {GOTO, "goto"},
 123         {IF, "if"},
 124         {INLINE, "inline"},
 125         {INT, "int"},
 126         {LONG, "long"},
 127         {REGISTER, "register"},
 128         {RESTRICT, "restrict"},
 129         {RETURN, "return"},
 130         {SHORT, "short"},
 131         {SIGNED, "signed"},
 132         {SIZEOF, "sizeof"},
 133         {STATIC, "static"},
 134         {STRUCT, "struct"},
 135         {SWITCH, "switch"},
 136         {TYPEDEF, "typedef"},
 137         {UNION, "union"},
 138         {UNSIGNED, "unsigned"},
 139         {VOID, "void"},
 140         {VOLATILE, "volatile"},
 141         {WHILE, "while"},
 142
 143 //Preprocessor keywords (except those already defined)
 144         {VA_ARGS, "__VA_ARGS__"},
 145         {DEFINE, "define"},
 146         {ELIF, "elif"},
 147 //      {ELSE, "else"},
 148         {ENDIF, "endif"},
 149         {ERROR, "error"},
 150 //      {IF, "if"},
 151         {IFDEF, "ifdef"},
 152         {IFNDEF, "ifndef"},
 153         {INCLUDE, "include"},
 154         {LINE, "line"},
 155         {PRAGMA, "pragma"},
 156         {UNDEF, "undef"},
 157         {WARNING, "warning"},
 158 };
 159
 160 #if 0
 161
 162 struct tokenizer *tokenizer_new(void *ctx) {
 163         struct tokenizer *t = talloc(ctx, struct tokenizer);
 164         t->ctx = ctx;
 165         queue_init(t->mq, t);
 166         t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
 167
 168         return t;
 169 }
 170
 171 #endif
 172
 173 #define MESSAGE_PATH "tokenize/"
 174
 175 static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
 176         const char *s = tl->orig, *e = s+tl->orig_size;
 177         array_char txt = array_new(tl);
 178         array(const char*) olines = array_new(tl);
 179         array(const char*) tlines = array_new(tl);
 180
 181         do {
 182                 const char *line_start = s, *line_end;
 183                 const char *lnw; //last non-white
 184                 size_t start_offset = txt.size;
 185
 186                 //scan to the next line and find the last non-white character in the line
 187                 while (s<e && !creturn(*s)) s++;
 188                 line_end = s;
 189                 lnw = s;
 190                 while (lnw>line_start && cspace(lnw[-1])) lnw--;
 191                 if (s<e && creturn(*s)) {
 192                         s++;
 193                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
 194                         if (s<e && *s=='\n'+'\r'-s[-1])
 195                                 s++;
 196                 }
 197
 198                 //add the backslash-break-free version of the text
 199                 if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
 200                         array_append_items(txt, line_start, lnw-1-line_start);
 201                         if (lnw<e && cspace(*lnw)) {
 202                                 tok_msg_warn(spaces_after_backslash_break, lnw,
 203                                         "Trailing spaces after backslash-broken line");
 204                         }
 205                 } else
 206                         array_append_items(txt, line_start, s-line_start);
 207
 208                 //add the line starts for this line
 209                 array_append(olines, line_start);
 210                 array_append(tlines, (const char*)start_offset);
 211                         //Since the txt buffer moves when expanded, we're storing offsets
 212                         //  for now.  Once we're done building txt, we can add the base
 213                         //  of it to all the offsets to make them pointers.
 214         } while (s<e);
 215
 216         //stick a null terminator at the end of the text
 217         array_realloc(txt, txt.size+1);
 218         txt.item[txt.size] = 0;
 219
 220         //convert the line start offsets to pointers
 221         array_for(i, tlines, *i = txt.item + (size_t)*i);
 222
 223         tl->olines = olines.item;
 224         tl->olines_size = olines.size;
 225         tl->txt = txt.item;
 226         tl->txt_size = txt.size;
 227         tl->tlines = tlines.item;
 228         tl->tlines_size = tlines.size;
 229 }
 230
 231 static void normal_keyword(struct token *tok) {
 232         if (tok->type==TOK_KEYWORD &&
 233                         (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
 234                 tok->type = TOK_IDENTIFIER;
 235 }
 236
 237 static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
 238         while (end>start && token_is_ignored(end-1)) end--;
 239         return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
 240 }
 241
 242 //Used to label __VA_ARGS__ as keywords within applicable macro expansions
 243 //Start should follow the DEFINE directive keyword
 244 static void this_is_a_define(struct token *start, struct token *end) {
 245         struct token *i = start, *pl_start;
 246
 247         //skip past the identifier that is defined
 248         while (i<end && token_is_ignored(i)) i++;
 249         if (i >= end)
 250                 return;
 251          //TODO:  check i->type to make sure it's an identifier, throw error otherwise
 252         normal_keyword(i++);
 253
 254         //see if this is actually a variadic macro
 255         if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
 256                 goto not_va_args;
 257         pl_start = ++i;
 258         while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
 259                 normal_keyword(i++);
 260         if (!define_parmlist_has_ellipsis(pl_start, i++))
 261                 goto not_va_args;
 262
 263         //We have arrived at the macro expansion and know there is a ... argument
 264         //Thus, we'll only change directive-only keywords to identifiers
 265         for(; i<end; i++) {
 266                 if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
 267                         i->type = TOK_IDENTIFIER;
 268         }
 269
 270 not_va_args:
 271         while (i < end)
 272                 normal_keyword(i++);
 273 }
 274
 275 //fill the flags field of each token and untangle keywords and such
 276 static void finalize_line(struct token *start, struct token *end) {
 277         struct token *i = start, *j;
 278
 279         assert(start<end && start->type==TOK_STARTLINE);
 280         i++;
 281
 282         while (i<end && token_is_ignored(i)) i++;
 283
 284         if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
 285         //preprocessor line
 286                 i->type = TOK_LEADING_POUND;
 287
 288                 //set pp on all tokens in this line
 289                 for (j=start; j<end; j++)
 290                         j->flags.pp = 1;
 291
 292                 //find the relevant token after the '#'
 293                 for (i++; i<end; i++) {
 294                         if (!token_is_ignored(i)) {
 295                                 i->flags.pp_directive = 1;
 296                                 if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
 297                                         i->type = TOK_IDENTIFIER;
 298                                 //TODO:  Handle invalid preprocessor directives (e.g. #+ )
 299
 300                                 if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
 301                                         for (j=i+1; j<end; j++)
 302                                         this_is_a_define(i+1, end);
 303                                 } else {
 304                                         while (++i < end)
 305                                                 normal_keyword(i);
 306                                 }
 307                                 break;
 308                         }
 309                 }
 310         } else {
 311         //normal line
 312                 while (i < end)
 313                         normal_keyword(i++);
 314         }
 315 }
 316
 317 //fill the list, flags, line, col, orig, and orig_size fields of each token
 318 //convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
 319 static void finalize(struct token_list *tl, struct token *start, struct token *end) {
 320         const char * const *lss = tl->tlines;
 321         const char * const *lse = lss + tl->tlines_size;
 322         struct token *i;
 323         struct token *startline = NULL;
 324
 325         assert(start < end);
 326
 327         tl->first = start;
 328         tl->last = end-1;
 329
 330         for (i=start; ; i++) {
 331                 //perform a second pass on each line
 332                 if (i >= end || i->type == TOK_STARTLINE) {
 333                         if (startline)
 334                                 finalize_line(startline, i);
 335                         startline = i;
 336                 }
 337
 338                 if (i >= end) {
 339                         end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
 340                         break;
 341                 }
 342
 343                 //set up the list links
 344                 i->prev = i>start ? i-1 : NULL;
 345                 i->next = i+1<end ? i+1 : NULL;
 346
 347                 //if i->txt starts on a later line, advance to it
 348                 while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
 349                         lss++;
 350
 351                 //set up line, col, orig, and orig_size
 352                 i->line = lss - tl->tlines;
 353                 i->col = i->txt - *lss;
 354                 i->orig = tl->olines[i->line] + i->col;
 355                 if (i > start)
 356                         i[-1].orig_size = i->orig - i[-1].orig;
 357
 358                 assert(i->line < tl->olines_size);
 359
 360                 //clear the flags
 361                 memset(&i->flags, 0, sizeof(i->flags));
 362         }
 363 }
 364
 365 #define add(...) do { \
 366                 struct token tok = {__VA_ARGS__}; \
 367                 tok.txt = orig; \
 368                 tok.txt_size = s-orig; \
 369                 array_append(array, tok); \
 370         } while (0)
 371
 372 #define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
 373 #define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
 374         //believe it or not, $ is a valid character in an identifier
 375
 376 struct dict *tokenizer_dict = NULL;
 377
 378 static void free_tokenizer_dict(void) {
 379         talloc_free(tokenizer_dict);
 380 }
 381
 382 struct token_list *tokenize(const char *orig, size_t orig_size,
 383                                 tok_message_queue *mq) {
 384         struct token_list *tl = talloc(orig, struct token_list);
 385         const char *s, *e;
 386         size_t stray_count=0, cr_count=0;
 387         array(struct token) array = array_new(tl);
 388         int only_pound_include = 0;
 389
 390         if (!tokenizer_dict) {
 391                 tokenizer_dict = dict_build(NULL, c_dictionary,
 392                         sizeof(c_dictionary)/sizeof(*c_dictionary));
 393                 atexit(free_tokenizer_dict);
 394         }
 395
 396         tl->orig = orig;
 397         tl->orig_size = orig_size;
 398         unbreak_backslash_broken_lines(tl, mq);
 399         tl->filename = NULL;
 400
 401         s = tl->txt;
 402         e = s + tl->txt_size;
 403
 404         array_appends(array, {
 405                 .type = TOK_STARTLINE,
 406                 .txt = s,
 407                 .txt_size = 0
 408         } );
 409
 410         while (s<e) {
 411                 const char *orig = s;
 412                 char c = *s++;
 413                 int added_something = 1;
 414
 415                 if (cstray(c)) {
 416                         stray_count++;
 417                         while (s<e && cstray(*s)) {
 418                                 s++;
 419                                 stray_count++;
 420                         }
 421                         add(.type = TOK_STRAY);
 422
 423                         /* This has the potential to be very noisy on binary
 424                            files, but it really is quite useful. */
 425                         tok_msg_error(stray_segment, orig,
 426                                 "%zu stray characters", s-orig);
 427
 428                 } else if (creturn(c)) {
 429                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
 430                         if (s<e && *s=='\n'+'\r'-c) {
 431                                 s++;
 432                                 cr_count++;
 433                         } else if (c=='\r')
 434                                 cr_count++;
 435
 436                         add(.type = TOK_WHITE);
 437                         orig = s;
 438
 439                         //add a TOK_STARTLINE for the next line unless this is the end of the document
 440                         if (s<e)
 441                                 add(.type = TOK_STARTLINE);
 442
 443                         only_pound_include = 0;
 444
 445                 } else if (cspace(c)) {
 446                         //skip over the remaining whitespace
 447                         while (s<e && cspace(*s)) s++;
 448                         add(.type = TOK_WHITE);
 449                         added_something = 0;
 450
 451                 } else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
 452                         struct token tok;
 453                         s = read_cnumber(&tok, s-1, e, mq);
 454                         tok.txt = orig;
 455                         tok.txt_size = s-orig;
 456                         array_append(array, tok);
 457
 458                 } else if (csymbol(c) || cident(c)) {
 459                         if (only_pound_include && (c=='"' || c=='<')) { //include string
 460                                 char *include;
 461                                 char end = c=='"' ? '"' : '>';
 462                                 short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
 463
 464                                 while (s<e && !creturn(*s) && *s!=end) s++;
 465                                 include = talloc_strndup(tl, orig+1, s-(orig+1));
 466
 467                                 if (s<e && *s==end) {
 468                                         s++;
 469                                 } else {
 470                                         tok_msg_error(include_missing_terminator, orig,
 471                                                 "Missing terminating %c character", end);
 472                                 }
 473
 474                                 add(.type = type,
 475                                         {.include = include});
 476                         } else if (c=='\'' || c=='\"') { //character or string literal
 477                                 array_char string = array_new(tl);
 478                                 s = read_cstring(&string, s, e, c, mq);
 479                                 if (s<e) s++; //advance past endquote (if available)
 480                                 add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
 481                                     {.string = string});
 482                         } else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
 483                                 if (*s++ == '*') { /* C-style comment */
 484                                         const char *comment_start = s-2;
 485                                         for (;;s++) {
 486                                                 if (s+1 >= e) {
 487                                                         s = e;
 488                                                         tok_msg_error(unterminated_comment, comment_start,
 489                                                                 "Unterminated comment");
 490                                                         break;
 491                                                 }
 492                                                 if (s[0]=='*' && s[1]=='/') {
 493                                                         s += 2;
 494                                                         break;
 495                                                 }
 496                                         }
 497                                         add(.type = TOK_CCOMMENT);
 498                                 } else { // C++-style comment
 499                                         while (s<e && !creturn(*s)) s++;
 500                                         add(.type = TOK_CPPCOMMENT);
 501                                 }
 502                                 added_something = 0;
 503
 504                         } else { //operator, keyword, or identifier
 505                                 struct dict_entry *ent;
 506                                 const char *ident_e = --s;
 507                                 while (ident_e<e && cident(*ident_e) ) ident_e++;
 508
 509                                 ent = dict_lookup(tokenizer_dict, &s, e);
 510                                 if (cident(c)) { //keyword or identifier
 511                                         if (ent && s==ident_e) {
 512                                                 add(.type = TOK_KEYWORD,
 513                                                         {.opkw = ent->id});
 514                                                 if (ent->id == INCLUDE) {
 515                                                         //hacky way to lex #include string properly
 516                                                         struct token *ts = array.item;
 517                                                         struct token *tp = ts+array.size-1;
 518                                                         while (tp>ts && token_is_ignored(tp-1))
 519                                                                 tp--;
 520                                                         if (tp>ts && token_is_op(tp-1, '#')) {
 521                                                                 tp--;
 522                                                                 while (tp>ts && token_is_ignored(tp-1))
 523                                                                         tp--;
 524                                                                 if (tp>ts && tp[-1].type==TOK_STARTLINE) {
 525                                                                         only_pound_include = 1;
 526                                                                         continue;
 527                                                                 }
 528                                                         }
 529                                                 }
 530                                         } else {
 531                                                 s = ident_e;
 532                                                 add(.type = TOK_IDENTIFIER);
 533                                         }
 534                                 } else if (ent) { //operator
 535                                         add(.type = TOK_OPERATOR,
 536                                             {.opkw = ent->id});
 537                                 } else { //invalid symbol (shouldn't happen)
 538                                         tok_msg_bug(unrecognized_symbol, s,
 539                                                 "Unrecognized symbol \'%c\'", c);
 540                                         s++;
 541                                         add(.type = TOK_STRAY);
 542                                 }
 543                         }
 544                 }
 545
 546                 if (added_something)
 547                         only_pound_include = 0;
 548         }
 549
 550         /*if (stray_count) {
 551                 tok_msg_error(stray_characters, NULL,
 552                         "%lu stray characters in text", (unsigned long)stray_count);
 553         }*/
 554         if (cr_count) {
 555                 tok_msg_warn(nonstandard_newlines, NULL,
 556                         "Text contains non-standard line terminators");
 557         }
 558
 559         finalize(tl, array.item, array.item+array.size);
 560
 561         return tl;
 562 }
 563
 564 size_t token_list_count(const struct token_list *tl) {
 565         size_t ret = 0;
 566         const struct token *i;
 567
 568         for (i=tl->first; i; i=i->next)
 569                 ret++;
 570
 571         return ret;
 572 }
 573
 574 static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
 575         const char * const *orig = lines;
 576         const char * const *orig_e = lines+line_count;
 577
 578         while (line_count > 1) {
 579                 size_t middle = line_count>>1;
 580                 if (ptr < lines[middle])
 581                         line_count = middle;
 582                 else {
 583                         lines += middle;
 584                         line_count -= middle;
 585                 }
 586         }
 587
 588         //select the *last* of equivalent lines
 589         while (lines+1 < orig_e && lines[0]==lines[1])
 590                 lines++;
 591
 592         // (don't) select the *first* of equivalent lines
 593         //while (lines>orig && lines<orig_e && lines[-1]==lines[0])
 594         //      lines--;
 595
 596         return lines - orig;
 597 }
 598
 599 int tok_point_lookup(struct tok_point *out, const char *ptr,
 600                         const struct token_list *tl) {
 601         size_t line_count = tl->olines_size;
 602
 603         memset(out, 0, sizeof(*out));
 604         if (!tl)
 605                 return 0;
 606
 607         if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
 608                 out->txt = ptr;
 609                 out->line = find_line(ptr, tl->tlines, line_count);
 610                 if (out->line < line_count) {
 611                         out->col = ptr - tl->tlines[out->line];
 612                         out->orig = tl->olines[out->line] + out->col;
 613                 } else {
 614                         out->col = 0;
 615                         out->orig = tl->orig + tl->orig_size;
 616                 }
 617                 return 1;
 618         } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
 619                 out->orig = ptr;
 620                 out->line = find_line(ptr, tl->olines, line_count);
 621                 if (out->line < line_count) {
 622                         const char *tline_start = tl->tlines[out->line];
 623                         const char *tline_end = out->line+1 < line_count ?
 624                                 tl->tlines[out->line+1] :
 625                                 tl->txt + tl->txt_size;
 626
 627                         out->col = ptr - tl->olines[out->line];
 628                         out->txt = tline_start + out->col;
 629
 630                         if (out->txt > tline_end)
 631                                 out->txt = tline_end;
 632                 } else {
 633                         out->col = 0;
 634                         out->txt = tl->txt + tl->txt_size;
 635                 }
 636                 return 1;
 637         } else {
 638                 return 0;
 639         }
 640 }
 641
 642 static char *escape_string(array_char *buf, const char *str, size_t size) {
 643         const char *s = str, *e = s+size;
 644         array_from_lit(*buf, "");
 645
 646         for (;s<e;s++) {
 647                 char buffer[8];
 648                 const char *esc = buffer;
 649                 unsigned char c = (unsigned char)*s;
 650                 if (ccontrol(c))
 651                         sprintf(buffer, "\\x%02X", c);
 652                 else switch(c) {
 653                         case '\t': esc = "\\t"; break;
 654                         case '\n': esc = "\\n"; break;
 655                         case '\v': esc = "\\v"; break;
 656                         case '\f': esc = "\\f"; break;
 657                         case '\r': esc = "\\r"; break;
 658                         case '"': esc = "\\\""; break;
 659                         case '\\': esc = "\\\\"; break;
 660                         default:
 661                                 buffer[0] = c;
 662                                 buffer[1] = 0;
 663                 }
 664                 array_append_string(*buf, esc);
 665         }
 666
 667         return buf->item;
 668 }
 669
 670 static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
 671         const char *ts = txt, *te = ts+txt_size;
 672         const char *os = orig, *oe = os+orig_size;
 673
 674         do {
 675                 const char *ob = os; //start of next backslash break
 676                 const char *obe = os; //end of next backslash break
 677                 size_t size; //amount of text to compare for this round
 678
 679                 while (ob<oe && *ob!='\\') ob++;
 680                 obe = ob;
 681                 if (obe < oe) { //there's a backslash
 682                         obe++;
 683                         while (obe<oe && cspace(*obe)) obe++;
 684                         if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
 685                                 obe++;
 686                                 if (obe<oe && *obe == '\n'+'\r'-obe[-1])
 687                                         obe++;
 688                         } else //this is just a plain old backslash
 689                                 ob = obe;
 690                 }
 691
 692                 size = ob-os;
 693
 694                 if (ts+size > te || memcmp(ts, os, size))
 695                         return 0;
 696                 ts += size;
 697                 os = obe;
 698         } while (ts<te);
 699
 700         if (ts != te || os != oe)
 701                 return 0;
 702
 703         return 1;
 704 }
 705
 706 static int is_backslash_break(const char **end, const char *s, const char *e) {
 707         if (s<e && *s == '\\') {
 708                 s++;
 709                 while (s<e && cspace(*s)) s++;
 710                 if (s<e && creturn(*s)) {
 711                         s++;
 712                         if (s<e && *s=='\n'+'\r'-s[-1])
 713                                 s++;
 714                         *end = s;
 715                         return 1;
 716                 }
 717                 return 0;
 718         }
 719         return 0;
 720 }
 721
 722 #define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)
 723
 724 //tests that should pass on an untainted token list out of the tokenize() function
 725 static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
 726         struct token *first = tl->first;
 727         struct token *last = tl->last;
 728         struct token *i;
 729         const char *txt=tl->txt, *orig=tl->orig;
 730         const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
 731
 732         if ((char*)first > (char*)last ||
 733                 (size_t)((char*)last - (char*)first) % sizeof(struct token))
 734                 failed("Token list pointers don't look right");
 735
 736         //token list should not end with TOK_STARTLINE unless
 737         //  the document is empty
 738         if (last!=first && last->type==TOK_STARTLINE)
 739                 return 0;
 740
 741         for (i=first; i; i=i->next) {
 742                 //Verify list links
 743                 if (i != first && i->prev != i-1)
 744                         failed("list.prev is incorrect");
 745                 if (i != last && i->next != i+1)
 746                         failed("list.next is incorrect");
 747
 748                 //Make sure txt segments fill the entire tl->txt
 749                 if (i->txt != txt)
 750                         failed("txt does not fill the token list");
 751                 txt += i->txt_size;
 752                 if (txt > txt_e)
 753                         failed("txt is out of bounds");
 754
 755                 //Make sure orig segments fill the entire tl->orig
 756                 if (i->orig != orig)
 757                         failed("orig does not fill the token list");
 758                 orig += i->orig_size;
 759                 if (orig > orig_e)
 760                         failed("orig is out of bounds");
 761         }
 762
 763         if (txt != txt_e)
 764                 return 0;
 765         if (orig != orig_e)
 766                 return 0;
 767
 768         return 1;
 769 }
 770
 771 int token_list_sanity_check(const struct token_list *tl, FILE *err) {
 772         struct token *first = tl->first;
 773         struct token *last = tl->last;
 774         struct token *i;
 775         int initial = 1;
 776
 777         if (tl->first == NULL || tl->last == NULL)
 778                 failed("Token list is completely empty");
 779
 780         if (first->type!=TOK_STARTLINE ||
 781             first->txt!=tl->txt || first->txt_size!=0 ||
 782             first->orig!=tl->orig || first->orig_size!=0 ||
 783             first->line!=0 || first->col!=0)
 784                 failed("Token list does not start with a valid TOK_STARTLINE");
 785
 786         if (first->prev!=NULL || last->next!=NULL)
 787                 failed("Token edge links are not NULL");
 788
 789         for (i=first; i; i=i->next) {
 790                 //Verify line,col
 791                 if (tl->tlines[i->line] + i->col != i->txt)
 792                         failed("line,col is wrong against txt");
 793                 if (tl->olines[i->line] + i->col != i->orig)
 794                         failed("line,col is wrong against orig");
 795
 796                 //Make sure tokens have proper sizes
 797                 if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
 798                         failed("Token is empty");
 799                 if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
 800                         failed("TOK_STARTLINE is non-empty");
 801
 802                 //Make sure TOK_WHITE actually contains white tokens
 803                 if (i->type==TOK_WHITE) {
 804                         const char *s = i->txt, *e = s+i->txt_size;
 805                         while (s<e && cwhite(*s)) s++;
 806                         if (s != e)
 807                                 failed("TOK_WHITE does not contain only white characters");
 808                 }
 809
 810                 //Make sure txt and orig match exactly except for backslash line breaks
 811                 if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
 812                         array_char buf = array_new(NULL);
 813                         fprintf(err,
 814                                 "txt and orig do not match:\n"
 815                                 "\ttxt  = \"%s\"\n",
 816                                 escape_string(&buf, i->txt, i->txt_size) );
 817                         fprintf(err, "\torig = \"%s\"\n",
 818                                 escape_string(&buf, i->orig, i->orig_size) );
 819
 820                         array_free(buf);
 821                         return 0;
 822                 }
 823
 824                 //Make sure tok_point_lookup returns correct point
 825                 {
 826                         struct tok_point tok_point;
 827                         const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
 828                         size_t line=i->line, col=i->col;
 829
 830                         #define check(ptr) do { \
 831                                 if (tok_point_lookup(&tok_point, ptr, tl)) { \
 832                                         if (tok_point.txt != t || tok_point.orig != o) \
 833                                                 failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
 834                                                 (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
 835                                         if (tok_point.line != line || tok_point.col != col) \
 836                                                 failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
 837                                                 (int)(tok_point.line-line), (int)(tok_point.col-col)); \
 838                                 } else if (initial) {\
 839                                         failed("tok_point_lookup failed on initial token list"); \
 840                                 } \
 841                         } while(0)
 842
 843                         for (;;) {
 844                                 while (is_backslash_break(&p, o, e)) {
 845                                         while (o<p) {
 846                                                 check(o);
 847                                                 o++;
 848                                                 col++;
 849                                         }
 850                                         col = 0;
 851                                         line++;
 852                                 }
 853                                 if (o >= e)
 854                                         break;
 855                                 do {
 856                                         if (creturn(*o)) {
 857                                                 p = o+1;
 858                                                 if (p<e && *p=='\n'+'\r'-p[-1])
 859                                                         p++;
 860                                                 while (o<p) {
 861                                                         check(o);
 862                                                         check(t);
 863                                                         t++, o++, col++;
 864                                                 }
 865                                                 line++;
 866                                                 col = 0;
 867                                         } else {
 868                                                 check(o);
 869                                                 check(t);
 870                                                 o++, t++, col++;
 871                                         }
 872                                 } while (o<e && *o!='\\');
 873                         }
 874
 875                         #undef check
 876                 }
 877         };
 878
 879         //Verify olines and tlines
 880         {
 881                 const char *s = tl->orig, *e = s+tl->orig_size;
 882                 size_t i, line_count = tl->olines_size;
 883
 884                 //both line arrays should be exactly the same size
 885                 if (tl->olines_size != tl->tlines_size)
 886                         return 0;
 887
 888                 for (i=0; s<e; i++) {
 889                         const char *line_start = s, *line_end;
 890                         size_t tline_size, oline_size;
 891                         const char *p;
 892
 893                         if (i+1 < line_count)
 894                                 tline_size = tl->tlines[i+1] - tl->tlines[i];
 895                         else
 896                                 tline_size = tl->txt+tl->txt_size - tl->tlines[i];
 897
 898                         while (s<e && !creturn(*s)) s++;
 899                         line_end = s;
 900                         if (s<e) {
 901                                 s++;
 902                                 if (s<e && *s=='\n'+'\r'-s[-1])
 903                                         s++;
 904                         }
 905
 906                         oline_size = s-line_start;
 907
 908                         //verify that olines elements are correct
 909                         if (line_start != tl->olines[i])
 910                                 return 0;
 911
 912                         //verify that tlines elements are in range
 913                         p = tl->tlines[i];
 914                         if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
 915                                 return 0;
 916
 917                         //verify that original lines have sizes >= the unbroken lines
 918                         if (oline_size < tline_size)
 919                                 return 0;
 920
 921                         //if sizes are inconsistent, make sure it is due to a backslash escape
 922                         if (oline_size > tline_size) {
 923                                 p = line_start+tline_size;
 924                                 if (*p++ != '\\')
 925                                         return 0;
 926                                 while (p<e && cspace(*p)) p++;
 927                                 if (p != line_end)
 928                                         return 0;
 929                         }
 930
 931                         //make sure the text of both copies match
 932                         if ( memcmp(
 933                                 tl->olines[i],
 934                                 tl->tlines[i],
 935                                 tline_size) )
 936                                 return 0;
 937                 }
 938         }
 939
 940         if (initial && !token_list_sanity_check_initial(tl, err))
 941                 failed("Initial sanity checks failed.  Has the list been modified after it was returned from tokenize() ?");
 942
 943         return 1;
 944 }
 945
 946 #undef failed
 947
 948 static char *sprint_token_flags(char buf[3], struct token_flags flags) {
 949         buf[0] = flags.pp ? 'p' : '-';
 950         buf[1] = flags.pp_directive ? 'D' : '-';
 951         buf[2] = 0;
 952         return buf;
 953 }
 954
 955 void token_list_dump(const struct token_list *tl, FILE *f) {
 956         struct token *tok;
 957         array_char buf = array_new(NULL);
 958         size_t i = 0;
 959         char buf2[8];
 960         const char *token_type_str[] = {
 961                 "TOK_INTEGER      ",
 962                 "TOK_FLOATING     ",
 963                 "TOK_OPERATOR     ",
 964                 "TOK_KEYWORD      ",
 965                 "TOK_IDENTIFIER   ",
 966                 "TOK_CHAR         ",
 967                 "TOK_STRING       ",
 968                 "TOK_LEADING_POUND",
 969                 "TOK_STRING_IQUOTE",
 970                 "TOK_STRING_IANGLE",
 971                 "TOK_CCOMMENT     ",
 972                 "TOK_CPPCOMMENT   ",
 973                 "TOK_WHITE        ",
 974                 "TOK_STARTLINE    ",
 975                 "TOK_STRAY        "
 976         };
 977
 978         for (tok=tl->first; tok; tok=tok->next) {
 979                 fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
 980                         token_type_str[tok->type],
 981                         sprint_token_flags(buf2, tok->flags),
 982                         escape_string(&buf, tok->txt, tok->txt_size));
 983                 #if 1 //print tok->orig
 984                 fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
 985                 #else
 986                 fprintf(f, "\n");
 987                 #endif
 988         }
 989
 990         array_free(buf);
 991 }
 992
 993 void tok_message_print(struct tok_message *m, struct token_list *tl) {
 994         struct tok_point pt;
 995         int resolved = tok_point_lookup(&pt, m->location, tl);
 996
 997         if (tl->filename) {
 998                 printf("%s:%s", tl->filename, resolved ? "" : " ");
 999         }
1000
1001         if (resolved) {
1002                 printf("%zu:%zu %s: %s\n",
1003                         pt.line+1, pt.col+1,
1004                         m->level==TM_DEBUG ? "debug" :
1005                         m->level==TM_INFO ? "info" :
1006                         m->level==TM_WARN ? "warning" :
1007                         m->level==TM_ERROR ? "error" :
1008                         m->level==TM_BUG ? "BUG" :
1009                         "???",
1010                         m->message);
1011         } else {
1012                 printf("%s: %s\n",
1013                         m->level==TM_DEBUG ? "debug" :
1014                         m->level==TM_INFO ? "info" :
1015                         m->level==TM_WARN ? "warning" :
1016                         m->level==TM_ERROR ? "error" :
1017                         m->level==TM_BUG ? "BUG" :
1018                         "???",
1019                         m->message);
1020         }
1021 }
1022
1023 void tok_message_dump(struct tok_message *m) {
1024         printf("%s: %s: %s\n",
1025                 m->level==TM_DEBUG ? "debug" :
1026                 m->level==TM_INFO ? "info" :
1027                 m->level==TM_WARN ? "warning" :
1028                 m->level==TM_ERROR ? "error" :
1029                 m->level==TM_BUG ? "BUG" :
1030                 "???", m->path, m->message);
1031 }
1032
1033 void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
1034         const char *path, const char *loc, const char *fmt, ...) {
1035         struct tok_message msg = {.level=level, .path=path, .location=loc};
1036         va_list ap;
1037
1038         if (!mq)
1039                 return;
1040
1041         va_start(ap, fmt);
1042         msg.message = talloc_vasprintf(mq->item, fmt, ap);
1043         va_end(ap);
1044
1045         enqueue(*mq, msg);
1046 }
1047
1048 void tok_message_queue_dump(const tok_message_queue *mq) {
1049         size_t i;
1050         for (i=0; i<queue_count(*mq); i++)
1051                 tok_message_dump(&queue_item(*mq, i));
1052 }
1053
1054
1055 #undef add
1056 #undef cstray
1057 #undef cident