git.ozlabs.org Git - ccan/blob - ccan/ccan_tokenizer/ccan_tokenizer.c

   1 /*
   2         Copyright (c) 2009  Joseph A. Adams
   3         All rights reserved.
   4
   5         Redistribution and use in source and binary forms, with or without
   6         modification, are permitted provided that the following conditions
   7         are met:
   8         1. Redistributions of source code must retain the above copyright
   9            notice, this list of conditions and the following disclaimer.
  10         2. Redistributions in binary form must reproduce the above copyright
  11            notice, this list of conditions and the following disclaimer in the
  12            documentation and/or other materials provided with the distribution.
  13         3. The name of the author may not be used to endorse or promote products
  14            derived from this software without specific prior written permission.
  15
  16         THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  17         IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18         OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19         IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  20         INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  21         NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22         DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23         THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24         (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  25         THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 */
  27
  28 #include "ccan_tokenizer.h"
  29
  30 #include <ccan/talloc/talloc.h>
  31
  32 #include <assert.h>
  33
  34 //Shown by operator precedence; based on
  35 // http://tigcc.ticalc.org/doc/opers.html#precedence .
  36
  37 static struct dict_entry c_dictionary[] = {
  38 //1. Highest
  39         {'(',"("}, {')',")"},
  40         {'[',"["}, {']',"]"},
  41         {'{',"{"}, {'}',"}"},
  42         {'.',"."},
  43         {PTR_OP,"->"},
  44
  45 //2. Unary
  46         {'!',"!"}, {'~',"~"}, //prefix
  47         {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
  48         // + - & *
  49
  50 //3. Multiplicative
  51         // *
  52         {'/',"/"}, {'%',"%"},
  53
  54 //4. Additive
  55         // + -
  56
  57 //5. Shift
  58         {LEFT_OP,"<<"}, {RIGHT_OP,">>"},
  59
  60 //6. Relational
  61         {'<',"<"}, {'>',">"},
  62         {LE_OP,"<="}, {GE_OP,">="},
  63
  64 //7. Equality
  65         {EQ_OP,"=="}, {NE_OP,"!="},
  66
  67 //8. Bitwise AND
  68         // &
  69 //9. Bitwise XOR
  70         {'^',"^"},
  71 //10. Bitwise OR
  72         {'|',"|"},
  73
  74 //11. Logical AND
  75         {AND_OP,"&&"},
  76 //12. Logical OR
  77         {OR_OP,"||"},
  78
  79 //13. Conditional
  80         {'?',"?"}, {':',":"},
  81
  82 //14. Assignment
  83         {'=',"="},
  84         {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
  85         {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
  86         {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
  87         {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
  88
  89 //15. Comma
  90         {',',","},
  91
  92 //16. Semicolon
  93         {';',";"},
  94
  95 //Misc
  96         {ELLIPSIS,"..."},
  97         {'#',"#"},
  98         {DOUBLE_POUND,"##"},
  99
 100 //Ambiguous
 101         //unary or binary
 102         {'+',"+"}, {'-',"-"},
 103         {'&',"&"}, {'*',"*"},
 104
 105 //Keywords
 106         {_BOOL, "_Bool"},
 107         {_COMPLEX, "_Complex"},
 108         {_IMAGINARY, "_Imaginary"},
 109         {BREAK, "break"},
 110         {CASE, "case"},
 111         {CHAR, "char"},
 112         {CONST, "const"},
 113         {CONTINUE, "continue"},
 114         {DEFAULT, "default"},
 115         {DO, "do"},
 116         {DOUBLE, "double"},
 117         {ELSE, "else"},
 118         {ENUM, "enum"},
 119         {EXTERN, "extern"},
 120         {FLOAT, "float"},
 121         {FOR, "for"},
 122         {GOTO, "goto"},
 123         {IF, "if"},
 124         {INLINE, "inline"},
 125         {INT, "int"},
 126         {LONG, "long"},
 127         {REGISTER, "register"},
 128         {RESTRICT, "restrict"},
 129         {RETURN, "return"},
 130         {SHORT, "short"},
 131         {SIGNED, "signed"},
 132         {SIZEOF, "sizeof"},
 133         {STATIC, "static"},
 134         {STRUCT, "struct"},
 135         {SWITCH, "switch"},
 136         {TYPEDEF, "typedef"},
 137         {UNION, "union"},
 138         {UNSIGNED, "unsigned"},
 139         {VOID, "void"},
 140         {VOLATILE, "volatile"},
 141         {WHILE, "while"},
 142
 143 //Preprocessor keywords (except those already defined)
 144         {VA_ARGS, "__VA_ARGS__"},
 145         {DEFINE, "define"},
 146         {ELIF, "elif"},
 147 //      {ELSE, "else"},
 148         {ENDIF, "endif"},
 149         {ERROR, "error"},
 150 //      {IF, "if"},
 151         {IFDEF, "ifdef"},
 152         {IFNDEF, "ifndef"},
 153         {INCLUDE, "include"},
 154         {LINE, "line"},
 155         {PRAGMA, "pragma"},
 156         {UNDEF, "undef"},
 157         {WARNING, "warning"},
 158 };
 159
 160 #if 0
 161
 162 struct tokenizer *tokenizer_new(void *ctx) {
 163         struct tokenizer *t = talloc(ctx, struct tokenizer);
 164         t->ctx = ctx;
 165         queue_init(t->mq, t);
 166         t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
 167
 168         return t;
 169 }
 170
 171 #endif
 172
 173 #define MESSAGE_PATH "tokenize/"
 174
 175 static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
 176         const char *s = tl->orig, *e = s+tl->orig_size;
 177         array_char txt = array_new(tl);
 178         array(const char*) olines = array_new(tl);
 179         array(const char*) tlines = array_new(tl);
 180
 181         do {
 182                 const char *line_start = s, *line_end;
 183                 const char *lnw; //last non-white
 184                 size_t start_offset = txt.size;
 185
 186                 //scan to the next line and find the last non-white character in the line
 187                 while (s<e && !creturn(*s)) s++;
 188                 line_end = s;
 189                 lnw = s;
 190                 while (lnw>line_start && cspace(lnw[-1])) lnw--;
 191                 if (s<e && creturn(*s)) {
 192                         s++;
 193                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
 194                         if (s<e && *s=='\n'+'\r'-s[-1])
 195                                 s++;
 196                 }
 197
 198                 //add the backslash-break-free version of the text
 199                 if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
 200                         array_append_items(txt, line_start, lnw-1-line_start);
 201                         if (lnw<e && cspace(*lnw)) {
 202                                 tok_msg_warn(spaces_after_backslash_break, lnw,
 203                                         "Trailing spaces after backslash-broken line");
 204                         }
 205                 } else
 206                         array_append_items(txt, line_start, s-line_start);
 207
 208                 //add the line starts for this line
 209                 array_append(olines, line_start);
 210                 array_append(tlines, (const char*)start_offset);
 211                         //Since the txt buffer moves when expanded, we're storing offsets
 212                         //  for now.  Once we're done building txt, we can add the base
 213                         //  of it to all the offsets to make them pointers.
 214         } while (s<e);
 215
 216         //stick a null terminator at the end of the text
 217         array_realloc(txt, txt.size+1);
 218         txt.item[txt.size] = 0;
 219
 220         //convert the line start offsets to pointers
 221         array_for(i, tlines, *i = txt.item + (size_t)*i);
 222
 223         tl->olines = olines.item;
 224         tl->olines_size = olines.size;
 225         tl->txt = txt.item;
 226         tl->txt_size = txt.size;
 227         tl->tlines = tlines.item;
 228         tl->tlines_size = tlines.size;
 229 }
 230
 231 static void normal_keyword(struct token *tok) {
 232         if (tok->type==TOK_KEYWORD &&
 233                         (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
 234                 tok->type = TOK_IDENTIFIER;
 235 }
 236
 237 static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
 238         while (end>start && token_is_ignored(end-1)) end--;
 239         return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
 240 }
 241
 242 //Used to label __VA_ARGS__ as keywords within applicable macro expansions
 243 //Start should follow the DEFINE directive keyword
 244 static void this_is_a_define(struct token *start, struct token *end) {
 245         struct token *i = start, *pl_start;
 246
 247         //skip past the identifier that is defined
 248         while (i<end && token_is_ignored(i)) i++;
 249         if (i >= end)
 250                 return;
 251          //TODO:  check i->type to make sure it's an identifier, throw error otherwise
 252         normal_keyword(i++);
 253
 254         //see if this is actually a variadic macro
 255         if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
 256                 goto not_va_args;
 257         pl_start = ++i;
 258         while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
 259                 normal_keyword(i++);
 260         if (!define_parmlist_has_ellipsis(pl_start, i++))
 261                 goto not_va_args;
 262
 263         //We have arrived at the macro expansion and know there is a ... argument
 264         //Thus, we'll only change directive-only keywords to identifiers
 265         for(; i<end; i++) {
 266                 if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
 267                         i->type = TOK_IDENTIFIER;
 268         }
 269
 270 not_va_args:
 271         while (i < end)
 272                 normal_keyword(i++);
 273 }
 274
 275 //fill the flags field of each token and untangle keywords and such
 276 static void finalize_line(struct token *start, struct token *end) {
 277         struct token *i = start, *j;
 278
 279         assert(start<end && start->type==TOK_STARTLINE);
 280         i++;
 281
 282         while (i<end && token_is_ignored(i)) i++;
 283
 284         if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
 285         //preprocessor line
 286                 i->type = TOK_LEADING_POUND;
 287
 288                 //set pp on all tokens in this line
 289                 for (j=start; j<end; j++)
 290                         j->flags.pp = 1;
 291
 292                 //find the relevant token after the '#'
 293                 for (i++; i<end; i++) {
 294                         if (!token_is_ignored(i)) {
 295                                 i->flags.pp_directive = 1;
 296                                 if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
 297                                         i->type = TOK_IDENTIFIER;
 298                                 //TODO:  Handle invalid preprocessor directives (e.g. #+ )
 299
 300                                 if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
 301                                         for (j=i+1; j<end; j++)
 302                                         this_is_a_define(i+1, end);
 303                                 } else {
 304                                         while (++i < end)
 305                                                 normal_keyword(i);
 306                                 }
 307                                 break;
 308                         }
 309                 }
 310         } else {
 311         //normal line
 312                 while (i < end)
 313                         normal_keyword(i++);
 314         }
 315 }
 316
 317 //fill the list, flags, line, col, orig, and orig_size fields of each token
 318 //convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
 319 static void finalize(struct token_list *tl, struct token *start, struct token *end) {
 320         const char * const *lss = tl->tlines;
 321         const char * const *lse = lss + tl->tlines_size;
 322         struct token *i;
 323         struct token *startline = NULL;
 324
 325         assert(start < end);
 326
 327         tl->first = start;
 328         tl->last = end-1;
 329
 330         for (i=start; ; i++) {
 331                 //perform a second pass on each line
 332                 if (i >= end || i->type == TOK_STARTLINE) {
 333                         if (startline)
 334                                 finalize_line(startline, i);
 335                         startline = i;
 336                 }
 337
 338                 if (i >= end) {
 339                         end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
 340                         break;
 341                 }
 342
 343                 //set up the list links
 344                 i->prev = i>start ? i-1 : NULL;
 345                 i->next = i+1<end ? i+1 : NULL;
 346
 347                 //if i->txt starts on a later line, advance to it
 348                 while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
 349                         lss++;
 350
 351                 //set up line, col, orig, and orig_size
 352                 i->line = lss - tl->tlines;
 353                 i->col = i->txt - *lss;
 354                 i->orig = tl->olines[i->line] + i->col;
 355                 if (i > start)
 356                         i[-1].orig_size = i->orig - i[-1].orig;
 357
 358                 assert(i->line < tl->olines_size);
 359
 360                 //clear the flags
 361                 memset(&i->flags, 0, sizeof(i->flags));
 362         }
 363 }
 364
 365 #define add(...) do { \
 366                 struct token tok = {__VA_ARGS__}; \
 367                 tok.txt = orig; \
 368                 tok.txt_size = s-orig; \
 369                 array_append(array, tok); \
 370         } while (0)
 371
 372 #define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
 373 #define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
 374         //believe it or not, $ is a valid character in an identifier
 375
 376 struct dict *tokenizer_dict = NULL;
 377
 378 static void free_tokenizer_dict(void) {
 379         talloc_free(tokenizer_dict);
 380 }
 381
 382 struct token_list *tokenize(const char *orig, size_t orig_size,
 383                                 tok_message_queue *mq) {
 384         struct token_list *tl = talloc(orig, struct token_list);
 385         const char *s, *e;
 386         size_t stray_count=0, cr_count=0;
 387         array(struct token) array = array_new(tl);
 388         int only_pound_include = 0;
 389
 390         if (!tokenizer_dict) {
 391                 tokenizer_dict = dict_build(NULL, c_dictionary,
 392                         sizeof(c_dictionary)/sizeof(*c_dictionary));
 393                 atexit(free_tokenizer_dict);
 394         }
 395
 396         tl->orig = orig;
 397         tl->orig_size = orig_size;
 398         unbreak_backslash_broken_lines(tl, mq);
 399         tl->filename = NULL;
 400
 401         s = tl->txt;
 402         e = s + tl->txt_size;
 403
 404         array_appends(array, {
 405                 .type = TOK_STARTLINE,
 406                 .txt = s,
 407                 .txt_size = 0
 408         } );
 409
 410         while (s<e) {
 411                 const char *orig = s;
 412                 char c = *s++;
 413                 int added_something = 1;
 414
 415                 if (cstray(c)) {
 416                         stray_count++;
 417                         while (s<e && cstray(*s)) {
 418                                 s++;
 419                                 stray_count++;
 420                         }
 421                         add(.type = TOK_STRAY);
 422
 423                         /* This has the potential to be very noisy on binary
 424                            files, but it really is quite useful. */
 425                         tok_msg_error(stray_segment, orig,
 426                                 "%zu stray characters", s-orig);
 427
 428                 } else if (creturn(c)) {
 429                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
 430                         if (s<e && *s=='\n'+'\r'-c) {
 431                                 s++;
 432                                 cr_count++;
 433                         } else if (c=='\r')
 434                                 cr_count++;
 435
 436                         add(.type = TOK_WHITE);
 437                         orig = s;
 438
 439                         //add a TOK_STARTLINE for the next line unless this is the end of the document
 440                         if (s<e)
 441                                 add(.type = TOK_STARTLINE);
 442
 443                         only_pound_include = 0;
 444
 445                 } else if (cspace(c)) {
 446                         //skip over the remaining whitespace
 447                         while (s<e && cspace(*s)) s++;
 448                         add(.type = TOK_WHITE);
 449                         added_something = 0;
 450
 451                 } else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
 452                         struct token tok;
 453                         s = read_cnumber(&tok, s-1, e, mq);
 454                         tok.txt = orig;
 455                         tok.txt_size = s-orig;
 456                         array_append(array, tok);
 457
 458                 } else if (csymbol(c) || cident(c)) {
 459                         if (only_pound_include && (c=='"' || c=='<')) { //include string
 460                                 char *include;
 461                                 char end = c=='"' ? '"' : '>';
 462                                 short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
 463
 464                                 while (s<e && !creturn(*s) && *s!=end) s++;
 465                                 include = talloc_strndup(tl, orig+1, s-(orig+1));
 466
 467                                 if (s<e && *s==end) {
 468                                         s++;
 469                                 } else {
 470                                         tok_msg_error(include_missing_terminator, orig,
 471                                                 "Missing terminating %c character", end);
 472                                 }
 473
 474                                 add(.type = type,
 475                                         {.include = include});
 476
 477                         } else if (c=='\'' || c=='\"') { //character or string literal
 478                                 array_char string = array_new(tl);
 479                                 s = read_cstring(&string, s, e, c, mq);
 480                                 if (s<e) s++; //advance past endquote (if available)
 481                                 add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
 482                                     {.string = string});
 483
 484                                 if (c=='\'' && string.size==0) {
 485                                         tok_msg_error(empty_char_constant, orig,
 486                                                 "Empty character constant");
 487                                 }
 488
 489                         } else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
 490                                 if (*s++ == '*') { /* C-style comment */
 491                                         const char *comment_start = s-2;
 492                                         for (;;s++) {
 493                                                 if (s+1 >= e) {
 494                                                         s = e;
 495                                                         tok_msg_error(unterminated_comment, comment_start,
 496                                                                 "Unterminated comment");
 497                                                         break;
 498                                                 }
 499                                                 if (s[0]=='*' && s[1]=='/') {
 500                                                         s += 2;
 501                                                         break;
 502                                                 }
 503                                         }
 504                                         add(.type = TOK_CCOMMENT);
 505                                 } else { // C++-style comment
 506                                         while (s<e && !creturn(*s)) s++;
 507                                         add(.type = TOK_CPPCOMMENT);
 508                                 }
 509                                 added_something = 0;
 510
 511                         } else { //operator, keyword, or identifier
 512                                 struct dict_entry *ent;
 513                                 const char *ident_e = --s;
 514                                 while (ident_e<e && cident(*ident_e) ) ident_e++;
 515
 516                                 ent = dict_lookup(tokenizer_dict, &s, e);
 517                                 if (cident(c)) { //keyword or identifier
 518                                         if (ent && s==ident_e) {
 519                                                 add(.type = TOK_KEYWORD,
 520                                                         {.opkw = ent->id});
 521                                                 if (ent->id == INCLUDE) {
 522                                                         //hacky way to lex #include string properly
 523                                                         struct token *ts = array.item;
 524                                                         struct token *tp = ts+array.size-1;
 525                                                         while (tp>ts && token_is_ignored(tp-1))
 526                                                                 tp--;
 527                                                         if (tp>ts && token_is_op(tp-1, '#')) {
 528                                                                 tp--;
 529                                                                 while (tp>ts && token_is_ignored(tp-1))
 530                                                                         tp--;
 531                                                                 if (tp>ts && tp[-1].type==TOK_STARTLINE) {
 532                                                                         only_pound_include = 1;
 533                                                                         continue;
 534                                                                 }
 535                                                         }
 536                                                 }
 537                                         } else {
 538                                                 s = ident_e;
 539                                                 add(.type = TOK_IDENTIFIER);
 540                                         }
 541                                 } else if (ent) { //operator
 542                                         add(.type = TOK_OPERATOR,
 543                                             {.opkw = ent->id});
 544                                 } else { //invalid symbol (shouldn't happen)
 545                                         tok_msg_bug(unrecognized_symbol, s,
 546                                                 "Unrecognized symbol \'%c\'", c);
 547                                         s++;
 548                                         add(.type = TOK_STRAY);
 549                                 }
 550                         }
 551                 }
 552
 553                 if (added_something)
 554                         only_pound_include = 0;
 555         }
 556
 557         /*if (stray_count) {
 558                 tok_msg_error(stray_characters, NULL,
 559                         "%lu stray characters in text", (unsigned long)stray_count);
 560         }*/
 561         if (cr_count) {
 562                 tok_msg_warn(nonstandard_newlines, NULL,
 563                         "Text contains non-standard line terminators");
 564         }
 565
 566         finalize(tl, array.item, array.item+array.size);
 567
 568         return tl;
 569 }
 570
 571 size_t token_list_count(const struct token_list *tl) {
 572         size_t ret = 0;
 573         const struct token *i;
 574
 575         for (i=tl->first; i; i=i->next)
 576                 ret++;
 577
 578         return ret;
 579 }
 580
 581 static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
 582         const char * const *orig = lines;
 583         const char * const *orig_e = lines+line_count;
 584
 585         while (line_count > 1) {
 586                 size_t middle = line_count>>1;
 587                 if (ptr < lines[middle])
 588                         line_count = middle;
 589                 else {
 590                         lines += middle;
 591                         line_count -= middle;
 592                 }
 593         }
 594
 595         //select the *last* of equivalent lines
 596         while (lines+1 < orig_e && lines[0]==lines[1])
 597                 lines++;
 598
 599         // (don't) select the *first* of equivalent lines
 600         //while (lines>orig && lines<orig_e && lines[-1]==lines[0])
 601         //      lines--;
 602
 603         return lines - orig;
 604 }
 605
 606 int tok_point_lookup(struct tok_point *out, const char *ptr,
 607                         const struct token_list *tl) {
 608         size_t line_count = tl->olines_size;
 609
 610         memset(out, 0, sizeof(*out));
 611         if (!tl)
 612                 return 0;
 613
 614         if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
 615                 out->txt = ptr;
 616                 out->line = find_line(ptr, tl->tlines, line_count);
 617                 if (out->line < line_count) {
 618                         out->col = ptr - tl->tlines[out->line];
 619                         out->orig = tl->olines[out->line] + out->col;
 620                 } else {
 621                         out->col = 0;
 622                         out->orig = tl->orig + tl->orig_size;
 623                 }
 624                 return 1;
 625         } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
 626                 out->orig = ptr;
 627                 out->line = find_line(ptr, tl->olines, line_count);
 628                 if (out->line < line_count) {
 629                         const char *tline_start = tl->tlines[out->line];
 630                         const char *tline_end = out->line+1 < line_count ?
 631                                 tl->tlines[out->line+1] :
 632                                 tl->txt + tl->txt_size;
 633
 634                         out->col = ptr - tl->olines[out->line];
 635                         out->txt = tline_start + out->col;
 636
 637                         if (out->txt > tline_end)
 638                                 out->txt = tline_end;
 639                 } else {
 640                         out->col = 0;
 641                         out->txt = tl->txt + tl->txt_size;
 642                 }
 643                 return 1;
 644         } else {
 645                 return 0;
 646         }
 647 }
 648
 649 static char *escape_string(array_char *buf, const char *str, size_t size) {
 650         const char *s = str, *e = s+size;
 651         array_from_lit(*buf, "");
 652
 653         for (;s<e;s++) {
 654                 char buffer[8];
 655                 const char *esc = buffer;
 656                 unsigned char c = (unsigned char)*s;
 657                 if (ccontrol(c))
 658                         sprintf(buffer, "\\x%02X", c);
 659                 else switch(c) {
 660                         case '\t': esc = "\\t"; break;
 661                         case '\n': esc = "\\n"; break;
 662                         case '\v': esc = "\\v"; break;
 663                         case '\f': esc = "\\f"; break;
 664                         case '\r': esc = "\\r"; break;
 665                         case '"': esc = "\\\""; break;
 666                         case '\\': esc = "\\\\"; break;
 667                         default:
 668                                 buffer[0] = c;
 669                                 buffer[1] = 0;
 670                 }
 671                 array_append_string(*buf, esc);
 672         }
 673
 674         return buf->item;
 675 }
 676
 677 static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
 678         const char *ts = txt, *te = ts+txt_size;
 679         const char *os = orig, *oe = os+orig_size;
 680
 681         do {
 682                 const char *ob = os; //start of next backslash break
 683                 const char *obe = os; //end of next backslash break
 684                 size_t size; //amount of text to compare for this round
 685
 686                 while (ob<oe && *ob!='\\') ob++;
 687                 obe = ob;
 688                 if (obe < oe) { //there's a backslash
 689                         obe++;
 690                         while (obe<oe && cspace(*obe)) obe++;
 691                         if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
 692                                 obe++;
 693                                 if (obe<oe && *obe == '\n'+'\r'-obe[-1])
 694                                         obe++;
 695                         } else //this is just a plain old backslash
 696                                 ob = obe;
 697                 }
 698
 699                 size = ob-os;
 700
 701                 if (ts+size > te || memcmp(ts, os, size))
 702                         return 0;
 703                 ts += size;
 704                 os = obe;
 705         } while (ts<te);
 706
 707         if (ts != te || os != oe)
 708                 return 0;
 709
 710         return 1;
 711 }
 712
 713 static int is_backslash_break(const char **end, const char *s, const char *e) {
 714         if (s<e && *s == '\\') {
 715                 s++;
 716                 while (s<e && cspace(*s)) s++;
 717                 if (s<e && creturn(*s)) {
 718                         s++;
 719                         if (s<e && *s=='\n'+'\r'-s[-1])
 720                                 s++;
 721                         *end = s;
 722                         return 1;
 723                 }
 724                 return 0;
 725         }
 726         return 0;
 727 }
 728
 729 #define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)
 730
 731 //tests that should pass on an untainted token list out of the tokenize() function
 732 static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
 733         struct token *first = tl->first;
 734         struct token *last = tl->last;
 735         struct token *i;
 736         const char *txt=tl->txt, *orig=tl->orig;
 737         const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
 738
 739         if ((char*)first > (char*)last ||
 740                 (size_t)((char*)last - (char*)first) % sizeof(struct token))
 741                 failed("Token list pointers don't look right");
 742
 743         //token list should not end with TOK_STARTLINE unless
 744         //  the document is empty
 745         if (last!=first && last->type==TOK_STARTLINE)
 746                 return 0;
 747
 748         for (i=first; i; i=i->next) {
 749                 //Verify list links
 750                 if (i != first && i->prev != i-1)
 751                         failed("list.prev is incorrect");
 752                 if (i != last && i->next != i+1)
 753                         failed("list.next is incorrect");
 754
 755                 //Make sure txt segments fill the entire tl->txt
 756                 if (i->txt != txt)
 757                         failed("txt does not fill the token list");
 758                 txt += i->txt_size;
 759                 if (txt > txt_e)
 760                         failed("txt is out of bounds");
 761
 762                 //Make sure orig segments fill the entire tl->orig
 763                 if (i->orig != orig)
 764                         failed("orig does not fill the token list");
 765                 orig += i->orig_size;
 766                 if (orig > orig_e)
 767                         failed("orig is out of bounds");
 768         }
 769
 770         if (txt != txt_e)
 771                 return 0;
 772         if (orig != orig_e)
 773                 return 0;
 774
 775         return 1;
 776 }
 777
 778 int token_list_sanity_check(const struct token_list *tl, FILE *err) {
 779         struct token *first = tl->first;
 780         struct token *last = tl->last;
 781         struct token *i;
 782         int initial = 1;
 783
 784         if (tl->first == NULL || tl->last == NULL)
 785                 failed("Token list is completely empty");
 786
 787         if (first->type!=TOK_STARTLINE ||
 788             first->txt!=tl->txt || first->txt_size!=0 ||
 789             first->orig!=tl->orig || first->orig_size!=0 ||
 790             first->line!=0 || first->col!=0)
 791                 failed("Token list does not start with a valid TOK_STARTLINE");
 792
 793         if (first->prev!=NULL || last->next!=NULL)
 794                 failed("Token edge links are not NULL");
 795
 796         for (i=first; i; i=i->next) {
 797                 //Verify line,col
 798                 if (tl->tlines[i->line] + i->col != i->txt)
 799                         failed("line,col is wrong against txt");
 800                 if (tl->olines[i->line] + i->col != i->orig)
 801                         failed("line,col is wrong against orig");
 802
 803                 //Make sure tokens have proper sizes
 804                 if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
 805                         failed("Token is empty");
 806                 if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
 807                         failed("TOK_STARTLINE is non-empty");
 808
 809                 //Make sure TOK_WHITE actually contains white tokens
 810                 if (i->type==TOK_WHITE) {
 811                         const char *s = i->txt, *e = s+i->txt_size;
 812                         while (s<e && cwhite(*s)) s++;
 813                         if (s != e)
 814                                 failed("TOK_WHITE does not contain only white characters");
 815                 }
 816
 817                 //Make sure txt and orig match exactly except for backslash line breaks
 818                 if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
 819                         array_char buf = array_new(NULL);
 820                         fprintf(err,
 821                                 "txt and orig do not match:\n"
 822                                 "\ttxt  = \"%s\"\n",
 823                                 escape_string(&buf, i->txt, i->txt_size) );
 824                         fprintf(err, "\torig = \"%s\"\n",
 825                                 escape_string(&buf, i->orig, i->orig_size) );
 826
 827                         array_free(buf);
 828                         return 0;
 829                 }
 830
 831                 //Make sure tok_point_lookup returns correct point
 832                 {
 833                         struct tok_point tok_point;
 834                         const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
 835                         size_t line=i->line, col=i->col;
 836
 837                         #define check(ptr) do { \
 838                                 if (tok_point_lookup(&tok_point, ptr, tl)) { \
 839                                         if (tok_point.txt != t || tok_point.orig != o) \
 840                                                 failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
 841                                                 (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
 842                                         if (tok_point.line != line || tok_point.col != col) \
 843                                                 failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
 844                                                 (int)(tok_point.line-line), (int)(tok_point.col-col)); \
 845                                 } else if (initial) {\
 846                                         failed("tok_point_lookup failed on initial token list"); \
 847                                 } \
 848                         } while(0)
 849
 850                         for (;;) {
 851                                 while (is_backslash_break(&p, o, e)) {
 852                                         while (o<p) {
 853                                                 check(o);
 854                                                 o++;
 855                                                 col++;
 856                                         }
 857                                         col = 0;
 858                                         line++;
 859                                 }
 860                                 if (o >= e)
 861                                         break;
 862                                 do {
 863                                         if (creturn(*o)) {
 864                                                 p = o+1;
 865                                                 if (p<e && *p=='\n'+'\r'-p[-1])
 866                                                         p++;
 867                                                 while (o<p) {
 868                                                         check(o);
 869                                                         check(t);
 870                                                         t++, o++, col++;
 871                                                 }
 872                                                 line++;
 873                                                 col = 0;
 874                                         } else {
 875                                                 check(o);
 876                                                 check(t);
 877                                                 o++, t++, col++;
 878                                         }
 879                                 } while (o<e && *o!='\\');
 880                         }
 881
 882                         #undef check
 883                 }
 884         };
 885
 886         //Verify olines and tlines
 887         {
 888                 const char *s = tl->orig, *e = s+tl->orig_size;
 889                 size_t i, line_count = tl->olines_size;
 890
 891                 //both line arrays should be exactly the same size
 892                 if (tl->olines_size != tl->tlines_size)
 893                         return 0;
 894
 895                 for (i=0; s<e; i++) {
 896                         const char *line_start = s, *line_end;
 897                         size_t tline_size, oline_size;
 898                         const char *p;
 899
 900                         if (i+1 < line_count)
 901                                 tline_size = tl->tlines[i+1] - tl->tlines[i];
 902                         else
 903                                 tline_size = tl->txt+tl->txt_size - tl->tlines[i];
 904
 905                         while (s<e && !creturn(*s)) s++;
 906                         line_end = s;
 907                         if (s<e) {
 908                                 s++;
 909                                 if (s<e && *s=='\n'+'\r'-s[-1])
 910                                         s++;
 911                         }
 912
 913                         oline_size = s-line_start;
 914
 915                         //verify that olines elements are correct
 916                         if (line_start != tl->olines[i])
 917                                 return 0;
 918
 919                         //verify that tlines elements are in range
 920                         p = tl->tlines[i];
 921                         if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
 922                                 return 0;
 923
 924                         //verify that original lines have sizes >= the unbroken lines
 925                         if (oline_size < tline_size)
 926                                 return 0;
 927
 928                         //if sizes are inconsistent, make sure it is due to a backslash escape
 929                         if (oline_size > tline_size) {
 930                                 p = line_start+tline_size;
 931                                 if (*p++ != '\\')
 932                                         return 0;
 933                                 while (p<e && cspace(*p)) p++;
 934                                 if (p != line_end)
 935                                         return 0;
 936                         }
 937
 938                         //make sure the text of both copies match
 939                         if ( memcmp(
 940                                 tl->olines[i],
 941                                 tl->tlines[i],
 942                                 tline_size) )
 943                                 return 0;
 944                 }
 945         }
 946
 947         if (initial && !token_list_sanity_check_initial(tl, err))
 948                 failed("Initial sanity checks failed.  Has the list been modified after it was returned from tokenize() ?");
 949
 950         return 1;
 951 }
 952
 953 #undef failed
 954
 955 static char *sprint_token_flags(char buf[3], struct token_flags flags) {
 956         buf[0] = flags.pp ? 'p' : '-';
 957         buf[1] = flags.pp_directive ? 'D' : '-';
 958         buf[2] = 0;
 959         return buf;
 960 }
 961
 962 void token_list_dump(const struct token_list *tl, FILE *f) {
 963         struct token *tok;
 964         array_char buf = array_new(NULL);
 965         size_t i = 0;
 966         char buf2[8];
 967         const char *token_type_str[] = {
 968                 "TOK_INTEGER      ",
 969                 "TOK_FLOATING     ",
 970                 "TOK_OPERATOR     ",
 971                 "TOK_KEYWORD      ",
 972                 "TOK_IDENTIFIER   ",
 973                 "TOK_CHAR         ",
 974                 "TOK_STRING       ",
 975                 "TOK_LEADING_POUND",
 976                 "TOK_STRING_IQUOTE",
 977                 "TOK_STRING_IANGLE",
 978                 "TOK_CCOMMENT     ",
 979                 "TOK_CPPCOMMENT   ",
 980                 "TOK_WHITE        ",
 981                 "TOK_STARTLINE    ",
 982                 "TOK_STRAY        "
 983         };
 984
 985         for (tok=tl->first; tok; tok=tok->next) {
 986                 fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
 987                         token_type_str[tok->type],
 988                         sprint_token_flags(buf2, tok->flags),
 989                         escape_string(&buf, tok->txt, tok->txt_size));
 990                 #if 1 //print tok->orig
 991                 fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
 992                 #else
 993                 fprintf(f, "\n");
 994                 #endif
 995         }
 996
 997         array_free(buf);
 998 }
 999
1000 void tok_message_print(struct tok_message *m, struct token_list *tl) {
1001         struct tok_point pt;
1002         int resolved = tok_point_lookup(&pt, m->location, tl);
1003
1004         if (tl->filename) {
1005                 printf("%s:%s", tl->filename, resolved ? "" : " ");
1006         }
1007
1008         if (resolved) {
1009                 printf("%zu:%zu %s: %s\n",
1010                         pt.line+1, pt.col+1,
1011                         m->level==TM_DEBUG ? "debug" :
1012                         m->level==TM_INFO ? "info" :
1013                         m->level==TM_WARN ? "warning" :
1014                         m->level==TM_ERROR ? "error" :
1015                         m->level==TM_BUG ? "BUG" :
1016                         "???",
1017                         m->message);
1018         } else {
1019                 printf("%s: %s\n",
1020                         m->level==TM_DEBUG ? "debug" :
1021                         m->level==TM_INFO ? "info" :
1022                         m->level==TM_WARN ? "warning" :
1023                         m->level==TM_ERROR ? "error" :
1024                         m->level==TM_BUG ? "BUG" :
1025                         "???",
1026                         m->message);
1027         }
1028 }
1029
1030 void tok_message_dump(struct tok_message *m) {
1031         printf("%s: %s: %s\n",
1032                 m->level==TM_DEBUG ? "debug" :
1033                 m->level==TM_INFO ? "info" :
1034                 m->level==TM_WARN ? "warning" :
1035                 m->level==TM_ERROR ? "error" :
1036                 m->level==TM_BUG ? "BUG" :
1037                 "???", m->path, m->message);
1038 }
1039
1040 void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
1041         const char *path, const char *loc, const char *fmt, ...) {
1042         struct tok_message msg = {.level=level, .path=path, .location=loc};
1043         va_list ap;
1044
1045         if (!mq)
1046                 return;
1047
1048         va_start(ap, fmt);
1049         msg.message = talloc_vasprintf(mq->item, fmt, ap);
1050         va_end(ap);
1051
1052         enqueue(*mq, msg);
1053 }
1054
1055 void tok_message_queue_dump(const tok_message_queue *mq) {
1056         size_t i;
1057         for (i=0; i<queue_count(*mq); i++)
1058                 tok_message_dump(&queue_item(*mq, i));
1059 }
1060
1061
1062 #undef add
1063 #undef cstray
1064 #undef cident