git.ozlabs.org Git - ccan/blob - ccan/ccan_tokenizer/ccan_tokenizer.c

   1 /*
   2         Copyright (c) 2009  Joseph A. Adams
   3         All rights reserved.
   4
   5         Redistribution and use in source and binary forms, with or without
   6         modification, are permitted provided that the following conditions
   7         are met:
   8         1. Redistributions of source code must retain the above copyright
   9            notice, this list of conditions and the following disclaimer.
  10         2. Redistributions in binary form must reproduce the above copyright
  11            notice, this list of conditions and the following disclaimer in the
  12            documentation and/or other materials provided with the distribution.
  13         3. The name of the author may not be used to endorse or promote products
  14            derived from this software without specific prior written permission.
  15
  16         THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  17         IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18         OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19         IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  20         INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  21         NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  22         DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  23         THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24         (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  25         THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 */
  27
  28 #include "ccan_tokenizer.h"
  29
  30 #include <ccan/talloc/talloc.h>
  31
  32 #include <assert.h>
  33
  34 //Shown by operator precedence; based on
  35 // http://tigcc.ticalc.org/doc/opers.html#precedence .
  36
  37 static struct dict_entry c_dictionary[] = {
  38 //1. Highest
  39         {'(',"("}, {')',")"},
  40         {'[',"["}, {']',"]"},
  41         {'{',"{"}, {'}',"}"},
  42         {'.',"."},
  43         {PTR_OP,"->"},
  44
  45 //2. Unary
  46         {'!',"!"}, {'~',"~"}, //prefix
  47         {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
  48         // + - & *
  49
  50 //3. Multiplicative
  51         // *
  52         {'/',"/"}, {'%',"%"},
  53
  54 //4. Additive
  55         // + -
  56
  57 //5. Shift
  58         {LEFT_OP,"<<"}, {RIGHT_OP,">>"},
  59
  60 //6. Relational
  61         {'<',"<"}, {'>',">"},
  62         {LE_OP,"<="}, {GE_OP,">="},
  63
  64 //7. Equality
  65         {EQ_OP,"=="}, {NE_OP,"!="},
  66
  67 //8. Bitwise AND
  68         // &
  69 //9. Bitwise XOR
  70         {'^',"^"},
  71 //10. Bitwise OR
  72         {'|',"|"},
  73
  74 //11. Logical AND
  75         {AND_OP,"&&"},
  76 //12. Logical OR
  77         {OR_OP,"||"},
  78
  79 //13. Conditional
  80         {'?',"?"}, {':',":"},
  81
  82 //14. Assignment
  83         {'=',"="},
  84         {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
  85         {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
  86         {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
  87         {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
  88
  89 //15. Comma
  90         {',',","},
  91
  92 //16. Semicolon
  93         {';',";"},
  94
  95 //Misc
  96         {ELLIPSIS,"..."},
  97         {'#',"#"},
  98         {DOUBLE_POUND,"##"},
  99
 100 //Ambiguous
 101         //unary or binary
 102         {'+',"+"}, {'-',"-"},
 103         {'&',"&"}, {'*',"*"},
 104
 105 //Keywords
 106         {_BOOL, "_Bool"},
 107         {_COMPLEX, "_Complex"},
 108         {_IMAGINARY, "_Imaginary"},
 109         {BREAK, "break"},
 110         {CASE, "case"},
 111         {CHAR, "char"},
 112         {CONST, "const"},
 113         {CONTINUE, "continue"},
 114         {DEFAULT, "default"},
 115         {DO, "do"},
 116         {DOUBLE, "double"},
 117         {ELSE, "else"},
 118         {ENUM, "enum"},
 119         {EXTERN, "extern"},
 120         {FLOAT, "float"},
 121         {FOR, "for"},
 122         {GOTO, "goto"},
 123         {IF, "if"},
 124         {INLINE, "inline"},
 125         {INT, "int"},
 126         {LONG, "long"},
 127         {REGISTER, "register"},
 128         {RESTRICT, "restrict"},
 129         {RETURN, "return"},
 130         {SHORT, "short"},
 131         {SIGNED, "signed"},
 132         {SIZEOF, "sizeof"},
 133         {STATIC, "static"},
 134         {STRUCT, "struct"},
 135         {SWITCH, "switch"},
 136         {TYPEDEF, "typedef"},
 137         {UNION, "union"},
 138         {UNSIGNED, "unsigned"},
 139         {VOID, "void"},
 140         {VOLATILE, "volatile"},
 141         {WHILE, "while"},
 142
 143 //Preprocessor keywords (except those already defined)
 144         {VA_ARGS, "__VA_ARGS__"},
 145         {DEFINE, "define"},
 146         {ELIF, "elif"},
 147 //      {ELSE, "else"},
 148         {ENDIF, "endif"},
 149         {ERROR, "error"},
 150 //      {IF, "if"},
 151         {IFDEF, "ifdef"},
 152         {IFNDEF, "ifndef"},
 153         {INCLUDE, "include"},
 154         {LINE, "line"},
 155         {PRAGMA, "pragma"},
 156         {UNDEF, "undef"},
 157         {WARNING, "warning"},
 158 };
 159
 160 #if 0
 161
 162 struct tokenizer *tokenizer_new(void *ctx) {
 163         struct tokenizer *t = talloc(ctx, struct tokenizer);
 164         t->ctx = ctx;
 165         queue_init(t->mq, t);
 166         t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
 167
 168         return t;
 169 }
 170
 171 #endif
 172
 173 static int talloc_darray_destructor(void *ptr);
 174
 175 /*
 176  * darray(T) *talloc_darray(const void *context);
 177  *
 178  * Create a new darray anchored in a talloc buffer.
 179  * When this pointer is freed, the darray will be freed as well.
 180  */
 181 static void *talloc_darray(const void *context)
 182 {
 183         void *ret = talloc(context, darray(void));
 184         darray_init(*(darray(void)*)ret);
 185         talloc_set_destructor(ret, talloc_darray_destructor);
 186         return ret;
 187 }
 188
 189 static int talloc_darray_destructor(void *ptr)
 190 {
 191         darray(void) *arr = ptr;
 192         free(arr->item);
 193         return 0;
 194 }
 195
 196 #define MESSAGE_PATH "tokenize/"
 197
 198 static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
 199         const char *s = tl->orig, *e = s+tl->orig_size;
 200         darray_char         *txt    = talloc_darray(tl);
 201         darray(const char*) *olines = talloc_darray(tl);
 202         darray(const char*) *tlines = talloc_darray(tl);
 203
 204         do {
 205                 const char *line_start = s, *line_end;
 206                 const char *lnw; //last non-white
 207                 size_t start_offset = txt->size;
 208
 209                 //scan to the next line and find the last non-white character in the line
 210                 while (s<e && !creturn(*s)) s++;
 211                 line_end = s;
 212                 lnw = s;
 213                 while (lnw>line_start && cspace(lnw[-1])) lnw--;
 214                 if (s<e && creturn(*s)) {
 215                         s++;
 216                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
 217                         if (s<e && *s=='\n'+'\r'-s[-1])
 218                                 s++;
 219                 }
 220
 221                 //add the backslash-break-free version of the text
 222                 if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
 223                         darray_append_items(*txt, line_start, lnw-1-line_start);
 224                         if (lnw<e && cspace(*lnw)) {
 225                                 tok_msg_warn(spaces_after_backslash_break, lnw,
 226                                         "Trailing spaces after backslash-broken line");
 227                         }
 228                 } else
 229                         darray_append_items(*txt, line_start, s-line_start);
 230
 231                 //add the line starts for this line
 232                 darray_append(*olines, line_start);
 233                 darray_append(*tlines, (const char*)start_offset);
 234                         //Since the txt buffer moves when expanded, we're storing offsets
 235                         //  for now.  Once we're done building txt, we can add the base
 236                         //  of it to all the offsets to make them pointers.
 237         } while (s<e);
 238
 239         //stick a null terminator at the end of the text
 240         darray_realloc(*txt, txt->size+1);
 241         txt->item[txt->size] = 0;
 242
 243         //convert the line start offsets to pointers
 244         {
 245                 const char **i;
 246                 darray_foreach(i, *tlines)
 247                         *i = txt->item + (size_t)(*i);
 248         }
 249
 250         tl->olines = olines->item;
 251         tl->olines_size = olines->size;
 252         tl->txt = txt->item;
 253         tl->txt_size = txt->size;
 254         tl->tlines = tlines->item;
 255         tl->tlines_size = tlines->size;
 256 }
 257
 258 static void normal_keyword(struct token *tok) {
 259         if (tok->type==TOK_KEYWORD &&
 260                         (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
 261                 tok->type = TOK_IDENTIFIER;
 262 }
 263
 264 static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
 265         while (end>start && token_is_ignored(end-1)) end--;
 266         return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
 267 }
 268
 269 //Used to label __VA_ARGS__ as keywords within applicable macro expansions
 270 //Start should follow the DEFINE directive keyword
 271 static void this_is_a_define(struct token *start, struct token *end) {
 272         struct token *i = start, *pl_start;
 273
 274         //skip past the identifier that is defined
 275         while (i<end && token_is_ignored(i)) i++;
 276         if (i >= end)
 277                 return;
 278          //TODO:  check i->type to make sure it's an identifier, throw error otherwise
 279         normal_keyword(i++);
 280
 281         //see if this is actually a variadic macro
 282         if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
 283                 goto not_va_args;
 284         pl_start = ++i;
 285         while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
 286                 normal_keyword(i++);
 287         if (!define_parmlist_has_ellipsis(pl_start, i++))
 288                 goto not_va_args;
 289
 290         //We have arrived at the macro expansion and know there is a ... argument
 291         //Thus, we'll only change directive-only keywords to identifiers
 292         for(; i<end; i++) {
 293                 if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
 294                         i->type = TOK_IDENTIFIER;
 295         }
 296
 297 not_va_args:
 298         while (i < end)
 299                 normal_keyword(i++);
 300 }
 301
 302 //fill the flags field of each token and untangle keywords and such
 303 static void finalize_line(struct token *start, struct token *end) {
 304         struct token *i = start, *j;
 305
 306         assert(start<end && start->type==TOK_STARTLINE);
 307         i++;
 308
 309         while (i<end && token_is_ignored(i)) i++;
 310
 311         if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
 312         //preprocessor line
 313                 i->type = TOK_LEADING_POUND;
 314
 315                 //set pp on all tokens in this line
 316                 for (j=start; j<end; j++)
 317                         j->flags.pp = 1;
 318
 319                 //find the relevant token after the '#'
 320                 for (i++; i<end; i++) {
 321                         if (!token_is_ignored(i)) {
 322                                 i->flags.pp_directive = 1;
 323                                 if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
 324                                         i->type = TOK_IDENTIFIER;
 325                                 //TODO:  Handle invalid preprocessor directives (e.g. #+ )
 326
 327                                 if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
 328                                         for (j=i+1; j<end; j++)
 329                                         this_is_a_define(i+1, end);
 330                                 } else {
 331                                         while (++i < end)
 332                                                 normal_keyword(i);
 333                                 }
 334                                 break;
 335                         }
 336                 }
 337         } else {
 338         //normal line
 339                 while (i < end)
 340                         normal_keyword(i++);
 341         }
 342 }
 343
 344 //fill the list, flags, line, col, orig, and orig_size fields of each token
 345 //convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
 346 static void finalize(struct token_list *tl, struct token *start, struct token *end) {
 347         const char * const *lss = tl->tlines;
 348         const char * const *lse = lss + tl->tlines_size;
 349         struct token *i;
 350         struct token *startline = NULL;
 351
 352         assert(start < end);
 353
 354         tl->first = start;
 355         tl->last = end-1;
 356
 357         for (i=start; ; i++) {
 358                 //perform a second pass on each line
 359                 if (i >= end || i->type == TOK_STARTLINE) {
 360                         if (startline)
 361                                 finalize_line(startline, i);
 362                         startline = i;
 363                 }
 364
 365                 if (i >= end) {
 366                         end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
 367                         break;
 368                 }
 369
 370                 //set up the list links
 371                 i->prev = i>start ? i-1 : NULL;
 372                 i->next = i+1<end ? i+1 : NULL;
 373
 374                 //if i->txt starts on a later line, advance to it
 375                 while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
 376                         lss++;
 377
 378                 //set up line, col, orig, and orig_size
 379                 i->line = lss - tl->tlines;
 380                 i->col = i->txt - *lss;
 381                 i->orig = tl->olines[i->line] + i->col;
 382                 if (i > start)
 383                         i[-1].orig_size = i->orig - i[-1].orig;
 384
 385                 assert(i->line < tl->olines_size);
 386
 387                 //clear the flags
 388                 memset(&i->flags, 0, sizeof(i->flags));
 389         }
 390 }
 391
 392 #define add(...) do { \
 393                 struct token tok = {__VA_ARGS__}; \
 394                 tok.txt = orig; \
 395                 tok.txt_size = s-orig; \
 396                 darray_append(*arr, tok); \
 397         } while (0)
 398
 399 #define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
 400 #define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
 401         //believe it or not, $ is a valid character in an identifier
 402
 403 struct dict *tokenizer_dict = NULL;
 404
 405 static void free_tokenizer_dict(void) {
 406         talloc_free(tokenizer_dict);
 407 }
 408
 409 struct token_list *tokenize(const void *tcontext, const char *orig, size_t orig_size,
 410                                 tok_message_queue *mq) {
 411         struct token_list *tl = talloc(tcontext, struct token_list);
 412         const char *s, *e;
 413         size_t stray_count=0, cr_count=0;
 414         darray(struct token) *arr = talloc_darray(tl);
 415         int only_pound_include = 0;
 416
 417         if (!tokenizer_dict) {
 418                 tokenizer_dict = dict_build(NULL, c_dictionary,
 419                         sizeof(c_dictionary)/sizeof(*c_dictionary));
 420                 atexit(free_tokenizer_dict);
 421         }
 422
 423         tl->orig = orig;
 424         tl->orig_size = orig_size;
 425         unbreak_backslash_broken_lines(tl, mq);
 426         tl->filename = NULL;
 427
 428         s = tl->txt;
 429         e = s + tl->txt_size;
 430
 431         darray_appends_t(*arr, struct token, {
 432                 .type = TOK_STARTLINE,
 433                 .txt = s,
 434                 .txt_size = 0
 435         } );
 436
 437         while (s<e) {
 438                 const char *orig = s;
 439                 char c = *s++;
 440                 int added_something = 1;
 441
 442                 if (cstray(c)) {
 443                         stray_count++;
 444                         while (s<e && cstray(*s)) {
 445                                 s++;
 446                                 stray_count++;
 447                         }
 448                         add(.type = TOK_STRAY);
 449
 450                         /* This has the potential to be very noisy on binary
 451                            files, but it really is quite useful. */
 452                         tok_msg_error(stray_segment, orig,
 453                                 "%zu stray characters", s-orig);
 454
 455                 } else if (creturn(c)) {
 456                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
 457                         if (s<e && *s=='\n'+'\r'-c) {
 458                                 s++;
 459                                 cr_count++;
 460                         } else if (c=='\r')
 461                                 cr_count++;
 462
 463                         add(.type = TOK_WHITE);
 464                         orig = s;
 465
 466                         //add a TOK_STARTLINE for the next line unless this is the end of the document
 467                         if (s<e)
 468                                 add(.type = TOK_STARTLINE);
 469
 470                         only_pound_include = 0;
 471
 472                 } else if (cspace(c)) {
 473                         //skip over the remaining whitespace
 474                         while (s<e && cspace(*s)) s++;
 475                         add(.type = TOK_WHITE);
 476                         added_something = 0;
 477
 478                 } else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
 479                         struct token tok;
 480                         s = read_cnumber(&tok, s-1, e, mq);
 481                         tok.txt = orig;
 482                         tok.txt_size = s-orig;
 483                         darray_append(*arr, tok);
 484
 485                 } else if (csymbol(c) || cident(c)) {
 486                         if (only_pound_include && (c=='"' || c=='<')) { //include string
 487                                 char *include;
 488                                 char end = c=='"' ? '"' : '>';
 489                                 short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
 490
 491                                 while (s<e && !creturn(*s) && *s!=end) s++;
 492                                 include = talloc_strndup(tl, orig+1, s-(orig+1));
 493
 494                                 if (s<e && *s==end) {
 495                                         s++;
 496                                 } else {
 497                                         tok_msg_error(include_missing_terminator, orig,
 498                                                 "Missing terminating %c character", end);
 499                                 }
 500
 501                                 add(.type = type,
 502                                         {.include = include});
 503
 504                         } else if (c=='\'' || c=='\"') { //character or string literal
 505                                 darray_char *string = talloc_darray(tl);
 506                                 s = read_cstring(string, s, e, c, mq);
 507                                 if (s<e) s++; //advance past endquote (if available)
 508                                 add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
 509                                     {.string = string});
 510
 511                                 if (c=='\'' && string->size==0) {
 512                                         tok_msg_error(empty_char_constant, orig,
 513                                                 "Empty character constant");
 514                                 }
 515
 516                         } else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
 517                                 if (*s++ == '*') { /* C-style comment */
 518                                         const char *comment_start = s-2;
 519                                         for (;;s++) {
 520                                                 if (s+1 >= e) {
 521                                                         s = e;
 522                                                         tok_msg_error(unterminated_comment, comment_start,
 523                                                                 "Unterminated comment");
 524                                                         break;
 525                                                 }
 526                                                 if (s[0]=='*' && s[1]=='/') {
 527                                                         s += 2;
 528                                                         break;
 529                                                 }
 530                                         }
 531                                         add(.type = TOK_CCOMMENT);
 532                                 } else { // C++-style comment
 533                                         while (s<e && !creturn(*s)) s++;
 534                                         add(.type = TOK_CPPCOMMENT);
 535                                 }
 536                                 added_something = 0;
 537
 538                         } else { //operator, keyword, or identifier
 539                                 struct dict_entry *ent;
 540                                 const char *ident_e = --s;
 541                                 while (ident_e<e && cident(*ident_e) ) ident_e++;
 542
 543                                 ent = dict_lookup(tokenizer_dict, &s, e);
 544                                 if (cident(c)) { //keyword or identifier
 545                                         if (ent && s==ident_e) {
 546                                                 add(.type = TOK_KEYWORD,
 547                                                         {.opkw = ent->id});
 548                                                 if (ent->id == INCLUDE) {
 549                                                         //hacky way to lex #include string properly
 550                                                         struct token *ts = arr->item;
 551                                                         struct token *tp = ts+arr->size-1;
 552                                                         while (tp>ts && token_is_ignored(tp-1))
 553                                                                 tp--;
 554                                                         if (tp>ts && token_is_op(tp-1, '#')) {
 555                                                                 tp--;
 556                                                                 while (tp>ts && token_is_ignored(tp-1))
 557                                                                         tp--;
 558                                                                 if (tp>ts && tp[-1].type==TOK_STARTLINE) {
 559                                                                         only_pound_include = 1;
 560                                                                         continue;
 561                                                                 }
 562                                                         }
 563                                                 }
 564                                         } else {
 565                                                 s = ident_e;
 566                                                 add(.type = TOK_IDENTIFIER);
 567                                         }
 568                                 } else if (ent) { //operator
 569                                         add(.type = TOK_OPERATOR,
 570                                             {.opkw = ent->id});
 571                                 } else { //invalid symbol (shouldn't happen)
 572                                         tok_msg_bug(unrecognized_symbol, s,
 573                                                 "Unrecognized symbol \'%c\'", c);
 574                                         s++;
 575                                         add(.type = TOK_STRAY);
 576                                 }
 577                         }
 578                 }
 579
 580                 if (added_something)
 581                         only_pound_include = 0;
 582         }
 583
 584         /*if (stray_count) {
 585                 tok_msg_error(stray_characters, NULL,
 586                         "%lu stray characters in text", (unsigned long)stray_count);
 587         }*/
 588         if (cr_count) {
 589                 tok_msg_warn(nonstandard_newlines, NULL,
 590                         "Text contains non-standard line terminators");
 591         }
 592
 593         finalize(tl, arr->item, arr->item+arr->size);
 594
 595         return tl;
 596 }
 597
 598 size_t token_list_count(const struct token_list *tl) {
 599         size_t ret = 0;
 600         const struct token *i;
 601
 602         for (i=tl->first; i; i=i->next)
 603                 ret++;
 604
 605         return ret;
 606 }
 607
 608 static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
 609         const char * const *orig = lines;
 610         const char * const *orig_e = lines+line_count;
 611
 612         while (line_count > 1) {
 613                 size_t middle = line_count>>1;
 614                 if (ptr < lines[middle])
 615                         line_count = middle;
 616                 else {
 617                         lines += middle;
 618                         line_count -= middle;
 619                 }
 620         }
 621
 622         //select the *last* of equivalent lines
 623         while (lines+1 < orig_e && lines[0]==lines[1])
 624                 lines++;
 625
 626         // (don't) select the *first* of equivalent lines
 627         //while (lines>orig && lines<orig_e && lines[-1]==lines[0])
 628         //      lines--;
 629
 630         return lines - orig;
 631 }
 632
 633 int tok_point_lookup(struct tok_point *out, const char *ptr,
 634                         const struct token_list *tl) {
 635         size_t line_count = tl->olines_size;
 636
 637         memset(out, 0, sizeof(*out));
 638         if (!tl)
 639                 return 0;
 640
 641         if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
 642                 out->txt = ptr;
 643                 out->line = find_line(ptr, tl->tlines, line_count);
 644                 if (out->line < line_count) {
 645                         out->col = ptr - tl->tlines[out->line];
 646                         out->orig = tl->olines[out->line] + out->col;
 647                 } else {
 648                         out->col = 0;
 649                         out->orig = tl->orig + tl->orig_size;
 650                 }
 651                 return 1;
 652         } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
 653                 out->orig = ptr;
 654                 out->line = find_line(ptr, tl->olines, line_count);
 655                 if (out->line < line_count) {
 656                         const char *tline_start = tl->tlines[out->line];
 657                         const char *tline_end = out->line+1 < line_count ?
 658                                 tl->tlines[out->line+1] :
 659                                 tl->txt + tl->txt_size;
 660
 661                         out->col = ptr - tl->olines[out->line];
 662                         out->txt = tline_start + out->col;
 663
 664                         if (out->txt > tline_end)
 665                                 out->txt = tline_end;
 666                 } else {
 667                         out->col = 0;
 668                         out->txt = tl->txt + tl->txt_size;
 669                 }
 670                 return 1;
 671         } else {
 672                 return 0;
 673         }
 674 }
 675
 676 static char *escape_string(darray_char *buf, const char *str, size_t size) {
 677         const char *s = str, *e = s+size;
 678         darray_from_lit(*buf, "");
 679
 680         for (;s<e;s++) {
 681                 char buffer[8];
 682                 const char *esc = buffer;
 683                 unsigned char c = (unsigned char)*s;
 684                 if (ccontrol(c))
 685                         sprintf(buffer, "\\x%02X", c);
 686                 else switch(c) {
 687                         case '\t': esc = "\\t"; break;
 688                         case '\n': esc = "\\n"; break;
 689                         case '\v': esc = "\\v"; break;
 690                         case '\f': esc = "\\f"; break;
 691                         case '\r': esc = "\\r"; break;
 692                         case '"': esc = "\\\""; break;
 693                         case '\\': esc = "\\\\"; break;
 694                         default:
 695                                 buffer[0] = c;
 696                                 buffer[1] = 0;
 697                 }
 698                 darray_append_string(*buf, esc);
 699         }
 700
 701         return buf->item;
 702 }
 703
 704 static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
 705         const char *ts = txt, *te = ts+txt_size;
 706         const char *os = orig, *oe = os+orig_size;
 707
 708         do {
 709                 const char *ob = os; //start of next backslash break
 710                 const char *obe; //end of next backslash break
 711                 size_t size; //amount of text to compare for this round
 712
 713                 while (ob<oe && *ob!='\\') ob++;
 714                 obe = ob;
 715                 if (obe < oe) { //there's a backslash
 716                         obe++;
 717                         while (obe<oe && cspace(*obe)) obe++;
 718                         if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
 719                                 obe++;
 720                                 if (obe<oe && *obe == '\n'+'\r'-obe[-1])
 721                                         obe++;
 722                         } else //this is just a plain old backslash
 723                                 ob = obe;
 724                 }
 725
 726                 size = ob-os;
 727
 728                 if (ts+size > te || memcmp(ts, os, size))
 729                         return 0;
 730                 ts += size;
 731                 os = obe;
 732         } while (ts<te);
 733
 734         if (ts != te || os != oe)
 735                 return 0;
 736
 737         return 1;
 738 }
 739
 740 static int is_backslash_break(const char **end, const char *s, const char *e) {
 741         if (s<e && *s == '\\') {
 742                 s++;
 743                 while (s<e && cspace(*s)) s++;
 744                 if (s<e && creturn(*s)) {
 745                         s++;
 746                         if (s<e && *s=='\n'+'\r'-s[-1])
 747                                 s++;
 748                         *end = s;
 749                         return 1;
 750                 }
 751                 return 0;
 752         }
 753         return 0;
 754 }
 755
 756 #define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)
 757
 758 //tests that should pass on an untainted token list out of the tokenize() function
 759 static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
 760         struct token *first = tl->first;
 761         struct token *last = tl->last;
 762         struct token *i;
 763         const char *txt=tl->txt, *orig=tl->orig;
 764         const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
 765
 766         if ((char*)first > (char*)last ||
 767                 (size_t)((char*)last - (char*)first) % sizeof(struct token))
 768                 failed("Token list pointers don't look right");
 769
 770         //token list should not end with TOK_STARTLINE unless
 771         //  the document is empty
 772         if (last!=first && last->type==TOK_STARTLINE)
 773                 return 0;
 774
 775         for (i=first; i; i=i->next) {
 776                 //Verify list links
 777                 if (i != first && i->prev != i-1)
 778                         failed("list.prev is incorrect");
 779                 if (i != last && i->next != i+1)
 780                         failed("list.next is incorrect");
 781
 782                 //Make sure txt segments fill the entire tl->txt
 783                 if (i->txt != txt)
 784                         failed("txt does not fill the token list");
 785                 txt += i->txt_size;
 786                 if (txt > txt_e)
 787                         failed("txt is out of bounds");
 788
 789                 //Make sure orig segments fill the entire tl->orig
 790                 if (i->orig != orig)
 791                         failed("orig does not fill the token list");
 792                 orig += i->orig_size;
 793                 if (orig > orig_e)
 794                         failed("orig is out of bounds");
 795         }
 796
 797         if (txt != txt_e)
 798                 return 0;
 799         if (orig != orig_e)
 800                 return 0;
 801
 802         return 1;
 803 }
 804
 805 int token_list_sanity_check(const struct token_list *tl, FILE *err) {
 806         struct token *first = tl->first;
 807         struct token *last = tl->last;
 808         struct token *i;
 809         int initial = 1;
 810
 811         if (tl->first == NULL || tl->last == NULL)
 812                 failed("Token list is completely empty");
 813
 814         if (first->type!=TOK_STARTLINE ||
 815             first->txt!=tl->txt || first->txt_size!=0 ||
 816             first->orig!=tl->orig || first->orig_size!=0 ||
 817             first->line!=0 || first->col!=0)
 818                 failed("Token list does not start with a valid TOK_STARTLINE");
 819
 820         if (first->prev!=NULL || last->next!=NULL)
 821                 failed("Token edge links are not NULL");
 822
 823         for (i=first; i; i=i->next) {
 824                 //Verify line,col
 825                 if (tl->tlines[i->line] + i->col != i->txt)
 826                         failed("line,col is wrong against txt");
 827                 if (tl->olines[i->line] + i->col != i->orig)
 828                         failed("line,col is wrong against orig");
 829
 830                 //Make sure tokens have proper sizes
 831                 if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
 832                         failed("Token is empty");
 833                 if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
 834                         failed("TOK_STARTLINE is non-empty");
 835
 836                 //Make sure TOK_WHITE actually contains white tokens
 837                 if (i->type==TOK_WHITE) {
 838                         const char *s = i->txt, *e = s+i->txt_size;
 839                         while (s<e && cwhite(*s)) s++;
 840                         if (s != e)
 841                                 failed("TOK_WHITE does not contain only white characters");
 842                 }
 843
 844                 //Make sure txt and orig match exactly except for backslash line breaks
 845                 if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
 846                         darray_char buf = darray_new();
 847                         fprintf(err,
 848                                 "txt and orig do not match:\n"
 849                                 "\ttxt  = \"%s\"\n",
 850                                 escape_string(&buf, i->txt, i->txt_size) );
 851                         fprintf(err, "\torig = \"%s\"\n",
 852                                 escape_string(&buf, i->orig, i->orig_size) );
 853
 854                         darray_free(buf);
 855                         return 0;
 856                 }
 857
 858                 //Make sure tok_point_lookup returns correct point
 859                 {
 860                         struct tok_point tok_point;
 861                         const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
 862                         size_t line=i->line, col=i->col;
 863
 864                         #define check(ptr) do { \
 865                                 if (tok_point_lookup(&tok_point, ptr, tl)) { \
 866                                         if (tok_point.txt != t || tok_point.orig != o) \
 867                                                 failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
 868                                                 (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
 869                                         if (tok_point.line != line || tok_point.col != col) \
 870                                                 failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
 871                                                 (int)(tok_point.line-line), (int)(tok_point.col-col)); \
 872                                 } else if (initial) {\
 873                                         failed("tok_point_lookup failed on initial token list"); \
 874                                 } \
 875                         } while(0)
 876
 877                         for (;;) {
 878                                 while (is_backslash_break(&p, o, e)) {
 879                                         while (o<p) {
 880                                                 check(o);
 881                                                 o++;
 882                                                 col++;
 883                                         }
 884                                         col = 0;
 885                                         line++;
 886                                 }
 887                                 if (o >= e)
 888                                         break;
 889                                 do {
 890                                         if (creturn(*o)) {
 891                                                 p = o+1;
 892                                                 if (p<e && *p=='\n'+'\r'-p[-1])
 893                                                         p++;
 894                                                 while (o<p) {
 895                                                         check(o);
 896                                                         check(t);
 897                                                         t++, o++, col++;
 898                                                 }
 899                                                 line++;
 900                                                 col = 0;
 901                                         } else {
 902                                                 check(o);
 903                                                 check(t);
 904                                                 o++, t++, col++;
 905                                         }
 906                                 } while (o<e && *o!='\\');
 907                         }
 908
 909                         #undef check
 910                 }
 911         };
 912
 913         //Verify olines and tlines
 914         {
 915                 const char *s = tl->orig, *e = s+tl->orig_size;
 916                 size_t i, line_count = tl->olines_size;
 917
 918                 //both line arrays should be exactly the same size
 919                 if (tl->olines_size != tl->tlines_size)
 920                         return 0;
 921
 922                 for (i=0; s<e; i++) {
 923                         const char *line_start = s, *line_end;
 924                         size_t tline_size, oline_size;
 925                         const char *p;
 926
 927                         if (i+1 < line_count)
 928                                 tline_size = tl->tlines[i+1] - tl->tlines[i];
 929                         else
 930                                 tline_size = tl->txt+tl->txt_size - tl->tlines[i];
 931
 932                         while (s<e && !creturn(*s)) s++;
 933                         line_end = s;
 934                         if (s<e) {
 935                                 s++;
 936                                 if (s<e && *s=='\n'+'\r'-s[-1])
 937                                         s++;
 938                         }
 939
 940                         oline_size = s-line_start;
 941
 942                         //verify that olines elements are correct
 943                         if (line_start != tl->olines[i])
 944                                 return 0;
 945
 946                         //verify that tlines elements are in range
 947                         p = tl->tlines[i];
 948                         if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
 949                                 return 0;
 950
 951                         //verify that original lines have sizes >= the unbroken lines
 952                         if (oline_size < tline_size)
 953                                 return 0;
 954
 955                         //if sizes are inconsistent, make sure it is due to a backslash escape
 956                         if (oline_size > tline_size) {
 957                                 p = line_start+tline_size;
 958                                 if (*p++ != '\\')
 959                                         return 0;
 960                                 while (p<e && cspace(*p)) p++;
 961                                 if (p != line_end)
 962                                         return 0;
 963                         }
 964
 965                         //make sure the text of both copies match
 966                         if ( memcmp(
 967                                 tl->olines[i],
 968                                 tl->tlines[i],
 969                                 tline_size) )
 970                                 return 0;
 971                 }
 972         }
 973
 974         if (initial && !token_list_sanity_check_initial(tl, err))
 975                 failed("Initial sanity checks failed.  Has the list been modified after it was returned from tokenize() ?");
 976
 977         return 1;
 978 }
 979
 980 #undef failed
 981
 982 static char *sprint_token_flags(char buf[3], struct token_flags flags) {
 983         buf[0] = flags.pp ? 'p' : '-';
 984         buf[1] = flags.pp_directive ? 'D' : '-';
 985         buf[2] = 0;
 986         return buf;
 987 }
 988
 989 void token_list_dump(const struct token_list *tl, FILE *f) {
 990         struct token *tok;
 991         darray_char buf = darray_new();
 992         size_t i = 0;
 993         char buf2[8];
 994         const char *token_type_str[] = {
 995                 "TOK_INTEGER      ",
 996                 "TOK_FLOATING     ",
 997                 "TOK_OPERATOR     ",
 998                 "TOK_KEYWORD      ",
 999                 "TOK_IDENTIFIER   ",
1000                 "TOK_CHAR         ",
1001                 "TOK_STRING       ",
1002                 "TOK_LEADING_POUND",
1003                 "TOK_STRING_IQUOTE",
1004                 "TOK_STRING_IANGLE",
1005                 "TOK_CCOMMENT     ",
1006                 "TOK_CPPCOMMENT   ",
1007                 "TOK_WHITE        ",
1008                 "TOK_STARTLINE    ",
1009                 "TOK_STRAY        "
1010         };
1011
1012         for (tok=tl->first; tok; tok=tok->next) {
1013                 fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
1014                         token_type_str[tok->type],
1015                         sprint_token_flags(buf2, tok->flags),
1016                         escape_string(&buf, tok->txt, tok->txt_size));
1017                 #if 1 //print tok->orig
1018                 fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
1019                 #else
1020                 fprintf(f, "\n");
1021                 #endif
1022         }
1023
1024         darray_free(buf);
1025 }
1026
1027 void tok_message_print(struct tok_message *m, struct token_list *tl) {
1028         struct tok_point pt;
1029         int resolved = tok_point_lookup(&pt, m->location, tl);
1030
1031         if (tl->filename) {
1032                 printf("%s:%s", tl->filename, resolved ? "" : " ");
1033         }
1034
1035         if (resolved) {
1036                 printf("%zu:%zu %s: %s\n",
1037                         pt.line+1, pt.col+1,
1038                         m->level==TM_DEBUG ? "debug" :
1039                         m->level==TM_INFO ? "info" :
1040                         m->level==TM_WARN ? "warning" :
1041                         m->level==TM_ERROR ? "error" :
1042                         m->level==TM_BUG ? "BUG" :
1043                         "???",
1044                         m->message);
1045         } else {
1046                 printf("%s: %s\n",
1047                         m->level==TM_DEBUG ? "debug" :
1048                         m->level==TM_INFO ? "info" :
1049                         m->level==TM_WARN ? "warning" :
1050                         m->level==TM_ERROR ? "error" :
1051                         m->level==TM_BUG ? "BUG" :
1052                         "???",
1053                         m->message);
1054         }
1055 }
1056
1057 void tok_message_dump(struct tok_message *m) {
1058         printf("%s: %s: %s\n",
1059                 m->level==TM_DEBUG ? "debug" :
1060                 m->level==TM_INFO ? "info" :
1061                 m->level==TM_WARN ? "warning" :
1062                 m->level==TM_ERROR ? "error" :
1063                 m->level==TM_BUG ? "BUG" :
1064                 "???", m->path, m->message);
1065 }
1066
1067 void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
1068         const char *path, const char *loc, const char *fmt, ...) {
1069         struct tok_message msg = {.level=level, .path=path, .location=loc};
1070         va_list ap;
1071
1072         if (!mq)
1073                 return;
1074
1075         va_start(ap, fmt);
1076         msg.message = talloc_vasprintf(mq->item, fmt, ap);
1077         va_end(ap);
1078
1079         enqueue(*mq, msg);
1080 }
1081
1082 void tok_message_queue_dump(const tok_message_queue *mq) {
1083         size_t i;
1084         for (i=0; i<queue_count(*mq); i++)
1085                 tok_message_dump(&queue_item(*mq, i));
1086 }
1087
1088
1089 #undef add
1090 #undef cstray
1091 #undef cident