Added module ccan_tokenizer from snapshot at:
[ccan] / ccan / ccan_tokenizer / ccan_tokenizer.c
1 /*
2         Copyright (c) 2009  Joseph A. Adams
3         All rights reserved.
4         
5         Redistribution and use in source and binary forms, with or without
6         modification, are permitted provided that the following conditions
7         are met:
8         1. Redistributions of source code must retain the above copyright
9            notice, this list of conditions and the following disclaimer.
10         2. Redistributions in binary form must reproduce the above copyright
11            notice, this list of conditions and the following disclaimer in the
12            documentation and/or other materials provided with the distribution.
13         3. The name of the author may not be used to endorse or promote products
14            derived from this software without specific prior written permission.
15         
16         THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17         IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18         OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19         IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20         INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21         NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22         DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23         THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24         (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25         THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "ccan_tokenizer.h"
29
30 #include <ccan/talloc/talloc.h>
31
32 #include <assert.h>
33
34 //Shown by operator precedence; based on
35 // http://tigcc.ticalc.org/doc/opers.html#precedence .
36
37 static struct dict_entry c_dictionary[] = {
38 //1. Highest
39         {'(',"("}, {')',")"},
40         {'[',"["}, {']',"]"},
41         {'{',"{"}, {'}',"}"},
42         {'.',"."},
43         {PTR_OP,"->"},
44         
45 //2. Unary
46         {'!',"!"}, {'~',"~"}, //prefix
47         {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
48         // + - & *
49         
50 //3. Multiplicative
51         // *
52         {'/',"/"}, {'%',"%"},
53         
54 //4. Additive
55         // + -
56         
57 //5. Shift
58         {LEFT_OP,"<<"}, {RIGHT_OP,">>"},
59         
60 //6. Relational
61         {'<',"<"}, {'>',">"},
62         {LE_OP,"<="}, {GE_OP,">="},
63         
64 //7. Equality
65         {EQ_OP,"=="}, {NE_OP,"!="},
66         
67 //8. Bitwise AND
68         // &
69 //9. Bitwise XOR
70         {'^',"^"},
71 //10. Bitwise OR
72         {'|',"|"},
73
74 //11. Logical AND
75         {AND_OP,"&&"},
76 //12. Logical OR
77         {OR_OP,"||"},
78
79 //13. Conditional
80         {'?',"?"}, {':',":"},
81
82 //14. Assignment
83         {'=',"="},
84         {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
85         {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
86         {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
87         {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
88         
89 //15. Comma
90         {',',","},
91
92 //16. Semicolon
93         {';',";"},
94         
95 //Misc
96         {ELLIPSIS,"..."},
97         {'#',"#"},
98         {DOUBLE_POUND,"##"},
99
100 //Ambiguous
101         //unary or binary
102         {'+',"+"}, {'-',"-"},
103         {'&',"&"}, {'*',"*"},
104
105 //Keywords
106         {_BOOL, "_Bool"},
107         {_COMPLEX, "_Complex"},
108         {_IMAGINARY, "_Imaginary"},
109         {BREAK, "break"},
110         {CASE, "case"},
111         {CHAR, "char"},
112         {CONST, "const"},
113         {CONTINUE, "continue"},
114         {DEFAULT, "default"},
115         {DO, "do"},
116         {DOUBLE, "double"},
117         {ELSE, "else"},
118         {ENUM, "enum"},
119         {EXTERN, "extern"},
120         {FLOAT, "float"},
121         {FOR, "for"},
122         {GOTO, "goto"},
123         {IF, "if"},
124         {INLINE, "inline"},
125         {INT, "int"},
126         {LONG, "long"},
127         {REGISTER, "register"},
128         {RESTRICT, "restrict"},
129         {RETURN, "return"},
130         {SHORT, "short"},
131         {SIGNED, "signed"},
132         {SIZEOF, "sizeof"},
133         {STATIC, "static"},
134         {STRUCT, "struct"},
135         {SWITCH, "switch"},
136         {TYPEDEF, "typedef"},
137         {UNION, "union"},
138         {UNSIGNED, "unsigned"},
139         {VOID, "void"},
140         {VOLATILE, "volatile"},
141         {WHILE, "while"},
142
143 //Preprocessor keywords (except those already defined)
144         {VA_ARGS, "__VA_ARGS__"},
145         {DEFINE, "define"},
146         {ELIF, "elif"},
147 //      {ELSE, "else"},
148         {ENDIF, "endif"},
149         {ERROR, "error"},
150 //      {IF, "if"},
151         {IFDEF, "ifdef"},
152         {IFNDEF, "ifndef"},
153         {INCLUDE, "include"},
154         {LINE, "line"},
155         {PRAGMA, "pragma"},
156         {UNDEF, "undef"},
157         {WARNING, "warning"},
158 };
159
160 #if 0
161
162 struct tokenizer *tokenizer_new(void *ctx) {
163         struct tokenizer *t = talloc(ctx, struct tokenizer);
164         t->ctx = ctx;
165         queue_init(t->mq, t);
166         t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
167         
168         return t;
169 }
170
171 #endif
172
173 #define MESSAGE_PATH "tokenize/"
174
175 static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
176         const char *s = tl->orig, *e = s+tl->orig_size;
177         array_char txt = array_new(tl);
178         array(const char*) olines = array_new(tl);
179         array(const char*) tlines = array_new(tl);
180         
181         do {
182                 const char *line_start = s, *line_end;
183                 const char *lnw; //last non-white
184                 size_t start_offset = txt.size;
185                 
186                 //scan to the next line and find the last non-white character in the line
187                 while (s<e && !creturn(*s)) s++;
188                 line_end = s;
189                 lnw = s;
190                 while (lnw>line_start && cspace(lnw[-1])) lnw--;
191                 if (s<e && creturn(*s)) {
192                         s++;
193                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
194                         if (s<e && *s=='\n'+'\r'-s[-1])
195                                 s++;
196                 }
197                 
198                 //add the backslash-break-free version of the text
199                 if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
200                         array_append_items(txt, line_start, lnw-1-line_start);
201                         if (lnw<e && cspace(*lnw)) {
202                                 tok_msg_warn(spaces_after_backslash_break, lnw,
203                                         "Trailing spaces after backslash-broken line");
204                         }
205                 } else
206                         array_append_items(txt, line_start, s-line_start);
207                 
208                 //add the line starts for this line
209                 array_append(olines, line_start);
210                 array_append(tlines, (const char*)start_offset);
211                         //Since the txt buffer moves when expanded, we're storing offsets
212                         //  for now.  Once we're done building txt, we can add the base
213                         //  of it to all the offsets to make them pointers.
214         } while (s<e);
215         
216         //stick a null terminator at the end of the text
217         array_realloc(txt, txt.size+1);
218         txt.item[txt.size] = 0;
219         
220         //convert the line start offsets to pointers
221         array_for(i, tlines, *i = txt.item + (size_t)*i);
222         
223         tl->olines = olines.item;
224         tl->olines_size = olines.size;
225         tl->txt = txt.item;
226         tl->txt_size = txt.size;
227         tl->tlines = tlines.item;
228         tl->tlines_size = tlines.size;
229 }
230
231 static void normal_keyword(struct token *tok) {
232         if (tok->type==TOK_KEYWORD &&
233                         (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
234                 tok->type = TOK_IDENTIFIER;
235 }
236
237 static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
238         while (end>start && token_is_ignored(end-1)) end--;
239         return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
240 }
241
242 //Used to label __VA_ARGS__ as keywords within applicable macro expansions
243 //Start should follow the DEFINE directive keyword
244 static void this_is_a_define(struct token *start, struct token *end) {
245         struct token *i = start, *pl_start;
246         
247         //skip past the identifier that is defined
248         while (i<end && token_is_ignored(i)) i++;
249         if (i >= end)
250                 return;
251          //TODO:  check i->type to make sure it's an identifier, throw error otherwise
252         normal_keyword(i++);
253         
254         //see if this is actually a variadic macro
255         if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
256                 goto not_va_args;
257         pl_start = ++i;
258         while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
259                 normal_keyword(i++);
260         if (!define_parmlist_has_ellipsis(pl_start, i++))
261                 goto not_va_args;
262         
263         //We have arrived at the macro expansion and know there is a ... argument
264         //Thus, we'll only change directive-only keywords to identifiers
265         for(; i<end; i++) {
266                 if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
267                         i->type = TOK_IDENTIFIER;
268         }
269         
270 not_va_args:
271         while (i < end)
272                 normal_keyword(i++);
273 }
274
275 //fill the flags field of each token and untangle keywords and such
276 static void finalize_line(struct token *start, struct token *end) {
277         struct token *i = start, *j;
278         
279         assert(start<end && start->type==TOK_STARTLINE);
280         i++;
281         
282         while (i<end && token_is_ignored(i)) i++;
283         
284         if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
285         //preprocessor line
286                 i->type = TOK_LEADING_POUND;
287                 
288                 //set pp on all tokens in this line
289                 for (j=start; j<end; j++)
290                         j->flags.pp = 1;
291                 
292                 //find the relevant token after the '#'
293                 for (i++; i<end; i++) {
294                         if (!token_is_ignored(i)) {
295                                 i->flags.pp_directive = 1;
296                                 if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
297                                         i->type = TOK_IDENTIFIER;
298                                 //TODO:  Handle invalid preprocessor directives (e.g. #+ )
299                                 
300                                 if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
301                                         for (j=i+1; j<end; j++)
302                                         this_is_a_define(i+1, end);
303                                 } else {
304                                         while (++i < end)
305                                                 normal_keyword(i);
306                                 }
307                                 break;
308                         }
309                 }
310         } else {
311         //normal line
312                 while (i < end)
313                         normal_keyword(i++);
314         }
315 }
316
317 //fill the list, flags, line, col, orig, and orig_size fields of each token
318 //convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
319 static void finalize(struct token_list *tl, struct token *start, struct token *end) {
320         const char * const *lss = tl->tlines;
321         const char * const *lse = lss + tl->tlines_size;
322         struct token *i;
323         struct token *startline = NULL;
324         
325         assert(start < end);
326         
327         tl->first = start;
328         tl->last = end-1;
329         
330         for (i=start; ; i++) {
331                 //perform a second pass on each line
332                 if (i >= end || i->type == TOK_STARTLINE) {
333                         if (startline)
334                                 finalize_line(startline, i);
335                         startline = i;
336                 }
337                 
338                 if (i >= end) {
339                         end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
340                         break;
341                 }
342                 
343                 //set up the list links
344                 i->prev = i>start ? i-1 : NULL;
345                 i->next = i+1<end ? i+1 : NULL;
346                 
347                 //if i->txt starts on a later line, advance to it
348                 while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
349                         lss++;
350                 
351                 //set up line, col, orig, and orig_size
352                 i->line = lss - tl->tlines;
353                 i->col = i->txt - *lss;
354                 i->orig = tl->olines[i->line] + i->col;
355                 if (i > start)
356                         i[-1].orig_size = i->orig - i[-1].orig;
357                 
358                 assert(i->line < tl->olines_size);
359                 
360                 //clear the flags
361                 memset(&i->flags, 0, sizeof(i->flags));
362         }
363 }
364
365 #define add(...) do { \
366                 struct token tok = {__VA_ARGS__}; \
367                 tok.txt = orig; \
368                 tok.txt_size = s-orig; \
369                 array_append(array, tok); \
370         } while (0)
371
372 #define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
373 #define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
374         //believe it or not, $ is a valid character in an identifier
375
376 struct dict *tokenizer_dict = NULL;
377
378 static void free_tokenizer_dict(void) {
379         talloc_free(tokenizer_dict);
380 }
381
382 struct token_list *tokenize(const char *orig, size_t orig_size,
383                                 tok_message_queue *mq) {
384         struct token_list *tl = talloc(orig, struct token_list);
385         const char *s, *e;
386         size_t stray_count=0, cr_count=0;
387         array(struct token) array = array_new(tl);
388         int only_pound_include = 0;
389         
390         if (!tokenizer_dict) {
391                 tokenizer_dict = dict_build(NULL, c_dictionary,
392                         sizeof(c_dictionary)/sizeof(*c_dictionary));
393                 atexit(free_tokenizer_dict);
394         }
395         
396         tl->orig = orig;
397         tl->orig_size = orig_size;
398         unbreak_backslash_broken_lines(tl, mq);
399         tl->filename = NULL;
400         
401         s = tl->txt;
402         e = s + tl->txt_size;
403         
404         array_appends(array, {
405                 .type = TOK_STARTLINE,
406                 .txt = s,
407                 .txt_size = 0
408         } );
409         
410         while (s<e) {
411                 const char *orig = s;
412                 char c = *s++;
413                 int added_something = 1;
414                 
415                 if (cstray(c)) {
416                         stray_count++;
417                         while (s<e && cstray(*s)) {
418                                 s++;
419                                 stray_count++;
420                         }
421                         add(.type = TOK_STRAY);
422                         
423                         /* This has the potential to be very noisy on binary
424                            files, but it really is quite useful. */
425                         tok_msg_error(stray_segment, orig,
426                                 "%zu stray characters", s-orig);
427                 
428                 } else if (creturn(c)) {
429                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
430                         if (s<e && *s=='\n'+'\r'-c) {
431                                 s++;
432                                 cr_count++;
433                         } else if (c=='\r')
434                                 cr_count++;
435                         
436                         add(.type = TOK_WHITE);
437                         orig = s;
438                         
439                         //add a TOK_STARTLINE for the next line unless this is the end of the document
440                         if (s<e)
441                                 add(.type = TOK_STARTLINE);
442                         
443                         only_pound_include = 0;
444                 
445                 } else if (cspace(c)) {
446                         //skip over the remaining whitespace
447                         while (s<e && cspace(*s)) s++;
448                         add(.type = TOK_WHITE);
449                         added_something = 0;
450                 
451                 } else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
452                         struct token tok;
453                         s = read_cnumber(&tok, s-1, e, mq);
454                         tok.txt = orig;
455                         tok.txt_size = s-orig;
456                         array_append(array, tok);
457                         
458                 } else if (csymbol(c) || cident(c)) {
459                         if (only_pound_include && (c=='"' || c=='<')) { //include string
460                                 char *include;
461                                 char end = c=='"' ? '"' : '>';
462                                 short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
463                                 
464                                 while (s<e && !creturn(*s) && *s!=end) s++;
465                                 include = talloc_strndup(tl, orig+1, s-(orig+1));
466                                 
467                                 if (s<e && *s==end) {
468                                         s++;
469                                 } else {
470                                         tok_msg_error(include_missing_terminator, orig,
471                                                 "Missing terminating %c character", end);
472                                 }
473                                 
474                                 add(.type = type,
475                                         {.include = include});
476                         } else if (c=='\'' || c=='\"') { //character or string literal
477                                 array_char string = array_new(tl);
478                                 s = read_cstring(&string, s, e, c, mq);
479                                 if (s<e) s++; //advance past endquote (if available)
480                                 add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
481                                     {.string = string});
482                         } else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
483                                 if (*s++ == '*') { /* C-style comment */
484                                         const char *comment_start = s-2;
485                                         for (;;s++) {
486                                                 if (s+1 >= e) {
487                                                         s = e;
488                                                         tok_msg_error(unterminated_comment, comment_start,
489                                                                 "Unterminated comment");
490                                                         break;
491                                                 }
492                                                 if (s[0]=='*' && s[1]=='/') {
493                                                         s += 2;
494                                                         break;
495                                                 }
496                                         }
497                                         add(.type = TOK_CCOMMENT);
498                                 } else { // C++-style comment
499                                         while (s<e && !creturn(*s)) s++;
500                                         add(.type = TOK_CPPCOMMENT);
501                                 }
502                                 added_something = 0;
503                         
504                         } else { //operator, keyword, or identifier
505                                 struct dict_entry *ent;
506                                 const char *ident_e = --s;
507                                 while (ident_e<e && cident(*ident_e) ) ident_e++;
508                                 
509                                 ent = dict_lookup(tokenizer_dict, &s, e);
510                                 if (cident(c)) { //keyword or identifier
511                                         if (ent && s==ident_e) {
512                                                 add(.type = TOK_KEYWORD,
513                                                         {.opkw = ent->id});
514                                                 if (ent->id == INCLUDE) {
515                                                         //hacky way to lex #include string properly
516                                                         struct token *ts = array.item;
517                                                         struct token *tp = ts+array.size-1;
518                                                         while (tp>ts && token_is_ignored(tp-1))
519                                                                 tp--;
520                                                         if (tp>ts && token_is_op(tp-1, '#')) {
521                                                                 tp--;
522                                                                 while (tp>ts && token_is_ignored(tp-1))
523                                                                         tp--;
524                                                                 if (tp>ts && tp[-1].type==TOK_STARTLINE) {
525                                                                         only_pound_include = 1;
526                                                                         continue;
527                                                                 }
528                                                         }
529                                                 }
530                                         } else {
531                                                 s = ident_e;
532                                                 add(.type = TOK_IDENTIFIER);
533                                         }
534                                 } else if (ent) { //operator
535                                         add(.type = TOK_OPERATOR,
536                                             {.opkw = ent->id});
537                                 } else { //invalid symbol (shouldn't happen)
538                                         tok_msg_bug(unrecognized_symbol, s,
539                                                 "Unrecognized symbol \'%c\'", c);
540                                         s++;
541                                         add(.type = TOK_STRAY);
542                                 }
543                         }
544                 }
545                 
546                 if (added_something)
547                         only_pound_include = 0;
548         }
549         
550         /*if (stray_count) {
551                 tok_msg_error(stray_characters, NULL,
552                         "%lu stray characters in text", (unsigned long)stray_count);
553         }*/
554         if (cr_count) {
555                 tok_msg_warn(nonstandard_newlines, NULL,
556                         "Text contains non-standard line terminators");
557         }
558         
559         finalize(tl, array.item, array.item+array.size);
560         
561         return tl;
562 }
563
564 size_t token_list_count(const struct token_list *tl) {
565         size_t ret = 0;
566         const struct token *i;
567         
568         for (i=tl->first; i; i=i->next)
569                 ret++;
570         
571         return ret;
572 }
573
574 static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
575         const char * const *orig = lines;
576         const char * const *orig_e = lines+line_count;
577         
578         while (line_count > 1) {
579                 size_t middle = line_count>>1;
580                 if (ptr < lines[middle])
581                         line_count = middle;
582                 else {
583                         lines += middle;
584                         line_count -= middle;
585                 }
586         }
587         
588         //select the *last* of equivalent lines
589         while (lines+1 < orig_e && lines[0]==lines[1])
590                 lines++;
591         
592         // (don't) select the *first* of equivalent lines
593         //while (lines>orig && lines<orig_e && lines[-1]==lines[0])
594         //      lines--;
595         
596         return lines - orig;
597 }
598
599 int tok_point_lookup(struct tok_point *out, const char *ptr,
600                         const struct token_list *tl) {
601         size_t line_count = tl->olines_size;
602         
603         memset(out, 0, sizeof(*out));
604         if (!tl)
605                 return 0;
606         
607         if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
608                 out->txt = ptr;
609                 out->line = find_line(ptr, tl->tlines, line_count);
610                 if (out->line < line_count) {
611                         out->col = ptr - tl->tlines[out->line];
612                         out->orig = tl->olines[out->line] + out->col;
613                 } else {
614                         out->col = 0;
615                         out->orig = tl->orig + tl->orig_size;
616                 }
617                 return 1;
618         } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
619                 out->orig = ptr;
620                 out->line = find_line(ptr, tl->olines, line_count);
621                 if (out->line < line_count) {
622                         const char *tline_start = tl->tlines[out->line];
623                         const char *tline_end = out->line+1 < line_count ?
624                                 tl->tlines[out->line+1] :
625                                 tl->txt + tl->txt_size;
626                         
627                         out->col = ptr - tl->olines[out->line];
628                         out->txt = tline_start + out->col;
629                         
630                         if (out->txt > tline_end)
631                                 out->txt = tline_end;
632                 } else {
633                         out->col = 0;
634                         out->txt = tl->txt + tl->txt_size;
635                 }
636                 return 1;
637         } else {
638                 return 0;
639         }
640 }
641
642 static char *escape_string(array_char *buf, const char *str, size_t size) {
643         const char *s = str, *e = s+size;
644         array_from_lit(*buf, "");
645         
646         for (;s<e;s++) {
647                 char buffer[8];
648                 const char *esc = buffer;
649                 unsigned char c = (unsigned char)*s;
650                 if (ccontrol(c))
651                         sprintf(buffer, "\\x%02X", c);
652                 else switch(c) {
653                         case '\t': esc = "\\t"; break;
654                         case '\n': esc = "\\n"; break;
655                         case '\v': esc = "\\v"; break;
656                         case '\f': esc = "\\f"; break;
657                         case '\r': esc = "\\r"; break;
658                         case '"': esc = "\\\""; break;
659                         case '\\': esc = "\\\\"; break;
660                         default:
661                                 buffer[0] = c;
662                                 buffer[1] = 0;
663                 }
664                 array_append_string(*buf, esc);
665         }
666         
667         return buf->item;
668 }
669
670 static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
671         const char *ts = txt, *te = ts+txt_size;
672         const char *os = orig, *oe = os+orig_size;
673         
674         do {
675                 const char *ob = os; //start of next backslash break
676                 const char *obe = os; //end of next backslash break
677                 size_t size; //amount of text to compare for this round
678                 
679                 while (ob<oe && *ob!='\\') ob++;
680                 obe = ob;
681                 if (obe < oe) { //there's a backslash
682                         obe++;
683                         while (obe<oe && cspace(*obe)) obe++;
684                         if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
685                                 obe++;
686                                 if (obe<oe && *obe == '\n'+'\r'-obe[-1])
687                                         obe++;
688                         } else //this is just a plain old backslash
689                                 ob = obe;
690                 }
691                 
692                 size = ob-os;
693                 
694                 if (ts+size > te || memcmp(ts, os, size))
695                         return 0;
696                 ts += size;
697                 os = obe;
698         } while (ts<te);
699         
700         if (ts != te || os != oe)
701                 return 0;
702         
703         return 1;
704 }
705
706 static int is_backslash_break(const char **end, const char *s, const char *e) {
707         if (s<e && *s == '\\') {
708                 s++;
709                 while (s<e && cspace(*s)) s++;
710                 if (s<e && creturn(*s)) {
711                         s++;
712                         if (s<e && *s=='\n'+'\r'-s[-1])
713                                 s++;
714                         *end = s;
715                         return 1;
716                 }
717                 return 0;
718         }
719         return 0;
720 }
721
722 #define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)
723
724 //tests that should pass on an untainted token list out of the tokenize() function
725 static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
726         struct token *first = tl->first;
727         struct token *last = tl->last;
728         struct token *i;
729         const char *txt=tl->txt, *orig=tl->orig;
730         const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
731         
732         if ((char*)first > (char*)last ||
733                 (size_t)((char*)last - (char*)first) % sizeof(struct token))
734                 failed("Token list pointers don't look right");
735         
736         //token list should not end with TOK_STARTLINE unless
737         //  the document is empty
738         if (last!=first && last->type==TOK_STARTLINE)
739                 return 0;
740         
741         for (i=first; i; i=i->next) {
742                 //Verify list links
743                 if (i != first && i->prev != i-1)
744                         failed("list.prev is incorrect");
745                 if (i != last && i->next != i+1)
746                         failed("list.next is incorrect");
747                 
748                 //Make sure txt segments fill the entire tl->txt
749                 if (i->txt != txt)
750                         failed("txt does not fill the token list");
751                 txt += i->txt_size;
752                 if (txt > txt_e)
753                         failed("txt is out of bounds");
754                 
755                 //Make sure orig segments fill the entire tl->orig
756                 if (i->orig != orig)
757                         failed("orig does not fill the token list");
758                 orig += i->orig_size;
759                 if (orig > orig_e)
760                         failed("orig is out of bounds");
761         }
762         
763         if (txt != txt_e)
764                 return 0;
765         if (orig != orig_e)
766                 return 0;
767         
768         return 1;
769 }
770
771 int token_list_sanity_check(const struct token_list *tl, FILE *err) {
772         struct token *first = tl->first;
773         struct token *last = tl->last;
774         struct token *i;
775         int initial = 1;
776         
777         if (tl->first == NULL || tl->last == NULL)
778                 failed("Token list is completely empty");
779         
780         if (first->type!=TOK_STARTLINE ||
781             first->txt!=tl->txt || first->txt_size!=0 ||
782             first->orig!=tl->orig || first->orig_size!=0 ||
783             first->line!=0 || first->col!=0)
784                 failed("Token list does not start with a valid TOK_STARTLINE");
785         
786         if (first->prev!=NULL || last->next!=NULL)
787                 failed("Token edge links are not NULL");
788         
789         for (i=first; i; i=i->next) {
790                 //Verify line,col
791                 if (tl->tlines[i->line] + i->col != i->txt)
792                         failed("line,col is wrong against txt");
793                 if (tl->olines[i->line] + i->col != i->orig)
794                         failed("line,col is wrong against orig");
795                 
796                 //Make sure tokens have proper sizes
797                 if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
798                         failed("Token is empty");
799                 if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
800                         failed("TOK_STARTLINE is non-empty");
801                 
802                 //Make sure TOK_WHITE actually contains white tokens
803                 if (i->type==TOK_WHITE) {
804                         const char *s = i->txt, *e = s+i->txt_size;
805                         while (s<e && cwhite(*s)) s++;
806                         if (s != e)
807                                 failed("TOK_WHITE does not contain only white characters");
808                 }
809                 
810                 //Make sure txt and orig match exactly except for backslash line breaks
811                 if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
812                         array_char buf = array_new(NULL);
813                         fprintf(err,
814                                 "txt and orig do not match:\n"
815                                 "\ttxt  = \"%s\"\n",
816                                 escape_string(&buf, i->txt, i->txt_size) );
817                         fprintf(err, "\torig = \"%s\"\n",
818                                 escape_string(&buf, i->orig, i->orig_size) );
819                         
820                         array_free(buf);
821                         return 0;
822                 }
823                 
824                 //Make sure tok_point_lookup returns correct point
825                 {
826                         struct tok_point tok_point;
827                         const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
828                         size_t line=i->line, col=i->col;
829                         
830                         #define check(ptr) do { \
831                                 if (tok_point_lookup(&tok_point, ptr, tl)) { \
832                                         if (tok_point.txt != t || tok_point.orig != o) \
833                                                 failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
834                                                 (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
835                                         if (tok_point.line != line || tok_point.col != col) \
836                                                 failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
837                                                 (int)(tok_point.line-line), (int)(tok_point.col-col)); \
838                                 } else if (initial) {\
839                                         failed("tok_point_lookup failed on initial token list"); \
840                                 } \
841                         } while(0)
842                         
843                         for (;;) {
844                                 while (is_backslash_break(&p, o, e)) {
845                                         while (o<p) {
846                                                 check(o);
847                                                 o++;
848                                                 col++;
849                                         }
850                                         col = 0;
851                                         line++;
852                                 }
853                                 if (o >= e)
854                                         break;
855                                 do {
856                                         if (creturn(*o)) {
857                                                 p = o+1;
858                                                 if (p<e && *p=='\n'+'\r'-p[-1])
859                                                         p++;
860                                                 while (o<p) {
861                                                         check(o);
862                                                         check(t);
863                                                         t++, o++, col++;
864                                                 }
865                                                 line++;
866                                                 col = 0;
867                                         } else {
868                                                 check(o);
869                                                 check(t);
870                                                 o++, t++, col++;
871                                         }
872                                 } while (o<e && *o!='\\');
873                         }
874                         
875                         #undef check
876                 }
877         };
878         
879         //Verify olines and tlines
880         {
881                 const char *s = tl->orig, *e = s+tl->orig_size;
882                 size_t i, line_count = tl->olines_size;
883                 
884                 //both line arrays should be exactly the same size
885                 if (tl->olines_size != tl->tlines_size)
886                         return 0;
887                 
888                 for (i=0; s<e; i++) {
889                         const char *line_start = s, *line_end;
890                         size_t tline_size, oline_size;
891                         const char *p;
892                         
893                         if (i+1 < line_count)
894                                 tline_size = tl->tlines[i+1] - tl->tlines[i];
895                         else
896                                 tline_size = tl->txt+tl->txt_size - tl->tlines[i];
897                         
898                         while (s<e && !creturn(*s)) s++;
899                         line_end = s;
900                         if (s<e) {
901                                 s++;
902                                 if (s<e && *s=='\n'+'\r'-s[-1])
903                                         s++;
904                         }
905                         
906                         oline_size = s-line_start;
907                         
908                         //verify that olines elements are correct
909                         if (line_start != tl->olines[i])
910                                 return 0;
911                         
912                         //verify that tlines elements are in range
913                         p = tl->tlines[i];
914                         if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
915                                 return 0;
916                         
917                         //verify that original lines have sizes >= the unbroken lines
918                         if (oline_size < tline_size)
919                                 return 0;
920                         
921                         //if sizes are inconsistent, make sure it is due to a backslash escape
922                         if (oline_size > tline_size) {
923                                 p = line_start+tline_size;
924                                 if (*p++ != '\\')
925                                         return 0;
926                                 while (p<e && cspace(*p)) p++;
927                                 if (p != line_end)
928                                         return 0;
929                         }
930                         
931                         //make sure the text of both copies match
932                         if ( memcmp(
933                                 tl->olines[i],
934                                 tl->tlines[i],
935                                 tline_size) )
936                                 return 0;
937                 }
938         }
939         
940         if (initial && !token_list_sanity_check_initial(tl, err))
941                 failed("Initial sanity checks failed.  Has the list been modified after it was returned from tokenize() ?");
942         
943         return 1;
944 }
945
946 #undef failed
947
948 static char *sprint_token_flags(char buf[3], struct token_flags flags) {
949         buf[0] = flags.pp ? 'p' : '-';
950         buf[1] = flags.pp_directive ? 'D' : '-';
951         buf[2] = 0;
952         return buf;
953 }
954
955 void token_list_dump(const struct token_list *tl, FILE *f) {
956         struct token *tok;
957         array_char buf = array_new(NULL);
958         size_t i = 0;
959         char buf2[8];
960         const char *token_type_str[] = {
961                 "TOK_INTEGER      ",
962                 "TOK_FLOATING     ",
963                 "TOK_OPERATOR     ",
964                 "TOK_KEYWORD      ",
965                 "TOK_IDENTIFIER   ",
966                 "TOK_CHAR         ",
967                 "TOK_STRING       ",
968                 "TOK_LEADING_POUND",
969                 "TOK_STRING_IQUOTE",
970                 "TOK_STRING_IANGLE",
971                 "TOK_CCOMMENT     ",
972                 "TOK_CPPCOMMENT   ",
973                 "TOK_WHITE        ",
974                 "TOK_STARTLINE    ",
975                 "TOK_STRAY        "
976         };
977         
978         for (tok=tl->first; tok; tok=tok->next) {
979                 fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
980                         token_type_str[tok->type],
981                         sprint_token_flags(buf2, tok->flags),
982                         escape_string(&buf, tok->txt, tok->txt_size));
983                 #if 1 //print tok->orig
984                 fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
985                 #else
986                 fprintf(f, "\n");
987                 #endif
988         }
989         
990         array_free(buf);
991 }
992
993 void tok_message_print(struct tok_message *m, struct token_list *tl) {
994         struct tok_point pt;
995         int resolved = tok_point_lookup(&pt, m->location, tl);
996         
997         if (tl->filename) {
998                 printf("%s:%s", tl->filename, resolved ? "" : " ");
999         }
1000         
1001         if (resolved) {
1002                 printf("%zu:%zu %s: %s\n",
1003                         pt.line+1, pt.col+1,
1004                         m->level==TM_DEBUG ? "debug" :
1005                         m->level==TM_INFO ? "info" :
1006                         m->level==TM_WARN ? "warning" :
1007                         m->level==TM_ERROR ? "error" :
1008                         m->level==TM_BUG ? "BUG" :
1009                         "???",
1010                         m->message);
1011         } else {
1012                 printf("%s: %s\n",
1013                         m->level==TM_DEBUG ? "debug" :
1014                         m->level==TM_INFO ? "info" :
1015                         m->level==TM_WARN ? "warning" :
1016                         m->level==TM_ERROR ? "error" :
1017                         m->level==TM_BUG ? "BUG" :
1018                         "???",
1019                         m->message);
1020         }
1021 }
1022
1023 void tok_message_dump(struct tok_message *m) {
1024         printf("%s: %s: %s\n",
1025                 m->level==TM_DEBUG ? "debug" :
1026                 m->level==TM_INFO ? "info" :
1027                 m->level==TM_WARN ? "warning" :
1028                 m->level==TM_ERROR ? "error" :
1029                 m->level==TM_BUG ? "BUG" :
1030                 "???", m->path, m->message);
1031 }
1032
1033 void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
1034         const char *path, const char *loc, const char *fmt, ...) {
1035         struct tok_message msg = {.level=level, .path=path, .location=loc};
1036         va_list ap;
1037         
1038         if (!mq)
1039                 return;
1040         
1041         va_start(ap, fmt);
1042         msg.message = talloc_vasprintf(mq->item, fmt, ap);
1043         va_end(ap);
1044         
1045         enqueue(*mq, msg);
1046 }
1047
1048 void tok_message_queue_dump(const tok_message_queue *mq) {
1049         size_t i;
1050         for (i=0; i<queue_count(*mq); i++)
1051                 tok_message_dump(&queue_item(*mq, i));
1052 }
1053
1054
1055 #undef add
1056 #undef cstray
1057 #undef cident