ccan_tokenizer: update to be compatible with darray.
[ccan] / ccan / ccan_tokenizer / ccan_tokenizer.c
1 /*
2         Copyright (c) 2009  Joseph A. Adams
3         All rights reserved.
4         
5         Redistribution and use in source and binary forms, with or without
6         modification, are permitted provided that the following conditions
7         are met:
8         1. Redistributions of source code must retain the above copyright
9            notice, this list of conditions and the following disclaimer.
10         2. Redistributions in binary form must reproduce the above copyright
11            notice, this list of conditions and the following disclaimer in the
12            documentation and/or other materials provided with the distribution.
13         3. The name of the author may not be used to endorse or promote products
14            derived from this software without specific prior written permission.
15         
16         THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17         IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18         OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19         IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20         INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21         NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22         DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23         THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24         (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25         THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "ccan_tokenizer.h"
29
30 #include <ccan/talloc/talloc.h>
31
32 #include <assert.h>
33
34 //Shown by operator precedence; based on
35 // http://tigcc.ticalc.org/doc/opers.html#precedence .
36
37 static struct dict_entry c_dictionary[] = {
38 //1. Highest
39         {'(',"("}, {')',")"},
40         {'[',"["}, {']',"]"},
41         {'{',"{"}, {'}',"}"},
42         {'.',"."},
43         {PTR_OP,"->"},
44         
45 //2. Unary
46         {'!',"!"}, {'~',"~"}, //prefix
47         {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
48         // + - & *
49         
50 //3. Multiplicative
51         // *
52         {'/',"/"}, {'%',"%"},
53         
54 //4. Additive
55         // + -
56         
57 //5. Shift
58         {LEFT_OP,"<<"}, {RIGHT_OP,">>"},
59         
60 //6. Relational
61         {'<',"<"}, {'>',">"},
62         {LE_OP,"<="}, {GE_OP,">="},
63         
64 //7. Equality
65         {EQ_OP,"=="}, {NE_OP,"!="},
66         
67 //8. Bitwise AND
68         // &
69 //9. Bitwise XOR
70         {'^',"^"},
71 //10. Bitwise OR
72         {'|',"|"},
73
74 //11. Logical AND
75         {AND_OP,"&&"},
76 //12. Logical OR
77         {OR_OP,"||"},
78
79 //13. Conditional
80         {'?',"?"}, {':',":"},
81
82 //14. Assignment
83         {'=',"="},
84         {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
85         {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
86         {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
87         {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
88         
89 //15. Comma
90         {',',","},
91
92 //16. Semicolon
93         {';',";"},
94         
95 //Misc
96         {ELLIPSIS,"..."},
97         {'#',"#"},
98         {DOUBLE_POUND,"##"},
99
100 //Ambiguous
101         //unary or binary
102         {'+',"+"}, {'-',"-"},
103         {'&',"&"}, {'*',"*"},
104
105 //Keywords
106         {_BOOL, "_Bool"},
107         {_COMPLEX, "_Complex"},
108         {_IMAGINARY, "_Imaginary"},
109         {BREAK, "break"},
110         {CASE, "case"},
111         {CHAR, "char"},
112         {CONST, "const"},
113         {CONTINUE, "continue"},
114         {DEFAULT, "default"},
115         {DO, "do"},
116         {DOUBLE, "double"},
117         {ELSE, "else"},
118         {ENUM, "enum"},
119         {EXTERN, "extern"},
120         {FLOAT, "float"},
121         {FOR, "for"},
122         {GOTO, "goto"},
123         {IF, "if"},
124         {INLINE, "inline"},
125         {INT, "int"},
126         {LONG, "long"},
127         {REGISTER, "register"},
128         {RESTRICT, "restrict"},
129         {RETURN, "return"},
130         {SHORT, "short"},
131         {SIGNED, "signed"},
132         {SIZEOF, "sizeof"},
133         {STATIC, "static"},
134         {STRUCT, "struct"},
135         {SWITCH, "switch"},
136         {TYPEDEF, "typedef"},
137         {UNION, "union"},
138         {UNSIGNED, "unsigned"},
139         {VOID, "void"},
140         {VOLATILE, "volatile"},
141         {WHILE, "while"},
142
143 //Preprocessor keywords (except those already defined)
144         {VA_ARGS, "__VA_ARGS__"},
145         {DEFINE, "define"},
146         {ELIF, "elif"},
147 //      {ELSE, "else"},
148         {ENDIF, "endif"},
149         {ERROR, "error"},
150 //      {IF, "if"},
151         {IFDEF, "ifdef"},
152         {IFNDEF, "ifndef"},
153         {INCLUDE, "include"},
154         {LINE, "line"},
155         {PRAGMA, "pragma"},
156         {UNDEF, "undef"},
157         {WARNING, "warning"},
158 };
159
160 #if 0
161
162 struct tokenizer *tokenizer_new(void *ctx) {
163         struct tokenizer *t = talloc(ctx, struct tokenizer);
164         t->ctx = ctx;
165         queue_init(t->mq, t);
166         t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
167         
168         return t;
169 }
170
171 #endif
172
173 static int talloc_darray_destructor(void *ptr);
174
175 /*
176  * darray(T) *talloc_darray(const void *context);
177  *
178  * Create a new darray anchored in a talloc buffer.
179  * When this pointer is freed, the darray will be freed as well.
180  */
181 static void *talloc_darray(const void *context)
182 {
183         void *ret = talloc(context, darray(void));
184         darray_init(*(darray(void)*)ret);
185         talloc_set_destructor(ret, talloc_darray_destructor);
186         return ret;
187 }
188
189 static int talloc_darray_destructor(void *ptr)
190 {
191         darray(void) *arr = ptr;
192         free(arr->item);
193         return 0;
194 }
195
196 #define MESSAGE_PATH "tokenize/"
197
198 static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
199         const char *s = tl->orig, *e = s+tl->orig_size;
200         darray_char         *txt    = talloc_darray(tl);
201         darray(const char*) *olines = talloc_darray(tl);
202         darray(const char*) *tlines = talloc_darray(tl);
203         
204         do {
205                 const char *line_start = s, *line_end;
206                 const char *lnw; //last non-white
207                 size_t start_offset = txt->size;
208                 
209                 //scan to the next line and find the last non-white character in the line
210                 while (s<e && !creturn(*s)) s++;
211                 line_end = s;
212                 lnw = s;
213                 while (lnw>line_start && cspace(lnw[-1])) lnw--;
214                 if (s<e && creturn(*s)) {
215                         s++;
216                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
217                         if (s<e && *s=='\n'+'\r'-s[-1])
218                                 s++;
219                 }
220                 
221                 //add the backslash-break-free version of the text
222                 if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
223                         darray_append_items(*txt, line_start, lnw-1-line_start);
224                         if (lnw<e && cspace(*lnw)) {
225                                 tok_msg_warn(spaces_after_backslash_break, lnw,
226                                         "Trailing spaces after backslash-broken line");
227                         }
228                 } else
229                         darray_append_items(*txt, line_start, s-line_start);
230                 
231                 //add the line starts for this line
232                 darray_append(*olines, line_start);
233                 darray_append(*tlines, (const char*)start_offset);
234                         //Since the txt buffer moves when expanded, we're storing offsets
235                         //  for now.  Once we're done building txt, we can add the base
236                         //  of it to all the offsets to make them pointers.
237         } while (s<e);
238         
239         //stick a null terminator at the end of the text
240         darray_realloc(*txt, txt->size+1);
241         txt->item[txt->size] = 0;
242         
243         //convert the line start offsets to pointers
244         {
245                 const char **i;
246                 darray_foreach(i, *tlines)
247                         *i = txt->item + (size_t)(*i);
248         }
249         
250         tl->olines = olines->item;
251         tl->olines_size = olines->size;
252         tl->txt = txt->item;
253         tl->txt_size = txt->size;
254         tl->tlines = tlines->item;
255         tl->tlines_size = tlines->size;
256 }
257
258 static void normal_keyword(struct token *tok) {
259         if (tok->type==TOK_KEYWORD &&
260                         (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
261                 tok->type = TOK_IDENTIFIER;
262 }
263
264 static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
265         while (end>start && token_is_ignored(end-1)) end--;
266         return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
267 }
268
269 //Used to label __VA_ARGS__ as keywords within applicable macro expansions
270 //Start should follow the DEFINE directive keyword
271 static void this_is_a_define(struct token *start, struct token *end) {
272         struct token *i = start, *pl_start;
273         
274         //skip past the identifier that is defined
275         while (i<end && token_is_ignored(i)) i++;
276         if (i >= end)
277                 return;
278          //TODO:  check i->type to make sure it's an identifier, throw error otherwise
279         normal_keyword(i++);
280         
281         //see if this is actually a variadic macro
282         if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
283                 goto not_va_args;
284         pl_start = ++i;
285         while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
286                 normal_keyword(i++);
287         if (!define_parmlist_has_ellipsis(pl_start, i++))
288                 goto not_va_args;
289         
290         //We have arrived at the macro expansion and know there is a ... argument
291         //Thus, we'll only change directive-only keywords to identifiers
292         for(; i<end; i++) {
293                 if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
294                         i->type = TOK_IDENTIFIER;
295         }
296         
297 not_va_args:
298         while (i < end)
299                 normal_keyword(i++);
300 }
301
302 //fill the flags field of each token and untangle keywords and such
303 static void finalize_line(struct token *start, struct token *end) {
304         struct token *i = start, *j;
305         
306         assert(start<end && start->type==TOK_STARTLINE);
307         i++;
308         
309         while (i<end && token_is_ignored(i)) i++;
310         
311         if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
312         //preprocessor line
313                 i->type = TOK_LEADING_POUND;
314                 
315                 //set pp on all tokens in this line
316                 for (j=start; j<end; j++)
317                         j->flags.pp = 1;
318                 
319                 //find the relevant token after the '#'
320                 for (i++; i<end; i++) {
321                         if (!token_is_ignored(i)) {
322                                 i->flags.pp_directive = 1;
323                                 if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
324                                         i->type = TOK_IDENTIFIER;
325                                 //TODO:  Handle invalid preprocessor directives (e.g. #+ )
326                                 
327                                 if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
328                                         for (j=i+1; j<end; j++)
329                                         this_is_a_define(i+1, end);
330                                 } else {
331                                         while (++i < end)
332                                                 normal_keyword(i);
333                                 }
334                                 break;
335                         }
336                 }
337         } else {
338         //normal line
339                 while (i < end)
340                         normal_keyword(i++);
341         }
342 }
343
344 //fill the list, flags, line, col, orig, and orig_size fields of each token
345 //convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
346 static void finalize(struct token_list *tl, struct token *start, struct token *end) {
347         const char * const *lss = tl->tlines;
348         const char * const *lse = lss + tl->tlines_size;
349         struct token *i;
350         struct token *startline = NULL;
351         
352         assert(start < end);
353         
354         tl->first = start;
355         tl->last = end-1;
356         
357         for (i=start; ; i++) {
358                 //perform a second pass on each line
359                 if (i >= end || i->type == TOK_STARTLINE) {
360                         if (startline)
361                                 finalize_line(startline, i);
362                         startline = i;
363                 }
364                 
365                 if (i >= end) {
366                         end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
367                         break;
368                 }
369                 
370                 //set up the list links
371                 i->prev = i>start ? i-1 : NULL;
372                 i->next = i+1<end ? i+1 : NULL;
373                 
374                 //if i->txt starts on a later line, advance to it
375                 while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
376                         lss++;
377                 
378                 //set up line, col, orig, and orig_size
379                 i->line = lss - tl->tlines;
380                 i->col = i->txt - *lss;
381                 i->orig = tl->olines[i->line] + i->col;
382                 if (i > start)
383                         i[-1].orig_size = i->orig - i[-1].orig;
384                 
385                 assert(i->line < tl->olines_size);
386                 
387                 //clear the flags
388                 memset(&i->flags, 0, sizeof(i->flags));
389         }
390 }
391
392 #define add(...) do { \
393                 struct token tok = {__VA_ARGS__}; \
394                 tok.txt = orig; \
395                 tok.txt_size = s-orig; \
396                 darray_append(*arr, tok); \
397         } while (0)
398
399 #define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
400 #define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
401         //believe it or not, $ is a valid character in an identifier
402
403 struct dict *tokenizer_dict = NULL;
404
405 static void free_tokenizer_dict(void) {
406         talloc_free(tokenizer_dict);
407 }
408
409 struct token_list *tokenize(const void *tcontext, const char *orig, size_t orig_size,
410                                 tok_message_queue *mq) {
411         struct token_list *tl = talloc(tcontext, struct token_list);
412         const char *s, *e;
413         size_t stray_count=0, cr_count=0;
414         darray(struct token) *arr = talloc_darray(tl);
415         int only_pound_include = 0;
416         
417         if (!tokenizer_dict) {
418                 tokenizer_dict = dict_build(NULL, c_dictionary,
419                         sizeof(c_dictionary)/sizeof(*c_dictionary));
420                 atexit(free_tokenizer_dict);
421         }
422         
423         tl->orig = orig;
424         tl->orig_size = orig_size;
425         unbreak_backslash_broken_lines(tl, mq);
426         tl->filename = NULL;
427         
428         s = tl->txt;
429         e = s + tl->txt_size;
430         
431         darray_appends_t(*arr, struct token, {
432                 .type = TOK_STARTLINE,
433                 .txt = s,
434                 .txt_size = 0
435         } );
436         
437         while (s<e) {
438                 const char *orig = s;
439                 char c = *s++;
440                 int added_something = 1;
441                 
442                 if (cstray(c)) {
443                         stray_count++;
444                         while (s<e && cstray(*s)) {
445                                 s++;
446                                 stray_count++;
447                         }
448                         add(.type = TOK_STRAY);
449                         
450                         /* This has the potential to be very noisy on binary
451                            files, but it really is quite useful. */
452                         tok_msg_error(stray_segment, orig,
453                                 "%zu stray characters", s-orig);
454                 
455                 } else if (creturn(c)) {
456                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
457                         if (s<e && *s=='\n'+'\r'-c) {
458                                 s++;
459                                 cr_count++;
460                         } else if (c=='\r')
461                                 cr_count++;
462                         
463                         add(.type = TOK_WHITE);
464                         orig = s;
465                         
466                         //add a TOK_STARTLINE for the next line unless this is the end of the document
467                         if (s<e)
468                                 add(.type = TOK_STARTLINE);
469                         
470                         only_pound_include = 0;
471                 
472                 } else if (cspace(c)) {
473                         //skip over the remaining whitespace
474                         while (s<e && cspace(*s)) s++;
475                         add(.type = TOK_WHITE);
476                         added_something = 0;
477                 
478                 } else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
479                         struct token tok;
480                         s = read_cnumber(&tok, s-1, e, mq);
481                         tok.txt = orig;
482                         tok.txt_size = s-orig;
483                         darray_append(*arr, tok);
484                         
485                 } else if (csymbol(c) || cident(c)) {
486                         if (only_pound_include && (c=='"' || c=='<')) { //include string
487                                 char *include;
488                                 char end = c=='"' ? '"' : '>';
489                                 short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
490                                 
491                                 while (s<e && !creturn(*s) && *s!=end) s++;
492                                 include = talloc_strndup(tl, orig+1, s-(orig+1));
493                                 
494                                 if (s<e && *s==end) {
495                                         s++;
496                                 } else {
497                                         tok_msg_error(include_missing_terminator, orig,
498                                                 "Missing terminating %c character", end);
499                                 }
500                                 
501                                 add(.type = type,
502                                         {.include = include});
503                                 
504                         } else if (c=='\'' || c=='\"') { //character or string literal
505                                 darray_char *string = talloc_darray(tl);
506                                 s = read_cstring(string, s, e, c, mq);
507                                 if (s<e) s++; //advance past endquote (if available)
508                                 add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
509                                     {.string = string});
510                                 
511                                 if (c=='\'' && string->size==0) {
512                                         tok_msg_error(empty_char_constant, orig,
513                                                 "Empty character constant");
514                                 }
515                                 
516                         } else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
517                                 if (*s++ == '*') { /* C-style comment */
518                                         const char *comment_start = s-2;
519                                         for (;;s++) {
520                                                 if (s+1 >= e) {
521                                                         s = e;
522                                                         tok_msg_error(unterminated_comment, comment_start,
523                                                                 "Unterminated comment");
524                                                         break;
525                                                 }
526                                                 if (s[0]=='*' && s[1]=='/') {
527                                                         s += 2;
528                                                         break;
529                                                 }
530                                         }
531                                         add(.type = TOK_CCOMMENT);
532                                 } else { // C++-style comment
533                                         while (s<e && !creturn(*s)) s++;
534                                         add(.type = TOK_CPPCOMMENT);
535                                 }
536                                 added_something = 0;
537                         
538                         } else { //operator, keyword, or identifier
539                                 struct dict_entry *ent;
540                                 const char *ident_e = --s;
541                                 while (ident_e<e && cident(*ident_e) ) ident_e++;
542                                 
543                                 ent = dict_lookup(tokenizer_dict, &s, e);
544                                 if (cident(c)) { //keyword or identifier
545                                         if (ent && s==ident_e) {
546                                                 add(.type = TOK_KEYWORD,
547                                                         {.opkw = ent->id});
548                                                 if (ent->id == INCLUDE) {
549                                                         //hacky way to lex #include string properly
550                                                         struct token *ts = arr->item;
551                                                         struct token *tp = ts+arr->size-1;
552                                                         while (tp>ts && token_is_ignored(tp-1))
553                                                                 tp--;
554                                                         if (tp>ts && token_is_op(tp-1, '#')) {
555                                                                 tp--;
556                                                                 while (tp>ts && token_is_ignored(tp-1))
557                                                                         tp--;
558                                                                 if (tp>ts && tp[-1].type==TOK_STARTLINE) {
559                                                                         only_pound_include = 1;
560                                                                         continue;
561                                                                 }
562                                                         }
563                                                 }
564                                         } else {
565                                                 s = ident_e;
566                                                 add(.type = TOK_IDENTIFIER);
567                                         }
568                                 } else if (ent) { //operator
569                                         add(.type = TOK_OPERATOR,
570                                             {.opkw = ent->id});
571                                 } else { //invalid symbol (shouldn't happen)
572                                         tok_msg_bug(unrecognized_symbol, s,
573                                                 "Unrecognized symbol \'%c\'", c);
574                                         s++;
575                                         add(.type = TOK_STRAY);
576                                 }
577                         }
578                 }
579                 
580                 if (added_something)
581                         only_pound_include = 0;
582         }
583         
584         /*if (stray_count) {
585                 tok_msg_error(stray_characters, NULL,
586                         "%lu stray characters in text", (unsigned long)stray_count);
587         }*/
588         if (cr_count) {
589                 tok_msg_warn(nonstandard_newlines, NULL,
590                         "Text contains non-standard line terminators");
591         }
592         
593         finalize(tl, arr->item, arr->item+arr->size);
594         
595         return tl;
596 }
597
598 size_t token_list_count(const struct token_list *tl) {
599         size_t ret = 0;
600         const struct token *i;
601         
602         for (i=tl->first; i; i=i->next)
603                 ret++;
604         
605         return ret;
606 }
607
608 static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
609         const char * const *orig = lines;
610         const char * const *orig_e = lines+line_count;
611         
612         while (line_count > 1) {
613                 size_t middle = line_count>>1;
614                 if (ptr < lines[middle])
615                         line_count = middle;
616                 else {
617                         lines += middle;
618                         line_count -= middle;
619                 }
620         }
621         
622         //select the *last* of equivalent lines
623         while (lines+1 < orig_e && lines[0]==lines[1])
624                 lines++;
625         
626         // (don't) select the *first* of equivalent lines
627         //while (lines>orig && lines<orig_e && lines[-1]==lines[0])
628         //      lines--;
629         
630         return lines - orig;
631 }
632
633 int tok_point_lookup(struct tok_point *out, const char *ptr,
634                         const struct token_list *tl) {
635         size_t line_count = tl->olines_size;
636         
637         memset(out, 0, sizeof(*out));
638         if (!tl)
639                 return 0;
640         
641         if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
642                 out->txt = ptr;
643                 out->line = find_line(ptr, tl->tlines, line_count);
644                 if (out->line < line_count) {
645                         out->col = ptr - tl->tlines[out->line];
646                         out->orig = tl->olines[out->line] + out->col;
647                 } else {
648                         out->col = 0;
649                         out->orig = tl->orig + tl->orig_size;
650                 }
651                 return 1;
652         } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
653                 out->orig = ptr;
654                 out->line = find_line(ptr, tl->olines, line_count);
655                 if (out->line < line_count) {
656                         const char *tline_start = tl->tlines[out->line];
657                         const char *tline_end = out->line+1 < line_count ?
658                                 tl->tlines[out->line+1] :
659                                 tl->txt + tl->txt_size;
660                         
661                         out->col = ptr - tl->olines[out->line];
662                         out->txt = tline_start + out->col;
663                         
664                         if (out->txt > tline_end)
665                                 out->txt = tline_end;
666                 } else {
667                         out->col = 0;
668                         out->txt = tl->txt + tl->txt_size;
669                 }
670                 return 1;
671         } else {
672                 return 0;
673         }
674 }
675
676 static char *escape_string(darray_char *buf, const char *str, size_t size) {
677         const char *s = str, *e = s+size;
678         darray_from_lit(*buf, "");
679         
680         for (;s<e;s++) {
681                 char buffer[8];
682                 const char *esc = buffer;
683                 unsigned char c = (unsigned char)*s;
684                 if (ccontrol(c))
685                         sprintf(buffer, "\\x%02X", c);
686                 else switch(c) {
687                         case '\t': esc = "\\t"; break;
688                         case '\n': esc = "\\n"; break;
689                         case '\v': esc = "\\v"; break;
690                         case '\f': esc = "\\f"; break;
691                         case '\r': esc = "\\r"; break;
692                         case '"': esc = "\\\""; break;
693                         case '\\': esc = "\\\\"; break;
694                         default:
695                                 buffer[0] = c;
696                                 buffer[1] = 0;
697                 }
698                 darray_append_string(*buf, esc);
699         }
700         
701         return buf->item;
702 }
703
704 static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
705         const char *ts = txt, *te = ts+txt_size;
706         const char *os = orig, *oe = os+orig_size;
707         
708         do {
709                 const char *ob = os; //start of next backslash break
710                 const char *obe = os; //end of next backslash break
711                 size_t size; //amount of text to compare for this round
712                 
713                 while (ob<oe && *ob!='\\') ob++;
714                 obe = ob;
715                 if (obe < oe) { //there's a backslash
716                         obe++;
717                         while (obe<oe && cspace(*obe)) obe++;
718                         if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
719                                 obe++;
720                                 if (obe<oe && *obe == '\n'+'\r'-obe[-1])
721                                         obe++;
722                         } else //this is just a plain old backslash
723                                 ob = obe;
724                 }
725                 
726                 size = ob-os;
727                 
728                 if (ts+size > te || memcmp(ts, os, size))
729                         return 0;
730                 ts += size;
731                 os = obe;
732         } while (ts<te);
733         
734         if (ts != te || os != oe)
735                 return 0;
736         
737         return 1;
738 }
739
740 static int is_backslash_break(const char **end, const char *s, const char *e) {
741         if (s<e && *s == '\\') {
742                 s++;
743                 while (s<e && cspace(*s)) s++;
744                 if (s<e && creturn(*s)) {
745                         s++;
746                         if (s<e && *s=='\n'+'\r'-s[-1])
747                                 s++;
748                         *end = s;
749                         return 1;
750                 }
751                 return 0;
752         }
753         return 0;
754 }
755
756 #define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)
757
758 //tests that should pass on an untainted token list out of the tokenize() function
759 static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
760         struct token *first = tl->first;
761         struct token *last = tl->last;
762         struct token *i;
763         const char *txt=tl->txt, *orig=tl->orig;
764         const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
765         
766         if ((char*)first > (char*)last ||
767                 (size_t)((char*)last - (char*)first) % sizeof(struct token))
768                 failed("Token list pointers don't look right");
769         
770         //token list should not end with TOK_STARTLINE unless
771         //  the document is empty
772         if (last!=first && last->type==TOK_STARTLINE)
773                 return 0;
774         
775         for (i=first; i; i=i->next) {
776                 //Verify list links
777                 if (i != first && i->prev != i-1)
778                         failed("list.prev is incorrect");
779                 if (i != last && i->next != i+1)
780                         failed("list.next is incorrect");
781                 
782                 //Make sure txt segments fill the entire tl->txt
783                 if (i->txt != txt)
784                         failed("txt does not fill the token list");
785                 txt += i->txt_size;
786                 if (txt > txt_e)
787                         failed("txt is out of bounds");
788                 
789                 //Make sure orig segments fill the entire tl->orig
790                 if (i->orig != orig)
791                         failed("orig does not fill the token list");
792                 orig += i->orig_size;
793                 if (orig > orig_e)
794                         failed("orig is out of bounds");
795         }
796         
797         if (txt != txt_e)
798                 return 0;
799         if (orig != orig_e)
800                 return 0;
801         
802         return 1;
803 }
804
805 int token_list_sanity_check(const struct token_list *tl, FILE *err) {
806         struct token *first = tl->first;
807         struct token *last = tl->last;
808         struct token *i;
809         int initial = 1;
810         
811         if (tl->first == NULL || tl->last == NULL)
812                 failed("Token list is completely empty");
813         
814         if (first->type!=TOK_STARTLINE ||
815             first->txt!=tl->txt || first->txt_size!=0 ||
816             first->orig!=tl->orig || first->orig_size!=0 ||
817             first->line!=0 || first->col!=0)
818                 failed("Token list does not start with a valid TOK_STARTLINE");
819         
820         if (first->prev!=NULL || last->next!=NULL)
821                 failed("Token edge links are not NULL");
822         
823         for (i=first; i; i=i->next) {
824                 //Verify line,col
825                 if (tl->tlines[i->line] + i->col != i->txt)
826                         failed("line,col is wrong against txt");
827                 if (tl->olines[i->line] + i->col != i->orig)
828                         failed("line,col is wrong against orig");
829                 
830                 //Make sure tokens have proper sizes
831                 if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
832                         failed("Token is empty");
833                 if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
834                         failed("TOK_STARTLINE is non-empty");
835                 
836                 //Make sure TOK_WHITE actually contains white tokens
837                 if (i->type==TOK_WHITE) {
838                         const char *s = i->txt, *e = s+i->txt_size;
839                         while (s<e && cwhite(*s)) s++;
840                         if (s != e)
841                                 failed("TOK_WHITE does not contain only white characters");
842                 }
843                 
844                 //Make sure txt and orig match exactly except for backslash line breaks
845                 if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
846                         darray_char buf = darray_new();
847                         fprintf(err,
848                                 "txt and orig do not match:\n"
849                                 "\ttxt  = \"%s\"\n",
850                                 escape_string(&buf, i->txt, i->txt_size) );
851                         fprintf(err, "\torig = \"%s\"\n",
852                                 escape_string(&buf, i->orig, i->orig_size) );
853                         
854                         darray_free(buf);
855                         return 0;
856                 }
857                 
858                 //Make sure tok_point_lookup returns correct point
859                 {
860                         struct tok_point tok_point;
861                         const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
862                         size_t line=i->line, col=i->col;
863                         
864                         #define check(ptr) do { \
865                                 if (tok_point_lookup(&tok_point, ptr, tl)) { \
866                                         if (tok_point.txt != t || tok_point.orig != o) \
867                                                 failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
868                                                 (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
869                                         if (tok_point.line != line || tok_point.col != col) \
870                                                 failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
871                                                 (int)(tok_point.line-line), (int)(tok_point.col-col)); \
872                                 } else if (initial) {\
873                                         failed("tok_point_lookup failed on initial token list"); \
874                                 } \
875                         } while(0)
876                         
877                         for (;;) {
878                                 while (is_backslash_break(&p, o, e)) {
879                                         while (o<p) {
880                                                 check(o);
881                                                 o++;
882                                                 col++;
883                                         }
884                                         col = 0;
885                                         line++;
886                                 }
887                                 if (o >= e)
888                                         break;
889                                 do {
890                                         if (creturn(*o)) {
891                                                 p = o+1;
892                                                 if (p<e && *p=='\n'+'\r'-p[-1])
893                                                         p++;
894                                                 while (o<p) {
895                                                         check(o);
896                                                         check(t);
897                                                         t++, o++, col++;
898                                                 }
899                                                 line++;
900                                                 col = 0;
901                                         } else {
902                                                 check(o);
903                                                 check(t);
904                                                 o++, t++, col++;
905                                         }
906                                 } while (o<e && *o!='\\');
907                         }
908                         
909                         #undef check
910                 }
911         };
912         
913         //Verify olines and tlines
914         {
915                 const char *s = tl->orig, *e = s+tl->orig_size;
916                 size_t i, line_count = tl->olines_size;
917                 
918                 //both line arrays should be exactly the same size
919                 if (tl->olines_size != tl->tlines_size)
920                         return 0;
921                 
922                 for (i=0; s<e; i++) {
923                         const char *line_start = s, *line_end;
924                         size_t tline_size, oline_size;
925                         const char *p;
926                         
927                         if (i+1 < line_count)
928                                 tline_size = tl->tlines[i+1] - tl->tlines[i];
929                         else
930                                 tline_size = tl->txt+tl->txt_size - tl->tlines[i];
931                         
932                         while (s<e && !creturn(*s)) s++;
933                         line_end = s;
934                         if (s<e) {
935                                 s++;
936                                 if (s<e && *s=='\n'+'\r'-s[-1])
937                                         s++;
938                         }
939                         
940                         oline_size = s-line_start;
941                         
942                         //verify that olines elements are correct
943                         if (line_start != tl->olines[i])
944                                 return 0;
945                         
946                         //verify that tlines elements are in range
947                         p = tl->tlines[i];
948                         if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
949                                 return 0;
950                         
951                         //verify that original lines have sizes >= the unbroken lines
952                         if (oline_size < tline_size)
953                                 return 0;
954                         
955                         //if sizes are inconsistent, make sure it is due to a backslash escape
956                         if (oline_size > tline_size) {
957                                 p = line_start+tline_size;
958                                 if (*p++ != '\\')
959                                         return 0;
960                                 while (p<e && cspace(*p)) p++;
961                                 if (p != line_end)
962                                         return 0;
963                         }
964                         
965                         //make sure the text of both copies match
966                         if ( memcmp(
967                                 tl->olines[i],
968                                 tl->tlines[i],
969                                 tline_size) )
970                                 return 0;
971                 }
972         }
973         
974         if (initial && !token_list_sanity_check_initial(tl, err))
975                 failed("Initial sanity checks failed.  Has the list been modified after it was returned from tokenize() ?");
976         
977         return 1;
978 }
979
980 #undef failed
981
982 static char *sprint_token_flags(char buf[3], struct token_flags flags) {
983         buf[0] = flags.pp ? 'p' : '-';
984         buf[1] = flags.pp_directive ? 'D' : '-';
985         buf[2] = 0;
986         return buf;
987 }
988
989 void token_list_dump(const struct token_list *tl, FILE *f) {
990         struct token *tok;
991         darray_char buf = darray_new();
992         size_t i = 0;
993         char buf2[8];
994         const char *token_type_str[] = {
995                 "TOK_INTEGER      ",
996                 "TOK_FLOATING     ",
997                 "TOK_OPERATOR     ",
998                 "TOK_KEYWORD      ",
999                 "TOK_IDENTIFIER   ",
1000                 "TOK_CHAR         ",
1001                 "TOK_STRING       ",
1002                 "TOK_LEADING_POUND",
1003                 "TOK_STRING_IQUOTE",
1004                 "TOK_STRING_IANGLE",
1005                 "TOK_CCOMMENT     ",
1006                 "TOK_CPPCOMMENT   ",
1007                 "TOK_WHITE        ",
1008                 "TOK_STARTLINE    ",
1009                 "TOK_STRAY        "
1010         };
1011         
1012         for (tok=tl->first; tok; tok=tok->next) {
1013                 fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
1014                         token_type_str[tok->type],
1015                         sprint_token_flags(buf2, tok->flags),
1016                         escape_string(&buf, tok->txt, tok->txt_size));
1017                 #if 1 //print tok->orig
1018                 fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
1019                 #else
1020                 fprintf(f, "\n");
1021                 #endif
1022         }
1023         
1024         darray_free(buf);
1025 }
1026
1027 void tok_message_print(struct tok_message *m, struct token_list *tl) {
1028         struct tok_point pt;
1029         int resolved = tok_point_lookup(&pt, m->location, tl);
1030         
1031         if (tl->filename) {
1032                 printf("%s:%s", tl->filename, resolved ? "" : " ");
1033         }
1034         
1035         if (resolved) {
1036                 printf("%zu:%zu %s: %s\n",
1037                         pt.line+1, pt.col+1,
1038                         m->level==TM_DEBUG ? "debug" :
1039                         m->level==TM_INFO ? "info" :
1040                         m->level==TM_WARN ? "warning" :
1041                         m->level==TM_ERROR ? "error" :
1042                         m->level==TM_BUG ? "BUG" :
1043                         "???",
1044                         m->message);
1045         } else {
1046                 printf("%s: %s\n",
1047                         m->level==TM_DEBUG ? "debug" :
1048                         m->level==TM_INFO ? "info" :
1049                         m->level==TM_WARN ? "warning" :
1050                         m->level==TM_ERROR ? "error" :
1051                         m->level==TM_BUG ? "BUG" :
1052                         "???",
1053                         m->message);
1054         }
1055 }
1056
1057 void tok_message_dump(struct tok_message *m) {
1058         printf("%s: %s: %s\n",
1059                 m->level==TM_DEBUG ? "debug" :
1060                 m->level==TM_INFO ? "info" :
1061                 m->level==TM_WARN ? "warning" :
1062                 m->level==TM_ERROR ? "error" :
1063                 m->level==TM_BUG ? "BUG" :
1064                 "???", m->path, m->message);
1065 }
1066
1067 void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
1068         const char *path, const char *loc, const char *fmt, ...) {
1069         struct tok_message msg = {.level=level, .path=path, .location=loc};
1070         va_list ap;
1071         
1072         if (!mq)
1073                 return;
1074         
1075         va_start(ap, fmt);
1076         msg.message = talloc_vasprintf(mq->item, fmt, ap);
1077         va_end(ap);
1078         
1079         enqueue(*mq, msg);
1080 }
1081
1082 void tok_message_queue_dump(const tok_message_queue *mq) {
1083         size_t i;
1084         for (i=0; i<queue_count(*mq); i++)
1085                 tok_message_dump(&queue_item(*mq, i));
1086 }
1087
1088
1089 #undef add
1090 #undef cstray
1091 #undef cident