]> git.ozlabs.org Git - ccan/blob - ccan/ccan_tokenizer/ccan_tokenizer.c
Fix tests: path change because they're now run under ccanlint (from their dir)
[ccan] / ccan / ccan_tokenizer / ccan_tokenizer.c
1 /*
2         Copyright (c) 2009  Joseph A. Adams
3         All rights reserved.
4         
5         Redistribution and use in source and binary forms, with or without
6         modification, are permitted provided that the following conditions
7         are met:
8         1. Redistributions of source code must retain the above copyright
9            notice, this list of conditions and the following disclaimer.
10         2. Redistributions in binary form must reproduce the above copyright
11            notice, this list of conditions and the following disclaimer in the
12            documentation and/or other materials provided with the distribution.
13         3. The name of the author may not be used to endorse or promote products
14            derived from this software without specific prior written permission.
15         
16         THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17         IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18         OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19         IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20         INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21         NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22         DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23         THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24         (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25         THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "ccan_tokenizer.h"
29
30 #include <ccan/talloc/talloc.h>
31
32 #include <assert.h>
33
34 //Shown by operator precedence; based on
35 // http://tigcc.ticalc.org/doc/opers.html#precedence .
36
37 static struct dict_entry c_dictionary[] = {
38 //1. Highest
39         {'(',"("}, {')',")"},
40         {'[',"["}, {']',"]"},
41         {'{',"{"}, {'}',"}"},
42         {'.',"."},
43         {PTR_OP,"->"},
44         
45 //2. Unary
46         {'!',"!"}, {'~',"~"}, //prefix
47         {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
48         // + - & *
49         
50 //3. Multiplicative
51         // *
52         {'/',"/"}, {'%',"%"},
53         
54 //4. Additive
55         // + -
56         
57 //5. Shift
58         {LEFT_OP,"<<"}, {RIGHT_OP,">>"},
59         
60 //6. Relational
61         {'<',"<"}, {'>',">"},
62         {LE_OP,"<="}, {GE_OP,">="},
63         
64 //7. Equality
65         {EQ_OP,"=="}, {NE_OP,"!="},
66         
67 //8. Bitwise AND
68         // &
69 //9. Bitwise XOR
70         {'^',"^"},
71 //10. Bitwise OR
72         {'|',"|"},
73
74 //11. Logical AND
75         {AND_OP,"&&"},
76 //12. Logical OR
77         {OR_OP,"||"},
78
79 //13. Conditional
80         {'?',"?"}, {':',":"},
81
82 //14. Assignment
83         {'=',"="},
84         {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
85         {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
86         {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
87         {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
88         
89 //15. Comma
90         {',',","},
91
92 //16. Semicolon
93         {';',";"},
94         
95 //Misc
96         {ELLIPSIS,"..."},
97         {'#',"#"},
98         {DOUBLE_POUND,"##"},
99
100 //Ambiguous
101         //unary or binary
102         {'+',"+"}, {'-',"-"},
103         {'&',"&"}, {'*',"*"},
104
105 //Keywords
106         {_BOOL, "_Bool"},
107         {_COMPLEX, "_Complex"},
108         {_IMAGINARY, "_Imaginary"},
109         {BREAK, "break"},
110         {CASE, "case"},
111         {CHAR, "char"},
112         {CONST, "const"},
113         {CONTINUE, "continue"},
114         {DEFAULT, "default"},
115         {DO, "do"},
116         {DOUBLE, "double"},
117         {ELSE, "else"},
118         {ENUM, "enum"},
119         {EXTERN, "extern"},
120         {FLOAT, "float"},
121         {FOR, "for"},
122         {GOTO, "goto"},
123         {IF, "if"},
124         {INLINE, "inline"},
125         {INT, "int"},
126         {LONG, "long"},
127         {REGISTER, "register"},
128         {RESTRICT, "restrict"},
129         {RETURN, "return"},
130         {SHORT, "short"},
131         {SIGNED, "signed"},
132         {SIZEOF, "sizeof"},
133         {STATIC, "static"},
134         {STRUCT, "struct"},
135         {SWITCH, "switch"},
136         {TYPEDEF, "typedef"},
137         {UNION, "union"},
138         {UNSIGNED, "unsigned"},
139         {VOID, "void"},
140         {VOLATILE, "volatile"},
141         {WHILE, "while"},
142
143 //Preprocessor keywords (except those already defined)
144         {VA_ARGS, "__VA_ARGS__"},
145         {DEFINE, "define"},
146         {ELIF, "elif"},
147 //      {ELSE, "else"},
148         {ENDIF, "endif"},
149         {ERROR, "error"},
150 //      {IF, "if"},
151         {IFDEF, "ifdef"},
152         {IFNDEF, "ifndef"},
153         {INCLUDE, "include"},
154         {LINE, "line"},
155         {PRAGMA, "pragma"},
156         {UNDEF, "undef"},
157         {WARNING, "warning"},
158 };
159
160 #if 0
161
162 struct tokenizer *tokenizer_new(void *ctx) {
163         struct tokenizer *t = talloc(ctx, struct tokenizer);
164         t->ctx = ctx;
165         queue_init(t->mq, t);
166         t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
167         
168         return t;
169 }
170
171 #endif
172
173 #define MESSAGE_PATH "tokenize/"
174
175 static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
176         const char *s = tl->orig, *e = s+tl->orig_size;
177         array_char txt = array_new(tl);
178         array(const char*) olines = array_new(tl);
179         array(const char*) tlines = array_new(tl);
180         
181         do {
182                 const char *line_start = s, *line_end;
183                 const char *lnw; //last non-white
184                 size_t start_offset = txt.size;
185                 
186                 //scan to the next line and find the last non-white character in the line
187                 while (s<e && !creturn(*s)) s++;
188                 line_end = s;
189                 lnw = s;
190                 while (lnw>line_start && cspace(lnw[-1])) lnw--;
191                 if (s<e && creturn(*s)) {
192                         s++;
193                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
194                         if (s<e && *s=='\n'+'\r'-s[-1])
195                                 s++;
196                 }
197                 
198                 //add the backslash-break-free version of the text
199                 if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
200                         array_append_items(txt, line_start, lnw-1-line_start);
201                         if (lnw<e && cspace(*lnw)) {
202                                 tok_msg_warn(spaces_after_backslash_break, lnw,
203                                         "Trailing spaces after backslash-broken line");
204                         }
205                 } else
206                         array_append_items(txt, line_start, s-line_start);
207                 
208                 //add the line starts for this line
209                 array_append(olines, line_start);
210                 array_append(tlines, (const char*)start_offset);
211                         //Since the txt buffer moves when expanded, we're storing offsets
212                         //  for now.  Once we're done building txt, we can add the base
213                         //  of it to all the offsets to make them pointers.
214         } while (s<e);
215         
216         //stick a null terminator at the end of the text
217         array_realloc(txt, txt.size+1);
218         txt.item[txt.size] = 0;
219         
220         //convert the line start offsets to pointers
221         array_for(i, tlines, *i = txt.item + (size_t)*i);
222         
223         tl->olines = olines.item;
224         tl->olines_size = olines.size;
225         tl->txt = txt.item;
226         tl->txt_size = txt.size;
227         tl->tlines = tlines.item;
228         tl->tlines_size = tlines.size;
229 }
230
231 static void normal_keyword(struct token *tok) {
232         if (tok->type==TOK_KEYWORD &&
233                         (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
234                 tok->type = TOK_IDENTIFIER;
235 }
236
237 static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
238         while (end>start && token_is_ignored(end-1)) end--;
239         return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
240 }
241
242 //Used to label __VA_ARGS__ as keywords within applicable macro expansions
243 //Start should follow the DEFINE directive keyword
244 static void this_is_a_define(struct token *start, struct token *end) {
245         struct token *i = start, *pl_start;
246         
247         //skip past the identifier that is defined
248         while (i<end && token_is_ignored(i)) i++;
249         if (i >= end)
250                 return;
251          //TODO:  check i->type to make sure it's an identifier, throw error otherwise
252         normal_keyword(i++);
253         
254         //see if this is actually a variadic macro
255         if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
256                 goto not_va_args;
257         pl_start = ++i;
258         while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
259                 normal_keyword(i++);
260         if (!define_parmlist_has_ellipsis(pl_start, i++))
261                 goto not_va_args;
262         
263         //We have arrived at the macro expansion and know there is a ... argument
264         //Thus, we'll only change directive-only keywords to identifiers
265         for(; i<end; i++) {
266                 if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
267                         i->type = TOK_IDENTIFIER;
268         }
269         
270 not_va_args:
271         while (i < end)
272                 normal_keyword(i++);
273 }
274
275 //fill the flags field of each token and untangle keywords and such
276 static void finalize_line(struct token *start, struct token *end) {
277         struct token *i = start, *j;
278         
279         assert(start<end && start->type==TOK_STARTLINE);
280         i++;
281         
282         while (i<end && token_is_ignored(i)) i++;
283         
284         if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
285         //preprocessor line
286                 i->type = TOK_LEADING_POUND;
287                 
288                 //set pp on all tokens in this line
289                 for (j=start; j<end; j++)
290                         j->flags.pp = 1;
291                 
292                 //find the relevant token after the '#'
293                 for (i++; i<end; i++) {
294                         if (!token_is_ignored(i)) {
295                                 i->flags.pp_directive = 1;
296                                 if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
297                                         i->type = TOK_IDENTIFIER;
298                                 //TODO:  Handle invalid preprocessor directives (e.g. #+ )
299                                 
300                                 if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
301                                         for (j=i+1; j<end; j++)
302                                         this_is_a_define(i+1, end);
303                                 } else {
304                                         while (++i < end)
305                                                 normal_keyword(i);
306                                 }
307                                 break;
308                         }
309                 }
310         } else {
311         //normal line
312                 while (i < end)
313                         normal_keyword(i++);
314         }
315 }
316
317 //fill the list, flags, line, col, orig, and orig_size fields of each token
318 //convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
319 static void finalize(struct token_list *tl, struct token *start, struct token *end) {
320         const char * const *lss = tl->tlines;
321         const char * const *lse = lss + tl->tlines_size;
322         struct token *i;
323         struct token *startline = NULL;
324         
325         assert(start < end);
326         
327         tl->first = start;
328         tl->last = end-1;
329         
330         for (i=start; ; i++) {
331                 //perform a second pass on each line
332                 if (i >= end || i->type == TOK_STARTLINE) {
333                         if (startline)
334                                 finalize_line(startline, i);
335                         startline = i;
336                 }
337                 
338                 if (i >= end) {
339                         end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
340                         break;
341                 }
342                 
343                 //set up the list links
344                 i->prev = i>start ? i-1 : NULL;
345                 i->next = i+1<end ? i+1 : NULL;
346                 
347                 //if i->txt starts on a later line, advance to it
348                 while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
349                         lss++;
350                 
351                 //set up line, col, orig, and orig_size
352                 i->line = lss - tl->tlines;
353                 i->col = i->txt - *lss;
354                 i->orig = tl->olines[i->line] + i->col;
355                 if (i > start)
356                         i[-1].orig_size = i->orig - i[-1].orig;
357                 
358                 assert(i->line < tl->olines_size);
359                 
360                 //clear the flags
361                 memset(&i->flags, 0, sizeof(i->flags));
362         }
363 }
364
365 #define add(...) do { \
366                 struct token tok = {__VA_ARGS__}; \
367                 tok.txt = orig; \
368                 tok.txt_size = s-orig; \
369                 array_append(array, tok); \
370         } while (0)
371
372 #define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
373 #define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
374         //believe it or not, $ is a valid character in an identifier
375
376 struct dict *tokenizer_dict = NULL;
377
378 static void free_tokenizer_dict(void) {
379         talloc_free(tokenizer_dict);
380 }
381
382 struct token_list *tokenize(const char *orig, size_t orig_size,
383                                 tok_message_queue *mq) {
384         struct token_list *tl = talloc(orig, struct token_list);
385         const char *s, *e;
386         size_t stray_count=0, cr_count=0;
387         array(struct token) array = array_new(tl);
388         int only_pound_include = 0;
389         
390         if (!tokenizer_dict) {
391                 tokenizer_dict = dict_build(NULL, c_dictionary,
392                         sizeof(c_dictionary)/sizeof(*c_dictionary));
393                 atexit(free_tokenizer_dict);
394         }
395         
396         tl->orig = orig;
397         tl->orig_size = orig_size;
398         unbreak_backslash_broken_lines(tl, mq);
399         tl->filename = NULL;
400         
401         s = tl->txt;
402         e = s + tl->txt_size;
403         
404         array_appends(array, {
405                 .type = TOK_STARTLINE,
406                 .txt = s,
407                 .txt_size = 0
408         } );
409         
410         while (s<e) {
411                 const char *orig = s;
412                 char c = *s++;
413                 int added_something = 1;
414                 
415                 if (cstray(c)) {
416                         stray_count++;
417                         while (s<e && cstray(*s)) {
418                                 s++;
419                                 stray_count++;
420                         }
421                         add(.type = TOK_STRAY);
422                         
423                         /* This has the potential to be very noisy on binary
424                            files, but it really is quite useful. */
425                         tok_msg_error(stray_segment, orig,
426                                 "%zu stray characters", s-orig);
427                 
428                 } else if (creturn(c)) {
429                         //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
430                         if (s<e && *s=='\n'+'\r'-c) {
431                                 s++;
432                                 cr_count++;
433                         } else if (c=='\r')
434                                 cr_count++;
435                         
436                         add(.type = TOK_WHITE);
437                         orig = s;
438                         
439                         //add a TOK_STARTLINE for the next line unless this is the end of the document
440                         if (s<e)
441                                 add(.type = TOK_STARTLINE);
442                         
443                         only_pound_include = 0;
444                 
445                 } else if (cspace(c)) {
446                         //skip over the remaining whitespace
447                         while (s<e && cspace(*s)) s++;
448                         add(.type = TOK_WHITE);
449                         added_something = 0;
450                 
451                 } else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
452                         struct token tok;
453                         s = read_cnumber(&tok, s-1, e, mq);
454                         tok.txt = orig;
455                         tok.txt_size = s-orig;
456                         array_append(array, tok);
457                         
458                 } else if (csymbol(c) || cident(c)) {
459                         if (only_pound_include && (c=='"' || c=='<')) { //include string
460                                 char *include;
461                                 char end = c=='"' ? '"' : '>';
462                                 short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
463                                 
464                                 while (s<e && !creturn(*s) && *s!=end) s++;
465                                 include = talloc_strndup(tl, orig+1, s-(orig+1));
466                                 
467                                 if (s<e && *s==end) {
468                                         s++;
469                                 } else {
470                                         tok_msg_error(include_missing_terminator, orig,
471                                                 "Missing terminating %c character", end);
472                                 }
473                                 
474                                 add(.type = type,
475                                         {.include = include});
476                                 
477                         } else if (c=='\'' || c=='\"') { //character or string literal
478                                 array_char string = array_new(tl);
479                                 s = read_cstring(&string, s, e, c, mq);
480                                 if (s<e) s++; //advance past endquote (if available)
481                                 add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
482                                     {.string = string});
483                                 
484                                 if (c=='\'' && string.size==0) {
485                                         tok_msg_error(empty_char_constant, orig,
486                                                 "Empty character constant");
487                                 }
488                                 
489                         } else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
490                                 if (*s++ == '*') { /* C-style comment */
491                                         const char *comment_start = s-2;
492                                         for (;;s++) {
493                                                 if (s+1 >= e) {
494                                                         s = e;
495                                                         tok_msg_error(unterminated_comment, comment_start,
496                                                                 "Unterminated comment");
497                                                         break;
498                                                 }
499                                                 if (s[0]=='*' && s[1]=='/') {
500                                                         s += 2;
501                                                         break;
502                                                 }
503                                         }
504                                         add(.type = TOK_CCOMMENT);
505                                 } else { // C++-style comment
506                                         while (s<e && !creturn(*s)) s++;
507                                         add(.type = TOK_CPPCOMMENT);
508                                 }
509                                 added_something = 0;
510                         
511                         } else { //operator, keyword, or identifier
512                                 struct dict_entry *ent;
513                                 const char *ident_e = --s;
514                                 while (ident_e<e && cident(*ident_e) ) ident_e++;
515                                 
516                                 ent = dict_lookup(tokenizer_dict, &s, e);
517                                 if (cident(c)) { //keyword or identifier
518                                         if (ent && s==ident_e) {
519                                                 add(.type = TOK_KEYWORD,
520                                                         {.opkw = ent->id});
521                                                 if (ent->id == INCLUDE) {
522                                                         //hacky way to lex #include string properly
523                                                         struct token *ts = array.item;
524                                                         struct token *tp = ts+array.size-1;
525                                                         while (tp>ts && token_is_ignored(tp-1))
526                                                                 tp--;
527                                                         if (tp>ts && token_is_op(tp-1, '#')) {
528                                                                 tp--;
529                                                                 while (tp>ts && token_is_ignored(tp-1))
530                                                                         tp--;
531                                                                 if (tp>ts && tp[-1].type==TOK_STARTLINE) {
532                                                                         only_pound_include = 1;
533                                                                         continue;
534                                                                 }
535                                                         }
536                                                 }
537                                         } else {
538                                                 s = ident_e;
539                                                 add(.type = TOK_IDENTIFIER);
540                                         }
541                                 } else if (ent) { //operator
542                                         add(.type = TOK_OPERATOR,
543                                             {.opkw = ent->id});
544                                 } else { //invalid symbol (shouldn't happen)
545                                         tok_msg_bug(unrecognized_symbol, s,
546                                                 "Unrecognized symbol \'%c\'", c);
547                                         s++;
548                                         add(.type = TOK_STRAY);
549                                 }
550                         }
551                 }
552                 
553                 if (added_something)
554                         only_pound_include = 0;
555         }
556         
557         /*if (stray_count) {
558                 tok_msg_error(stray_characters, NULL,
559                         "%lu stray characters in text", (unsigned long)stray_count);
560         }*/
561         if (cr_count) {
562                 tok_msg_warn(nonstandard_newlines, NULL,
563                         "Text contains non-standard line terminators");
564         }
565         
566         finalize(tl, array.item, array.item+array.size);
567         
568         return tl;
569 }
570
571 size_t token_list_count(const struct token_list *tl) {
572         size_t ret = 0;
573         const struct token *i;
574         
575         for (i=tl->first; i; i=i->next)
576                 ret++;
577         
578         return ret;
579 }
580
581 static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
582         const char * const *orig = lines;
583         const char * const *orig_e = lines+line_count;
584         
585         while (line_count > 1) {
586                 size_t middle = line_count>>1;
587                 if (ptr < lines[middle])
588                         line_count = middle;
589                 else {
590                         lines += middle;
591                         line_count -= middle;
592                 }
593         }
594         
595         //select the *last* of equivalent lines
596         while (lines+1 < orig_e && lines[0]==lines[1])
597                 lines++;
598         
599         // (don't) select the *first* of equivalent lines
600         //while (lines>orig && lines<orig_e && lines[-1]==lines[0])
601         //      lines--;
602         
603         return lines - orig;
604 }
605
606 int tok_point_lookup(struct tok_point *out, const char *ptr,
607                         const struct token_list *tl) {
608         size_t line_count = tl->olines_size;
609         
610         memset(out, 0, sizeof(*out));
611         if (!tl)
612                 return 0;
613         
614         if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
615                 out->txt = ptr;
616                 out->line = find_line(ptr, tl->tlines, line_count);
617                 if (out->line < line_count) {
618                         out->col = ptr - tl->tlines[out->line];
619                         out->orig = tl->olines[out->line] + out->col;
620                 } else {
621                         out->col = 0;
622                         out->orig = tl->orig + tl->orig_size;
623                 }
624                 return 1;
625         } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
626                 out->orig = ptr;
627                 out->line = find_line(ptr, tl->olines, line_count);
628                 if (out->line < line_count) {
629                         const char *tline_start = tl->tlines[out->line];
630                         const char *tline_end = out->line+1 < line_count ?
631                                 tl->tlines[out->line+1] :
632                                 tl->txt + tl->txt_size;
633                         
634                         out->col = ptr - tl->olines[out->line];
635                         out->txt = tline_start + out->col;
636                         
637                         if (out->txt > tline_end)
638                                 out->txt = tline_end;
639                 } else {
640                         out->col = 0;
641                         out->txt = tl->txt + tl->txt_size;
642                 }
643                 return 1;
644         } else {
645                 return 0;
646         }
647 }
648
649 static char *escape_string(array_char *buf, const char *str, size_t size) {
650         const char *s = str, *e = s+size;
651         array_from_lit(*buf, "");
652         
653         for (;s<e;s++) {
654                 char buffer[8];
655                 const char *esc = buffer;
656                 unsigned char c = (unsigned char)*s;
657                 if (ccontrol(c))
658                         sprintf(buffer, "\\x%02X", c);
659                 else switch(c) {
660                         case '\t': esc = "\\t"; break;
661                         case '\n': esc = "\\n"; break;
662                         case '\v': esc = "\\v"; break;
663                         case '\f': esc = "\\f"; break;
664                         case '\r': esc = "\\r"; break;
665                         case '"': esc = "\\\""; break;
666                         case '\\': esc = "\\\\"; break;
667                         default:
668                                 buffer[0] = c;
669                                 buffer[1] = 0;
670                 }
671                 array_append_string(*buf, esc);
672         }
673         
674         return buf->item;
675 }
676
677 static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
678         const char *ts = txt, *te = ts+txt_size;
679         const char *os = orig, *oe = os+orig_size;
680         
681         do {
682                 const char *ob = os; //start of next backslash break
683                 const char *obe = os; //end of next backslash break
684                 size_t size; //amount of text to compare for this round
685                 
686                 while (ob<oe && *ob!='\\') ob++;
687                 obe = ob;
688                 if (obe < oe) { //there's a backslash
689                         obe++;
690                         while (obe<oe && cspace(*obe)) obe++;
691                         if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
692                                 obe++;
693                                 if (obe<oe && *obe == '\n'+'\r'-obe[-1])
694                                         obe++;
695                         } else //this is just a plain old backslash
696                                 ob = obe;
697                 }
698                 
699                 size = ob-os;
700                 
701                 if (ts+size > te || memcmp(ts, os, size))
702                         return 0;
703                 ts += size;
704                 os = obe;
705         } while (ts<te);
706         
707         if (ts != te || os != oe)
708                 return 0;
709         
710         return 1;
711 }
712
713 static int is_backslash_break(const char **end, const char *s, const char *e) {
714         if (s<e && *s == '\\') {
715                 s++;
716                 while (s<e && cspace(*s)) s++;
717                 if (s<e && creturn(*s)) {
718                         s++;
719                         if (s<e && *s=='\n'+'\r'-s[-1])
720                                 s++;
721                         *end = s;
722                         return 1;
723                 }
724                 return 0;
725         }
726         return 0;
727 }
728
729 #define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)
730
731 //tests that should pass on an untainted token list out of the tokenize() function
732 static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
733         struct token *first = tl->first;
734         struct token *last = tl->last;
735         struct token *i;
736         const char *txt=tl->txt, *orig=tl->orig;
737         const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
738         
739         if ((char*)first > (char*)last ||
740                 (size_t)((char*)last - (char*)first) % sizeof(struct token))
741                 failed("Token list pointers don't look right");
742         
743         //token list should not end with TOK_STARTLINE unless
744         //  the document is empty
745         if (last!=first && last->type==TOK_STARTLINE)
746                 return 0;
747         
748         for (i=first; i; i=i->next) {
749                 //Verify list links
750                 if (i != first && i->prev != i-1)
751                         failed("list.prev is incorrect");
752                 if (i != last && i->next != i+1)
753                         failed("list.next is incorrect");
754                 
755                 //Make sure txt segments fill the entire tl->txt
756                 if (i->txt != txt)
757                         failed("txt does not fill the token list");
758                 txt += i->txt_size;
759                 if (txt > txt_e)
760                         failed("txt is out of bounds");
761                 
762                 //Make sure orig segments fill the entire tl->orig
763                 if (i->orig != orig)
764                         failed("orig does not fill the token list");
765                 orig += i->orig_size;
766                 if (orig > orig_e)
767                         failed("orig is out of bounds");
768         }
769         
770         if (txt != txt_e)
771                 return 0;
772         if (orig != orig_e)
773                 return 0;
774         
775         return 1;
776 }
777
778 int token_list_sanity_check(const struct token_list *tl, FILE *err) {
779         struct token *first = tl->first;
780         struct token *last = tl->last;
781         struct token *i;
782         int initial = 1;
783         
784         if (tl->first == NULL || tl->last == NULL)
785                 failed("Token list is completely empty");
786         
787         if (first->type!=TOK_STARTLINE ||
788             first->txt!=tl->txt || first->txt_size!=0 ||
789             first->orig!=tl->orig || first->orig_size!=0 ||
790             first->line!=0 || first->col!=0)
791                 failed("Token list does not start with a valid TOK_STARTLINE");
792         
793         if (first->prev!=NULL || last->next!=NULL)
794                 failed("Token edge links are not NULL");
795         
796         for (i=first; i; i=i->next) {
797                 //Verify line,col
798                 if (tl->tlines[i->line] + i->col != i->txt)
799                         failed("line,col is wrong against txt");
800                 if (tl->olines[i->line] + i->col != i->orig)
801                         failed("line,col is wrong against orig");
802                 
803                 //Make sure tokens have proper sizes
804                 if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
805                         failed("Token is empty");
806                 if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
807                         failed("TOK_STARTLINE is non-empty");
808                 
809                 //Make sure TOK_WHITE actually contains white tokens
810                 if (i->type==TOK_WHITE) {
811                         const char *s = i->txt, *e = s+i->txt_size;
812                         while (s<e && cwhite(*s)) s++;
813                         if (s != e)
814                                 failed("TOK_WHITE does not contain only white characters");
815                 }
816                 
817                 //Make sure txt and orig match exactly except for backslash line breaks
818                 if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
819                         array_char buf = array_new(NULL);
820                         fprintf(err,
821                                 "txt and orig do not match:\n"
822                                 "\ttxt  = \"%s\"\n",
823                                 escape_string(&buf, i->txt, i->txt_size) );
824                         fprintf(err, "\torig = \"%s\"\n",
825                                 escape_string(&buf, i->orig, i->orig_size) );
826                         
827                         array_free(buf);
828                         return 0;
829                 }
830                 
831                 //Make sure tok_point_lookup returns correct point
832                 {
833                         struct tok_point tok_point;
834                         const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
835                         size_t line=i->line, col=i->col;
836                         
837                         #define check(ptr) do { \
838                                 if (tok_point_lookup(&tok_point, ptr, tl)) { \
839                                         if (tok_point.txt != t || tok_point.orig != o) \
840                                                 failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
841                                                 (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
842                                         if (tok_point.line != line || tok_point.col != col) \
843                                                 failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
844                                                 (int)(tok_point.line-line), (int)(tok_point.col-col)); \
845                                 } else if (initial) {\
846                                         failed("tok_point_lookup failed on initial token list"); \
847                                 } \
848                         } while(0)
849                         
850                         for (;;) {
851                                 while (is_backslash_break(&p, o, e)) {
852                                         while (o<p) {
853                                                 check(o);
854                                                 o++;
855                                                 col++;
856                                         }
857                                         col = 0;
858                                         line++;
859                                 }
860                                 if (o >= e)
861                                         break;
862                                 do {
863                                         if (creturn(*o)) {
864                                                 p = o+1;
865                                                 if (p<e && *p=='\n'+'\r'-p[-1])
866                                                         p++;
867                                                 while (o<p) {
868                                                         check(o);
869                                                         check(t);
870                                                         t++, o++, col++;
871                                                 }
872                                                 line++;
873                                                 col = 0;
874                                         } else {
875                                                 check(o);
876                                                 check(t);
877                                                 o++, t++, col++;
878                                         }
879                                 } while (o<e && *o!='\\');
880                         }
881                         
882                         #undef check
883                 }
884         };
885         
886         //Verify olines and tlines
887         {
888                 const char *s = tl->orig, *e = s+tl->orig_size;
889                 size_t i, line_count = tl->olines_size;
890                 
891                 //both line arrays should be exactly the same size
892                 if (tl->olines_size != tl->tlines_size)
893                         return 0;
894                 
895                 for (i=0; s<e; i++) {
896                         const char *line_start = s, *line_end;
897                         size_t tline_size, oline_size;
898                         const char *p;
899                         
900                         if (i+1 < line_count)
901                                 tline_size = tl->tlines[i+1] - tl->tlines[i];
902                         else
903                                 tline_size = tl->txt+tl->txt_size - tl->tlines[i];
904                         
905                         while (s<e && !creturn(*s)) s++;
906                         line_end = s;
907                         if (s<e) {
908                                 s++;
909                                 if (s<e && *s=='\n'+'\r'-s[-1])
910                                         s++;
911                         }
912                         
913                         oline_size = s-line_start;
914                         
915                         //verify that olines elements are correct
916                         if (line_start != tl->olines[i])
917                                 return 0;
918                         
919                         //verify that tlines elements are in range
920                         p = tl->tlines[i];
921                         if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
922                                 return 0;
923                         
924                         //verify that original lines have sizes >= the unbroken lines
925                         if (oline_size < tline_size)
926                                 return 0;
927                         
928                         //if sizes are inconsistent, make sure it is due to a backslash escape
929                         if (oline_size > tline_size) {
930                                 p = line_start+tline_size;
931                                 if (*p++ != '\\')
932                                         return 0;
933                                 while (p<e && cspace(*p)) p++;
934                                 if (p != line_end)
935                                         return 0;
936                         }
937                         
938                         //make sure the text of both copies match
939                         if ( memcmp(
940                                 tl->olines[i],
941                                 tl->tlines[i],
942                                 tline_size) )
943                                 return 0;
944                 }
945         }
946         
947         if (initial && !token_list_sanity_check_initial(tl, err))
948                 failed("Initial sanity checks failed.  Has the list been modified after it was returned from tokenize() ?");
949         
950         return 1;
951 }
952
953 #undef failed
954
955 static char *sprint_token_flags(char buf[3], struct token_flags flags) {
956         buf[0] = flags.pp ? 'p' : '-';
957         buf[1] = flags.pp_directive ? 'D' : '-';
958         buf[2] = 0;
959         return buf;
960 }
961
962 void token_list_dump(const struct token_list *tl, FILE *f) {
963         struct token *tok;
964         array_char buf = array_new(NULL);
965         size_t i = 0;
966         char buf2[8];
967         const char *token_type_str[] = {
968                 "TOK_INTEGER      ",
969                 "TOK_FLOATING     ",
970                 "TOK_OPERATOR     ",
971                 "TOK_KEYWORD      ",
972                 "TOK_IDENTIFIER   ",
973                 "TOK_CHAR         ",
974                 "TOK_STRING       ",
975                 "TOK_LEADING_POUND",
976                 "TOK_STRING_IQUOTE",
977                 "TOK_STRING_IANGLE",
978                 "TOK_CCOMMENT     ",
979                 "TOK_CPPCOMMENT   ",
980                 "TOK_WHITE        ",
981                 "TOK_STARTLINE    ",
982                 "TOK_STRAY        "
983         };
984         
985         for (tok=tl->first; tok; tok=tok->next) {
986                 fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
987                         token_type_str[tok->type],
988                         sprint_token_flags(buf2, tok->flags),
989                         escape_string(&buf, tok->txt, tok->txt_size));
990                 #if 1 //print tok->orig
991                 fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
992                 #else
993                 fprintf(f, "\n");
994                 #endif
995         }
996         
997         array_free(buf);
998 }
999
1000 void tok_message_print(struct tok_message *m, struct token_list *tl) {
1001         struct tok_point pt;
1002         int resolved = tok_point_lookup(&pt, m->location, tl);
1003         
1004         if (tl->filename) {
1005                 printf("%s:%s", tl->filename, resolved ? "" : " ");
1006         }
1007         
1008         if (resolved) {
1009                 printf("%zu:%zu %s: %s\n",
1010                         pt.line+1, pt.col+1,
1011                         m->level==TM_DEBUG ? "debug" :
1012                         m->level==TM_INFO ? "info" :
1013                         m->level==TM_WARN ? "warning" :
1014                         m->level==TM_ERROR ? "error" :
1015                         m->level==TM_BUG ? "BUG" :
1016                         "???",
1017                         m->message);
1018         } else {
1019                 printf("%s: %s\n",
1020                         m->level==TM_DEBUG ? "debug" :
1021                         m->level==TM_INFO ? "info" :
1022                         m->level==TM_WARN ? "warning" :
1023                         m->level==TM_ERROR ? "error" :
1024                         m->level==TM_BUG ? "BUG" :
1025                         "???",
1026                         m->message);
1027         }
1028 }
1029
1030 void tok_message_dump(struct tok_message *m) {
1031         printf("%s: %s: %s\n",
1032                 m->level==TM_DEBUG ? "debug" :
1033                 m->level==TM_INFO ? "info" :
1034                 m->level==TM_WARN ? "warning" :
1035                 m->level==TM_ERROR ? "error" :
1036                 m->level==TM_BUG ? "BUG" :
1037                 "???", m->path, m->message);
1038 }
1039
1040 void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
1041         const char *path, const char *loc, const char *fmt, ...) {
1042         struct tok_message msg = {.level=level, .path=path, .location=loc};
1043         va_list ap;
1044         
1045         if (!mq)
1046                 return;
1047         
1048         va_start(ap, fmt);
1049         msg.message = talloc_vasprintf(mq->item, fmt, ap);
1050         va_end(ap);
1051         
1052         enqueue(*mq, msg);
1053 }
1054
1055 void tok_message_queue_dump(const tok_message_queue *mq) {
1056         size_t i;
1057         for (i=0; i<queue_count(*mq); i++)
1058                 tok_message_dump(&queue_item(*mq, i));
1059 }
1060
1061
1062 #undef add
1063 #undef cstray
1064 #undef cident