Added module ccan_tokenizer from snapshot at:
authorJoey Adams <joeyadams3.14159@gmail.com>
Fri, 3 Jul 2009 05:46:46 +0000 (01:46 -0400)
committerJoey Adams <joeyadams3.14159@gmail.com>
Fri, 3 Jul 2009 05:46:46 +0000 (01:46 -0400)
http://www.funsitelots.com/ccan/ccan_tokenizer-20090703.tar.bz2

17 files changed:
ccan/ccan_tokenizer/_info [new file with mode: 0644]
ccan/ccan_tokenizer/ccan_tokenizer.c [new file with mode: 0644]
ccan/ccan_tokenizer/ccan_tokenizer.h [new file with mode: 0644]
ccan/ccan_tokenizer/charflag.c [new file with mode: 0644]
ccan/ccan_tokenizer/charflag.h [new file with mode: 0644]
ccan/ccan_tokenizer/dict.c [new file with mode: 0644]
ccan/ccan_tokenizer/dict.h [new file with mode: 0644]
ccan/ccan_tokenizer/documentation [new file with mode: 0644]
ccan/ccan_tokenizer/number_constant.guppy [new file with mode: 0644]
ccan/ccan_tokenizer/queue.c [new file with mode: 0644]
ccan/ccan_tokenizer/queue.h [new file with mode: 0644]
ccan/ccan_tokenizer/read_cnumber.c [new file with mode: 0644]
ccan/ccan_tokenizer/read_cstring.c [new file with mode: 0644]
ccan/ccan_tokenizer/scripts/message_dump_to_messages.sh [new file with mode: 0755]
ccan/ccan_tokenizer/test/run-simple-token.c [new file with mode: 0644]
ccan/ccan_tokenizer/test/run.c [new file with mode: 0644]
ccan/ccan_tokenizer/todo [new file with mode: 0644]

diff --git a/ccan/ccan_tokenizer/_info b/ccan/ccan_tokenizer/_info
new file mode 100644 (file)
index 0000000..8c3c9df
--- /dev/null
@@ -0,0 +1,97 @@
+#include <string.h>
+#include <stdio.h>
+#include "config.h"
+
+/**
+ * ccan_tokenizer - A full-text lexer for C source files
+ *
+ * ccan_tokenizer generates a list of tokens given the contents of a C source
+ * or header file.
+ *
+ * Example:
+ *
+ * #include <ccan/ccan_tokenizer/ccan_tokenizer.h>
+ * #include <ccan/grab_file/grab_file.h>
+ * #include <err.h>
+ *
+ * void token_list_stats(const struct token_list *tl) {
+ *     size_t comment=0, white=0, stray=0, code=0, total=0;
+ *     size_t count = 0;
+ *     const struct token *i;
+ *
+ *     for (i=tl->first; i; i=i->next) {
+ *             size_t size = i->orig_size;
+ *             total += size;
+ *             count++;
+ *
+ *             if (token_type_is_comment(i->type))
+ *                     comment += size;
+ *             else if (i->type == TOK_WHITE)
+ *                     white += size;
+ *             else if (i->type == TOK_STRAY)
+ *                     stray += size;
+ *             else
+ *                     code += size;
+ *     }
+ *
+ *     printf("Code:        %.02f%%\n"
+ *            "White space: %.02f%%\n"
+ *            "Comments:    %.02f%%\n",
+ *            (double)code    * 100.0 / (double)total,
+ *            (double)white   * 100.0 / (double)total,
+ *            (double)comment * 100.0 / (double)total);
+ *     if (stray)
+ *             printf("Stray:       %.02f%%\n",
+ *                     (double)stray * 100.0 / (double)total);
+ *     printf("Total size:  %zu bytes with %zu tokens\n",
+ *             total, count);
+ * }
+ *
+ * int main(int argc, char *argv[]) {
+ *     size_t len;
+ *     char *file;
+ *     struct token_list *tl;
+ *     tok_message_queue mq;
+ *     queue_init(mq, NULL);
+ *
+ *     //grab the file
+ *     if (argc != 2) {
+ *             fprintf(stderr, "Usage: %s source_file\n", argv[0]);
+ *             return 1;
+ *     }
+ *     file = grab_file(NULL, argv[1], &len);
+ *     if (!file)
+ *             err(1, "Could not read file %s", argv[1]);
+ *
+ *     //tokenize the contents
+ *     tl = tokenize(file, len, &mq);
+ *
+ *     //print warnings, errors, etc.
+ *     while (queue_count(mq)) {
+ *             struct tok_message msg = dequeue(mq);
+ *             tok_message_print(&msg, tl);
+ *     }
+ *
+ *     //do neat stuff with the token list
+ *     token_list_stats(tl);
+ *
+ *     //free stuff
+ *     talloc_free(file); //implicitly frees tl
+ *     queue_free(mq);
+ *
+ *     return 0;
+ * }
+ */
+int main(int argc, char *argv[])
+{
+       /* Expect exactly one argument */
+       if (argc != 2)
+               return 1;
+
+       if (strcmp(argv[1], "depends") == 0) {
+               printf("ccan/array\n");
+               return 0;
+       }
+
+       return 1;
+}
diff --git a/ccan/ccan_tokenizer/ccan_tokenizer.c b/ccan/ccan_tokenizer/ccan_tokenizer.c
new file mode 100644 (file)
index 0000000..7d29e02
--- /dev/null
@@ -0,0 +1,1057 @@
+/*
+        Copyright (c) 2009  Joseph A. Adams
+        All rights reserved.
+        
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions
+        are met:
+        1. Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+        2. Redistributions in binary form must reproduce the above copyright
+           notice, this list of conditions and the following disclaimer in the
+           documentation and/or other materials provided with the distribution.
+        3. The name of the author may not be used to endorse or promote products
+           derived from this software without specific prior written permission.
+        
+        THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+        IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+        OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+        IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+        INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+        NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+        DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+        THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+        THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "ccan_tokenizer.h"
+
+#include <ccan/talloc/talloc.h>
+
+#include <assert.h>
+
+//Shown by operator precedence; based on
+// http://tigcc.ticalc.org/doc/opers.html#precedence .
+
+static struct dict_entry c_dictionary[] = {
+//1. Highest
+       {'(',"("}, {')',")"},
+       {'[',"["}, {']',"]"},
+       {'{',"{"}, {'}',"}"},
+       {'.',"."},
+       {PTR_OP,"->"},
+       
+//2. Unary
+       {'!',"!"}, {'~',"~"}, //prefix
+       {INC_OP,"++"}, {DEC_OP,"--"}, //prefix or postfix
+       // + - & *
+       
+//3. Multiplicative
+       // *
+       {'/',"/"}, {'%',"%"},
+       
+//4. Additive
+       // + -
+       
+//5. Shift
+       {LEFT_OP,"<<"}, {RIGHT_OP,">>"},
+       
+//6. Relational
+       {'<',"<"}, {'>',">"},
+       {LE_OP,"<="}, {GE_OP,">="},
+       
+//7. Equality
+       {EQ_OP,"=="}, {NE_OP,"!="},
+       
+//8. Bitwise AND
+       // &
+//9. Bitwise XOR
+       {'^',"^"},
+//10. Bitwise OR
+       {'|',"|"},
+
+//11. Logical AND
+       {AND_OP,"&&"},
+//12. Logical OR
+       {OR_OP,"||"},
+
+//13. Conditional
+       {'?',"?"}, {':',":"},
+
+//14. Assignment
+       {'=',"="},
+       {MUL_ASSIGN,"*="}, {DIV_ASSIGN,"/="}, {MOD_ASSIGN,"%="},
+       {ADD_ASSIGN,"+="}, {SUB_ASSIGN,"-="},
+       {AND_ASSIGN,"&="}, {XOR_ASSIGN,"^="}, {OR_ASSIGN,"|="},
+       {LEFT_ASSIGN,"<<="}, {RIGHT_ASSIGN,">>="},
+       
+//15. Comma
+       {',',","},
+
+//16. Semicolon
+       {';',";"},
+       
+//Misc
+       {ELLIPSIS,"..."},
+       {'#',"#"},
+       {DOUBLE_POUND,"##"},
+
+//Ambiguous
+       //unary or binary
+       {'+',"+"}, {'-',"-"},
+       {'&',"&"}, {'*',"*"},
+
+//Keywords
+       {_BOOL, "_Bool"},
+       {_COMPLEX, "_Complex"},
+       {_IMAGINARY, "_Imaginary"},
+       {BREAK, "break"},
+       {CASE, "case"},
+       {CHAR, "char"},
+       {CONST, "const"},
+       {CONTINUE, "continue"},
+       {DEFAULT, "default"},
+       {DO, "do"},
+       {DOUBLE, "double"},
+       {ELSE, "else"},
+       {ENUM, "enum"},
+       {EXTERN, "extern"},
+       {FLOAT, "float"},
+       {FOR, "for"},
+       {GOTO, "goto"},
+       {IF, "if"},
+       {INLINE, "inline"},
+       {INT, "int"},
+       {LONG, "long"},
+       {REGISTER, "register"},
+       {RESTRICT, "restrict"},
+       {RETURN, "return"},
+       {SHORT, "short"},
+       {SIGNED, "signed"},
+       {SIZEOF, "sizeof"},
+       {STATIC, "static"},
+       {STRUCT, "struct"},
+       {SWITCH, "switch"},
+       {TYPEDEF, "typedef"},
+       {UNION, "union"},
+       {UNSIGNED, "unsigned"},
+       {VOID, "void"},
+       {VOLATILE, "volatile"},
+       {WHILE, "while"},
+
+//Preprocessor keywords (except those already defined)
+       {VA_ARGS, "__VA_ARGS__"},
+       {DEFINE, "define"},
+       {ELIF, "elif"},
+//     {ELSE, "else"},
+       {ENDIF, "endif"},
+       {ERROR, "error"},
+//     {IF, "if"},
+       {IFDEF, "ifdef"},
+       {IFNDEF, "ifndef"},
+       {INCLUDE, "include"},
+       {LINE, "line"},
+       {PRAGMA, "pragma"},
+       {UNDEF, "undef"},
+       {WARNING, "warning"},
+};
+
+#if 0
+
+struct tokenizer *tokenizer_new(void *ctx) {
+       struct tokenizer *t = talloc(ctx, struct tokenizer);
+       t->ctx = ctx;
+       queue_init(t->mq, t);
+       t->dict = dict_build(t, c_dictionary, sizeof(c_dictionary)/sizeof(*c_dictionary));
+       
+       return t;
+}
+
+#endif
+
+#define MESSAGE_PATH "tokenize/"
+
+static void unbreak_backslash_broken_lines(struct token_list *tl, tok_message_queue *mq) {
+       const char *s = tl->orig, *e = s+tl->orig_size;
+       array_char txt = array_new(tl);
+       array(const char*) olines = array_new(tl);
+       array(const char*) tlines = array_new(tl);
+       
+       do {
+               const char *line_start = s, *line_end;
+               const char *lnw; //last non-white
+               size_t start_offset = txt.size;
+               
+               //scan to the next line and find the last non-white character in the line
+               while (s<e && !creturn(*s)) s++;
+               line_end = s;
+               lnw = s;
+               while (lnw>line_start && cspace(lnw[-1])) lnw--;
+               if (s<e && creturn(*s)) {
+                       s++;
+                       //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
+                       if (s<e && *s=='\n'+'\r'-s[-1])
+                               s++;
+               }
+               
+               //add the backslash-break-free version of the text
+               if (lnw>line_start && lnw[-1]=='\\' && line_end<e) {
+                       array_append_items(txt, line_start, lnw-1-line_start);
+                       if (lnw<e && cspace(*lnw)) {
+                               tok_msg_warn(spaces_after_backslash_break, lnw,
+                                       "Trailing spaces after backslash-broken line");
+                       }
+               } else
+                       array_append_items(txt, line_start, s-line_start);
+               
+               //add the line starts for this line
+               array_append(olines, line_start);
+               array_append(tlines, (const char*)start_offset);
+                       //Since the txt buffer moves when expanded, we're storing offsets
+                       //  for now.  Once we're done building txt, we can add the base
+                       //  of it to all the offsets to make them pointers.
+       } while (s<e);
+       
+       //stick a null terminator at the end of the text
+       array_realloc(txt, txt.size+1);
+       txt.item[txt.size] = 0;
+       
+       //convert the line start offsets to pointers
+       array_for(i, tlines, *i = txt.item + (size_t)*i);
+       
+       tl->olines = olines.item;
+       tl->olines_size = olines.size;
+       tl->txt = txt.item;
+       tl->txt_size = txt.size;
+       tl->tlines = tlines.item;
+       tl->tlines_size = tlines.size;
+}
+
+static void normal_keyword(struct token *tok) {
+       if (tok->type==TOK_KEYWORD &&
+                       (opkw_is_directive_only(tok->opkw) || tok->opkw==VA_ARGS))
+               tok->type = TOK_IDENTIFIER;
+}
+
+static int define_parmlist_has_ellipsis(struct token *start, struct token *end) {
+       while (end>start && token_is_ignored(end-1)) end--;
+       return (end-->start && end->type==TOK_OPERATOR && end->opkw==ELLIPSIS);
+}
+
+//Used to label __VA_ARGS__ as keywords within applicable macro expansions
+//Start should follow the DEFINE directive keyword
+static void this_is_a_define(struct token *start, struct token *end) {
+       struct token *i = start, *pl_start;
+       
+       //skip past the identifier that is defined
+       while (i<end && token_is_ignored(i)) i++;
+       if (i >= end)
+               return;
+        //TODO:  check i->type to make sure it's an identifier, throw error otherwise
+       normal_keyword(i++);
+       
+       //see if this is actually a variadic macro
+       if (!(i<end && i->type==TOK_OPERATOR && i->opkw=='('))
+               goto not_va_args;
+       pl_start = ++i;
+       while (i<end && !(i->type==TOK_OPERATOR && i->opkw==')'))
+               normal_keyword(i++);
+       if (!define_parmlist_has_ellipsis(pl_start, i++))
+               goto not_va_args;
+       
+       //We have arrived at the macro expansion and know there is a ... argument
+       //Thus, we'll only change directive-only keywords to identifiers
+       for(; i<end; i++) {
+               if (i->type==TOK_KEYWORD && opkw_is_directive_only(i->opkw))
+                       i->type = TOK_IDENTIFIER;
+       }
+       
+not_va_args:
+       while (i < end)
+               normal_keyword(i++);
+}
+
+//fill the flags field of each token and untangle keywords and such
+static void finalize_line(struct token *start, struct token *end) {
+       struct token *i = start, *j;
+       
+       assert(start<end && start->type==TOK_STARTLINE);
+       i++;
+       
+       while (i<end && token_is_ignored(i)) i++;
+       
+       if (i<end && i->type==TOK_OPERATOR && i->opkw=='#') {
+       //preprocessor line
+               i->type = TOK_LEADING_POUND;
+               
+               //set pp on all tokens in this line
+               for (j=start; j<end; j++)
+                       j->flags.pp = 1;
+               
+               //find the relevant token after the '#'
+               for (i++; i<end; i++) {
+                       if (!token_is_ignored(i)) {
+                               i->flags.pp_directive = 1;
+                               if (i->type==TOK_KEYWORD && !opkw_is_directive(i->opkw))
+                                       i->type = TOK_IDENTIFIER;
+                               //TODO:  Handle invalid preprocessor directives (e.g. #+ )
+                               
+                               if (i->type==TOK_KEYWORD && i->opkw==DEFINE) {
+                                       for (j=i+1; j<end; j++)
+                                       this_is_a_define(i+1, end);
+                               } else {
+                                       while (++i < end)
+                                               normal_keyword(i);
+                               }
+                               break;
+                       }
+               }
+       } else {
+       //normal line
+               while (i < end)
+                       normal_keyword(i++);
+       }
+}
+
+//fill the list, flags, line, col, orig, and orig_size fields of each token
+//convert identifiers mistaken for preprocessor keywords (e.g. ifdef) to identifiers
+static void finalize(struct token_list *tl, struct token *start, struct token *end) {
+       const char * const *lss = tl->tlines;
+       const char * const *lse = lss + tl->tlines_size;
+       struct token *i;
+       struct token *startline = NULL;
+       
+       assert(start < end);
+       
+       tl->first = start;
+       tl->last = end-1;
+       
+       for (i=start; ; i++) {
+               //perform a second pass on each line
+               if (i >= end || i->type == TOK_STARTLINE) {
+                       if (startline)
+                               finalize_line(startline, i);
+                       startline = i;
+               }
+               
+               if (i >= end) {
+                       end[-1].orig_size = tl->orig+tl->orig_size - end[-1].orig;
+                       break;
+               }
+               
+               //set up the list links
+               i->prev = i>start ? i-1 : NULL;
+               i->next = i+1<end ? i+1 : NULL;
+               
+               //if i->txt starts on a later line, advance to it
+               while (lss+1<lse && i->txt >= lss[1] && i->txt > lss[0])
+                       lss++;
+               
+               //set up line, col, orig, and orig_size
+               i->line = lss - tl->tlines;
+               i->col = i->txt - *lss;
+               i->orig = tl->olines[i->line] + i->col;
+               if (i > start)
+                       i[-1].orig_size = i->orig - i[-1].orig;
+               
+               assert(i->line < tl->olines_size);
+               
+               //clear the flags
+               memset(&i->flags, 0, sizeof(i->flags));
+       }
+}
+
+#define add(...) do { \
+               struct token tok = {__VA_ARGS__}; \
+               tok.txt = orig; \
+               tok.txt_size = s-orig; \
+               array_append(array, tok); \
+       } while (0)
+
+#define cstray(c) (ccontrol(c) || cextended(c) || (c)=='@' || (c)=='`' || (c)=='\\')
+#define cident(c) (cletter(c) || cdigit(c) || c=='_' || c=='$')
+       //believe it or not, $ is a valid character in an identifier
+
+struct dict *tokenizer_dict = NULL;
+
+static void free_tokenizer_dict(void) {
+       talloc_free(tokenizer_dict);
+}
+
+struct token_list *tokenize(const char *orig, size_t orig_size,
+                               tok_message_queue *mq) {
+       struct token_list *tl = talloc(orig, struct token_list);
+       const char *s, *e;
+       size_t stray_count=0, cr_count=0;
+       array(struct token) array = array_new(tl);
+       int only_pound_include = 0;
+       
+       if (!tokenizer_dict) {
+               tokenizer_dict = dict_build(NULL, c_dictionary,
+                       sizeof(c_dictionary)/sizeof(*c_dictionary));
+               atexit(free_tokenizer_dict);
+       }
+       
+       tl->orig = orig;
+       tl->orig_size = orig_size;
+       unbreak_backslash_broken_lines(tl, mq);
+       tl->filename = NULL;
+       
+       s = tl->txt;
+       e = s + tl->txt_size;
+       
+       array_appends(array, {
+               .type = TOK_STARTLINE,
+               .txt = s,
+               .txt_size = 0
+       } );
+       
+       while (s<e) {
+               const char *orig = s;
+               char c = *s++;
+               int added_something = 1;
+               
+               if (cstray(c)) {
+                       stray_count++;
+                       while (s<e && cstray(*s)) {
+                               s++;
+                               stray_count++;
+                       }
+                       add(.type = TOK_STRAY);
+                       
+                       /* This has the potential to be very noisy on binary
+                          files, but it really is quite useful. */
+                       tok_msg_error(stray_segment, orig,
+                               "%zu stray characters", s-orig);
+               
+               } else if (creturn(c)) {
+                       //check for non-standard newlines (i.e. "\r", "\r\n", or "\n\r")
+                       if (s<e && *s=='\n'+'\r'-c) {
+                               s++;
+                               cr_count++;
+                       } else if (c=='\r')
+                               cr_count++;
+                       
+                       add(.type = TOK_WHITE);
+                       orig = s;
+                       
+                       //add a TOK_STARTLINE for the next line unless this is the end of the document
+                       if (s<e)
+                               add(.type = TOK_STARTLINE);
+                       
+                       only_pound_include = 0;
+               
+               } else if (cspace(c)) {
+                       //skip over the remaining whitespace
+                       while (s<e && cspace(*s)) s++;
+                       add(.type = TOK_WHITE);
+                       added_something = 0;
+               
+               } else if (cdigit(c) || (c=='.' && s<e && cdigit(*s))) {
+                       struct token tok;
+                       s = read_cnumber(&tok, s-1, e, mq);
+                       tok.txt = orig;
+                       tok.txt_size = s-orig;
+                       array_append(array, tok);
+                       
+               } else if (csymbol(c) || cident(c)) {
+                       if (only_pound_include && (c=='"' || c=='<')) { //include string
+                               char *include;
+                               char end = c=='"' ? '"' : '>';
+                               short type = c=='"' ? TOK_STRING_IQUOTE : TOK_STRING_IANGLE;
+                               
+                               while (s<e && !creturn(*s) && *s!=end) s++;
+                               include = talloc_strndup(tl, orig+1, s-(orig+1));
+                               
+                               if (s<e && *s==end) {
+                                       s++;
+                               } else {
+                                       tok_msg_error(include_missing_terminator, orig,
+                                               "Missing terminating %c character", end);
+                               }
+                               
+                               add(.type = type,
+                                       {.include = include});
+                       } else if (c=='\'' || c=='\"') { //character or string literal
+                               array_char string = array_new(tl);
+                               s = read_cstring(&string, s, e, c, mq);
+                               if (s<e) s++; //advance past endquote (if available)
+                               add(.type = c=='\'' ? TOK_CHAR : TOK_STRING,
+                                   {.string = string});
+                       } else if (c=='/' && s<e && (*s=='*' || *s=='/')) { //comment
+                               if (*s++ == '*') { /* C-style comment */
+                                       const char *comment_start = s-2;
+                                       for (;;s++) {
+                                               if (s+1 >= e) {
+                                                       s = e;
+                                                       tok_msg_error(unterminated_comment, comment_start,
+                                                               "Unterminated comment");
+                                                       break;
+                                               }
+                                               if (s[0]=='*' && s[1]=='/') {
+                                                       s += 2;
+                                                       break;
+                                               }
+                                       }
+                                       add(.type = TOK_CCOMMENT);
+                               } else { // C++-style comment
+                                       while (s<e && !creturn(*s)) s++;
+                                       add(.type = TOK_CPPCOMMENT);
+                               }
+                               added_something = 0;
+                       
+                       } else { //operator, keyword, or identifier
+                               struct dict_entry *ent;
+                               const char *ident_e = --s;
+                               while (ident_e<e && cident(*ident_e) ) ident_e++;
+                               
+                               ent = dict_lookup(tokenizer_dict, &s, e);
+                               if (cident(c)) { //keyword or identifier
+                                       if (ent && s==ident_e) {
+                                               add(.type = TOK_KEYWORD,
+                                                       {.opkw = ent->id});
+                                               if (ent->id == INCLUDE) {
+                                                       //hacky way to lex #include string properly
+                                                       struct token *ts = array.item;
+                                                       struct token *tp = ts+array.size-1;
+                                                       while (tp>ts && token_is_ignored(tp-1))
+                                                               tp--;
+                                                       if (tp>ts && token_is_op(tp-1, '#')) {
+                                                               tp--;
+                                                               while (tp>ts && token_is_ignored(tp-1))
+                                                                       tp--;
+                                                               if (tp>ts && tp[-1].type==TOK_STARTLINE) {
+                                                                       only_pound_include = 1;
+                                                                       continue;
+                                                               }
+                                                       }
+                                               }
+                                       } else {
+                                               s = ident_e;
+                                               add(.type = TOK_IDENTIFIER);
+                                       }
+                               } else if (ent) { //operator
+                                       add(.type = TOK_OPERATOR,
+                                           {.opkw = ent->id});
+                               } else { //invalid symbol (shouldn't happen)
+                                       tok_msg_bug(unrecognized_symbol, s,
+                                               "Unrecognized symbol \'%c\'", c);
+                                       s++;
+                                       add(.type = TOK_STRAY);
+                               }
+                       }
+               }
+               
+               if (added_something)
+                       only_pound_include = 0;
+       }
+       
+       /*if (stray_count) {
+               tok_msg_error(stray_characters, NULL,
+                       "%lu stray characters in text", (unsigned long)stray_count);
+       }*/
+       if (cr_count) {
+               tok_msg_warn(nonstandard_newlines, NULL,
+                       "Text contains non-standard line terminators");
+       }
+       
+       finalize(tl, array.item, array.item+array.size);
+       
+       return tl;
+}
+
+size_t token_list_count(const struct token_list *tl) {
+       size_t ret = 0;
+       const struct token *i;
+       
+       for (i=tl->first; i; i=i->next)
+               ret++;
+       
+       return ret;
+}
+
+static size_t find_line(const char *ptr, const char * const *lines, size_t line_count) {
+       const char * const *orig = lines;
+       const char * const *orig_e = lines+line_count;
+       
+       while (line_count > 1) {
+               size_t middle = line_count>>1;
+               if (ptr < lines[middle])
+                       line_count = middle;
+               else {
+                       lines += middle;
+                       line_count -= middle;
+               }
+       }
+       
+       //select the *last* of equivalent lines
+       while (lines+1 < orig_e && lines[0]==lines[1])
+               lines++;
+       
+       // (don't) select the *first* of equivalent lines
+       //while (lines>orig && lines<orig_e && lines[-1]==lines[0])
+       //      lines--;
+       
+       return lines - orig;
+}
+
+int tok_point_lookup(struct tok_point *out, const char *ptr,
+                       const struct token_list *tl) {
+       size_t line_count = tl->olines_size;
+       
+       memset(out, 0, sizeof(*out));
+       if (!tl)
+               return 0;
+       
+       if (ptr >= tl->txt && ptr <= tl->txt+tl->txt_size) {
+               out->txt = ptr;
+               out->line = find_line(ptr, tl->tlines, line_count);
+               if (out->line < line_count) {
+                       out->col = ptr - tl->tlines[out->line];
+                       out->orig = tl->olines[out->line] + out->col;
+               } else {
+                       out->col = 0;
+                       out->orig = tl->orig + tl->orig_size;
+               }
+               return 1;
+       } else if (ptr >= tl->orig && ptr <= tl->orig+tl->orig_size) {
+               out->orig = ptr;
+               out->line = find_line(ptr, tl->olines, line_count);
+               if (out->line < line_count) {
+                       const char *tline_start = tl->tlines[out->line];
+                       const char *tline_end = out->line+1 < line_count ?
+                               tl->tlines[out->line+1] :
+                               tl->txt + tl->txt_size;
+                       
+                       out->col = ptr - tl->olines[out->line];
+                       out->txt = tline_start + out->col;
+                       
+                       if (out->txt > tline_end)
+                               out->txt = tline_end;
+               } else {
+                       out->col = 0;
+                       out->txt = tl->txt + tl->txt_size;
+               }
+               return 1;
+       } else {
+               return 0;
+       }
+}
+
+static char *escape_string(array_char *buf, const char *str, size_t size) {
+       const char *s = str, *e = s+size;
+       array_from_lit(*buf, "");
+       
+       for (;s<e;s++) {
+               char buffer[8];
+               const char *esc = buffer;
+               unsigned char c = (unsigned char)*s;
+               if (ccontrol(c))
+                       sprintf(buffer, "\\x%02X", c);
+               else switch(c) {
+                       case '\t': esc = "\\t"; break;
+                       case '\n': esc = "\\n"; break;
+                       case '\v': esc = "\\v"; break;
+                       case '\f': esc = "\\f"; break;
+                       case '\r': esc = "\\r"; break;
+                       case '"': esc = "\\\""; break;
+                       case '\\': esc = "\\\\"; break;
+                       default:
+                               buffer[0] = c;
+                               buffer[1] = 0;
+               }
+               array_append_string(*buf, esc);
+       }
+       
+       return buf->item;
+}
+
+static int txt_orig_matches(const char *txt, size_t txt_size, const char *orig, size_t orig_size) {
+       const char *ts = txt, *te = ts+txt_size;
+       const char *os = orig, *oe = os+orig_size;
+       
+       do {
+               const char *ob = os; //start of next backslash break
+               const char *obe = os; //end of next backslash break
+               size_t size; //amount of text to compare for this round
+               
+               while (ob<oe && *ob!='\\') ob++;
+               obe = ob;
+               if (obe < oe) { //there's a backslash
+                       obe++;
+                       while (obe<oe && cspace(*obe)) obe++;
+                       if (obe<oe && creturn(*obe)) { //there's a backslash-broken line
+                               obe++;
+                               if (obe<oe && *obe == '\n'+'\r'-obe[-1])
+                                       obe++;
+                       } else //this is just a plain old backslash
+                               ob = obe;
+               }
+               
+               size = ob-os;
+               
+               if (ts+size > te || memcmp(ts, os, size))
+                       return 0;
+               ts += size;
+               os = obe;
+       } while (ts<te);
+       
+       if (ts != te || os != oe)
+               return 0;
+       
+       return 1;
+}
+
+static int is_backslash_break(const char **end, const char *s, const char *e) {
+       if (s<e && *s == '\\') {
+               s++;
+               while (s<e && cspace(*s)) s++;
+               if (s<e && creturn(*s)) {
+                       s++;
+                       if (s<e && *s=='\n'+'\r'-s[-1])
+                               s++;
+                       *end = s;
+                       return 1;
+               }
+               return 0;
+       }
+       return 0;
+}
+
+#define failed(fmt, ...) do {fprintf(err, fmt "\n", ##__VA_ARGS__); return 0; } while(0)
+
+//tests that should pass on an untainted token list out of the tokenize() function
+static int token_list_sanity_check_initial(const struct token_list *tl, FILE *err) {
+       struct token *first = tl->first;
+       struct token *last = tl->last;
+       struct token *i;
+       const char *txt=tl->txt, *orig=tl->orig;
+       const char *txt_e = txt+tl->txt_size, *orig_e = orig+tl->orig_size;
+       
+       if ((char*)first > (char*)last ||
+               (size_t)((char*)last - (char*)first) % sizeof(struct token))
+               failed("Token list pointers don't look right");
+       
+       //token list should not end with TOK_STARTLINE unless
+       //  the document is empty
+       if (last!=first && last->type==TOK_STARTLINE)
+               return 0;
+       
+       for (i=first; i; i=i->next) {
+               //Verify list links
+               if (i != first && i->prev != i-1)
+                       failed("list.prev is incorrect");
+               if (i != last && i->next != i+1)
+                       failed("list.next is incorrect");
+               
+               //Make sure txt segments fill the entire tl->txt
+               if (i->txt != txt)
+                       failed("txt does not fill the token list");
+               txt += i->txt_size;
+               if (txt > txt_e)
+                       failed("txt is out of bounds");
+               
+               //Make sure orig segments fill the entire tl->orig
+               if (i->orig != orig)
+                       failed("orig does not fill the token list");
+               orig += i->orig_size;
+               if (orig > orig_e)
+                       failed("orig is out of bounds");
+       }
+       
+       if (txt != txt_e)
+               return 0;
+       if (orig != orig_e)
+               return 0;
+       
+       return 1;
+}
+
+int token_list_sanity_check(const struct token_list *tl, FILE *err) {
+       struct token *first = tl->first;
+       struct token *last = tl->last;
+       struct token *i;
+       int initial = 1;
+       
+       if (tl->first == NULL || tl->last == NULL)
+               failed("Token list is completely empty");
+       
+       if (first->type!=TOK_STARTLINE ||
+           first->txt!=tl->txt || first->txt_size!=0 ||
+           first->orig!=tl->orig || first->orig_size!=0 ||
+           first->line!=0 || first->col!=0)
+               failed("Token list does not start with a valid TOK_STARTLINE");
+       
+       if (first->prev!=NULL || last->next!=NULL)
+               failed("Token edge links are not NULL");
+       
+       for (i=first; i; i=i->next) {
+               //Verify line,col
+               if (tl->tlines[i->line] + i->col != i->txt)
+                       failed("line,col is wrong against txt");
+               if (tl->olines[i->line] + i->col != i->orig)
+                       failed("line,col is wrong against orig");
+               
+               //Make sure tokens have proper sizes
+               if (i->type!=TOK_STARTLINE && (i->txt_size==0 || i->orig_size==0 || i->txt_size > i->orig_size) )
+                       failed("Token is empty");
+               if (i->type==TOK_STARTLINE && (i->txt_size!=0 || i->orig_size!=0) )
+                       failed("TOK_STARTLINE is non-empty");
+               
+               //Make sure TOK_WHITE actually contains white tokens
+               if (i->type==TOK_WHITE) {
+                       const char *s = i->txt, *e = s+i->txt_size;
+                       while (s<e && cwhite(*s)) s++;
+                       if (s != e)
+                               failed("TOK_WHITE does not contain only white characters");
+               }
+               
+               //Make sure txt and orig match exactly except for backslash line breaks
+               if (!txt_orig_matches(i->txt, i->txt_size, i->orig, i->orig_size)) {
+                       array_char buf = array_new(NULL);
+                       fprintf(err,
+                               "txt and orig do not match:\n"
+                               "\ttxt  = \"%s\"\n",
+                               escape_string(&buf, i->txt, i->txt_size) );
+                       fprintf(err, "\torig = \"%s\"\n",
+                               escape_string(&buf, i->orig, i->orig_size) );
+                       
+                       array_free(buf);
+                       return 0;
+               }
+               
+               //Make sure tok_point_lookup returns correct point
+               {
+                       struct tok_point tok_point;
+                       const char *t=i->txt, *o=i->orig, *e=o+i->orig_size, *p;
+                       size_t line=i->line, col=i->col;
+                       
+                       #define check(ptr) do { \
+                               if (tok_point_lookup(&tok_point, ptr, tl)) { \
+                                       if (tok_point.txt != t || tok_point.orig != o) \
+                                               failed("tok_point_lookup on txt reported incorrect txt/orig (orig is %d, should be %d)", \
+                                               (int)(tok_point.orig-i->orig), (int)(o-i->orig)); \
+                                       if (tok_point.line != line || tok_point.col != col) \
+                                               failed("tok_point_lookup on txt reported incorrect line/col (off by %d, %d)", \
+                                               (int)(tok_point.line-line), (int)(tok_point.col-col)); \
+                               } else if (initial) {\
+                                       failed("tok_point_lookup failed on initial token list"); \
+                               } \
+                       } while(0)
+                       
+                       for (;;) {
+                               while (is_backslash_break(&p, o, e)) {
+                                       while (o<p) {
+                                               check(o);
+                                               o++;
+                                               col++;
+                                       }
+                                       col = 0;
+                                       line++;
+                               }
+                               if (o >= e)
+                                       break;
+                               do {
+                                       if (creturn(*o)) {
+                                               p = o+1;
+                                               if (p<e && *p=='\n'+'\r'-p[-1])
+                                                       p++;
+                                               while (o<p) {
+                                                       check(o);
+                                                       check(t);
+                                                       t++, o++, col++;
+                                               }
+                                               line++;
+                                               col = 0;
+                                       } else {
+                                               check(o);
+                                               check(t);
+                                               o++, t++, col++;
+                                       }
+                               } while (o<e && *o!='\\');
+                       }
+                       
+                       #undef check
+               }
+       };
+       
+       //Verify olines and tlines
+       {
+               const char *s = tl->orig, *e = s+tl->orig_size;
+               size_t i, line_count = tl->olines_size;
+               
+               //both line arrays should be exactly the same size
+               if (tl->olines_size != tl->tlines_size)
+                       return 0;
+               
+               for (i=0; s<e; i++) {
+                       const char *line_start = s, *line_end;
+                       size_t tline_size, oline_size;
+                       const char *p;
+                       
+                       if (i+1 < line_count)
+                               tline_size = tl->tlines[i+1] - tl->tlines[i];
+                       else
+                               tline_size = tl->txt+tl->txt_size - tl->tlines[i];
+                       
+                       while (s<e && !creturn(*s)) s++;
+                       line_end = s;
+                       if (s<e) {
+                               s++;
+                               if (s<e && *s=='\n'+'\r'-s[-1])
+                                       s++;
+                       }
+                       
+                       oline_size = s-line_start;
+                       
+                       //verify that olines elements are correct
+                       if (line_start != tl->olines[i])
+                               return 0;
+                       
+                       //verify that tlines elements are in range
+                       p = tl->tlines[i];
+                       if (p < tl->txt || p+tline_size > tl->txt+tl->txt_size)
+                               return 0;
+                       
+                       //verify that original lines have sizes >= the unbroken lines
+                       if (oline_size < tline_size)
+                               return 0;
+                       
+                       //if sizes are inconsistent, make sure it is due to a backslash escape
+                       if (oline_size > tline_size) {
+                               p = line_start+tline_size;
+                               if (*p++ != '\\')
+                                       return 0;
+                               while (p<e && cspace(*p)) p++;
+                               if (p != line_end)
+                                       return 0;
+                       }
+                       
+                       //make sure the text of both copies match
+                       if ( memcmp(
+                               tl->olines[i],
+                               tl->tlines[i],
+                               tline_size) )
+                               return 0;
+               }
+       }
+       
+       if (initial && !token_list_sanity_check_initial(tl, err))
+               failed("Initial sanity checks failed.  Has the list been modified after it was returned from tokenize() ?");
+       
+       return 1;
+}
+
+#undef failed
+
+static char *sprint_token_flags(char buf[3], struct token_flags flags) {
+       buf[0] = flags.pp ? 'p' : '-';
+       buf[1] = flags.pp_directive ? 'D' : '-';
+       buf[2] = 0;
+       return buf;
+}
+
+void token_list_dump(const struct token_list *tl, FILE *f) {
+       struct token *tok;
+       array_char buf = array_new(NULL);
+       size_t i = 0;
+       char buf2[8];
+       const char *token_type_str[] = {
+               "TOK_INTEGER      ",
+               "TOK_FLOATING     ",
+               "TOK_OPERATOR     ",
+               "TOK_KEYWORD      ",
+               "TOK_IDENTIFIER   ",
+               "TOK_CHAR         ",
+               "TOK_STRING       ",
+               "TOK_LEADING_POUND",
+               "TOK_STRING_IQUOTE",
+               "TOK_STRING_IANGLE",
+               "TOK_CCOMMENT     ",
+               "TOK_CPPCOMMENT   ",
+               "TOK_WHITE        ",
+               "TOK_STARTLINE    ",
+               "TOK_STRAY        "
+       };
+       
+       for (tok=tl->first; tok; tok=tok->next) {
+               fprintf(f, "%lu\t%s\t%s\t\"%s\"", (unsigned long)(i++),
+                       token_type_str[tok->type],
+                       sprint_token_flags(buf2, tok->flags),
+                       escape_string(&buf, tok->txt, tok->txt_size));
+               #if 1 //print tok->orig
+               fprintf(f, "\t\"%s\"\n", escape_string(&buf, tok->orig, tok->orig_size));
+               #else
+               fprintf(f, "\n");
+               #endif
+       }
+       
+       array_free(buf);
+}
+
+void tok_message_print(struct tok_message *m, struct token_list *tl) {
+       struct tok_point pt;
+       int resolved = tok_point_lookup(&pt, m->location, tl);
+       
+       if (tl->filename) {
+               printf("%s:%s", tl->filename, resolved ? "" : " ");
+       }
+       
+       if (resolved) {
+               printf("%zu:%zu %s: %s\n",
+                       pt.line+1, pt.col+1,
+                       m->level==TM_DEBUG ? "debug" :
+                       m->level==TM_INFO ? "info" :
+                       m->level==TM_WARN ? "warning" :
+                       m->level==TM_ERROR ? "error" :
+                       m->level==TM_BUG ? "BUG" :
+                       "???",
+                       m->message);
+       } else {
+               printf("%s: %s\n",
+                       m->level==TM_DEBUG ? "debug" :
+                       m->level==TM_INFO ? "info" :
+                       m->level==TM_WARN ? "warning" :
+                       m->level==TM_ERROR ? "error" :
+                       m->level==TM_BUG ? "BUG" :
+                       "???",
+                       m->message);
+       }
+}
+
+void tok_message_dump(struct tok_message *m) {
+       printf("%s: %s: %s\n",
+               m->level==TM_DEBUG ? "debug" :
+               m->level==TM_INFO ? "info" :
+               m->level==TM_WARN ? "warning" :
+               m->level==TM_ERROR ? "error" :
+               m->level==TM_BUG ? "BUG" :
+               "???", m->path, m->message);
+}
+
+void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
+       const char *path, const char *loc, const char *fmt, ...) {
+       struct tok_message msg = {.level=level, .path=path, .location=loc};
+       va_list ap;
+       
+       if (!mq)
+               return;
+       
+       va_start(ap, fmt);
+       msg.message = talloc_vasprintf(mq->item, fmt, ap);
+       va_end(ap);
+       
+       enqueue(*mq, msg);
+}
+
+void tok_message_queue_dump(const tok_message_queue *mq) {
+       size_t i;
+       for (i=0; i<queue_count(*mq); i++)
+               tok_message_dump(&queue_item(*mq, i));
+}
+
+
+#undef add
+#undef cstray
+#undef cident
diff --git a/ccan/ccan_tokenizer/ccan_tokenizer.h b/ccan/ccan_tokenizer/ccan_tokenizer.h
new file mode 100644 (file)
index 0000000..7634501
--- /dev/null
@@ -0,0 +1,307 @@
+/*
+        Copyright (c) 2009  Joseph A. Adams
+        All rights reserved.
+        
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions
+        are met:
+        1. Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+        2. Redistributions in binary form must reproduce the above copyright
+           notice, this list of conditions and the following disclaimer in the
+           documentation and/or other materials provided with the distribution.
+        3. The name of the author may not be used to endorse or promote products
+           derived from this software without specific prior written permission.
+        
+        THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+        IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+        OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+        IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+        INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+        NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+        DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+        THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+        THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CCAN_TOKENIZER_H
+#define CCAN_TOKENIZER_H
+
+#include <ccan/array/array.h>
+#include "charflag.h"
+#include "dict.h"
+#include "queue.h"
+#include <stdint.h>
+#include <errno.h> //for readui
+
+/* Definition of tokens and the token list */
+
+enum token_type {
+       TOK_INTEGER,       //integer (e.g. 5, 1000L, 0x5)
+       TOK_FLOATING,      //floating point number (e.g. 5.0, 7.0f, etc.)
+       TOK_OPERATOR,      //operator (e.g. +, -, (, ), ++, etc.)
+       TOK_KEYWORD,       //keyword (e.g. char, _Bool, ifdef)
+       TOK_IDENTIFIER,    //identifier or unprocessed keyword (e.g. int, token, pp_conditions)
+       TOK_CHAR,          //character literal (e.g. 'a' or even '1234')
+       TOK_STRING,        //string literal (e.g. "hello" or "zero\0inside")
+       TOK_LEADING_POUND, //leading # in a preprocessor directive (e.g. # include)
+       TOK_STRING_IQUOTE, // "config.h"
+       TOK_STRING_IANGLE, // <stdio.h>
+       
+       #define token_type_is_ignored(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_WHITE)
+       #define token_type_is_comment(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_CPPCOMMENT)
+       TOK_CCOMMENT, //C comment (e.g. /* comment */)
+       TOK_CPPCOMMENT, //C++ comment (e.g. //comment )
+       TOK_WHITE, //whitespace (span of \t\n\v\f\r and space)
+       TOK_STARTLINE,  //beginning of line (txt/txtsize is always empty)
+       TOK_STRAY, //control characters, weird characters, and extended characters where they shouldn't be
+};
+
+enum tok_suffix {
+       TOK_NOSUFFIX = 0,
+       
+       TOK_U  = 1,  //unsigned
+       TOK_L  = 2,  //long or double-precision float
+       TOK_LL = 4,  //long long (note that TOK_L and TOK_LL are mutually exclusive)
+       TOK_F  = 8,  //float (single-precision)
+       TOK_I  = 16, //imaginary
+       
+       TOK_UL  = TOK_U | TOK_L,  //unsigned long
+       TOK_ULL = TOK_U | TOK_LL, //unsigned long long
+       
+       //Imaginary combo meals
+       TOK_IMAG_U   = TOK_I | TOK_U,
+       TOK_IMAG_L   = TOK_I | TOK_L,
+       TOK_IMAG_LL  = TOK_I | TOK_LL,
+       TOK_IMAG_F   = TOK_I | TOK_F,
+       
+       TOK_IMAG_UL  = TOK_I | TOK_UL,
+       TOK_IMAG_ULL = TOK_I | TOK_ULL,
+};
+
+struct tok_integer {
+       uint64_t v;
+       int base; //one of 2, 8, 10, or 16
+       enum tok_suffix suffix;
+};
+
+struct tok_floating {
+       long double v;
+       enum tok_suffix suffix;
+};
+
+//Operator/keyword naming conventions taken from Jeff Lee's Yacc grammar:
+//http://www.lysator.liu.se/c/ANSI-C-grammar-y.html
+enum tok_opkw {
+       /* Permute these regularly */
+       PTR_OP=128, INC_OP, DEC_OP, LEFT_OP, RIGHT_OP, LE_OP, GE_OP, EQ_OP, NE_OP,
+       AND_OP, OR_OP,
+       MUL_ASSIGN, DIV_ASSIGN, MOD_ASSIGN,
+       ADD_ASSIGN, SUB_ASSIGN,
+       AND_ASSIGN, XOR_ASSIGN, OR_ASSIGN,
+       LEFT_ASSIGN, RIGHT_ASSIGN,
+       ELLIPSIS,
+       DOUBLE_POUND,
+       
+       //Keywords
+       _BOOL,
+       _COMPLEX,
+       _IMAGINARY,
+       BREAK,
+       CASE,
+       CHAR,
+       CONST,
+       CONTINUE,
+       DEFAULT,
+       DO,
+       DOUBLE,
+       ELSE,
+       ENUM,
+       EXTERN,
+       FLOAT,
+       FOR,
+       GOTO,
+       IF,
+       INLINE,
+       INT,
+       LONG,
+       REGISTER,
+       RESTRICT,
+       RETURN,
+       SHORT,
+       SIGNED,
+       SIZEOF,
+       STATIC,
+       STRUCT,
+       SWITCH,
+       TYPEDEF,
+       UNION,
+       UNSIGNED,
+       VOID,
+       VOLATILE,
+       WHILE,
+       
+       //Preprocessor keywords (except those already defined)
+       VA_ARGS,
+       #define opkw_is_directive_only(opkw) ((opkw)>=DEFINE && (opkw)<=WARNING)
+       #define opkw_is_directive(opkw) (opkw_is_directive_only(opkw) || (opkw)==ELSE || (opkw)==IF)
+       DEFINE,
+       ELIF,
+       //ELSE,
+       ENDIF,
+       ERROR,
+       //IF,
+       IFDEF,
+       IFNDEF,
+       INCLUDE,
+       LINE,
+       PRAGMA,
+       UNDEF,
+       WARNING, /* gcc extension */
+};
+
+struct token_flags {
+       unsigned short
+               pp:1, //is token part of a preprocessor line
+               pp_directive:1; //does token follow a TOK_LEADING_POUND (e.g. # include)
+};
+
+struct token {
+       struct token *prev, *next;
+       
+       struct token_flags flags;
+       short type; //enum token_type
+       union {
+               struct tok_integer integer;
+               struct tok_floating floating;
+               int opkw; //operator or keyword ID (e.g. '+', INC_OP (++), ADD_ASSIGN (+=))
+               array_char string; //applies to TOK_CHAR and TOK_STRING
+               char *include; //applies to TOK_STRING_IQUOTE and TOK_STRING_IANGLE
+       };
+       
+       //text this token represents (with backslash-broken lines merged)
+       const char *txt;
+       size_t txt_size;
+       
+       //text this token represents (untouched)
+       const char *orig;
+       size_t orig_size;
+       
+       //zero-based line and column number of this token
+       size_t line, col;
+};
+
+static inline int token_is_ignored(const struct token *tok) {
+       return token_type_is_ignored(tok->type);
+}
+
+static inline int token_is_op(const struct token *tok, int opkw) {
+       return tok->type==TOK_OPERATOR && tok->opkw==opkw;
+}
+
+static inline int token_is_kw(const struct token *tok, int opkw) {
+       return tok->type==TOK_KEYWORD && tok->opkw==opkw;
+}
+
+struct token_list {
+       struct token *first, *last;
+       
+       //Points to original input as given
+       const char *orig;
+       size_t orig_size;
+       
+       //position of the start of each real line with respect to orig
+       const char * const *olines;
+       size_t olines_size;
+       
+       //Copy of original input without backslash-broken lines
+       const char *txt;
+       size_t txt_size;
+       
+       //position of the start of each real line with respect to txt
+       const char * const *tlines;
+       size_t tlines_size;
+       
+       //Set me so tok_message_print will know what file name to display
+       const char *filename;
+};
+
+extern struct dict *tokenizer_dict;
+
+typedef queue(struct tok_message) tok_message_queue;
+
+//the token_list is allocated as a child of orig
+struct token_list *tokenize(const char *orig, size_t orig_size, tok_message_queue *mq);
+
+size_t token_list_count(const struct token_list *tl);
+
+//used for debugging
+int token_list_sanity_check(const struct token_list *tl, FILE *err);
+void token_list_dump(const struct token_list *tl, FILE *f);
+
+/* tok_point_lookup is used to locate a pointer that is within a token list's
+   txt or orig fields */
+
+struct tok_point {
+       const char *txt, *orig;
+       size_t line, col;
+};
+
+//returns nonzero if the pointer could be resolved
+int tok_point_lookup(struct tok_point *out, const char *ptr,
+                       const struct token_list *tl);
+
+
+/* Tokenizer message queue; used to gather and report warnings, errors, etc. */
+
+enum tok_message_level {TM_DEBUG, TM_INFO, TM_WARN, TM_ERROR, TM_BUG};
+
+struct tok_message {
+       enum tok_message_level level;
+       const char *path;
+               //Unique slash-delimited name of the message
+               //e.g. tokenize/read_cstring/ambiguous_octal
+       const char *message;
+               //Human-readable description
+               //e.g. `Octal \007 followed by digit`
+       const char *location;
+               //Pointer (typically within the token list's txt or orig) of the error
+};
+
+#define tok_msg_debug(name, loc, fmt, ...) tok_message_add(mq, TM_DEBUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+#define tok_msg_info(name, loc, fmt, ...) tok_message_add(mq, TM_INFO, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+#define tok_msg_warn(name, loc, fmt, ...) tok_message_add(mq, TM_WARN, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+#define tok_msg_error(name, loc, fmt, ...) tok_message_add(mq, TM_ERROR, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+#define tok_msg_bug(name, loc, fmt, ...) tok_message_add(mq, TM_BUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+
+void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
+       const char *path, const char *loc, const char *fmt, ...);
+
+void tok_message_print(struct tok_message *m, struct token_list *tl);
+
+void tok_message_dump(struct tok_message *m);
+void tok_message_queue_dump(const tok_message_queue *mq);
+
+
+/* Miscellaneous internal components */
+
+char *read_cstring(array_char *out, const char *s, const char *e, char quoteChar, tok_message_queue *mq);
+char *read_cnumber(struct token *tok, const char *s, const char *e, tok_message_queue *mq);
+
+
+typedef unsigned int readui_base;
+
+#define READUI_ALLOWHIGHERDIGITS 256
+#define READUI_ALLOWCAPLETTERS 512
+#define READUI_ALLOWLCASELETTERS 1024
+#define READUI_ALLOWLETTERS (READUI_ALLOWCAPLETTERS | READUI_ALLOWLCASELETTERS)
+
+#define READUI_DEC      ((readui_base)(10))
+#define READUI_HEX      ((readui_base)(16 | READUI_ALLOWLETTERS))
+#define READUI_OCT      ((readui_base)(8))
+#define READUI_BIN      ((readui_base)(2))
+
+uint64_t readui(const char **sp, const char *e, readui_base base);
+
+#endif
diff --git a/ccan/ccan_tokenizer/charflag.c b/ccan/ccan_tokenizer/charflag.c
new file mode 100644 (file)
index 0000000..c74cc27
--- /dev/null
@@ -0,0 +1,131 @@
+#include "charflag.h"
+
+#define C CF_CONTROL
+#define S CF_SPACE
+#define R CF_RETURN
+#define D CF_DIGIT
+#define L CF_LETTER
+#define H CF_HEX
+#define Y CF_SYMBOL
+
+unsigned char charflag[256] = {
+   C,C,C,C,C,C,C,C,C,
+   S, // \t
+   R, // \n
+   S, // \v
+   S, // \f
+   R, // \r
+   C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,
+   S, // space
+   Y,   // !
+   Y,   // "
+   Y,   // #
+   Y,   // $
+   Y,   // %
+   Y,   // &
+   Y,   // '
+   Y,   // (
+   Y,   // )
+   Y,   // *
+   Y,   // +
+   Y,   // ,
+   Y,   // -
+   Y,   // .
+   Y,   // /
+   D|H, // 0
+   D|H, // 1
+   D|H, // 2
+   D|H, // 3
+   D|H, // 4
+   D|H, // 5
+   D|H, // 6
+   D|H, // 7
+   D|H, // 8
+   D|H, // 9
+   Y,   // :
+   Y,   // ;
+   Y,   // <
+   Y,   // =
+   Y,   // >
+   Y,   // ?
+   Y,   // @
+   L|H, // A
+   L|H, // B
+   L|H, // C
+   L|H, // D
+   L|H, // E
+   L|H, // F
+   L,   // G
+   L,   // H
+   L,   // I
+   L,   // J
+   L,   // K
+   L,   // L
+   L,   // M
+   L,   // N
+   L,   // O
+   L,   // P
+   L,   // Q
+   L,   // R
+   L,   // S
+   L,   // T
+   L,   // U
+   L,   // V
+   L,   // W
+   L,   // X
+   L,   // Y
+   L,   // Z
+   Y,   // [
+   Y,   // \ (backslash)
+   Y,   // ]
+   Y,   // ^
+   Y,   // _
+   Y,   // `
+   L|H, // a
+   L|H, // b
+   L|H, // c
+   L|H, // d
+   L|H, // e
+   L|H, // f
+   L,   // g
+   L,   // h
+   L,   // i
+   L,   // j
+   L,   // k
+   L,   // l
+   L,   // m
+   L,   // n
+   L,   // o
+   L,   // p
+   L,   // q
+   L,   // r
+   L,   // s
+   L,   // t
+   L,   // u
+   L,   // v
+   L,   // w
+   L,   // x
+   L,   // y
+   L,   // z
+   Y,   // {
+   Y,   // |
+   Y,   // }
+   Y,   // ~
+   C,   // DEL
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+#undef C
+#undef S
+#undef R
+#undef D
+#undef L
+#undef H
+#undef Y
diff --git a/ccan/ccan_tokenizer/charflag.h b/ccan/ccan_tokenizer/charflag.h
new file mode 100644 (file)
index 0000000..24d170b
--- /dev/null
@@ -0,0 +1,37 @@
+#ifndef CCAN_CHARFLAG_H
+#define CCAN_CHARFLAG_H
+
+//All of these macros evaluate the argument exactly once
+
+#define ccontrol(c)  (charflag(c) & CF_CONTROL) //Weird characters that shouldn't be in text
+#define cspace(c)    (charflag(c) & CF_SPACE)   //Space, tab, vertical tab, form feed
+#define creturn(c)   (charflag(c) & CF_RETURN)  //Newline
+#define cwhite(c)    (charflag(c) & CF_WHITE)   //cspace or creturn
+#define cdigit(c)    (charflag(c) & CF_DIGIT)   //0-9
+#define cletter(c)   (charflag(c) & CF_LETTER)  //A-Za-z
+#define chex(c)      (charflag(c) & CF_HEX)     //0-9A-Fa-f
+#define csymbol(c)   (charflag(c) & CF_SYMBOL)
+       // !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
+       //If it's ASCII, prints a non-blank character, and is not a digit or letter, it's a symbol
+#define cextended(c) (charflag(c) == 0)         //Characters >= 128
+
+/* To test:
+
+All charflag macros should evaluate exactly once
+
+*/
+
+extern unsigned char charflag[256];
+#define charflag(c) (charflag[(unsigned int)(unsigned char)(c)])
+
+#define CF_CONTROL ((unsigned char)  1)
+#define CF_SPACE   ((unsigned char)  2)
+#define CF_RETURN  ((unsigned char)  4)
+#define CF_DIGIT   ((unsigned char)  8)
+#define CF_LETTER  ((unsigned char) 16)
+#define CF_HEX     ((unsigned char) 32)
+#define CF_SYMBOL  ((unsigned char) 64)
+
+#define CF_WHITE (CF_SPACE|CF_RETURN)
+
+#endif
diff --git a/ccan/ccan_tokenizer/dict.c b/ccan/ccan_tokenizer/dict.c
new file mode 100644 (file)
index 0000000..559ebf6
--- /dev/null
@@ -0,0 +1,92 @@
+#include "dict.h"
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+//compare dict_entries by first letter ascending, then by length descending
+static int compar_dict_entry(const void *ap, const void *bp) {
+       const struct dict_entry *a=ap, *b=bp;
+       unsigned int first_a = (unsigned int)a->str[0];
+       unsigned int first_b = (unsigned int)b->str[0];
+       if (first_a < first_b)
+               return -1;
+       else if (first_a > first_b)
+               return 1;
+       else {
+               size_t len_a = strlen(a->str);
+               size_t len_b = strlen(b->str);
+               if (len_a > len_b)
+                       return -1;
+               else if (len_a < len_b)
+                       return 1;
+               else
+                       return 0;
+       }
+}
+
+struct dict *dict_build(void *ctx, const struct dict_entry *entries, size_t count) {
+       struct dict *dict = talloc_zero(ctx, struct dict);
+       struct dict_entry *ent;
+       int i;
+       
+       if (!count)
+               return dict;
+       
+       ent = talloc_array(dict, struct dict_entry, count);
+       memcpy(ent, entries, count*sizeof(struct dict_entry));
+       qsort(ent, count, sizeof(*ent), compar_dict_entry);
+       
+       if (ent->str[0]==0) {
+               dict->zero = ent;
+               ent++, count--;
+               
+               if (count && ent->str[0]==0) {
+                       fprintf(stderr, "dict_entry array contains multiple empty strings\n");
+                       exit(EXIT_FAILURE);
+               }
+       }
+       
+       for (i=1; i<256; i++) {
+               if (!count)
+                       break;
+               if (ent->str[0] == (char)i)
+                       dict->by_first_letter[i-1] = ent;
+               while (count && ent->str[0] == (char)i)
+                       ent++, count--;
+       }
+       
+       return dict;
+}
+
+struct dict_entry *dict_lookup(struct dict *dict, const char **sp, const char *e) {
+       struct dict_entry *de;
+       unsigned int first;
+       if (*sp >= e)
+               return NULL;
+       first = (unsigned int)**sp & 0xFF;
+       
+       if (!first) {
+               if (dict->zero)
+                       (*sp)++;
+               return dict->zero;
+       }
+       
+       de = dict->by_first_letter[first-1];
+       if (!de)
+               return NULL;
+       
+       for (;de->str[0]==(char)first; de++) {
+               const char *s = *sp;
+               const char *ds = de->str;
+               for (;;s++,ds++) {
+                       if (!*ds) {
+                               *sp = s;
+                               return de;
+                       }
+                       if (s>=e || *s!=*ds)
+                               break;
+               }
+       }
+       
+       return NULL;
+}
diff --git a/ccan/ccan_tokenizer/dict.h b/ccan/ccan_tokenizer/dict.h
new file mode 100644 (file)
index 0000000..0628898
--- /dev/null
@@ -0,0 +1,21 @@
+#ifndef CCAN_TOKENIZER_DICT_H
+#define CCAN_TOKENIZER_DICT_H
+
+#include <stdint.h>
+#include <ccan/talloc/talloc.h>
+       //needed for freeing the struct dict*
+
+struct dict_entry {
+       int id;
+       const char *str;
+};
+
+struct dict {
+       struct dict_entry *zero;
+       struct dict_entry *by_first_letter[256];
+};
+
+struct dict *dict_build(void *ctx, const struct dict_entry *entries, size_t count);
+struct dict_entry *dict_lookup(struct dict *dict, const char **sp, const char *e);
+
+#endif
diff --git a/ccan/ccan_tokenizer/documentation b/ccan/ccan_tokenizer/documentation
new file mode 100644 (file)
index 0000000..4f2fdb1
--- /dev/null
@@ -0,0 +1,51 @@
+readui - Flexible function for reading a 64-bit unsigned integer
+@sp: Pointer to scanning pointer
+@e:  Pointer to end of string
+@base:  Typically one of READUI_DEC, READUI_HEX, READUI_OCT, or READUI_BIN.
+
+readui() converts the string of digits from *sp to e to a number, setting *sp to the first invalid character or e if the entire string is valid or empty.  It does not look at prefixes or suffixes, only digits.  It skips preceding whitespace.
+
+readui() uses errno to indicate success or failure.  It will set errno to one of the following:
+
+0:  Input is valid and non-empty
+EINVAL:  Input is empty, does not start with any valid digits, or base is 0
+ERANGE:  Number given is greater than ULLONG_MAX
+
+Example (UNTESTED):
+
+uint64_t read_number(const char *str) {
+       const char *s = str, *e = strchr(str, 0);
+       readui_base base = READUI_DEC;
+       uint64_t result;
+       
+       //See if the number has a 0x (for hex) or 0 (for octal) prefix
+       if (s+2<=e && *s=='0') {
+               s++;
+               if (*s=='x' || *s=='X') {
+                       base = READUI_HEX;
+                       s++;
+               } else
+                       base = READUI_OCT;
+       }
+       
+       result = readui(&s, e, base);
+       
+       if (errno)
+               perror("read_number");
+       
+       return result;
+}
+
+Rules for a token list:
+
+It always has and starts with a TOK_STARTLINE
+
+
+Misc.:
+
+If the world were intuitive, the tokenizer would never report warnings or bugs on a source file that compiles successfully.  However, one case where it does is when erroneous tokens appear within an #if 0 block.  Example:
+
+#if 0
+0b101.0p0
+#endif
+
diff --git a/ccan/ccan_tokenizer/number_constant.guppy b/ccan/ccan_tokenizer/number_constant.guppy
new file mode 100644 (file)
index 0000000..8adfa3c
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+guppy is a pattern-matching language by Joey Adams that's not implemented or formalized yet.
+See http://www.funsitelots.com/pub/guppy.g for a near self-definition
+
+This is a guppy representation of integer and floating point formatting in C.
+It is based on http://c0x.coding-guidelines.com/6.4.4.1.html and http://c0x.coding-guidelines.com/6.4.4.2.html
+*/
+
+number_constant: [
+       integer_constant()
+       floating_constant()
+]
+
+integer_constant: [
+       ([1-9] [0-9]*)           //decimal
+       (0 [0-7]*)               //octal
+       (0 [X x] [0-9 A-F a-f]*) //hexadecimal
+]
+
+integer_suffix: [
+       ([U u] [L l]*0..2)
+       ([L l]*1..2 [U u]*0..1)
+]
+
+floating_constant: [
+       decimal_floating_constant()
+       hexadecimal_floating_constant()
+]
+
+decimal_floating_constant: [
+       ([0-9]* '.' [0-9]+ exponent_part()*0..1 floating_suffix())
+       ([0-9]+ '.' exponent_part()*0..1 floating_suffix())
+       ([0-9]+ exponent_part() floating_suffix())
+]
+
+exponent_part:
+       ([E e] ['+' '-']*0..1 [0-9]+)
+
+hexadecimal_floating_constant:
+       (0 [X x] [
+               [0-9 A-F a-f]* '.' [0-9 A-F a-f]+
+               [0-9 A-F a-f]+ '.'
+               [0-9 A-F a-f]+
+       ] [P p] ['+' '-']*0..1 [0-9]+ floating_suffix())
+
+floating_suffix: [F L f l]*0..1
+
+scan_number:
+(
+       [
+               (0 [X x] [0-9 A-F a-f '.']*)
+               (0 [B b] [0-1] [0-9 '.']*)
+               ([0-9 '.']*)
+       ]
+       ( [E P e p] ['+' '-']*0..1 [0-9]* )*0..1
+       [0-9 A-Z a-z '.' '_' '$']*
+)
+
+/*
+Notes:
+
+A numeric constant can begin with any of:
+       0-9 '.'
+and can contain any of:
+       0-9 a-f e f l p u x '.' '+' '-'
+along with capital equivalents.
+
+If scanning finds something starting with a '.' but no decimal digit after it, it is the '.' operator and not a number.
+
+*/
diff --git a/ccan/ccan_tokenizer/queue.c b/ccan/ccan_tokenizer/queue.c
new file mode 100644 (file)
index 0000000..18dbbe3
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+        Copyright (c) 2009  Joseph A. Adams
+        All rights reserved.
+        
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions
+        are met:
+        1. Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+        2. Redistributions in binary form must reproduce the above copyright
+           notice, this list of conditions and the following disclaimer in the
+           documentation and/or other materials provided with the distribution.
+        3. The name of the author may not be used to endorse or promote products
+           derived from this software without specific prior written permission.
+        
+        THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+        IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+        OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+        IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+        INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+        NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+        DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+        THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+        THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "queue.h"
+#include <ccan/talloc/talloc.h>
+#include <string.h>
+
+void queue_enqueue_helper(void *qp, size_t itemSize) {
+       queue(char) *q = qp;
+       size_t oldsize = q->flag+1;
+       q->flag += oldsize;
+       q->item = talloc_realloc_size(NULL, q->item, (q->flag+1)*itemSize);
+       memcpy(q->item+(q->head+oldsize)*itemSize, q->item+q->head*itemSize, (oldsize-q->head)*itemSize);
+       q->head += oldsize;
+}
+
+int queue_alias_helper(const void *a, const void *b) {
+       (void)a, (void)b;
+       return 0;
+}
diff --git a/ccan/ccan_tokenizer/queue.h b/ccan/ccan_tokenizer/queue.h
new file mode 100644 (file)
index 0000000..af78af6
--- /dev/null
@@ -0,0 +1,77 @@
+/*
+        Copyright (c) 2009  Joseph A. Adams
+        All rights reserved.
+        
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions
+        are met:
+        1. Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+        2. Redistributions in binary form must reproduce the above copyright
+           notice, this list of conditions and the following disclaimer in the
+           documentation and/or other materials provided with the distribution.
+        3. The name of the author may not be used to endorse or promote products
+           derived from this software without specific prior written permission.
+        
+        THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+        IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+        OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+        IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+        INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+        NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+        DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+        THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+        THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CCAN_QUEUE_H
+#define CCAN_QUEUE_H
+
+#include <stdint.h>
+#include <ccan/talloc/talloc.h>
+
+#ifndef HAVE_ATTRIBUTE_MAY_ALIAS
+#define HAVE_ATTRIBUTE_MAY_ALIAS 1
+#endif
+
+#if HAVE_ATTRIBUTE_MAY_ALIAS==1
+#define queue_alias(ptr) /* nothing */
+#define queue(type) struct {size_t head, tail, flag; type *item;} __attribute__((__may_alias__))
+#else
+#define queue_alias(ptr) qsort(ptr, 0, 1, queue_alias_helper) //hack
+#define queue(type) struct {size_t head, tail, flag; type *item;}
+#endif
+
+int queue_alias_helper(const void *a, const void *b);
+
+#define queue_init(queue, ctx) do {(queue).head = (queue).tail = 0; (queue).flag = 3; (queue).item = talloc_size(ctx, sizeof(*(queue).item)*4);} while(0)
+#define queue_free(queue) do {talloc_free((queue).item);} while(0)
+
+#define queue_count(queue) (((queue).tail-(queue).head) & (queue).flag)
+#define enqueue(queue, ...) \
+       do { \
+               (queue).item[(queue).tail++] = (__VA_ARGS__); \
+               (queue).tail &= (queue).flag; \
+               if ((queue).tail == (queue).head) { \
+                       queue_enqueue_helper(&(queue), sizeof(*(queue).item)); \
+                       queue_alias(&(queue)); \
+               } \
+       } while(0)
+#define dequeue_check(queue) ((queue).head != (queue).tail ? dequeue(queue) : NULL)
+#define dequeue(queue) ((queue).item[queue_dequeue_helper(&(queue).head, (queue).flag)])
+
+//TODO:  Test us
+#define queue_next(queue) ((queue).item[(queue).head])
+#define queue_item(queue, pos) ((queue).item[((queue).head+(pos)) & (queue).flag])
+#define queue_skip(queue) do {(queue).head++; (queue).head &= (queue).flag;} while(0)
+
+void queue_enqueue_helper(void *qp, size_t itemSize);
+
+static inline size_t queue_dequeue_helper(size_t *head, size_t flag) {
+       size_t ret = (*head)++;
+       *head &= flag;
+       return ret;
+}
+
+#endif
diff --git a/ccan/ccan_tokenizer/read_cnumber.c b/ccan/ccan_tokenizer/read_cnumber.c
new file mode 100644 (file)
index 0000000..bb8eb30
--- /dev/null
@@ -0,0 +1,408 @@
+
+//for strtold
+#define _ISOC99_SOURCE
+#include <stdlib.h>
+#undef _ISOC99_SOURCE
+
+#include "ccan_tokenizer.h"
+
+#ifndef ULLONG_MAX
+#define ULLONG_MAX 18446744073709551615ULL
+#endif
+
+static const char *skipnum(const char *s, const char *e, readui_base base) {
+       for (;s<e;s++) {
+               unsigned int c = (unsigned char)*s;
+               
+               if (cdigit(c)) {
+                       if ( c-'0' >= (base & 0xFF) &&
+                           !(base & READUI_ALLOWHIGHERDIGITS) )
+                               break;
+               } else if (c>='A' && c<='Z') {
+                       if (!(base & READUI_ALLOWCAPLETTERS))
+                               break;
+                       if ( c-'A'+10 >= (base & 0xFF) &&
+                           !(base & READUI_ALLOWHIGHERDIGITS))
+                               break;
+               } else if (c>='a' && c<='z') {
+                       if (!(base & READUI_ALLOWLCASELETTERS))
+                               break;
+                       if ( c-'a'+10 >= (base & 0xFF) &&
+                           !(base & READUI_ALLOWHIGHERDIGITS))
+                               break;
+               } else
+                       break;
+       }
+       
+       return s;
+}
+
+static uint64_t readui_valid(const char *s, const char *e, readui_base base) {
+       uint64_t ret = 0;
+       uint64_t multiplier = 1;
+       uint64_t digit_value;
+       
+       //64-bit multiplication with overflow checking
+       #define multiply(dest, src) do { \
+               uint32_t a0 = (uint64_t)(dest) & 0xFFFFFFFF; \
+               uint32_t a1 = (uint64_t)(dest) >> 32; \
+               uint32_t b0 = (uint64_t)(src) & 0xFFFFFFFF; \
+               uint32_t b1 = (uint64_t)(src) >> 32; \
+               uint64_t a, b; \
+               \
+               if (a1 && b1) \
+                       goto overflowed; \
+               a = (uint64_t)a1*b0 + (uint64_t)a0*b1; \
+               if (a > 0xFFFFFFFF) \
+                       goto overflowed; \
+               a <<= 32; \
+               b = (uint64_t)a0*b0; \
+               \
+               if (a+b < a) \
+                       goto overflowed; \
+               (dest) = a+b; \
+       } while(0)
+       
+       if (s >= e || ((base&0xFF) < 1)) {
+               errno = EINVAL;
+               return 0;
+       }
+       
+       while (s<e && *s=='0') s++;
+       
+       if (e > s) {
+               for (;;) {
+                       char c = *--e;
+                       
+                       //this series of if statements takes advantage of the fact that 'a'>'A'>'0'
+                       if (c >= 'a')
+                               c -= 'a'-10;
+                       else if (c >= 'A')
+                               c -= 'A'-10;
+                       else
+                               c -= '0';
+                       digit_value = c;
+                       
+                       //TODO:  Write/find a testcase where temp *= multiplier does overflow
+                       multiply(digit_value, multiplier);
+                       
+                       if (ret+digit_value < ret)
+                               goto overflowed;
+                       ret += digit_value;
+                       
+                       if (e <= s)
+                               break;
+                       
+                       multiply(multiplier, base & 0xFF);
+               }
+       }
+       errno = 0;
+       return ret;
+       
+overflowed:
+       errno = ERANGE;
+       return ULLONG_MAX;
+       
+       #undef multiply
+}
+
+uint64_t readui(const char **sp, const char *e, readui_base base) {
+       const char *s = *sp;
+       
+       while (s<e && cwhite(*s)) s++;
+       e = skipnum(s, e, base);
+       
+       *sp = e;
+       return readui_valid(s, e, base);
+}
+
+
+#define MESSAGE_PATH "tokenize/read_cnumber/"
+
+struct scan_number {
+/*
+ * Each of the pointers points to the first character of a given component.
+ * Consider 0x50.1p+1f .  It would be broken down into:
+ */
+       const char *prefix;   // 0x
+       const char *digits;   // 50.1
+       const char *exponent; // p+1
+       const char *suffix;   // f
+       const char *end;
+       size_t dots_found;    // 1
+};
+
+/*
+ * Scans past all the characters in a number token, fills the struct, and
+ * returns one of TOK_INTEGER or TOK_FLOATING to indicate the type.
+ *
+ * First character must be [0-9 '.']
+ */
+static enum token_type scan_number(struct scan_number *sn,
+                                       const char *s, const char *e) {
+       enum token_type type;
+       
+       sn->dots_found = 0;
+       
+       sn->prefix = s;
+       sn->digits = s;
+       if (s+3<=e && s[0]=='0') {
+               if (s[1]=='X' || s[1]=='x') {
+               //hexadecimal
+                       s += 2;
+                       sn->digits = s;
+                       for (;s<e;s++) {
+                               if (*s == '.')
+                                       sn->dots_found++;
+                               else if (!chex(*s))
+                                       break;
+                       }
+                       goto done_scanning_digits;
+               } else if (s[1]=='B' || s[1]=='b') {
+               //binary
+                       s += 2;
+                       if (*s!='0' && *s!='1')
+                               s -= 2;
+                       sn->digits = s;
+               }
+       }
+       
+       //binary, decimal, or octal
+       for (;s<e;s++) {
+               if (*s == '.')
+                       sn->dots_found++;
+               else if (!cdigit(*s))
+                       break;
+       }
+
+done_scanning_digits:
+       
+       sn->exponent = s;
+       if (s<e && (
+               (sn->prefix==sn->digits && (*s=='E' || *s=='e')) ||
+               (sn->prefix < sn->digits && (*s=='P' || *s=='p'))
+       )) {
+               s++;
+               if (s<e && (*s=='+' || *s=='-'))
+                       s++;
+               while (s<e && cdigit(*s)) s++;
+       }
+       
+       sn->suffix = s;
+       while (s<e && (cdigit(*s) || cletter(*s) ||
+               *s=='.' || *s=='_' || *s=='$')) s++;
+       
+       sn->end = s;
+       
+       //Now we're done scanning, but now we want to know what type this is
+       type = TOK_INTEGER;
+       if (sn->dots_found)
+               type = TOK_FLOATING;
+       if (sn->exponent < sn->suffix)
+               type = TOK_FLOATING;
+       
+       //if this is an octal, make the leading 0 a prefix
+       if (type==TOK_INTEGER && sn->prefix==sn->digits &&
+                       sn->digits < s && sn->digits[0]=='0')
+               sn->digits++;
+       
+       return type;
+}
+
+static enum tok_suffix read_number_suffix(const char *s, const char *e,
+                       enum token_type type, tok_message_queue *mq) {
+       const char *orig_s = s;
+       enum tok_suffix sfx = 0;
+       
+       //read the suffix in pieces
+       while (s<e) {
+               enum tok_suffix sfx_prev = sfx;
+               char c = *s++;
+               if (c>='a' && c<='z')
+                       c -= 'a'-'A';
+               
+               if (c=='L') {
+                       if (s<e && (*s=='L' || *s=='l')) {
+                               s++;
+                               sfx |= TOK_LL;
+                               
+                               //TOK_L and TOK_LL are mutually exclusive
+                               if (sfx & TOK_L)
+                                       goto invalid;
+                       } else {
+                               sfx |= TOK_L;
+                       }
+               }
+               else if (c=='U')
+                       sfx |= TOK_U;
+               else if (c=='F')
+                       sfx |= TOK_F;
+               else if (c=='I')
+                       sfx |= TOK_I;
+               else
+                       goto invalid;
+               
+               if (sfx == sfx_prev)
+                       goto invalid; //suffix piece was repeated
+       }
+       
+       //make sure the suffix is appropriate for this number type
+       if (type==TOK_INTEGER && (sfx & TOK_F)) {
+               tok_msg_error(suffix_float_only, orig_s,
+               "Suffix only valid for floating point numbers");
+               sfx = TOK_NOSUFFIX;
+       }
+       if (type==TOK_FLOATING && (sfx & (TOK_U | TOK_LL))) {
+               tok_msg_error(suffix_integer_only, orig_s,
+               "Suffix only valid for integers");
+               sfx = TOK_NOSUFFIX;
+       }
+       
+       return sfx;
+       
+invalid:
+       if (type==TOK_INTEGER)
+               tok_msg_error(integer_suffix_invalid, orig_s,
+                               "Integer suffix invalid");
+       else
+               tok_msg_error(floating_suffix_invalid, orig_s,
+                               "Floating point suffix invalid");
+       return TOK_NOSUFFIX;
+}
+
+static void read_integer(struct tok_integer *out, const struct scan_number *sn,
+                       tok_message_queue *mq) {
+       /*
+       Assertions about an integer's struct scan_number:
+               prefix is empty or [0 0B 0b 0X 0x]
+               sn->digits is not empty (i.e. sn->digits < sn->exponent)
+                       *unless* the prefix is "0"
+               has no exponent
+               suffix is [0-9 A-Z a-z '.']*
+               dots_found == 0
+       */
+       readui_base base = READUI_DEC;
+       const char *tokstart = sn->prefix;
+       const char *s = sn->digits, *e = sn->exponent;
+       
+       if (sn->prefix+1 < sn->digits) {
+               if (sn->prefix[1]=='X' || sn->prefix[1]=='x')
+                       base = READUI_HEX;
+               else
+                       base = READUI_OCT;
+       } else if (sn->prefix < sn->digits) {
+               base = READUI_OCT;
+       }
+       
+       if (s>=e && base==READUI_OCT) {
+               //octal contains no digits
+               out->v = 0;
+               out->base = 8;
+               goto suffix;
+       }
+       
+       out->v = readui(&s, sn->exponent, base);
+       out->base = base & 0xFF;
+       
+       if (s != e || errno == EINVAL) {
+               tok_msg_error(integer_invalid_digits, tokstart,
+                       "Integer constant contains invalid digits");
+       } else if (errno) {
+               if (errno == ERANGE) {
+                       tok_msg_error(integer_out_of_range, tokstart,
+                               "Integer constant out of range");
+               } else {
+                       tok_msg_bug(readui_unknown, tokstart,
+                               "Unknown error returned by readui");
+               }
+       }
+       
+suffix:
+       out->suffix =
+               read_number_suffix(sn->suffix, sn->end, TOK_INTEGER, mq);
+       
+       return;
+}
+
+static void read_floating(struct tok_floating *out, const struct scan_number *sn,
+                       tok_message_queue *mq) {
+       /*
+       Assertions about a float's struct scan_number:
+               prefix is empty or [0B 0b 0X 0x] (note: no octal prefix 0)
+               sn->digits not empty, ever
+               exponent may or may not exist
+               If exponent exists, it is valid and formatted as:
+                       ( [E P e p] ['+' '-']*0..1 [0-9]* )
+               An exponent starts with E if this is decimal, P if it is hex/binary
+               suffix is [0-9 A-Z a-z '.']*
+               dots_found can be anything
+       */
+       const char *tokstart = sn->prefix;
+       const char *s = sn->prefix, *e = sn->suffix;
+       char borrow = *sn->end;
+       //long double strtold(const char *nptr, char **endptr);
+       
+       out->v = 0.0;
+       out->suffix = TOK_NOSUFFIX;
+       
+       if (sn->prefix < sn->digits) {
+               if (sn->prefix[1]=='B' || sn->prefix[1]=='b') {
+                       tok_msg_error(binary_float, tokstart,
+                               "Binary floating point constants not allowed");
+                       return;
+               }
+               if (sn->exponent >= sn->suffix) {
+                       tok_msg_error(hex_float_no_exponent, tokstart,
+                               "Hex floating point constant missing exponent");
+                       return;
+               }
+       }
+       
+       
+       /* Stick a null terminator at the end of the input so strtold
+        * won't read beyond the given input.
+        *
+        * This is thread-safe because the input is from
+        * token_list.txt, which was generated in the
+        * tokenize function which is still running.
+        */
+       *(char*)sn->end = 0;
+       errno = 0;
+       out->v = strtold(s, (char**)&s);
+       //don't forget to set it back
+       *(char*)sn->end = borrow;
+       
+       if (errno) {
+               //for some reason, strtold may errno to EDOM to indicate underrun
+               //open test/run.c and search "floating_out_of_range" for more details
+               if (errno == ERANGE || errno == EDOM) {
+                       tok_msg_error(floating_out_of_range, tokstart,
+                               "Floating point constant out of range");
+               } else {
+                       tok_msg_bug(strtold_unknown, tokstart,
+                               "Unknown error returned by strtold");
+               }
+       }
+       
+       if (s != e) {
+               tok_msg_error(floating_invalid_digits, tokstart,
+                       "Floating point constant contains invalid digits");
+       }
+       
+       out->suffix =
+               read_number_suffix(sn->suffix, sn->end, TOK_FLOATING, mq);
+}
+
+char *read_cnumber(struct token *tok, const char *s, const char *e, tok_message_queue *mq) {
+       struct scan_number sn;
+       
+       tok->type = scan_number(&sn, s, e);
+       if (tok->type == TOK_INTEGER)
+               read_integer(&tok->integer, &sn, mq);
+       else
+               read_floating(&tok->floating, &sn, mq);
+       
+       return (char*)sn.end;
+}
+
+#undef MESSAGE_PATH
diff --git a/ccan/ccan_tokenizer/read_cstring.c b/ccan/ccan_tokenizer/read_cstring.c
new file mode 100644 (file)
index 0000000..2302180
--- /dev/null
@@ -0,0 +1,161 @@
+#include "ccan_tokenizer.h"
+
+static char *strdup_rng(const char *s, const char *e) {
+       char *ret = malloc(e-s+1);
+       memcpy(ret, s, e-s);
+       ret[e-s] = 0;
+       return ret;
+}
+
+#define MESSAGE_PATH "tokenize/read_cstring/"
+
+//Reads a C string starting at s until quoteChar is found or e is reached
+//  Returns the pointer to the terminating quote character or e if none was found
+char *read_cstring(array_char *out, const char *s, const char *e, char quoteChar, tok_message_queue *mq) {
+       const char * const tokstart = s;
+       const char *p;
+       int has_endquote=0, has_newlines=0;
+       
+       //tok_msg_debug(called, s, "Called read_cstring on `%s`", s);
+       
+       #define append(startptr,endptr) array_append_items(*out, startptr, (endptr)-(startptr))
+       #define append_char(theChar) array_append(*out, theChar)
+       #define append_zero() do {array_append(*out, 0); out->size--;} while(0)
+       
+       p = s;
+       while (p<e) {
+               char c = *p++;
+               if (c == '\\') {
+                       append(s, p-1);
+                       s = p;
+                       if (p >= e) {
+                               append_char('\\');
+                               tok_msg_error(ended_in_backslash, p-1,
+                                       "read_cstring input ended in backslash");
+                               break;
+                       }
+                       c = *p++;
+                       if (c>='0' && c<='9') {
+                               unsigned int octal = c-'0';
+                               size_t digit_count = 0;
+                               while (p<e && *p>='0' && *p<='9') {
+                                       octal <<= 3;
+                                       octal += (*p++) - '0';
+                                       if (++digit_count >= 2)
+                                               break;
+                               }
+                               if (p<e && *p>='0' && *p<='9') {
+                                       tok_msg_info(ambiguous_octal, s-2,
+                                               "Octal followed by digit");
+                               }
+                               if (octal > 0xFF) {
+                                       tok_msg_warn(octal_overflow, s-2,
+                                               "Octal out of range");
+                               }
+                               c = octal;
+                       } else {
+                               switch (c) {
+                                       case 'x': {
+                                               size_t digit_count = 0;
+                                               size_t zero_count = 0;
+                                               unsigned int hex = 0;
+                                               while (p<e && *p=='0') p++, zero_count++;
+                                               for (;p<e;digit_count++) {
+                                                       c = *p++;
+                                                       if (c>='0' && c<='9')
+                                                               c -= '0';
+                                                       else if (c>='A' && c<='F')
+                                                               c -= 'A'-10;
+                                                       else if (c>='a' && c<='f')
+                                                               c -= 'a'-10;
+                                                       else {
+                                                               p--;
+                                                               break;
+                                                       }
+                                                       hex <<= 4;
+                                                       hex += c;
+                                               }
+                                               if (zero_count+digit_count > 2) {
+                                                       char *hex_string = strdup_rng(s-2, p);
+                                                       tok_msg_warn(ambiguous_hex, s-2,
+                                                               "Hex escape '%s' is ambiguous", hex_string);
+                                                       if (digit_count > 2)
+                                                               tok_msg_warn(hex_overflow, s-2,
+                                                                       "Hex escape '%s' out of range", hex_string);
+                                                       free(hex_string);
+                                               }
+                                               c = hex & 0xFF;
+                                       }       break;
+                                       case 'a':
+                                               c=0x7;
+                                               break;
+                                       case 'b':
+                                               c=0x8;
+                                               break;
+                                       case 'e':
+                                               c=0x1B;
+                                               break;
+                                       case 'f':
+                                               c=0xC;
+                                               break;
+                                       case 'n':
+                                               c=0xA;
+                                               break;
+                                       case 'r':
+                                               c=0xD;
+                                               break;
+                                       case 't':
+                                               c=0x9;
+                                               break;
+                                       case 'v':
+                                               c=0xB;
+                                               break;
+                                       case '\\':
+                                               break;
+                                       default:
+                                               if (c == quoteChar)
+                                                       break;
+                                               if (c=='\'' && quoteChar=='"') {
+                                                       /* tok_msg_info(escaped_single_quote, s-2,
+                                                               "Single quote characters need not be escaped within double quotes"); */
+                                                       break;
+                                               }
+                                               if (c=='"' && quoteChar=='\'') {
+                                                       /* tok_msg_info(escaped_double_quote, s-2,
+                                                               "Double quote characters need not be escaped within single quotes"); */
+                                                       break;
+                                               }
+                                               tok_msg_warn(unknown_escape, s-2,
+                                                       "Unknown escape sequence '\\%c'", c);
+                                               break;
+                               }
+                       }
+                       s = p;
+                       append_char(c);
+               } else if (c == quoteChar) {
+                       p--;
+                       has_endquote = 1;
+                       break;
+               } else if (creturn(c)) {
+                       has_newlines = 1;
+               }
+       }
+       append(s, p);
+       append_zero();
+       if (!has_endquote) {
+               tok_msg_error(missing_endquote, tokstart,
+                       "Missing endquote on %s literal",
+                       quoteChar=='\'' ? "character" : "string");
+       } else if (has_newlines) {
+               tok_msg_warn(quote_newlines, tokstart,
+                       "%s literal contains newline character(s)",
+                       quoteChar=='\'' ? "Character" : "String");
+       }
+       return (char*)p;
+       
+       #undef append
+       #undef append_char
+       #undef append_zero
+}
+
+#undef MESSAGE_PATH
diff --git a/ccan/ccan_tokenizer/scripts/message_dump_to_messages.sh b/ccan/ccan_tokenizer/scripts/message_dump_to_messages.sh
new file mode 100755 (executable)
index 0000000..ef554b9
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/sh
+sed 's/^D: /{.level=TM_DEBUG, .path="/' |
+sed 's/^I: /{.level=TM_INFO, .path="/' |
+sed 's/^W: /{.level=TM_WARN, .path="/' |
+sed 's/^BUG: /{.level=TM_BUG, .path="/' |
+sed 's/:.*/\"},/'
diff --git a/ccan/ccan_tokenizer/test/run-simple-token.c b/ccan/ccan_tokenizer/test/run-simple-token.c
new file mode 100644 (file)
index 0000000..071848f
--- /dev/null
@@ -0,0 +1,272 @@
+#include "ccan_tokenizer/read_cnumber.c"
+#include "ccan_tokenizer/read_cstring.c"
+#include "ccan_tokenizer/dict.c"
+#include "ccan_tokenizer/ccan_tokenizer.c"
+#include "ccan_tokenizer/queue.c"
+#include "ccan_tokenizer/charflag.c"
+#include "tap/tap.h"
+
+#define item(num) (toks->first[num])
+//sed 's/toks->array\.item\[\([^]]*\)\]/item(\1)/g'
+
+tok_message_queue *MQ = NULL;
+
+static const char *onechar_tokens = "!~#%^&*()=-+{}[]|;:,.<>/?";
+static const char *twochar_tokens = "!=##%=^=&=&&*=-=--->+=++==|=||<=<<>=>>/=";
+static const char *threechar_tokens = "<<=>>=...";
+static const char *char_token = "'x'";
+static const char *string_token = "\"string\"";
+static const char *ident_tokens = "doh abc f_o _ba b$f";
+
+static char *backslashify(const char *string)
+{
+       unsigned int i;
+       char *ret = talloc_size(NULL, strlen(string)*3 + 1);
+       for (i = 0; i < strlen(string); i++) {
+               ret[i*3] = string[i];
+               ret[i*3+1] = '\\';
+               ret[i*3+2] = '\n';
+       }
+       ret[i*3] = '\0';
+       return ret;
+}
+
+static char *spacify(const char *string, unsigned int num)
+{
+       unsigned int i;
+       char *ret = talloc_size(NULL, strlen(string)*2 + 1);
+       memset(ret, ' ', strlen(string)*2);
+
+       for (i = 0; i < strlen(string); i += num)
+               memcpy(&ret[i + i/num], string+i, num);
+       ret[i + i/num] = '\0';
+       return ret;
+}
+
+static struct token_list *test_tokens(const char *orig, unsigned int size)
+{
+       struct token_list *toks;
+       char *string = talloc_strdup(NULL, orig);
+       unsigned int i;
+
+       toks = tokenize(string, strlen(string), MQ);
+       ok1(token_list_sanity_check(toks, stdout));
+       
+       ok1(token_list_count(toks) == strlen(string)/size + 1);
+       ok1(item(0).type == TOK_STARTLINE);
+       for (i = 0; i < strlen(string)/size; i++) {
+               ok1(item(i+1).type == TOK_OPERATOR);
+               ok1(item(i+1).txt_size == size);
+               ok1(strncmp(item(i+1).txt, string + i*size, size) == 0);
+               ok1(item(i+1).orig_size == size);
+               ok1(item(i+1).orig == string + i*size);
+       }
+       return toks;
+}
+
+static struct token_list *test_tokens_spaced(const char *orig,
+                                            unsigned int size)
+{
+       struct token_list *toks;
+       char *string = spacify(orig, size);
+       unsigned int i;
+
+       toks = tokenize(string, strlen(string), MQ);
+       ok1(token_list_sanity_check(toks, stdout));
+       
+       ok1(token_list_count(toks) == strlen(orig)/size*2 + 1);
+       ok1(item(0).type == TOK_STARTLINE);
+       for (i = 0; i < strlen(orig)/size; i++) {
+               ok1(item(i*2+1).type == TOK_OPERATOR);
+               ok1(item(i*2+1).txt_size == size);
+               ok1(!strncmp(item(i*2+1).txt, string + i*(size+1), size));
+               ok1(item(i*2+1).orig_size == size);
+               ok1(item(i*2+1).orig == string + i*(size+1));
+               ok1(item(i*2+2).type == TOK_WHITE);
+               ok1(item(i*2+2).txt_size == 1);
+               ok1(item(i*2+2).txt[0] == ' ');
+               ok1(item(i*2+2).orig_size == 1);
+               ok1(item(i*2+2).orig == string + i*(size+1) + size);
+       }
+       return toks;
+}
+
+static struct token_list *test_tokens_backslashed(const char *orig,
+                                                 unsigned int size)
+{
+       struct token_list *toks;
+       const char *string = backslashify(orig);
+       unsigned int i;
+
+       toks = tokenize(string, strlen(string), MQ);
+       ok1(token_list_sanity_check(toks, stdout));
+       
+       ok1(token_list_count(toks) == strlen(orig)/size + 1);
+       ok1(item(0).type == TOK_STARTLINE);
+       for (i = 0; i < strlen(orig)/size; i++) {
+               ok1(item(i+1).type == TOK_OPERATOR);
+               ok1(item(i+1).txt_size == size);
+               ok1(strncmp(item(i+1).txt, orig + i*size, size) == 0);
+               ok1(item(i+1).orig_size == size*3);
+               ok1(item(i+1).orig == string + i*size*3);
+       }
+       return toks;
+}
+
+static void onechar_checks(const struct token_list *toks, int mul)
+{
+       unsigned int i;
+       for (i = 0; i < strlen(onechar_tokens); i++)
+               ok1(item(i*mul+1).opkw == onechar_tokens[i]);
+}
+
+static void twochar_checks(const struct token_list *toks, int mul)
+{
+       ok1(item(1).opkw == NE_OP);
+       ok1(item(1*mul+1).opkw == DOUBLE_POUND);
+       ok1(item(2*mul+1).opkw == MOD_ASSIGN);
+       ok1(item(3*mul+1).opkw == XOR_ASSIGN);
+       ok1(item(4*mul+1).opkw == AND_ASSIGN);
+       ok1(item(5*mul+1).opkw == AND_OP);
+       ok1(item(6*mul+1).opkw == MUL_ASSIGN);
+       ok1(item(7*mul+1).opkw == SUB_ASSIGN);
+       ok1(item(8*mul+1).opkw == DEC_OP);
+       ok1(item(9*mul+1).opkw == PTR_OP);
+       ok1(item(10*mul+1).opkw == ADD_ASSIGN);
+       ok1(item(11*mul+1).opkw == INC_OP);
+       ok1(item(12*mul+1).opkw == EQ_OP);
+       ok1(item(13*mul+1).opkw == OR_ASSIGN);
+       ok1(item(14*mul+1).opkw == OR_OP);
+       ok1(item(15*mul+1).opkw == LE_OP);
+       ok1(item(16*mul+1).opkw == LEFT_OP);
+       ok1(item(17*mul+1).opkw == GE_OP);
+       ok1(item(18*mul+1).opkw == RIGHT_OP);
+       ok1(item(19*mul+1).opkw == DIV_ASSIGN);
+}
+
+static void threechar_checks(const struct token_list *toks, int mul)
+{
+       ok1(item(1).opkw == LEFT_ASSIGN);
+       ok1(item(1*mul+1).opkw == RIGHT_ASSIGN);
+       ok1(item(2*mul+1).opkw == ELLIPSIS);
+}
+
+int main(void)
+{
+       unsigned int i;
+       struct token_list *toks;
+       char *str;
+       char *backslashed_idents;
+
+       plan_tests(1243);
+       toks = test_tokens(onechar_tokens, 1);
+       onechar_checks(toks, 1);
+       talloc_free((char*)toks->orig);
+
+       toks = test_tokens(twochar_tokens, 2);
+       twochar_checks(toks, 1);
+       talloc_free((char*)toks->orig);
+
+       toks = test_tokens(threechar_tokens, 3);
+       threechar_checks(toks, 1);
+       talloc_free((char*)toks->orig);
+
+       /* char literal */
+       str = talloc_strdup(NULL, char_token);
+       toks = tokenize(str, strlen(str), MQ);
+       ok1(token_list_sanity_check(toks, stdout));
+       ok1(token_list_count(toks) == 2);
+       ok1(item(0).type == TOK_STARTLINE);
+       ok1(item(1).type == TOK_CHAR);
+       ok1(item(1).txt_size == strlen(str));
+       ok1(strncmp(item(1).txt, str, strlen(str)) == 0);
+       ok1(item(1).orig_size == strlen(str));
+       ok1(item(1).orig == str);
+       /* FIXME: test contents of string. */
+       talloc_free(str);
+
+       /* string literal */
+       str = talloc_strdup(NULL, string_token);
+       toks = tokenize(str, strlen(str), MQ);
+       ok1(token_list_sanity_check(toks, stdout));
+       ok1(token_list_count(toks) == 2);
+       ok1(item(0).type == TOK_STARTLINE);
+       ok1(item(1).type == TOK_STRING);
+       ok1(item(1).txt_size == strlen(str));
+       ok1(strncmp(item(1).txt, str, strlen(str)) == 0);
+       ok1(item(1).orig_size == strlen(str));
+       ok1(item(1).orig == str);
+       /* FIXME: test contents of string. */
+       talloc_free(str);
+
+       /* Identifiers */
+       str = talloc_strdup(NULL, ident_tokens);
+       toks = tokenize(str, strlen(str), MQ);
+       ok1(token_list_sanity_check(toks, stdout));
+       token_list_dump(toks, stdout);
+       ok1(token_list_count(toks) == 10);
+       ok1(item(0).type == TOK_STARTLINE);
+       for (i = 0; i < 5; i++) {
+               ok1(item(i*2+1).type == TOK_IDENTIFIER);
+               ok1(item(i*2+1).txt_size == 3);
+               ok1(strncmp(item(i*2+1).txt, str + i*4, 3) == 0);
+               ok1(item(i*2+1).orig_size == 3);
+               ok1(item(i*2+1).orig == str + i*4);
+               if (i == 4)
+                       continue;
+               ok1(item(i*2+2).type == TOK_WHITE);
+               ok1(item(i*2+2).txt_size == 1);
+               ok1(item(i*2+2).txt[0] == ' ');
+               ok1(item(i*2+2).orig_size == 1);
+               ok1(item(i*2+2).orig == str + i*4 + 3);
+       }
+       talloc_free(str);
+
+       toks = test_tokens_spaced(onechar_tokens, 1);
+       onechar_checks(toks, 2);
+       talloc_free((char*)toks->orig);
+
+       toks = test_tokens_spaced(twochar_tokens, 2);
+       twochar_checks(toks, 2);
+       talloc_free((char*)toks->orig);
+
+       toks = test_tokens_spaced(threechar_tokens, 3);
+       threechar_checks(toks, 2);
+       talloc_free((char*)toks->orig);
+
+       toks = test_tokens_backslashed(onechar_tokens, 1);
+       onechar_checks(toks, 1);
+       talloc_free((char*)toks->orig);
+
+       toks = test_tokens_backslashed(twochar_tokens, 2);
+       twochar_checks(toks, 1);
+       talloc_free((char*)toks->orig);
+
+       toks = test_tokens_backslashed(threechar_tokens, 3);
+       threechar_checks(toks, 1);
+       talloc_free((char*)toks->orig);
+
+       /* Identifiers */
+       backslashed_idents = backslashify(ident_tokens);
+       toks = tokenize(backslashed_idents, strlen(backslashed_idents), MQ);
+       ok1(token_list_sanity_check(toks, stdout));
+       ok1(token_list_count(toks) == 10);
+       ok1(item(0).type == TOK_STARTLINE);
+       for (i = 0; i < 5; i++) {
+               ok1(item(i*2+1).type == TOK_IDENTIFIER);
+               ok1(item(i*2+1).txt_size == 3);
+               ok1(strncmp(item(i*2+1).txt, ident_tokens + i*4, 3) == 0);
+               ok1(item(i*2+1).orig_size == 9);
+               ok1(item(i*2+1).orig == backslashed_idents + i*12);
+               if (i == 4)
+                       continue;
+               ok1(item(i*2+2).type == TOK_WHITE);
+               ok1(item(i*2+2).txt_size == 1);
+               ok1(item(i*2+2).txt[0] == ' ');
+               ok1(item(i*2+2).orig_size == 3);
+               ok1(item(i*2+2).orig == backslashed_idents + i*12 + 9);
+       }
+       talloc_free(backslashed_idents);
+
+       return exit_status();
+}
diff --git a/ccan/ccan_tokenizer/test/run.c b/ccan/ccan_tokenizer/test/run.c
new file mode 100644 (file)
index 0000000..9a4ae19
--- /dev/null
@@ -0,0 +1,1466 @@
+/*
+        Copyright (c) 2009  Joseph A. Adams
+        All rights reserved.
+        
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions
+        are met:
+        1. Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+        2. Redistributions in binary form must reproduce the above copyright
+           notice, this list of conditions and the following disclaimer in the
+           documentation and/or other materials provided with the distribution.
+        3. The name of the author may not be used to endorse or promote products
+           derived from this software without specific prior written permission.
+        
+        THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+        IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+        OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+        IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+        INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+        NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+        DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+        THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+        THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "ccan_tokenizer/read_cnumber.c"
+#include "ccan_tokenizer/read_cstring.c"
+#include "ccan_tokenizer/dict.c"
+#include "ccan_tokenizer/ccan_tokenizer.c"
+#include "ccan_tokenizer/queue.c"
+#include "ccan_tokenizer/charflag.c"
+
+#include "ccan_tokenizer/ccan_tokenizer.h"
+
+#include "tap/tap.h"
+
+#include <math.h>
+
+#define array_count_pair(type, ...) (const type []){__VA_ARGS__}, sizeof((const type []){__VA_ARGS__})/sizeof(type)
+
+static void test_read_cstring(void) {
+       #define next() do {array_free(str); array_init(str, NULL); csp++;} while(0)
+       #define cs (*csp)
+       #define verify_quotechar(correct, correct_continuation_offset, quotechar) do { \
+               const size_t s = sizeof(correct)-1; \
+               p = read_cstring(&str, cs, cs ? strchr(cs, 0) : NULL, quotechar, &mq); \
+               ok(str.size==s && str.alloc>s && str.item[s]==0 && \
+               !memcmp(str.item, correct, s), \
+               "\"%s: Is output correct?", cs); \
+               ok(p == cs+correct_continuation_offset, "\"%s: Is continuation pointer correct?", cs); \
+               next(); \
+       } while(0)
+       #define verify(correct, correct_continuation_offset) verify_quotechar(correct, correct_continuation_offset, '"')
+       
+       const char * const cstrings[] = {
+               NULL,
+               "",
+               "\"",
+               "Hello world!\"",
+               "Hello world!",
+               "\\\\\\f\\e\\b\\0\\a\\r\\n\\w\\t\\v\\\'\\\"\"",
+               "\\\\\\f\\e\\b\\0\\a\\r\\n\\w\\t\\v\\\'\\\"\'",
+               "الأدب العربي\"",
+               "Ends with \\",
+               "Tab: '\\011' Space: '\\040' Overflow: '\\777' Ambiguous: '\\1013'\"",
+               "\\x50\\x35\\x12\\xEF\\xFE\\x00012\\x345\""
+       };
+       const char * const *csp = cstrings;
+       const char *p;
+       array_char str = array_new(NULL);
+       tok_message_queue mq;
+       
+       queue_init(mq, NULL);
+       
+       //check null input
+       verify("", 0);
+       
+       //Check an empty input
+       verify("", 0);
+       
+       //Check an empty quote-terminated string
+       verify("", 0);
+       
+       //Check a simple string
+       verify("Hello world!", 12);
+       
+       //Check a simple string without an end quote
+       verify("Hello world!", 12);
+       
+       //Check a collection of single-character sequences
+       verify("\\\f\e\b\0\a\r\nw\t\v\'\"", 26);
+       
+       //Check same collection of single-character sequences, this time using a single quote terminator
+       verify_quotechar("\\\f\e\b\0\a\r\nw\t\v\'\"", 26, '\'');
+       
+       //Check a real UTF-8 string
+       verify("\xd8\xa7\xd9\x84\xd8\xa3\xd8\xaf\xd8\xa8\x20\xd8\xa7\xd9\x84\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a", 23);
+       
+       //Check string ending in backslash
+       verify("Ends with \\", 11);
+       
+       //Check a series of octal escapes
+       verify("Tab: '\t' Space: ' ' Overflow: '\377' Ambiguous: 'A3'", 61);
+       
+       //Check a series of hex escapes
+       verify("\x50\x35\x12\xEF\xFE\x12\x45", 32);
+       
+       array_free(str);
+       
+       //tok_message_queue_dump(&mq);
+       
+       //Verify the message queue
+       if (1)
+       {
+               struct tok_message m;
+               struct tok_message correct_messages[] = {
+                       {.level=TM_ERROR, .path="tokenize/read_cstring/missing_endquote"},
+                       {.level=TM_ERROR, .path="tokenize/read_cstring/missing_endquote"},
+                       {.level=TM_ERROR, .path="tokenize/read_cstring/missing_endquote"},
+                       {.level=TM_WARN, .path="tokenize/read_cstring/unknown_escape"},
+                       //{.level=TM_INFO, .path="tokenize/read_cstring/escaped_single_quote"},
+                       {.level=TM_WARN, .path="tokenize/read_cstring/unknown_escape"},
+                       //{.level=TM_INFO, .path="tokenize/read_cstring/escaped_double_quote"},
+                       {.level=TM_ERROR, .path="tokenize/read_cstring/ended_in_backslash"},
+                       {.level=TM_ERROR, .path="tokenize/read_cstring/missing_endquote"},
+                       {.level=TM_WARN, .path="tokenize/read_cstring/octal_overflow"},
+                       {.level=TM_INFO, .path="tokenize/read_cstring/ambiguous_octal"},
+                       {.level=TM_WARN, .path="tokenize/read_cstring/ambiguous_hex"},
+                       {.level=TM_WARN, .path="tokenize/read_cstring/ambiguous_hex"},
+                       {.level=TM_WARN, .path="tokenize/read_cstring/hex_overflow"},
+               };
+               size_t i, e=sizeof(correct_messages)/sizeof(*correct_messages);
+               
+               while(queue_count(mq) && queue_next(mq).level==TM_DEBUG)
+                       queue_skip(mq);
+               for (i=0; i<e; i++) {
+                       if (!queue_count(mq))
+                               break;
+                       m = dequeue(mq);
+                       if (m.level != correct_messages[i].level)
+                               break;
+                       if (strcmp(m.path, correct_messages[i].path))
+                               break;
+                       while(queue_count(mq) && queue_next(mq).level==TM_DEBUG)
+                               queue_skip(mq);
+               }
+               if (i<e)
+                       printf("Item %u is incorrect\n", i);
+               ok(i==e, "Is message queue correct?");
+               ok(!queue_count(mq), "Message queue should be empty now.");
+       }
+       
+       queue_free(mq);
+       #undef next
+       #undef cs
+       #undef verify_quotechar
+       #undef verify
+}
+
+#if 0
+static void p(const char *str) {
+       if (str)
+               puts(str);
+       else
+               puts("(null)");
+}
+#endif
+
+static void test_queue(void) {
+       #define next() do {queue_free(q); queue_init(q, NULL);} while(0)
+       
+       const char * const s[] = {
+               "zero",
+               "one",
+               "two",
+               "three",
+               "four",
+               "five",
+               "six",
+               "seven",
+               "eight",
+               "nine",
+               "ten",
+               "eleven",
+               "twelve",
+               "thirteen",
+               "fourteen",
+               "fifteen"
+       };
+       queue(const char*) q;
+       queue_init(q, NULL);
+       
+       enqueue(q, s[0]);
+       enqueue(q, s[1]);
+       enqueue(q, s[2]);
+       enqueue(q, s[3]);
+       enqueue(q, s[4]);
+       enqueue(q, s[5]);
+       ok(queue_count(q) == 6, "Checking queue count");
+       
+       ok(dequeue_check(q)==s[0] &&
+               dequeue_check(q)==s[1] &&
+               dequeue_check(q)==s[2], "Dequeuing/checking 3 items");
+       ok(queue_count(q) == 3, "Checking queue count");
+       
+       enqueue(q, s[6]);
+       enqueue(q, s[7]);
+       enqueue(q, s[8]);
+       enqueue(q, s[9]);
+       enqueue(q, s[10]);
+       enqueue(q, s[11]);
+       enqueue(q, s[12]);
+       enqueue(q, s[13]);
+       enqueue(q, s[14]);
+       enqueue(q, s[15]);
+       ok(queue_count(q) == 13, "Checking queue count");
+       
+       ok(dequeue_check(q)==s[3] &&
+               dequeue_check(q)==s[4] &&
+               dequeue_check(q)==s[5] &&
+               dequeue_check(q)==s[6] &&
+               dequeue_check(q)==s[7] &&
+               dequeue_check(q)==s[8] &&
+               dequeue_check(q)==s[9] &&
+               dequeue_check(q)==s[10] &&
+               dequeue_check(q)==s[11] &&
+               dequeue_check(q)==s[12] &&
+               dequeue_check(q)==s[13] &&
+               dequeue_check(q)==s[14] &&
+               dequeue_check(q)==s[15], "Are queue items correct?");
+       ok(dequeue_check(q)==NULL && dequeue_check(q)==NULL && queue_count(q)==0, "Does queue run out correctly?");
+       
+       queue_free(q);
+       
+       #undef next
+}
+
+#define test_dict_single() _test_dict_single(dict, str, sizeof(str)-1, correct, sizeof(correct)/sizeof(*correct))
+static void _test_dict_single(struct dict *dict, const char *str, size_t len, int *correct, size_t correct_count) {
+       const char *s=str, *e=str+len;
+       size_t i;
+       struct dict_entry *entry;
+       
+       for (i=0; s<e && i<correct_count; i++) {
+               const char *s_last = s;
+               entry = dict_lookup(dict, &s, e);
+               if (!entry) {
+                       if (s_last != s)
+                               break; //dict_lookup should not modify *sp when it returns NULL
+                       s++;
+                       if (correct[i] != -100)
+                               break;
+                       continue;
+               }
+               if (correct[i] != entry->id)
+                       break;
+               if (!*entry->str) {
+                       if (s_last+1 != s)
+                               break;
+                       if (s[-1] != 0)
+                               break;
+               } else {
+                       size_t len = strlen(entry->str);
+                       if (s_last+len != s)
+                               break;
+                       if (strncmp(entry->str, s-len, len))
+                               break;
+               }
+               //printf("Correctly read %s\n", entry->str);
+       }
+       
+       if (s!=e || i!=correct_count) {
+               printf("Tokenization failed at ");
+               fwrite(s, 1, e-s, stdout);
+               printf("\n");
+       }
+       
+       ok(s==e && i==correct_count, "All of the tokens are correct");
+}
+
+static void test_dict(void) {
+       struct dict_entry dict_orig[] = {
+               {-1, ""},
+               {0, " "},
+               {1, "it"},
+               {2, "it's"},
+               {3, "a"},
+               {4, "beautiful"},
+               {5, "be"},
+               {6, "day"},
+               {7, "d"},
+               {8, "in"},
+               {9, "the"},
+               {10, "t"},
+               {11, "neighborhood"},
+               {12, "neighbor"},
+               {13, "won't"},
+               {14, " you"},
+               {15, "my"},
+               {16, "??"},
+               {17, "item"},
+               {18, "ip"},
+               {19, "\xFF\xFA"},
+               {20, "\xFF\xEE"},
+               {21, "\x80\x12\x34"},
+               {22, "\x80\x32"},
+               {23, "\x80\x32\x34"}
+       };
+       struct dict *dict = dict_build(NULL, dict_orig, sizeof(dict_orig)/sizeof(*dict_orig));
+       
+       {
+               const char *s=NULL, *e=NULL;
+               ok(dict_lookup(dict, &s, e)==NULL && s==NULL && e==NULL, "dict_lookup does nothing and returns null on empty input");
+       }
+       
+       {
+               const char str[] = "it's a beautiful day in the neighborhood\0won't you be my neighbor?";
+               int correct[] = {2,0, 3,0, 4,0, 6,0, 8,0, 9,0, 11,-1, 13, 14,0, 5,0, 15,0, 12, -100};
+               test_dict_single();
+       }
+       
+       //check equal-length tokens
+       {
+               const char str[] = "it'sitem initip";
+               int correct[] = {2,17,0, 8,1,18};
+               test_dict_single();
+       }
+       
+       //check mostly invalid tokens
+       {
+               const char str[] = "&^&beaumx yo youthx";
+               int correct[] = {-100,-100,-100, 5,3,-100,-100,-100, 0,-100,-100, 14,10,-100,-100};
+               test_dict_single();
+       }
+       
+       //check tokens that start with a character greater than 0x7F
+       {
+               const char str[] = "\x80\x12\x34\x80\x32\x80\x32\x34\xFF\xFA\xFF\xEE";
+               int correct[] = {21, 22, 23, 19, 20};
+               test_dict_single();
+       }
+       
+       talloc_free(dict);
+       
+       //make sure dict_build doesn't blow up on an empty dictionary
+       dict = dict_build(NULL, NULL, 0);
+       talloc_free(dict);
+}
+
+static void test_charflag(void) {
+       char i;
+       int correct = 0;
+       
+       #define CONTROL do { \
+               if (ccontrol(i) && !cspace(i) && !creturn(i) && !cwhite(i) && \
+                       !cdigit(i) && !cletter(i) && !chex(i) && !csymbol(i) && \
+                       !cextended(i) ) \
+                       correct++; \
+               } while(0)
+       #define SPACE do { \
+               if (!ccontrol(i) && cspace(i) && !creturn(i) && cwhite(i) && \
+                       !cdigit(i) && !cletter(i) && !chex(i) && !csymbol(i) && \
+                       !cextended(i) ) \
+                       correct++; \
+               } while(0)
+       #define RETURN do { \
+               if (!ccontrol(i) && !cspace(i) && creturn(i) && cwhite(i) && \
+                       !cdigit(i) && !cletter(i) && !chex(i) && !csymbol(i) && \
+                       !cextended(i) ) \
+                       correct++; \
+               } while(0)
+       #define SYMBOL do { \
+               if (!ccontrol(i) && !cspace(i) && !creturn(i) && !cwhite(i) && \
+                       !cdigit(i) && !cletter(i) && !chex(i) && csymbol(i) && \
+                       !cextended(i) ) \
+                       correct++; \
+               } while(0)
+       #define DIGIT do { \
+               if (!ccontrol(i) && !cspace(i) && !creturn(i) && !cwhite(i) && \
+                       cdigit(i) && !cletter(i) && chex(i) && !csymbol(i) && \
+                       !cextended(i) ) \
+                       correct++; \
+               } while(0)
+       #define LETTER_HEX do { \
+               if (!ccontrol(i) && !cspace(i) && !creturn(i) && !cwhite(i) && \
+                       !cdigit(i) && cletter(i) && chex(i) && !csymbol(i) && \
+                       !cextended(i) ) \
+                       correct++; \
+               } while(0)
+       #define LETTER do { \
+               if (!ccontrol(i) && !cspace(i) && !creturn(i) && !cwhite(i) && \
+                       !cdigit(i) && cletter(i) && !chex(i) && !csymbol(i) && \
+                       !cextended(i) ) \
+                       correct++; \
+               } while(0)
+       #define EXTENDED do { \
+               if (!ccontrol(i) && !cspace(i) && !creturn(i) && !cwhite(i) && \
+                       !cdigit(i) && !cletter(i) && !chex(i) && !csymbol(i) && \
+                       cextended(i) ) \
+                       correct++; \
+               } while(0)
+       
+       for (i=0; i<'\t'; i++) CONTROL;
+       i = '\t'; SPACE;
+       i = '\n'; RETURN;
+       i = '\v'; SPACE;
+       i = '\f'; SPACE;
+       i = '\r'; RETURN;
+       for (i='\r'+1; i<' '; i++) CONTROL;
+       i = ' '; SPACE;
+       for (i='!'; i<='/'; i++) SYMBOL;
+       for (i='0'; i<='9'; i++) DIGIT;
+       for (i=':'; i<='@'; i++) SYMBOL;
+       for (i='A'; i<='F'; i++) LETTER_HEX;
+       for (i='G'; i<='Z'; i++) LETTER;
+       for (i='['; i<='`'; i++) SYMBOL;
+       for (i='a'; i<='f'; i++) LETTER_HEX;
+       for (i='g'; i<='z'; i++) LETTER;
+       for (i='{'; i<='~'; i++) SYMBOL;
+       i = '\x7F'; CONTROL;
+       
+       ok(correct==128, "ASCII characters have correct charflags");
+       correct = 0;
+       
+       //We do some goofy stuff here to make sure sign extension doesn't cause problems with charflags
+       {
+               unsigned int ui;
+               int si;
+               
+               for (ui=128; ui<=255; ui++) {
+                       i = ui;
+                       EXTENDED;
+               }
+               for (si=-128; si<0; si++) {
+                       i = si;
+                       EXTENDED;
+               }
+       }
+       {
+               int i;
+               for (i=-128; i<0; i++) EXTENDED;
+       }
+       {
+               unsigned int i;
+               for (i=128; i<=255; i++) EXTENDED;
+       }
+       
+       ok(correct==512, "Extended characters have correct charflags");
+       
+       #undef CONTROL
+       #undef SPACE
+       #undef RETURN
+       #undef SYMBOL
+       #undef DIGIT
+       #undef LETTER_HEX
+       #undef LETTER
+       #undef EXTENDED
+}
+
+struct readui_test {
+       const char *txt;
+       size_t txt_size;
+       readui_base base;
+       
+       uint64_t correct_integer;
+       int correct_errno;
+       size_t correct_advance;
+};
+
+#define T(txt, ...) {txt, sizeof(txt)-1, __VA_ARGS__}
+#define M (18446744073709551615ULL)
+
+struct readui_test readui_tests[] = {
+       //Basic reads
+       T("0",READUI_DEC, 0,0,1),
+       T(" \t42  ",READUI_DEC, 42,0,4),
+       
+       //Different bases
+       T("BADBEEFDEADBAT",READUI_HEX, 0xBADBEEFDEADBAULL,0,13),
+       T("7559",READUI_OCT, 0755,0,3),
+       T("01010010110012",READUI_BIN, 2649,0,13),
+       T("1000000000",0x7F, 8594754748609397887ULL,0,10),
+       
+       //Errors
+       T("",READUI_DEC, 0,EINVAL,0),
+       T("18446744073709551616",
+               READUI_DEC,M,ERANGE,20),
+       T("1000000000000000000000000",
+               READUI_DEC,M,ERANGE,25),
+       T("10000000000000000",
+               READUI_HEX,M,ERANGE,17),
+       T("10000000000000000000000000000000000000000000000000000000000000000",
+               READUI_BIN,M,ERANGE,65),
+       T("10000000000",
+               0x7D,M,ERANGE,11),
+       T("9000000000",0x7F, M,ERANGE,10),
+       
+       //Misc
+       T("18446744073709551615",READUI_DEC, M,0,20),
+};
+
+static void test_readui_single(struct readui_test *test) {
+       uint64_t result_integer;
+       int result_errno;
+       size_t result_advance;
+       
+       const char *s = test->txt, *e = s+test->txt_size;
+       errno = 0;
+       result_integer = readui(&s, e, test->base);
+       result_errno = errno;
+       result_advance = s-test->txt;
+       
+       ok(result_integer == test->correct_integer &&
+          result_errno   == test->correct_errno &&
+          result_advance == test->correct_advance,
+          "Testing \"%s\"", test->txt);
+}
+
+static void test_readui(void) {
+       size_t i, count = sizeof(readui_tests)/sizeof(*readui_tests);
+       
+       for (i=0; i<count; i++)
+               test_readui_single(readui_tests+i);
+}
+
+#undef T
+#undef M
+
+static void scan_number_sanity_check(const struct scan_number *sn,
+               enum token_type type, const char *str_pipes, const char *msg) {
+       //If there is a prefix, it should follow
+       //the pattern (0 [B X b x]*0..1)
+       if (sn->prefix < sn->digits) {
+               int len = sn->digits - sn->prefix;
+               if (len!=1 && len!=2) {
+                       fail("%s : Prefix length is %d; should be 1 or 2",
+                               str_pipes, len);
+                       return;
+               }
+               if (sn->prefix[0] != '0') {
+                       fail("%s : Prefix does not start with 0",
+                               str_pipes);
+                       return;
+               }
+               if (len==2 && !strchr("BXbx", sn->prefix[1])) {
+                       fail("%s : Prefix is 0%c; should be 0, 0b, or 0x",
+                               str_pipes, sn->prefix[1]);
+                       return;
+               }
+               if (len==1 && type==TOK_FLOATING) {
+                       fail("%s : Octal prefix appears on floating point number",
+                               str_pipes);
+                       return;
+               }
+       } else {
+       //if there is no prefix, the first digit should not be 0
+       //  unless this is a floating point number
+               if (sn->digits < sn->exponent && sn->digits[0]=='0' &&
+                               type==TOK_INTEGER) {
+                       fail("%s : First digit of non-prefix integer is 0",
+                               str_pipes);
+                       return;
+               }
+       }
+       
+       //Make sure sn->digits contains valid digits and is not empty
+       //  (unless prefix is "0")
+       {
+               const char *s = sn->digits, *e = sn->exponent;
+               if (sn->prefix+1 < sn->digits) {
+                       if (s >= e) {
+                               fail("%s : 0%c not followed by any digits",
+                                       str_pipes, sn->prefix[1]);
+                               return;
+                       }
+                       if (sn->prefix[1] == 'X' || sn->prefix[1] == 'x') {
+                               while (s<e && strchr(
+                                       "0123456789ABCDEFabcdef.", *s)) s++;
+                       } else {
+                               if (s[0]!='0' && s[0]!='1') {
+                                       fail("%s: Binary prefix not followed by a 0 or 1",
+                                               str_pipes);
+                                       return;
+                               }
+                               while (s<e && strchr(
+                                       "0123456789.", *s)) s++;
+                       }
+               } else {
+                       if (type==TOK_FLOATING && s >= e) {
+                               fail("%s : sn->digits is empty in a floating point number",
+                                       str_pipes);
+                               return;
+                       }
+                       if (sn->prefix >= sn->digits && s >= e) {
+                               fail("%s : both sn->prefix and sn->digits are empty",
+                                       str_pipes);
+                               return;
+                       }
+                       while (s<e && strchr("0123456789.", *s)) s++;
+               }
+               if (s != e) {
+                       fail("%s : sn->digits is not entirely valid", str_pipes);
+                       return;
+               }
+       }
+       
+       //Make sure exponent follows the rules
+       if (sn->exponent < sn->suffix) {
+               char c = sn->exponent[0];
+               if (type==TOK_INTEGER) {
+                       fail("%s : sn->exponent is not empty in an integer", str_pipes);
+                       return;
+               }
+               if (sn->prefix < sn->digits && (c=='E' || c=='e')) {
+                       fail("%s : Exponent for hex/binary starts with %c", str_pipes, c);
+                       return;
+               }
+               if (sn->prefix >= sn->digits && (c=='P' || c=='p')) {
+                       fail("%s : Exponent for decimal starts with %c", str_pipes, c);
+                       return;
+               }
+       }
+       
+       pass("%s%s", str_pipes, msg);
+       return;
+}
+
+static void test_scan_number_single(const char *str_pipes,
+                               enum token_type type, size_t dots_found) {
+       char *str = malloc(strlen(str_pipes)+1);
+       const char *expected[5];
+       struct scan_number sn;
+       enum token_type given_type;
+       
+       {
+               const char *s = str_pipes;
+               char *d = str;
+               size_t pipes = 0;
+               
+               expected[0] = d;
+               for (;*s;s++) {
+                       if (*s == ' ')
+                               continue;
+                       if (*s == '|') {
+                               if (++pipes > 4)
+                                       goto fail_too_many_pipes;
+                               expected[pipes] = d;
+                       } else
+                               *d++ = *s;
+               }
+               *d = 0;
+               
+               if (pipes < 3)
+                       goto fail_not_enough_pipes;
+               if (pipes == 3)
+                       expected[4] = d;
+       }
+       
+       given_type = scan_number(&sn, str, strchr(str,0));
+       
+       if (sn.prefix != expected[0]) {
+               fail("%s : sn.prefix is wrong", str_pipes);
+               return;
+       }
+       if (sn.digits != expected[1]) {
+               fail("%s : sn.digits is wrong", str_pipes);
+               return;
+       }
+       if (sn.exponent != expected[2]) {
+               fail("%s : sn.exponent is wrong", str_pipes);
+               return;
+       }
+       if (sn.suffix != expected[3]) {
+               fail("%s : sn.suffix is wrong", str_pipes);
+               return;
+       }
+       if (sn.end != expected[4]) {
+               fail("%s : sn.end is wrong", str_pipes);
+               return;
+       }
+       if (given_type != type) {
+               fail("%s : Type incorrect", str_pipes);
+               return;
+       }
+       if (sn.dots_found != dots_found) {
+               fail("%s : sn.dots_found is %d; should be %d", str_pipes,
+                       sn.dots_found, dots_found);
+               return;
+       }
+       
+       scan_number_sanity_check(&sn, type, str_pipes, "");
+       
+       free(str);
+       return;
+       
+fail_too_many_pipes:
+       fail("Too many pipes in the test string \"%s\"; should be 3", str_pipes);
+       return;
+fail_not_enough_pipes:
+       fail("Not enough pipes in the test string \"%s\"; should be 3", str_pipes);
+       return;
+}
+
+#define T(str, type, dots_found) test_scan_number_single(str,type,dots_found)
+
+static void test_scan_number(void) {
+       T("0x | 50.1 | p+1 | f", TOK_FLOATING, 1);
+       T("| 100 || L", TOK_INTEGER, 0);
+       T("0 ||| b21", TOK_INTEGER, 0);
+       T("0b | 101 || L", TOK_INTEGER, 0);
+       T("0X | 7Af ||| \t2", TOK_INTEGER, 0);
+       T("0|||b", TOK_INTEGER, 0);
+       T("0|||x", TOK_INTEGER, 0);
+}
+
+#undef T
+
+#define T(string, value, theBase, theSuffix) do { \
+       queue_init(mq, NULL); \
+       str = (string); \
+       type = scan_number(&sn, str, str+sizeof(string)-1); \
+       ok(type==TOK_INTEGER, "%s : type==TOK_INTEGER", str); \
+       scan_number_sanity_check(&sn, type, str, \
+               " : scan_number_sanity_check passed"); \
+       read_integer(&integer, &sn, &mq); \
+       ok(integer.v==(value) && integer.base==(theBase) && \
+               integer.suffix==(theSuffix), \
+               "%s : Correct value and suffix", str); \
+       } while(0)
+#define Q(name) do { \
+       if (queue_count(mq)) { \
+               const char *path = dequeue(mq).path; \
+               ok(!strcmp(path, "tokenize/read_cnumber/" #name), \
+                       "%s : Dequeued %s", str, path); \
+       } \
+       } while(0)
+#define E() do { \
+       ok(queue_count(mq)==0, "%s : Message queue empty", str); \
+       if (queue_count(mq)) \
+               tok_message_queue_dump(&mq); \
+       queue_free(mq); \
+       } while(0)
+
+static void test_read_integer(void) {
+       struct scan_number sn;
+       tok_message_queue mq;
+       const char *str;
+       enum token_type type;
+       struct tok_integer integer;
+       
+       T("0b0lu", 0, 8, TOK_UL);
+       E();
+       
+       T("1", 1, 10, TOK_NOSUFFIX);
+       E();
+       
+       T("32Q", 32, 10, TOK_NOSUFFIX);
+       Q(integer_suffix_invalid);
+       E();
+       
+       T("32i", 32, 10, TOK_I);
+       E();
+       
+       T("0755f", 493, 8, TOK_NOSUFFIX);
+       Q(suffix_float_only);
+       E();
+       
+       T("0xDeadBeef", 0xDEADBEEF, 16, TOK_NOSUFFIX);
+       E();
+       
+       T("12345678901234567890$1_LONG.SUFFIX", 12345678901234567890ULL, 10, TOK_NOSUFFIX);
+       ok1(sn.end == strchr(str, 0));
+       Q(integer_suffix_invalid);
+       E();
+       
+       T("0xDEADBEEFlull", 0xDEADBEEF, 16, TOK_NOSUFFIX);
+       Q(integer_suffix_invalid);
+       E();
+       
+       T("0xBALLuu", 0xBA, 16, TOK_NOSUFFIX);
+       Q(integer_suffix_invalid);
+       E();
+       
+       T("123456789012345678901", 18446744073709551615ULL, 10, TOK_NOSUFFIX);
+       Q(integer_out_of_range);
+       E();
+       
+       T("09", 0, 8, TOK_NOSUFFIX);
+       Q(integer_invalid_digits);
+       E();
+}
+
+#undef T
+#undef E
+
+#define Teq(string, equals, theSuffix) do { \
+       queue_init(mq, NULL); \
+       str = malloc(sizeof(string)); \
+       memcpy(str, string, sizeof(string)); \
+       type = scan_number(&sn, str, str+sizeof(string)-1); \
+       ok(type==TOK_FLOATING, "%s : type==TOK_FLOATING", str); \
+       scan_number_sanity_check(&sn, type, str, \
+               " : scan_number_sanity_check passed"); \
+       read_floating(&floating, &sn, &mq); \
+       ok((equals) && \
+               floating.suffix==(theSuffix), \
+               "%s : Correct value and suffix", str); \
+       } while(0)
+#define T(string, value, theSuffix) \
+       Teq(string, fabsl(floating.v - (value)) <= 0.00000000000000001, theSuffix)
+#define E() do { \
+       ok(queue_count(mq)==0, "%s : Message queue empty", str); \
+       if (queue_count(mq)) \
+               tok_message_queue_dump(&mq); \
+       queue_free(mq); \
+       free(str); \
+       } while(0)
+
+static void test_read_floating(void) {
+       struct scan_number sn;
+       tok_message_queue mq;
+       char *str; //str is a malloced copy so read_floating can do its null terminator trick
+       enum token_type type;
+       struct tok_floating floating;
+       
+       T("1.0", 1.0, TOK_NOSUFFIX);
+       E();
+       
+       T("0.0", 0.0, TOK_NOSUFFIX);
+       E();
+       
+       T("0755e1", 7550.0, TOK_NOSUFFIX);
+       E();
+       
+       T("0xD.Bp0", 0xD.Bp0, TOK_NOSUFFIX);
+       E();
+       
+       //GCC doesn't throw any errors or warnings for this odd case,
+       //but we call it an error to be consistent with strtold
+       T("0x.p0", 0.0, TOK_NOSUFFIX);
+       Q(floating_invalid_digits);
+       E();
+       
+       T("32.0Q", 32.0, TOK_NOSUFFIX);
+       Q(floating_suffix_invalid);
+       E();
+       
+       T("32.0Li", 32.0, TOK_IMAG_L);
+       E();
+       
+       T("32.0LL", 32.0, TOK_NOSUFFIX);
+       Q(suffix_integer_only);
+       E();
+       
+       Teq("0xDEAD.BEEF", floating.v==0.0, TOK_NOSUFFIX);
+       Q(hex_float_no_exponent);
+       E();
+       
+       T("0b101.0p0", 0, TOK_NOSUFFIX);
+       Q(binary_float);
+       E();
+       
+       /* If any of the following three tests fails, consider increasing
+          the e+ and e- values. */
+       
+       Teq("1.e+4933", isinf(floating.v), TOK_NOSUFFIX);
+       Q(floating_out_of_range);
+       E();
+       
+       /* for some reason, strtold sets errno=EDOM on x86, and
+          on my PowerPC G4 on Fedora 10, the same phenomenon occurs
+          but the exponents are e+309, e-324, and e-325 */
+       Teq("1.e-4951", floating.v==0.0, TOK_NOSUFFIX);
+       Q(floating_out_of_range);
+       E();
+       
+       Teq("1.e-4952", floating.v==0.0, TOK_NOSUFFIX);
+       Q(floating_out_of_range);
+       E();
+       
+}
+
+#undef Teq
+#undef T
+#undef Q
+#undef E
+
+struct tokenizer_test {
+       const char *txt;
+       size_t txt_size;
+       
+       const struct token *tokens;
+       size_t token_count;
+};
+
+#define T(txt, ...) {txt, sizeof(txt)-1, array_count_pair(struct token, __VA_ARGS__)}
+#define string(txt) {.string={.item = (txt), .size = sizeof(txt)-1}}
+#define opkw(v) {.opkw = (v)}
+#define txt(t) .txt = (t), .txt_size = sizeof(t)-1
+#define integer(...) {.integer={__VA_ARGS__}}
+#define floating(...) {.floating={__VA_ARGS__}}
+#define space {.type = TOK_WHITE, .txt = " ", .txt_size = 1}
+#define startline {.type = TOK_STARTLINE}
+#define include(str) {.include = (str)}
+
+struct tokenizer_msg_test {
+       struct tokenizer_test test;
+       
+       const char * const *messages;
+       size_t message_count;
+};
+
+#define M(...) array_count_pair(const char *, __VA_ARGS__)
+
+struct tokenizer_test tokenizer_tests[] = {
+       T(""),
+       T("\n",
+               {.type = TOK_WHITE, txt("\n")}
+       ),
+       T("\na",
+               {.type = TOK_WHITE, txt("\n")},
+               startline,
+               {.type = TOK_IDENTIFIER, txt("a")}
+       ),
+       T("int n = c++;",
+               {.type = TOK_KEYWORD,
+                       opkw(INT),
+                       txt("int")
+               }, space,
+               {.type = TOK_IDENTIFIER,
+                       txt("n")
+               }, space,
+               {.type = TOK_OPERATOR,
+                       opkw('='),
+                       txt("=")
+               }, space,
+               {.type = TOK_IDENTIFIER,
+                       txt("c")
+               },
+               {.type = TOK_OPERATOR,
+                       opkw(INC_OP),
+                       txt("++")
+               },
+               {.type = TOK_OPERATOR,
+                       opkw(';'),
+                       txt(";")
+               }
+       ),
+       T(".5 42 ",
+               {.type = TOK_FLOATING,
+                       floating(.5, TOK_NOSUFFIX),
+                       txt(".5")
+               }, space,
+               {.type = TOK_INTEGER,
+                       integer(42, 10, TOK_NOSUFFIX),
+                       txt("42")
+               }, space,
+       ),
+       //Make sure TOK_STRAY doesn't take over the universe
+       T("``AS IS'' AND",
+               {.type = TOK_STRAY,
+                       txt("``")
+               },
+               {.type = TOK_IDENTIFIER,
+                       txt("AS")
+               }, space,
+               {.type = TOK_IDENTIFIER,
+                       txt("IS")
+               },
+               {.type = TOK_CHAR,
+                       string(""),
+                       txt("\'\'")
+               }, space,
+               {.type = TOK_IDENTIFIER,
+                       txt("AND")
+               }
+       ),
+       //Make sure starting with 0 doesn't result in skipping whitespace
+       T("0 .05 0 500",
+               {.type = TOK_INTEGER,
+                       integer(0, 8, TOK_NOSUFFIX),
+                       txt("0")
+               }, space,
+               {.type = TOK_FLOATING,
+                       floating(.05, TOK_NOSUFFIX),
+                       txt(".05")
+               }, space,
+               {.type = TOK_INTEGER,
+                       integer(0, 8, TOK_NOSUFFIX),
+                       txt("0")
+               }, space,
+               {.type = TOK_INTEGER,
+                       integer(500, 10, TOK_NOSUFFIX),
+                       txt("500")
+               }
+       ),
+       //Make sure a simple preprocessor directive works
+       T("\t/*comment*/ #include \"include.h\"\n",
+               {.flags={1,0}, .type=TOK_WHITE, txt("\t")},
+               {.flags={1,0}, .type=TOK_CCOMMENT, txt("/*comment*/")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(INCLUDE), txt("include")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_STRING_IQUOTE, include("include.h"), txt("\"include.h\"")},
+               {.flags={1,0}, .type=TOK_WHITE, txt("\n")}
+       ),
+       //Make sure __VA_ARGS__ is lexed correctly
+       T("if #define __VA_ARGS__=0X5FULL;\n"
+         " #define __VA_ARGS__(__VA_ARGS__, ...\t)__VA_ARGS__ bar int define",
+               {.type=TOK_KEYWORD, opkw(IF), txt("if")},
+               space,
+               {.type=TOK_OPERATOR, opkw('#'), txt("#")},
+               {.type=TOK_IDENTIFIER, txt("define")},
+               space,
+               {.type=TOK_IDENTIFIER, txt("__VA_ARGS__")},
+               {.type=TOK_OPERATOR, opkw('='), txt("=")},
+               {.type=TOK_INTEGER, integer(0x5F,16,TOK_ULL), txt("0X5FULL")},
+               {.type=TOK_OPERATOR, opkw(';'), txt(";")},
+               {.type=TOK_WHITE, txt("\n")},
+               {.flags={1,0}, .type=TOK_STARTLINE},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(DEFINE), txt("define")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("__VA_ARGS__")},
+               {.flags={1,0}, .type=TOK_OPERATOR, opkw('('), txt("(")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("__VA_ARGS__")},
+               {.flags={1,0}, .type=TOK_OPERATOR, opkw(','), txt(",")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_OPERATOR, opkw(ELLIPSIS), txt("...")},
+               {.flags={1,0}, .type=TOK_WHITE, txt("\t")},
+               {.flags={1,0}, .type=TOK_OPERATOR, opkw(')'), txt(")")},
+               {.flags={1,0}, .type=TOK_KEYWORD, opkw(VA_ARGS), txt("__VA_ARGS__")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("bar")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_KEYWORD, opkw(INT), txt("int")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("define")},
+       ),
+       //__VA_ARGS__ is an identifier if no ... operator is in the parameter list or if there is no parameter list
+       T("#define foo __VA_ARGS__ bar int define\n#define foo() __VA_ARGS__ bar int define",
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(DEFINE), txt("define")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("foo")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("__VA_ARGS__")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("bar")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_KEYWORD, opkw(INT), txt("int")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("define")},
+               {.flags={1,0}, .type=TOK_WHITE, txt("\n")},
+               
+               {.flags={1,0}, .type=TOK_STARTLINE},
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(DEFINE), txt("define")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("foo")},
+               {.flags={1,0}, .type=TOK_OPERATOR, opkw('('), txt("(")},
+               {.flags={1,0}, .type=TOK_OPERATOR, opkw(')'), txt(")")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("__VA_ARGS__")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("bar")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_KEYWORD, opkw(INT), txt("int")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("define")}
+       ),
+       
+       //Test various integer suffixen
+       T("1 1u 1l 1ul 1lu 1ll 1ull 1llu 1U 1L 1UL 1LU 1LL 1ULL 1LLU "
+         "1uq 1lq 1llq 1ulq 1luq 1f 1i",
+               {.type=TOK_INTEGER, integer(1, 10, TOK_NOSUFFIX), txt("1")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_U), txt("1u")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_L), txt("1l")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_UL), txt("1ul")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_UL), txt("1lu")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_LL), txt("1ll")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_ULL), txt("1ull")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_ULL), txt("1llu")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_U), txt("1U")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_L), txt("1L")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_UL), txt("1UL")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_UL), txt("1LU")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_LL), txt("1LL")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_ULL), txt("1ULL")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_ULL), txt("1LLU")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_NOSUFFIX), txt("1uq")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_NOSUFFIX), txt("1lq")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_NOSUFFIX), txt("1llq")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_NOSUFFIX), txt("1ulq")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_NOSUFFIX), txt("1luq")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_NOSUFFIX), txt("1f")}, space,
+               {.type=TOK_INTEGER, integer(1, 10, TOK_I), txt("1i")}
+       ),
+       //Test non-standard newlines
+       T("\n\r\n \r\n\rint",
+               {.type=TOK_WHITE, txt("\n\r")}, startline,
+               {.type=TOK_WHITE, txt("\n")}, startline,
+               space,
+               {.type=TOK_WHITE, txt("\r\n")}, startline,
+               {.type=TOK_WHITE, txt("\r")}, startline,
+               {.type=TOK_KEYWORD, opkw(INT), txt("int")}
+       ),
+       //Test backslash-broken lines
+       T("oner\\ \nous",
+               {.type=TOK_IDENTIFIER, txt("onerous")}
+       ),
+       T("\\\n\\\n\\\n\\",
+               {.type=TOK_STRAY, txt("\\")}
+       ),
+       T("in\\\nt i\\;\nf\\ \r\nor (i=0; i<10; i++) {\\",
+               {.type=TOK_KEYWORD, opkw(INT), txt("int")}, space,
+               {.type=TOK_IDENTIFIER, txt("i")},
+               {.type=TOK_STRAY, txt("\\")},
+               {.type=TOK_OPERATOR, opkw(';'), txt(";")},
+               {.type=TOK_WHITE, txt("\n")},
+               
+               startline,
+               {.type=TOK_KEYWORD, opkw(FOR), txt("for")}, space,
+               {.type=TOK_OPERATOR, opkw('('), txt("(")},
+               {.type=TOK_IDENTIFIER, txt("i")},
+               {.type=TOK_OPERATOR, opkw('='), txt("=")},
+               {.type=TOK_INTEGER, integer(0,8,0), txt("0")},
+               {.type=TOK_OPERATOR, opkw(';'), txt(";")}, space,
+               {.type=TOK_IDENTIFIER, txt("i")},
+               {.type=TOK_OPERATOR, opkw('<'), txt("<")},
+               {.type=TOK_INTEGER, integer(10,10,0), txt("10")},
+               {.type=TOK_OPERATOR, opkw(';'), txt(";")}, space,
+               {.type=TOK_IDENTIFIER, txt("i")},
+               {.type=TOK_OPERATOR, opkw(INC_OP), txt("++")},
+               {.type=TOK_OPERATOR, opkw(')'), txt(")")}, space,
+               {.type=TOK_OPERATOR, opkw('{'), txt("{")},
+               {.type=TOK_STRAY, txt("\\")}
+       ),
+       //More preprocessor directive tests
+       T("#apple\n#pragma\n#const\n#define \t\n#define foo(x",
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_IDENTIFIER, txt("apple")},
+               {.flags={1,0}, .type=TOK_WHITE, txt("\n")},
+               
+               {.flags={1,0}, .type=TOK_STARTLINE},
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(PRAGMA), txt("pragma")},
+               {.flags={1,0}, .type=TOK_WHITE, txt("\n")},
+               
+               {.flags={1,0}, .type=TOK_STARTLINE},
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_IDENTIFIER, txt("const")},
+               {.flags={1,0}, .type=TOK_WHITE, txt("\n")},
+               
+               {.flags={1,0}, .type=TOK_STARTLINE},
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(DEFINE), txt("define")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" \t")},
+               {.flags={1,0}, .type=TOK_WHITE, txt("\n")},
+               
+               {.flags={1,0}, .type=TOK_STARTLINE},
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(DEFINE), txt("define")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("foo")},
+               {.flags={1,0}, .type=TOK_OPERATOR, opkw('('), txt("(")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("x")}
+       ),
+       T("#define",
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(DEFINE), txt("define")}
+       ),
+       T("#define foo",
+               {.flags={1,0}, .type=TOK_LEADING_POUND, txt("#")},
+               {.flags={1,1}, .type=TOK_KEYWORD, opkw(DEFINE), txt("define")},
+               {.flags={1,0}, .type=TOK_WHITE, txt(" ")},
+               {.flags={1,0}, .type=TOK_IDENTIFIER, txt("foo")}
+       ),
+       T("`#define foo",
+               {.type=TOK_STRAY, txt("`")},
+               {.type=TOK_OPERATOR, opkw('#'), txt("#")},
+               {.type=TOK_IDENTIFIER, txt("define")},
+               space,
+               {.type=TOK_IDENTIFIER, txt("foo")}
+       )
+};
+
+struct tokenizer_msg_test tokenizer_msg_tests[] = {
+       {T("/* Unterminated C comment",
+               {.type=TOK_CCOMMENT, txt("/* Unterminated C comment")}
+       ), M(
+               "unterminated_comment"
+       )},
+       {T("\"\n\"\"\n",
+               {.type=TOK_STRING, string("\n"), txt("\"\n\"")},
+               {.type=TOK_STRING, string("\n"), txt("\"\n")}
+       ), M(
+               "read_cstring/quote_newlines",
+               "read_cstring/missing_endquote"
+       )},
+};
+
+#undef T
+#undef string
+#undef opkw
+#undef txt
+#undef integer
+#undef floating
+#undef M
+#undef include
+
+static void test_tokenizer_single(struct tokenizer_test *t, tok_message_queue *mq) {
+       struct token_list *tl;
+       size_t i, count = t->token_count, gen_count;
+       const struct token *tok_gen, *tok_correct;
+       int success = 1;
+       char *txt = talloc_memdup(NULL, t->txt, t->txt_size);
+       size_t txt_size = t->txt_size;
+       #define failed(fmt, ...) do { \
+               printf("Error: " fmt "\n", ##__VA_ARGS__); \
+               success = 0; \
+               goto done; \
+       } while(0)
+       
+       tl = tokenize(txt, txt_size, mq);
+       
+       if (tl->orig != txt || tl->orig_size != txt_size)
+               failed("tokenize() did not replicate orig/orig_size from arguments");
+       if (!token_list_sanity_check(tl, stdout))
+               failed("Sanity check failed");
+       
+       gen_count = token_list_count(tl);
+       if (gen_count != count+1)
+               failed("Incorrect number of tokens (%zu, should be %zu)\n",
+                       gen_count, count+1);
+       
+       tok_gen = tl->first->next; //skip the beginning TOK_STARTLINE
+       tok_correct = t->tokens;
+       for (i=0; i<count; i++, tok_gen=tok_gen->next, tok_correct++) {
+               if (tok_gen->type != tok_correct->type)
+                       failed("Token \"%s\": Incorrect type", tok_correct->txt);
+               {
+                       struct token_flags g=tok_gen->flags, c=tok_correct->flags;
+                       if (g.pp!=c.pp || g.pp_directive!=c.pp_directive)
+                               failed("Token \"%s\": Incorrect flags", tok_correct->txt);
+               }
+               switch (tok_gen->type) {
+                       case TOK_INTEGER:
+                               if (tok_gen->integer.v != tok_correct->integer.v ||
+                                   tok_gen->integer.base != tok_correct->integer.base ||
+                                   tok_gen->integer.suffix != tok_correct->integer.suffix)
+                                       failed("Token \"%s\": Integer value/base/suffix incorrect", tok_correct->txt);;
+                               break;
+                       case TOK_FLOATING:
+                               if (fabsl(tok_gen->floating.v - tok_correct->floating.v) > 0.00000000000000001 ||
+                                   tok_gen->floating.suffix != tok_correct->floating.suffix)
+                                       failed("Token \"%s\": Floating point value/suffix incorrect", tok_correct->txt);
+                               break;
+                       case TOK_OPERATOR:
+                               if (tok_gen->opkw != tok_correct->opkw)
+                                       failed("Token \"%s\": Operator opkw incorrect", tok_correct->txt);
+                               break;
+                       case TOK_KEYWORD:
+                               if (tok_gen->opkw != tok_correct->opkw)
+                                       failed("Token \"%s\": Keyword opkw incorrect", tok_correct->txt);
+                               break;
+                       case TOK_CHAR:
+                       case TOK_STRING:
+                               //anything using string
+                               if (tok_gen->string.size != tok_correct->string.size ||
+                                       memcmp(tok_gen->string.item, tok_correct->string.item,
+                                       tok_gen->string.size) ||
+                                       tok_gen->string.item[tok_gen->string.size] != 0 )
+                                       failed("Token \"%s\": String value incorrect", tok_correct->txt);
+                               break;
+                       case TOK_STRING_IQUOTE:
+                       case TOK_STRING_IANGLE:
+                               if (strcmp(tok_gen->include, tok_correct->include))
+                                       failed("Token \"%s\": #include string incorrect", tok_correct->txt);
+                               break;
+                       case TOK_IDENTIFIER:
+                       case TOK_CCOMMENT:
+                       case TOK_CPPCOMMENT:
+                       case TOK_WHITE:
+                       case TOK_STARTLINE:
+                       case TOK_STRAY:
+                               break;
+               }
+               if (tok_gen->type!=TOK_STARTLINE && (
+                       tok_gen->txt_size != tok_correct->txt_size ||
+                       memcmp(tok_gen->txt, tok_correct->txt, tok_gen->txt_size))
+                       )
+                       failed("Token \"%s\": txt incorrect", tok_correct->txt);
+       }
+       
+       #undef failed
+done:
+       ok(success==1, "Tokenize %s", t->txt);
+       
+       if (!success)
+               token_list_dump(tl, stdout);
+       
+       talloc_free(txt);
+}
+
+static void test_tokenizer_file(const char *file_name, tok_message_queue *mq) {
+       FILE *f = fopen(file_name, "rb");
+       array_char text = array_new(NULL);
+       const size_t inc = 1024;
+       struct token_list *tl;
+       
+       if (!f) {
+               fail("Could not read file '%s': %s", file_name, strerror(errno));
+               goto end;
+       }
+       
+       for (;;) {
+               size_t read_len;
+               
+               array_realloc(text, text.size+inc+1);
+               read_len = fread(text.item+text.size, 1, inc, f);
+               text.size += read_len;
+               text.item[text.size] = 0;
+               
+               if (read_len < inc)
+                       break;
+               
+       }
+       if (ferror(f)) {
+               fail("Error reading file '%s': %s", file_name, strerror(errno));
+               goto end;
+       }
+       
+       tl = tokenize(text.item, text.size, mq);
+       tl->filename = file_name;
+       
+       //printf("File '%s' has %zu tokens\n", file_name, token_list_count(tl));
+       //token_list_dump(tl, stdout);
+       
+       if (!token_list_sanity_check(tl, stdout)) {
+               fail("Sanity check failed for file '%s'", file_name);
+               goto end;
+       }
+       
+       pass("File '%s' has %zu tokens", file_name, token_list_count(tl));
+       
+       /*while (queue_count(*mq)) {
+               struct tok_message msg = dequeue(*mq);
+               tok_message_print(&msg, tl);
+       }*/
+       
+end:
+       array_free(text);
+       if (f)
+               fclose(f);
+}
+
+static void test_tokenizer(void) {
+       tok_message_queue mq;
+       size_t i, count;
+       int has_warn_or_worse = 0;
+       
+       queue_init(mq, NULL);
+       
+       count = sizeof(tokenizer_tests)/sizeof(*tokenizer_tests);
+       for (i=0; i<count; i++) {
+               test_tokenizer_single(tokenizer_tests+i, &mq);
+               while (queue_count(mq)) {
+                       struct tok_message msg = dequeue(mq);
+                       (void) msg;
+                       //tok_message_dump(&msg);
+               }
+       }
+       
+       count = sizeof(tokenizer_msg_tests)/sizeof(*tokenizer_msg_tests);
+       for (i=0; i<count; i++) {
+               size_t j;
+               test_tokenizer_single(&tokenizer_msg_tests[i].test, &mq);
+               
+               if (queue_count(mq) != tokenizer_msg_tests[i].message_count) {
+                       fail("Incorrect number of messages from tokenize()");
+                       while (queue_count(mq))
+                               (void) dequeue(mq);
+                       goto msg_fail;
+               }
+               
+               for (j=0; queue_count(mq); j++) {
+                       struct tok_message msg = dequeue(mq);
+                       const char *base = "tokenize/";
+                       size_t baselen = strlen(base);
+                       //tok_message_dump(&msg);
+                       
+                       if (strncmp(msg.path, base, baselen)) {
+                               fail("Message from tokenize() doesn't start with \"%s\"",
+                                       base);
+                               goto msg_fail;
+                       }
+                       if (strcmp(msg.path+baselen,
+                                       tokenizer_msg_tests[i].messages[j])) {
+                               fail("Incorrect message %s, should be %s",
+                                       msg.path+baselen, tokenizer_msg_tests[i].messages[j]);
+                               goto msg_fail;
+                       }
+               }
+               
+               pass("Messages from tokenize() are correct");
+       msg_fail:;
+       }
+       
+       test_tokenizer_file("ccan/ccan_tokenizer/test/run.c", &mq);
+       
+       while (queue_count(mq)) {
+               struct tok_message msg = dequeue(mq);
+               if (msg.level >= TM_WARN) {
+                       has_warn_or_worse = 1;
+                       tok_message_dump(&msg);
+               }
+               //else tok_message_dump(&msg);
+       }
+       
+       ok(has_warn_or_worse==0, "Tokenizing run.c generated%s warnings, errors, or bugs",
+               has_warn_or_worse ? "" : " no");
+       
+       queue_free(mq);
+}
+
+#include <unistd.h>
+
+int main(void)
+{
+       plan_tests(195);
+       
+       diag("* Checking queue...");
+       test_queue();
+       
+       diag("* Checking read_cstring...");
+       test_read_cstring();
+       
+       diag("* Checking dict...");
+       test_dict();
+       
+       diag("* Checking charflag...");
+       test_charflag();
+       
+       diag("* Checking readui...");
+       test_readui();
+       
+       diag("* Checking scan_number...");
+       test_scan_number();
+       
+       diag("* Checking read_integer...");
+       test_read_integer();
+       
+       diag("* Checking read_floating...");
+       test_read_floating();
+       
+       diag("* Checking tokenizer...");
+       test_tokenizer();
+       
+       /* This exits depending on whether all tests passed */
+       return exit_status();
+}
diff --git a/ccan/ccan_tokenizer/todo b/ccan/ccan_tokenizer/todo
new file mode 100644 (file)
index 0000000..76a85b1
--- /dev/null
@@ -0,0 +1,172 @@
+Update that simple tokenizer compulsory test so things will compile
+
+
+
+(done) Fix #include <stdio.h> to read include directive correctly
+
+txt/orig state of affairs:
+
+The problem is that there are two ways to interpret line,col:
+       With respect to txt
+       With respect to orig
+
+This isn't a problem when txt and orig point to the same character, as in:
+
+int in\
+dex
+int \
+index /*Here, the backslash break should be gobbled up by the space identifier*/
+
+line,col has no ambiguity as to where it should point.  However, when they point to different characters (i.e. at the beginning of a line):
+
+\
+int index
+
+line,col could either point to orig or to the first real character.  Thus, we will do the latter.
+
+Moreover, will a newline followed by backslash breaks generate a token that gobbles up said breaks?  I believe it will, but no need to call this mandatory.
+
+Thus, on a lookup with a txt pointer, the line/col/orig should match the real character and not preceding backslash breaks.
+
+
+I've been assuming that every token starts with its first character, neglecting the case where a line starts with backslash breaks.  The question is, given the txt pointer to the first character, where should the derived orig land?
+
+Currently, the orig lands after the beginning backslash breaks, when instead it should probably land before them.
+
+Here's what the tokenizer's text anchoring needs:
+       Broken/unbroken text pointer -> line/col
+       Unbroken contents per token to identify identifier text
+       Original contents per token to rebuild the document
+       Ability to change "original contents" so the document will be saved with modifications
+       Ability to insert new tokens
+
+Solution:
+       New tokens will typically have identical txt and orig, yea even the same pointer.
+       txt/txt_size for unbroken contents, orig/orig_size for original
+       modify orig to change the document
+       txt identifies identifier text
+       Line lookup tables are used to resolve txt/orig pointers; other pointers can't be resolved in the same fashion and may require traversing backward through the list.
+
+What this means:
+       Token txt/txt_size, orig/orig_size, orig_lines, txt_lines, and tok_point_lookup are all still correct.
+       Token line,col will be removed
+       
+Other improvements to do:
+       Sanity check the point lookups like crazy
+       Remove the array() structures in token_list, as these are supposed to be read-only
+
+Make sure tok_point_lookup returns correct values for every single pointer possible, particularly those in orig that are on backslash-breaks
+
+Convert the tok_message_queue into an array of messages bound to tokens.
+
+Ask Rusty about the trailing newline in this case:
+
+/* Blah
+ * 
+ * blah
+ */
+
+Here, rather than the trailing space being blank, it is "blank" from the comment perspective.
+May require deeper analysis.
+
+Todos from ccan_tokenizer.h
+/*
+Assumption:  Every token fits in one and exactly one line
+Counterexamples:
+       Backslash-broken lines
+       Multiline comments
+
+Checks to implement in the tokenizer:
+
+is the $ character used in an identifier (some configurations of GCC allow this)
+are there potentially ambiguous sequences used in a string literal (e.g. "\0000")
+Are there stray characters?  (e.g. '\0', '@', '\b')
+Are there trailing spaces at the end of lines (unless said spaces consume the entire line)?
+       Are there trailing spaces after a backslash-broken line?
+
+
+Fixes todo:
+
+backslash-newline sequence should register as an empty character, and the tokenizer's line value should be incremented accordingly.
+*/
+
+Lex angle bracket strings in #include
+
+Check the rules in the documentation
+
+Examine the message queue as part of testing the tokenizer:
+       Make sure there are no bug messages
+       Make sure files compile with no warnings
+For the tokenizer sanity check, make sure integers and floats have valid suffixes respectively
+       (e.g. no TOK_F for an integer, no TOK_ULL for a floating)
+
+Update the scan_number sanity checks
+(done) Move scan_number et al. to a separate C file
+
+Test:
+       Overflow and underflow floats
+       0x.p0
+       (done) 0755f //octal 0755 with invalid suffix
+       (done) 0755e1 //floating 7550
+
+Figure out how keywords will be handled.
+       Preprocessor directives are <strike>case-insensitive</strike> actually case-sensitive (except __VA_ARGS__)
+       All C keywords are case sensitive
+       __VA_ARGS__ should be read as an identifier unless it's in the expansion of a macro.  Otherwise, GCC generates a warning.
+               We are in the expansion of a macro after <startline> <space> # <space> 
+       Don't forget about __attribute__
+       Except for __VA_ARGS__, all preprocessor keywords are proceeded by <startline> <space> # <space>
+
+Solution:
+       All the words themselves will go into one opkw dictionary, and for both type and opkw, no distinction will be made between preprocessor and normal keywords.
+       Instead, int type will become short type; unsigned short cpp:1;
+
+Merge
+Commit ccan_tokenizer to the ccan repo
+Introduce ccan_tokenizer to ccanlint
+
+Write testcases for scanning all available operators
+Support integer and floating point suffices (e.g. 500UL, 0.5f)
+Examine the message queue after tokenizing
+Make sure single-character operators have an opkw < 128
+Make sure c_dictionary has no duplicate entries
+Write verifiers for other types than TOK_WHITE
+
+What's been done:
+
+Operator table has been organized
+Merged Rusty's changes
+Fixed if -> while in finalize
+Fixed a couple mistakes in run-simple-token.c testcases themselves
+       Expected orig/orig_size sizes weren't right
+Made token_list_sanity_check a public function and used it throughout run-simple-token.c
+Tests succeed and pass valgrind
+
+Lines/columns of every token are recorded
+
+(done) Fix "0\nstatic"
+(done) Write tests to make sure backslash-broken lines have correct token locations.
+(done) Correctly handle backslash-broken lines
+       One plan:  Separate the scanning code from the reading code.  Scanning sends valid ranges to reading, and reading fills valid tokens for the tokenizer/scanner to properly add
+       Another plan:  Un-break backslash-broken lines into another copy of the input.  Create an array of the positions of each real line break so 
+Annotate message queue messages with current token
+
+Conversion to make:
+       From:
+               Position in unbroken text
+       To:
+               Real line number
+               Real offset from start of line
+
+Thus, we want an array of real line start locations wrt the unbroken text
+
+Here is a bro\
+ken line.  Here is a
+real line.
+
+<LINE>Here is a bro<LINE>ken line.  Here is a
+<LINE>real line.
+
+If we know the position of the token text wrt the unbroken text, we can look up the real line number and offset using only the array of real line start positions within the unbroken text.
+
+Because all we need is the orig and orig_size with respect to the unbroken text to orient 
\ No newline at end of file