X-Git-Url: https://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fccan_tokenizer%2Fccan_tokenizer.h;fp=ccan%2Fccan_tokenizer%2Fccan_tokenizer.h;h=7634501f84f455774364dfe9aabb0961c0f087a7;hp=0000000000000000000000000000000000000000;hb=69cc1b45b4921c0be738902fe0d5225f135e2aae;hpb=46b1a03e21303e03b68de213b41c0840767fbc96 diff --git a/ccan/ccan_tokenizer/ccan_tokenizer.h b/ccan/ccan_tokenizer/ccan_tokenizer.h new file mode 100644 index 00000000..7634501f --- /dev/null +++ b/ccan/ccan_tokenizer/ccan_tokenizer.h @@ -0,0 +1,307 @@ +/* + Copyright (c) 2009 Joseph A. Adams + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the author may not be used to endorse or promote products + derived from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef CCAN_TOKENIZER_H +#define CCAN_TOKENIZER_H + +#include +#include "charflag.h" +#include "dict.h" +#include "queue.h" +#include +#include //for readui + +/* Definition of tokens and the token list */ + +enum token_type { + TOK_INTEGER, //integer (e.g. 5, 1000L, 0x5) + TOK_FLOATING, //floating point number (e.g. 5.0, 7.0f, etc.) + TOK_OPERATOR, //operator (e.g. +, -, (, ), ++, etc.) + TOK_KEYWORD, //keyword (e.g. char, _Bool, ifdef) + TOK_IDENTIFIER, //identifier or unprocessed keyword (e.g. int, token, pp_conditions) + TOK_CHAR, //character literal (e.g. 'a' or even '1234') + TOK_STRING, //string literal (e.g. "hello" or "zero\0inside") + TOK_LEADING_POUND, //leading # in a preprocessor directive (e.g. # include) + TOK_STRING_IQUOTE, // "config.h" + TOK_STRING_IANGLE, // + + #define token_type_is_ignored(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_WHITE) + #define token_type_is_comment(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_CPPCOMMENT) + TOK_CCOMMENT, //C comment (e.g. /* comment */) + TOK_CPPCOMMENT, //C++ comment (e.g. //comment ) + TOK_WHITE, //whitespace (span of \t\n\v\f\r and space) + TOK_STARTLINE, //beginning of line (txt/txtsize is always empty) + TOK_STRAY, //control characters, weird characters, and extended characters where they shouldn't be +}; + +enum tok_suffix { + TOK_NOSUFFIX = 0, + + TOK_U = 1, //unsigned + TOK_L = 2, //long or double-precision float + TOK_LL = 4, //long long (note that TOK_L and TOK_LL are mutually exclusive) + TOK_F = 8, //float (single-precision) + TOK_I = 16, //imaginary + + TOK_UL = TOK_U | TOK_L, //unsigned long + TOK_ULL = TOK_U | TOK_LL, //unsigned long long + + //Imaginary combo meals + TOK_IMAG_U = TOK_I | TOK_U, + TOK_IMAG_L = TOK_I | TOK_L, + TOK_IMAG_LL = TOK_I | TOK_LL, + TOK_IMAG_F = TOK_I | TOK_F, + + TOK_IMAG_UL = TOK_I | TOK_UL, + TOK_IMAG_ULL = TOK_I | TOK_ULL, +}; + +struct tok_integer { + uint64_t v; + int base; //one of 2, 8, 10, or 16 + enum tok_suffix suffix; +}; + +struct tok_floating { + long double v; + enum tok_suffix suffix; +}; + +//Operator/keyword naming conventions taken from Jeff Lee's Yacc grammar: +//http://www.lysator.liu.se/c/ANSI-C-grammar-y.html +enum tok_opkw { + /* Permute these regularly */ + PTR_OP=128, INC_OP, DEC_OP, LEFT_OP, RIGHT_OP, LE_OP, GE_OP, EQ_OP, NE_OP, + AND_OP, OR_OP, + MUL_ASSIGN, DIV_ASSIGN, MOD_ASSIGN, + ADD_ASSIGN, SUB_ASSIGN, + AND_ASSIGN, XOR_ASSIGN, OR_ASSIGN, + LEFT_ASSIGN, RIGHT_ASSIGN, + ELLIPSIS, + DOUBLE_POUND, + + //Keywords + _BOOL, + _COMPLEX, + _IMAGINARY, + BREAK, + CASE, + CHAR, + CONST, + CONTINUE, + DEFAULT, + DO, + DOUBLE, + ELSE, + ENUM, + EXTERN, + FLOAT, + FOR, + GOTO, + IF, + INLINE, + INT, + LONG, + REGISTER, + RESTRICT, + RETURN, + SHORT, + SIGNED, + SIZEOF, + STATIC, + STRUCT, + SWITCH, + TYPEDEF, + UNION, + UNSIGNED, + VOID, + VOLATILE, + WHILE, + + //Preprocessor keywords (except those already defined) + VA_ARGS, + #define opkw_is_directive_only(opkw) ((opkw)>=DEFINE && (opkw)<=WARNING) + #define opkw_is_directive(opkw) (opkw_is_directive_only(opkw) || (opkw)==ELSE || (opkw)==IF) + DEFINE, + ELIF, + //ELSE, + ENDIF, + ERROR, + //IF, + IFDEF, + IFNDEF, + INCLUDE, + LINE, + PRAGMA, + UNDEF, + WARNING, /* gcc extension */ +}; + +struct token_flags { + unsigned short + pp:1, //is token part of a preprocessor line + pp_directive:1; //does token follow a TOK_LEADING_POUND (e.g. # include) +}; + +struct token { + struct token *prev, *next; + + struct token_flags flags; + short type; //enum token_type + union { + struct tok_integer integer; + struct tok_floating floating; + int opkw; //operator or keyword ID (e.g. '+', INC_OP (++), ADD_ASSIGN (+=)) + array_char string; //applies to TOK_CHAR and TOK_STRING + char *include; //applies to TOK_STRING_IQUOTE and TOK_STRING_IANGLE + }; + + //text this token represents (with backslash-broken lines merged) + const char *txt; + size_t txt_size; + + //text this token represents (untouched) + const char *orig; + size_t orig_size; + + //zero-based line and column number of this token + size_t line, col; +}; + +static inline int token_is_ignored(const struct token *tok) { + return token_type_is_ignored(tok->type); +} + +static inline int token_is_op(const struct token *tok, int opkw) { + return tok->type==TOK_OPERATOR && tok->opkw==opkw; +} + +static inline int token_is_kw(const struct token *tok, int opkw) { + return tok->type==TOK_KEYWORD && tok->opkw==opkw; +} + +struct token_list { + struct token *first, *last; + + //Points to original input as given + const char *orig; + size_t orig_size; + + //position of the start of each real line with respect to orig + const char * const *olines; + size_t olines_size; + + //Copy of original input without backslash-broken lines + const char *txt; + size_t txt_size; + + //position of the start of each real line with respect to txt + const char * const *tlines; + size_t tlines_size; + + //Set me so tok_message_print will know what file name to display + const char *filename; +}; + +extern struct dict *tokenizer_dict; + +typedef queue(struct tok_message) tok_message_queue; + +//the token_list is allocated as a child of orig +struct token_list *tokenize(const char *orig, size_t orig_size, tok_message_queue *mq); + +size_t token_list_count(const struct token_list *tl); + +//used for debugging +int token_list_sanity_check(const struct token_list *tl, FILE *err); +void token_list_dump(const struct token_list *tl, FILE *f); + +/* tok_point_lookup is used to locate a pointer that is within a token list's + txt or orig fields */ + +struct tok_point { + const char *txt, *orig; + size_t line, col; +}; + +//returns nonzero if the pointer could be resolved +int tok_point_lookup(struct tok_point *out, const char *ptr, + const struct token_list *tl); + + +/* Tokenizer message queue; used to gather and report warnings, errors, etc. */ + +enum tok_message_level {TM_DEBUG, TM_INFO, TM_WARN, TM_ERROR, TM_BUG}; + +struct tok_message { + enum tok_message_level level; + const char *path; + //Unique slash-delimited name of the message + //e.g. tokenize/read_cstring/ambiguous_octal + const char *message; + //Human-readable description + //e.g. `Octal \007 followed by digit` + const char *location; + //Pointer (typically within the token list's txt or orig) of the error +}; + +#define tok_msg_debug(name, loc, fmt, ...) tok_message_add(mq, TM_DEBUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__) +#define tok_msg_info(name, loc, fmt, ...) tok_message_add(mq, TM_INFO, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__) +#define tok_msg_warn(name, loc, fmt, ...) tok_message_add(mq, TM_WARN, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__) +#define tok_msg_error(name, loc, fmt, ...) tok_message_add(mq, TM_ERROR, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__) +#define tok_msg_bug(name, loc, fmt, ...) tok_message_add(mq, TM_BUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__) + +void tok_message_add(tok_message_queue *mq, enum tok_message_level level, + const char *path, const char *loc, const char *fmt, ...); + +void tok_message_print(struct tok_message *m, struct token_list *tl); + +void tok_message_dump(struct tok_message *m); +void tok_message_queue_dump(const tok_message_queue *mq); + + +/* Miscellaneous internal components */ + +char *read_cstring(array_char *out, const char *s, const char *e, char quoteChar, tok_message_queue *mq); +char *read_cnumber(struct token *tok, const char *s, const char *e, tok_message_queue *mq); + + +typedef unsigned int readui_base; + +#define READUI_ALLOWHIGHERDIGITS 256 +#define READUI_ALLOWCAPLETTERS 512 +#define READUI_ALLOWLCASELETTERS 1024 +#define READUI_ALLOWLETTERS (READUI_ALLOWCAPLETTERS | READUI_ALLOWLCASELETTERS) + +#define READUI_DEC ((readui_base)(10)) +#define READUI_HEX ((readui_base)(16 | READUI_ALLOWLETTERS)) +#define READUI_OCT ((readui_base)(8)) +#define READUI_BIN ((readui_base)(2)) + +uint64_t readui(const char **sp, const char *e, readui_base base); + +#endif