2 Copyright (c) 2009 Joseph A. Adams
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
8 1. Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 2. Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 3. The name of the author may not be used to endorse or promote products
14 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #ifndef CCAN_TOKENIZER_H
29 #define CCAN_TOKENIZER_H
31 #include <ccan/array/array.h>
36 #include <errno.h> //for readui
38 /* Definition of tokens and the token list */
41 TOK_INTEGER, //integer (e.g. 5, 1000L, 0x5)
42 TOK_FLOATING, //floating point number (e.g. 5.0, 7.0f, etc.)
43 TOK_OPERATOR, //operator (e.g. +, -, (, ), ++, etc.)
44 TOK_KEYWORD, //keyword (e.g. char, _Bool, ifdef)
45 TOK_IDENTIFIER, //identifier or unprocessed keyword (e.g. int, token, pp_conditions)
46 TOK_CHAR, //character literal (e.g. 'a' or even '1234')
47 TOK_STRING, //string literal (e.g. "hello" or "zero\0inside")
48 TOK_LEADING_POUND, //leading # in a preprocessor directive (e.g. # include)
49 TOK_STRING_IQUOTE, // "config.h"
50 TOK_STRING_IANGLE, // <stdio.h>
52 #define token_type_is_ignored(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_WHITE)
53 #define token_type_is_comment(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_CPPCOMMENT)
54 TOK_CCOMMENT, //C comment (e.g. /* comment */)
55 TOK_CPPCOMMENT, //C++ comment (e.g. //comment )
56 TOK_WHITE, //whitespace (span of \t\n\v\f\r and space)
57 TOK_STARTLINE, //beginning of line (txt/txtsize is always empty)
58 TOK_STRAY, //control characters, weird characters, and extended characters where they shouldn't be
65 TOK_L = 2, //long or double-precision float
66 TOK_LL = 4, //long long (note that TOK_L and TOK_LL are mutually exclusive)
67 TOK_F = 8, //float (single-precision)
68 TOK_I = 16, //imaginary
70 TOK_UL = TOK_U | TOK_L, //unsigned long
71 TOK_ULL = TOK_U | TOK_LL, //unsigned long long
73 //Imaginary combo meals
74 TOK_IMAG_U = TOK_I | TOK_U,
75 TOK_IMAG_L = TOK_I | TOK_L,
76 TOK_IMAG_LL = TOK_I | TOK_LL,
77 TOK_IMAG_F = TOK_I | TOK_F,
79 TOK_IMAG_UL = TOK_I | TOK_UL,
80 TOK_IMAG_ULL = TOK_I | TOK_ULL,
85 int base; //one of 2, 8, 10, or 16
86 enum tok_suffix suffix;
91 enum tok_suffix suffix;
94 //Operator/keyword naming conventions taken from Jeff Lee's Yacc grammar:
95 //http://www.lysator.liu.se/c/ANSI-C-grammar-y.html
97 /* Permute these regularly */
98 PTR_OP=128, INC_OP, DEC_OP, LEFT_OP, RIGHT_OP, LE_OP, GE_OP, EQ_OP, NE_OP,
100 MUL_ASSIGN, DIV_ASSIGN, MOD_ASSIGN,
101 ADD_ASSIGN, SUB_ASSIGN,
102 AND_ASSIGN, XOR_ASSIGN, OR_ASSIGN,
103 LEFT_ASSIGN, RIGHT_ASSIGN,
145 //Preprocessor keywords (except those already defined)
147 #define opkw_is_directive_only(opkw) ((opkw)>=DEFINE && (opkw)<=WARNING)
148 #define opkw_is_directive(opkw) (opkw_is_directive_only(opkw) || (opkw)==ELSE || (opkw)==IF)
161 WARNING, /* gcc extension */
166 pp:1, //is token part of a preprocessor line
167 pp_directive:1; //does token follow a TOK_LEADING_POUND (e.g. # include)
171 struct token *prev, *next;
173 struct token_flags flags;
174 short type; //enum token_type
176 struct tok_integer integer;
177 struct tok_floating floating;
178 int opkw; //operator or keyword ID (e.g. '+', INC_OP (++), ADD_ASSIGN (+=))
179 array_char string; //applies to TOK_CHAR and TOK_STRING
180 char *include; //applies to TOK_STRING_IQUOTE and TOK_STRING_IANGLE
183 //text this token represents (with backslash-broken lines merged)
187 //text this token represents (untouched)
191 //zero-based line and column number of this token
195 static inline int token_is_ignored(const struct token *tok) {
196 return token_type_is_ignored(tok->type);
199 static inline int token_is_op(const struct token *tok, int opkw) {
200 return tok->type==TOK_OPERATOR && tok->opkw==opkw;
203 static inline int token_is_kw(const struct token *tok, int opkw) {
204 return tok->type==TOK_KEYWORD && tok->opkw==opkw;
208 struct token *first, *last;
210 //Points to original input as given
214 //position of the start of each real line with respect to orig
215 const char * const *olines;
218 //Copy of original input without backslash-broken lines
222 //position of the start of each real line with respect to txt
223 const char * const *tlines;
226 //Set me so tok_message_print will know what file name to display
227 const char *filename;
230 extern struct dict *tokenizer_dict;
232 typedef queue(struct tok_message) tok_message_queue;
234 //the token_list is allocated as a child of orig
235 struct token_list *tokenize(const char *orig, size_t orig_size, tok_message_queue *mq);
237 size_t token_list_count(const struct token_list *tl);
240 int token_list_sanity_check(const struct token_list *tl, FILE *err);
241 void token_list_dump(const struct token_list *tl, FILE *f);
243 /* tok_point_lookup is used to locate a pointer that is within a token list's
244 txt or orig fields */
247 const char *txt, *orig;
251 //returns nonzero if the pointer could be resolved
252 int tok_point_lookup(struct tok_point *out, const char *ptr,
253 const struct token_list *tl);
256 /* Tokenizer message queue; used to gather and report warnings, errors, etc. */
258 enum tok_message_level {TM_DEBUG, TM_INFO, TM_WARN, TM_ERROR, TM_BUG};
261 enum tok_message_level level;
263 //Unique slash-delimited name of the message
264 //e.g. tokenize/read_cstring/ambiguous_octal
266 //Human-readable description
267 //e.g. `Octal \007 followed by digit`
268 const char *location;
269 //Pointer (typically within the token list's txt or orig) of the error
272 #define tok_msg_debug(name, loc, fmt, ...) tok_message_add(mq, TM_DEBUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
273 #define tok_msg_info(name, loc, fmt, ...) tok_message_add(mq, TM_INFO, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
274 #define tok_msg_warn(name, loc, fmt, ...) tok_message_add(mq, TM_WARN, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
275 #define tok_msg_error(name, loc, fmt, ...) tok_message_add(mq, TM_ERROR, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
276 #define tok_msg_bug(name, loc, fmt, ...) tok_message_add(mq, TM_BUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
278 void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
279 const char *path, const char *loc, const char *fmt, ...);
281 void tok_message_print(struct tok_message *m, struct token_list *tl);
283 void tok_message_dump(struct tok_message *m);
284 void tok_message_queue_dump(const tok_message_queue *mq);
287 /* Miscellaneous internal components */
289 char *read_cstring(array_char *out, const char *s, const char *e, char quoteChar, tok_message_queue *mq);
290 char *read_cnumber(struct token *tok, const char *s, const char *e, tok_message_queue *mq);
293 typedef unsigned int readui_base;
295 #define READUI_ALLOWHIGHERDIGITS 256
296 #define READUI_ALLOWCAPLETTERS 512
297 #define READUI_ALLOWLCASELETTERS 1024
298 #define READUI_ALLOWLETTERS (READUI_ALLOWCAPLETTERS | READUI_ALLOWLCASELETTERS)
300 #define READUI_DEC ((readui_base)(10))
301 #define READUI_HEX ((readui_base)(16 | READUI_ALLOWLETTERS))
302 #define READUI_OCT ((readui_base)(8))
303 #define READUI_BIN ((readui_base)(2))
305 uint64_t readui(const char **sp, const char *e, readui_base base);