2 Copyright (c) 2009 Joseph A. Adams
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
8 1. Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 2. Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 3. The name of the author may not be used to endorse or promote products
14 derived from this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #ifndef CCAN_TOKENIZER_H
29 #define CCAN_TOKENIZER_H
31 #include <ccan/darray/darray.h>
36 #include <errno.h> //for readui
38 /* Definition of tokens and the token list */
41 TOK_INTEGER, //integer (e.g. 5, 1000L, 0x5)
42 TOK_FLOATING, //floating point number (e.g. 5.0, 7.0f, etc.)
43 TOK_OPERATOR, //operator (e.g. +, -, (, ), ++, etc.)
45 #define token_type_is_identifier(type) ((type)>=TOK_KEYWORD && (type)<=TOK_IDENTIFIER)
46 TOK_KEYWORD, //keyword (e.g. char, _Bool, ifdef)
47 TOK_IDENTIFIER, //identifier or unprocessed keyword (e.g. int, token, pp_conditions)
49 TOK_CHAR, //character literal (e.g. 'a' or even '1234')
50 TOK_STRING, //string literal (e.g. "hello" or "zero\0inside")
51 TOK_LEADING_POUND, //leading # in a preprocessor directive (e.g. # include)
52 TOK_STRING_IQUOTE, // "config.h"
53 TOK_STRING_IANGLE, // <stdio.h>
55 #define token_type_is_ignored(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_WHITE)
56 #define token_type_is_comment(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_CPPCOMMENT)
57 TOK_CCOMMENT, //C comment (e.g. /* comment */)
58 TOK_CPPCOMMENT, //C++ comment (e.g. //comment )
59 TOK_WHITE, //whitespace (span of \t\n\v\f\r and space)
61 TOK_STARTLINE, //beginning of line (txt/txtsize is always empty)
62 TOK_STRAY, //control characters, weird characters, and extended characters where they shouldn't be
69 TOK_L = 2, //long or double-precision float
70 TOK_LL = 4, //long long (note that TOK_L and TOK_LL are mutually exclusive)
71 TOK_F = 8, //float (single-precision)
72 TOK_I = 16, //imaginary
74 TOK_UL = TOK_U | TOK_L, //unsigned long
75 TOK_ULL = TOK_U | TOK_LL, //unsigned long long
77 //Imaginary combo meals
78 TOK_IMAG_U = TOK_I | TOK_U,
79 TOK_IMAG_L = TOK_I | TOK_L,
80 TOK_IMAG_LL = TOK_I | TOK_LL,
81 TOK_IMAG_F = TOK_I | TOK_F,
83 TOK_IMAG_UL = TOK_I | TOK_UL,
84 TOK_IMAG_ULL = TOK_I | TOK_ULL,
89 int base; //one of 2, 8, 10, or 16
90 enum tok_suffix suffix;
95 enum tok_suffix suffix;
98 //Operator/keyword naming conventions taken from Jeff Lee's Yacc grammar:
99 //http://www.lysator.liu.se/c/ANSI-C-grammar-y.html
101 /* Permute these regularly */
102 PTR_OP=128, INC_OP, DEC_OP, LEFT_OP, RIGHT_OP, LE_OP, GE_OP, EQ_OP, NE_OP,
104 MUL_ASSIGN, DIV_ASSIGN, MOD_ASSIGN,
105 ADD_ASSIGN, SUB_ASSIGN,
106 AND_ASSIGN, XOR_ASSIGN, OR_ASSIGN,
107 LEFT_ASSIGN, RIGHT_ASSIGN,
149 //Preprocessor keywords (except those already defined)
151 #define opkw_is_directive_only(opkw) ((opkw)>=DEFINE && (opkw)<=WARNING)
152 #define opkw_is_directive(opkw) (opkw_is_directive_only(opkw) || (opkw)==ELSE || (opkw)==IF)
165 WARNING, /* gcc extension */
170 pp:1, //is token part of a preprocessor line
171 pp_directive:1; //does token follow a TOK_LEADING_POUND (e.g. # include)
175 struct token *prev, *next;
177 struct token_flags flags;
178 short type; //enum token_type
180 struct tok_integer integer;
181 struct tok_floating floating;
182 int opkw; //operator or keyword ID (e.g. '+', INC_OP (++), ADD_ASSIGN (+=))
183 darray_char *string; //applies to TOK_CHAR and TOK_STRING
184 char *include; //applies to TOK_STRING_IQUOTE and TOK_STRING_IANGLE
187 //text this token represents (with backslash-broken lines merged)
191 //text this token represents (untouched)
195 //zero-based line and column number of this token
199 //keywords such as int, long, etc. may be defined over, making them identifiers in a sense
200 static inline int token_is_identifier(const struct token *tok) {
201 return token_type_is_identifier(tok->type);
204 static inline int token_is_ignored(const struct token *tok) {
205 return token_type_is_ignored(tok->type);
208 static inline int token_is_op(const struct token *tok, int opkw) {
209 return tok->type==TOK_OPERATOR && tok->opkw==opkw;
212 static inline int token_is_kw(const struct token *tok, int opkw) {
213 return tok->type==TOK_KEYWORD && tok->opkw==opkw;
216 static inline int token_txt_is(const struct token *tok, const char *str) {
217 size_t len = strlen(str);
218 return tok->txt_size==len && !memcmp(tok->txt, str, len);
222 struct token *first, *last;
224 //Points to original input as given
228 //position of the start of each real line with respect to orig
229 const char * const *olines;
232 //Copy of original input without backslash-broken lines
236 //position of the start of each real line with respect to txt
237 const char * const *tlines;
240 //Set me so tok_message_print will know what file name to display
241 const char *filename;
244 extern struct dict *tokenizer_dict;
246 typedef queue(struct tok_message) tok_message_queue;
248 //the token_list is allocated as a child of tcontext
249 struct token_list *tokenize(const void *tcontext, const char *orig, size_t orig_size, tok_message_queue *mq);
251 size_t token_list_count(const struct token_list *tl);
254 int token_list_sanity_check(const struct token_list *tl, FILE *err);
255 void token_list_dump(const struct token_list *tl, FILE *f);
257 /* tok_point_lookup is used to locate a pointer that is within a token list's
258 txt or orig fields */
261 const char *txt, *orig;
265 //returns nonzero if the pointer could be resolved
266 int tok_point_lookup(struct tok_point *out, const char *ptr,
267 const struct token_list *tl);
270 /* Tokenizer message queue; used to gather and report warnings, errors, etc. */
272 enum tok_message_level {TM_DEBUG, TM_INFO, TM_WARN, TM_ERROR, TM_BUG};
275 enum tok_message_level level;
277 //Unique slash-delimited name of the message
278 //e.g. tokenize/read_cstring/ambiguous_octal
280 //Human-readable description
281 //e.g. `Octal \007 followed by digit`
282 const char *location;
283 //Pointer (typically within the token list's txt or orig) of the error
286 #define tok_msg_debug(name, loc, fmt, ...) tok_message_add(mq, TM_DEBUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
287 #define tok_msg_info(name, loc, fmt, ...) tok_message_add(mq, TM_INFO, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
288 #define tok_msg_warn(name, loc, fmt, ...) tok_message_add(mq, TM_WARN, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
289 #define tok_msg_error(name, loc, fmt, ...) tok_message_add(mq, TM_ERROR, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
290 #define tok_msg_bug(name, loc, fmt, ...) tok_message_add(mq, TM_BUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
292 void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
293 const char *path, const char *loc, const char *fmt, ...);
295 void tok_message_print(struct tok_message *m, struct token_list *tl);
297 void tok_message_dump(struct tok_message *m);
298 void tok_message_queue_dump(const tok_message_queue *mq);
301 /* Miscellaneous internal components */
303 char *read_cstring(darray_char *out, const char *s, const char *e, char quoteChar, tok_message_queue *mq);
304 char *read_cnumber(struct token *tok, const char *s, const char *e, tok_message_queue *mq);
307 typedef unsigned int readui_base;
309 #define READUI_ALLOWHIGHERDIGITS 256
310 #define READUI_ALLOWCAPLETTERS 512
311 #define READUI_ALLOWLCASELETTERS 1024
312 #define READUI_ALLOWLETTERS (READUI_ALLOWCAPLETTERS | READUI_ALLOWLCASELETTERS)
314 #define READUI_DEC ((readui_base)(10))
315 #define READUI_HEX ((readui_base)(16 | READUI_ALLOWLETTERS))
316 #define READUI_OCT ((readui_base)(8))
317 #define READUI_BIN ((readui_base)(2))
319 uint64_t readui(const char **sp, const char *e, readui_base base);