X-Git-Url: https://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Fccan_tokenizer%2Fccan_tokenizer.h;fp=ccan%2Fccan_tokenizer%2Fccan_tokenizer.h;h=7634501f84f455774364dfe9aabb0961c0f087a7;hp=0000000000000000000000000000000000000000;hb=69cc1b45b4921c0be738902fe0d5225f135e2aae;hpb=46b1a03e21303e03b68de213b41c0840767fbc96

diff --git a/ccan/ccan_tokenizer/ccan_tokenizer.h b/ccan/ccan_tokenizer/ccan_tokenizer.h
new file mode 100644
index 00000000..7634501f
--- /dev/null
+++ b/ccan/ccan_tokenizer/ccan_tokenizer.h
@@ -0,0 +1,307 @@
+/*
+        Copyright (c) 2009  Joseph A. Adams
+        All rights reserved.
+        
+        Redistribution and use in source and binary forms, with or without
+        modification, are permitted provided that the following conditions
+        are met:
+        1. Redistributions of source code must retain the above copyright
+           notice, this list of conditions and the following disclaimer.
+        2. Redistributions in binary form must reproduce the above copyright
+           notice, this list of conditions and the following disclaimer in the
+           documentation and/or other materials provided with the distribution.
+        3. The name of the author may not be used to endorse or promote products
+           derived from this software without specific prior written permission.
+        
+        THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+        IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+        OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+        IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+        INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+        NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+        DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+        THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+        (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+        THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CCAN_TOKENIZER_H
+#define CCAN_TOKENIZER_H
+
+#include <ccan/array/array.h>
+#include "charflag.h"
+#include "dict.h"
+#include "queue.h"
+#include <stdint.h>
+#include <errno.h> //for readui
+
+/* Definition of tokens and the token list */
+
+enum token_type {
+	TOK_INTEGER,	   //integer (e.g. 5, 1000L, 0x5)
+	TOK_FLOATING,	   //floating point number (e.g. 5.0, 7.0f, etc.)
+	TOK_OPERATOR,	   //operator (e.g. +, -, (, ), ++, etc.)
+	TOK_KEYWORD,	   //keyword (e.g. char, _Bool, ifdef)
+	TOK_IDENTIFIER,	   //identifier or unprocessed keyword (e.g. int, token, pp_conditions)
+	TOK_CHAR,	   //character literal (e.g. 'a' or even '1234')
+	TOK_STRING,	   //string literal (e.g. "hello" or "zero\0inside")
+	TOK_LEADING_POUND, //leading # in a preprocessor directive (e.g. # include)
+	TOK_STRING_IQUOTE, // "config.h"
+	TOK_STRING_IANGLE, // <stdio.h>
+	
+	#define token_type_is_ignored(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_WHITE)
+	#define token_type_is_comment(type) ((type)>=TOK_CCOMMENT && (type)<=TOK_CPPCOMMENT)
+	TOK_CCOMMENT, //C comment (e.g. /* comment */)
+	TOK_CPPCOMMENT, //C++ comment (e.g. //comment )
+	TOK_WHITE, //whitespace (span of \t\n\v\f\r and space)
+	TOK_STARTLINE,	//beginning of line (txt/txtsize is always empty)
+	TOK_STRAY, //control characters, weird characters, and extended characters where they shouldn't be
+};
+
+enum tok_suffix {
+	TOK_NOSUFFIX = 0,
+	
+	TOK_U  = 1,  //unsigned
+	TOK_L  = 2,  //long or double-precision float
+	TOK_LL = 4,  //long long (note that TOK_L and TOK_LL are mutually exclusive)
+	TOK_F  = 8,  //float (single-precision)
+	TOK_I  = 16, //imaginary
+	
+	TOK_UL  = TOK_U | TOK_L,  //unsigned long
+	TOK_ULL = TOK_U | TOK_LL, //unsigned long long
+	
+	//Imaginary combo meals
+	TOK_IMAG_U   = TOK_I | TOK_U,
+	TOK_IMAG_L   = TOK_I | TOK_L,
+	TOK_IMAG_LL  = TOK_I | TOK_LL,
+	TOK_IMAG_F   = TOK_I | TOK_F,
+	
+	TOK_IMAG_UL  = TOK_I | TOK_UL,
+	TOK_IMAG_ULL = TOK_I | TOK_ULL,
+};
+
+struct tok_integer {
+	uint64_t v;
+	int base; //one of 2, 8, 10, or 16
+	enum tok_suffix suffix;
+};
+
+struct tok_floating {
+	long double v;
+	enum tok_suffix suffix;
+};
+
+//Operator/keyword naming conventions taken from Jeff Lee's Yacc grammar:
+//http://www.lysator.liu.se/c/ANSI-C-grammar-y.html
+enum tok_opkw {
+	/* Permute these regularly */
+	PTR_OP=128, INC_OP, DEC_OP, LEFT_OP, RIGHT_OP, LE_OP, GE_OP, EQ_OP, NE_OP,
+	AND_OP, OR_OP,
+	MUL_ASSIGN, DIV_ASSIGN, MOD_ASSIGN,
+	ADD_ASSIGN, SUB_ASSIGN,
+	AND_ASSIGN, XOR_ASSIGN, OR_ASSIGN,
+	LEFT_ASSIGN, RIGHT_ASSIGN,
+	ELLIPSIS,
+	DOUBLE_POUND,
+	
+	//Keywords
+	_BOOL,
+	_COMPLEX,
+	_IMAGINARY,
+	BREAK,
+	CASE,
+	CHAR,
+	CONST,
+	CONTINUE,
+	DEFAULT,
+	DO,
+	DOUBLE,
+	ELSE,
+	ENUM,
+	EXTERN,
+	FLOAT,
+	FOR,
+	GOTO,
+	IF,
+	INLINE,
+	INT,
+	LONG,
+	REGISTER,
+	RESTRICT,
+	RETURN,
+	SHORT,
+	SIGNED,
+	SIZEOF,
+	STATIC,
+	STRUCT,
+	SWITCH,
+	TYPEDEF,
+	UNION,
+	UNSIGNED,
+	VOID,
+	VOLATILE,
+	WHILE,
+	
+	//Preprocessor keywords (except those already defined)
+	VA_ARGS,
+	#define opkw_is_directive_only(opkw) ((opkw)>=DEFINE && (opkw)<=WARNING)
+	#define opkw_is_directive(opkw) (opkw_is_directive_only(opkw) || (opkw)==ELSE || (opkw)==IF)
+	DEFINE,
+	ELIF,
+	//ELSE,
+	ENDIF,
+	ERROR,
+	//IF,
+	IFDEF,
+	IFNDEF,
+	INCLUDE,
+	LINE,
+	PRAGMA,
+	UNDEF,
+	WARNING, /* gcc extension */
+};
+
+struct token_flags {
+	unsigned short
+		pp:1, //is token part of a preprocessor line
+		pp_directive:1; //does token follow a TOK_LEADING_POUND (e.g. # include)
+};
+
+struct token {
+	struct token *prev, *next;
+	
+	struct token_flags flags;
+	short type; //enum token_type
+	union {
+		struct tok_integer integer;
+		struct tok_floating floating;
+		int opkw; //operator or keyword ID (e.g. '+', INC_OP (++), ADD_ASSIGN (+=))
+		array_char string; //applies to TOK_CHAR and TOK_STRING
+		char *include; //applies to TOK_STRING_IQUOTE and TOK_STRING_IANGLE
+	};
+	
+	//text this token represents (with backslash-broken lines merged)
+	const char *txt;
+	size_t txt_size;
+	
+	//text this token represents (untouched)
+	const char *orig;
+	size_t orig_size;
+	
+	//zero-based line and column number of this token
+	size_t line, col;
+};
+
+static inline int token_is_ignored(const struct token *tok) {
+	return token_type_is_ignored(tok->type);
+}
+
+static inline int token_is_op(const struct token *tok, int opkw) {
+	return tok->type==TOK_OPERATOR && tok->opkw==opkw;
+}
+
+static inline int token_is_kw(const struct token *tok, int opkw) {
+	return tok->type==TOK_KEYWORD && tok->opkw==opkw;
+}
+
+struct token_list {
+	struct token *first, *last;
+	
+	//Points to original input as given
+	const char *orig;
+	size_t orig_size;
+	
+	//position of the start of each real line with respect to orig
+	const char * const *olines;
+	size_t olines_size;
+	
+	//Copy of original input without backslash-broken lines
+	const char *txt;
+	size_t txt_size;
+	
+	//position of the start of each real line with respect to txt
+	const char * const *tlines;
+	size_t tlines_size;
+	
+	//Set me so tok_message_print will know what file name to display
+	const char *filename;
+};
+
+extern struct dict *tokenizer_dict;
+
+typedef queue(struct tok_message) tok_message_queue;
+
+//the token_list is allocated as a child of orig
+struct token_list *tokenize(const char *orig, size_t orig_size, tok_message_queue *mq);
+
+size_t token_list_count(const struct token_list *tl);
+
+//used for debugging
+int token_list_sanity_check(const struct token_list *tl, FILE *err);
+void token_list_dump(const struct token_list *tl, FILE *f);
+
+/* tok_point_lookup is used to locate a pointer that is within a token list's
+   txt or orig fields */
+
+struct tok_point {
+	const char *txt, *orig;
+	size_t line, col;
+};
+
+//returns nonzero if the pointer could be resolved
+int tok_point_lookup(struct tok_point *out, const char *ptr,
+			const struct token_list *tl);
+
+
+/* Tokenizer message queue; used to gather and report warnings, errors, etc. */
+
+enum tok_message_level {TM_DEBUG, TM_INFO, TM_WARN, TM_ERROR, TM_BUG};
+
+struct tok_message {
+	enum tok_message_level level;
+	const char *path;
+		//Unique slash-delimited name of the message
+		//e.g. tokenize/read_cstring/ambiguous_octal
+	const char *message;
+		//Human-readable description
+		//e.g. `Octal \007 followed by digit`
+	const char *location;
+		//Pointer (typically within the token list's txt or orig) of the error
+};
+
+#define tok_msg_debug(name, loc, fmt, ...) tok_message_add(mq, TM_DEBUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+#define tok_msg_info(name, loc, fmt, ...) tok_message_add(mq, TM_INFO, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+#define tok_msg_warn(name, loc, fmt, ...) tok_message_add(mq, TM_WARN, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+#define tok_msg_error(name, loc, fmt, ...) tok_message_add(mq, TM_ERROR, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+#define tok_msg_bug(name, loc, fmt, ...) tok_message_add(mq, TM_BUG, MESSAGE_PATH #name, loc, fmt, ##__VA_ARGS__)
+
+void tok_message_add(tok_message_queue *mq, enum tok_message_level level,
+	const char *path, const char *loc, const char *fmt, ...);
+
+void tok_message_print(struct tok_message *m, struct token_list *tl);
+
+void tok_message_dump(struct tok_message *m);
+void tok_message_queue_dump(const tok_message_queue *mq);
+
+
+/* Miscellaneous internal components */
+
+char *read_cstring(array_char *out, const char *s, const char *e, char quoteChar, tok_message_queue *mq);
+char *read_cnumber(struct token *tok, const char *s, const char *e, tok_message_queue *mq);
+
+
+typedef unsigned int readui_base;
+
+#define READUI_ALLOWHIGHERDIGITS 256
+#define READUI_ALLOWCAPLETTERS 512
+#define READUI_ALLOWLCASELETTERS 1024
+#define READUI_ALLOWLETTERS (READUI_ALLOWCAPLETTERS | READUI_ALLOWLCASELETTERS)
+
+#define READUI_DEC      ((readui_base)(10))
+#define READUI_HEX      ((readui_base)(16 | READUI_ALLOWLETTERS))
+#define READUI_OCT      ((readui_base)(8))
+#define READUI_BIN      ((readui_base)(2))
+
+uint64_t readui(const char **sp, const char *e, readui_base base);
+
+#endif