--- /dev/null
+/* MIT (BSD) license - see LICENSE file for details */
+#include "cdump.h"
+#include <ccan/tal/str/str.h>
+#include <assert.h>
+
+struct token {
+ const char *p;
+ size_t len;
+};
+
+static void add_token(struct token **toks, const char *p, size_t len)
+{
+ size_t n = tal_count(*toks);
+ tal_resize(toks, n+1);
+ (*toks)[n].p = p;
+ (*toks)[n].len = len;
+}
+
+/* Simplified tokenizer: comments and preproc directives removed,
+ identifiers are a token, others are single char tokens. */
+static struct token *tokenize(const void *ctx, const char *code)
+{
+ unsigned int i, len, tok_start = -1;
+ bool start_of_line = true;
+ struct token *toks = tal_arr(ctx, struct token, 0);
+
+ for (i = 0; code[i]; i += len) {
+ if (code[i] == '#' && start_of_line) {
+ /* Preprocessor line. */
+ len = strcspn(code+i, "\n");
+ } else if (code[i] == '/' && code[i+1] == '/') {
+ /* One line comment. */
+ len = strcspn(code+i, "\n");
+ if (tok_start != -1U) {
+ add_token(&toks, code+tok_start, i - tok_start);
+ tok_start = -1U;
+ }
+ } else if (code[i] == '/' && code[i+1] == '*') {
+ /* Multi-line comment. */
+ const char *end = strstr(code+i+2, "*/");
+ len = (end + 2) - (code + i);
+ if (!end)
+ len = strlen(code + i);
+ if (tok_start != -1U) {
+ add_token(&toks, code+tok_start, i - tok_start);
+ tok_start = -1U;
+ }
+ } else if (cisalnum(code[i]) || code[i] == '_') {
+ /* Identifier or part thereof */
+ if (tok_start == -1U)
+ tok_start = i;
+ len = 1;
+ } else if (!cisspace(code[i])) {
+ /* Punctuation: treat as single char token. */
+ if (tok_start != -1U) {
+ add_token(&toks, code+tok_start, i - tok_start);
+ tok_start = -1U;
+ }
+ add_token(&toks, code+i, 1);
+ len = 1;
+ } else {
+ /* Whitespace. */
+ if (tok_start != -1U) {
+ add_token(&toks, code+tok_start, i - tok_start);
+ tok_start = -1U;
+ }
+ len = 1;
+ }
+ if (code[i] == '\n')
+ start_of_line = true;
+ else if (!cisspace(code[i]))
+ start_of_line = false;
+ }
+
+ /* Add terminating NULL. */
+ tal_resizez(&toks, tal_count(toks) + 1);
+ return toks;
+}
+
+struct parse_state {
+ const char *code;
+ const struct token *toks;
+ struct cdump_definitions *defs;
+ char *complaints;
+};
+
+static bool tok_is(const struct token **toks, const char *target)
+{
+ return (*toks)->p && (*toks)->len == strlen(target)
+ && memcmp((*toks)->p, target, (*toks)->len) == 0;
+}
+
+static const struct token *tok_peek(const struct token **toks)
+{
+ if (toks[0]->p)
+ return toks[0];
+ return NULL;
+}
+
+static const struct token *tok_take(const struct token **toks)
+{
+ if (!toks[0]->p)
+ return NULL;
+
+ return (*toks)++;
+}
+
+static const struct token *tok_take_if(const struct token **toks,
+ const char *target)
+{
+ if (tok_is(toks, target))
+ return tok_take(toks);
+ return NULL;
+}
+
+static const char *tok_take_ident(const tal_t *ctx, const struct token **toks)
+{
+ const struct token *t = tok_peek(toks);
+
+ if (!t)
+ return NULL;
+
+ if (strspn(t->p, "_0123456789"
+ "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ") < t->len)
+ return NULL;
+
+ t = tok_take(toks);
+ return tal_strndup(ctx, t->p, t->len);
+}
+
+static char *string_of_toks(const tal_t *ctx,
+ const struct token *first,
+ const struct token *until)
+{
+ const struct token *end = until - 1;
+ return tal_strndup(ctx, first->p, end->p - first->p + end->len);
+}
+
+static char *tok_take_until(const tal_t *ctx,
+ const struct token **toks,
+ const char *delims)
+{
+ const struct token *t, *start;
+
+ start = tok_peek(toks);
+ while ((t = tok_peek(toks)) != NULL) {
+ /* If this contains a delimiter, copy up to prev token. */
+ if (strcspn(t->p, delims) < t->len)
+ return string_of_toks(ctx, start, t);
+ tok_take(toks);
+ };
+
+ /* EOF without finding delimiter */
+ return NULL;
+}
+
+static bool type_defined(const struct cdump_type *t)
+{
+ switch (t->kind) {
+ case CDUMP_STRUCT:
+ case CDUMP_UNION:
+ return (t->u.members != NULL);
+ case CDUMP_ENUM:
+ return (t->u.enum_vals != NULL);
+
+ /* These shouldn't happen; we don't try to define them. */
+ case CDUMP_UNKNOWN:
+ case CDUMP_ARRAY:
+ case CDUMP_POINTER:
+ break;
+ }
+ abort();
+}
+
+/* May allocate a new type if not already found (steals @name) */
+static struct cdump_type *get_type(struct cdump_definitions *defs,
+ enum cdump_type_kind kind,
+ const char *name)
+{
+ struct cdump_map *m;
+ struct cdump_type *t;
+
+ switch (kind) {
+ case CDUMP_STRUCT:
+ m = &defs->structs;
+ break;
+ case CDUMP_UNION:
+ m = &defs->unions;
+ break;
+ case CDUMP_ENUM:
+ m = &defs->enums;
+ break;
+ case CDUMP_UNKNOWN:
+ case CDUMP_ARRAY:
+ case CDUMP_POINTER:
+ m = NULL;
+ }
+
+ /* Do we already have it? */
+ if (m) {
+ t = strmap_get(m, name);
+ if (t)
+ return t;
+ }
+
+ t = tal(defs, struct cdump_type);
+ t->kind = kind;
+ t->name = name ? tal_steal(t, name) : NULL;
+ /* These are actually the same, but be thorough */
+ t->u.members = NULL;
+ t->u.enum_vals = NULL;
+ if (m)
+ strmap_add(m, t->name, t);
+
+ return t;
+}
+
+static void complain(struct parse_state *ps, const char *complaint)
+{
+ unsigned int linenum;
+ const char *p = ps->code;
+
+ for (linenum = 1; p < ps->toks[0].p; linenum++) {
+ p = strchr(p+1, '\n');
+ if (!p)
+ break;
+ }
+
+ tal_append_fmt(&ps->complaints,
+ "Line %u: '%.*s': %s\n",
+ linenum, (int)ps->toks[0].len,
+ ps->toks[0].p, complaint);
+}
+
+static void tok_take_unknown_statement(struct parse_state *ps)
+{
+ complain(ps, "Ignoring unknown statement until next semicolon");
+ tal_free(tok_take_until(NULL, &ps->toks, ";"));
+ tok_take_if(&ps->toks, ";");
+}
+
+/* [ ... */
+static bool tok_take_array(struct parse_state *ps, struct cdump_type **type)
+{
+ /* This will be some arbitrary expression! */
+ struct cdump_type *arr = get_type(ps->defs, CDUMP_ARRAY, NULL);
+
+ arr->u.arr.size = tok_take_until(arr, &ps->toks, "]");
+ if (!arr->u.arr.size) {
+ complain(ps, "Could not find closing array size ]");
+ return false;
+ }
+
+ arr->u.arr.type = *type;
+ *type = arr;
+
+ /* Swallow ] */
+ tok_take(&ps->toks);
+ return true;
+}
+
+static struct cdump_type *ptr_of(struct parse_state *ps,
+ const struct cdump_type *ptr_to)
+{
+ struct cdump_type *ptr = get_type(ps->defs, CDUMP_POINTER, NULL);
+ ptr->u.ptr = ptr_to;
+ return ptr;
+}
+
+static bool tok_take_type(struct parse_state *ps, struct cdump_type **type)
+{
+ const char *name;
+ const struct token *types;
+ enum cdump_type_kind kind;
+
+ /* Ignoring weird typedefs, only these can be combined. */
+ types = ps->toks;
+ while (tok_take_if(&ps->toks, "int")
+ || tok_take_if(&ps->toks, "long")
+ || tok_take_if(&ps->toks, "short")
+ || tok_take_if(&ps->toks, "double")
+ || tok_take_if(&ps->toks, "float")
+ || tok_take_if(&ps->toks, "char")
+ || tok_take_if(&ps->toks, "signed")
+ || tok_take_if(&ps->toks, "unsigned"));
+
+ /* Did we get some? */
+ if (ps->toks != types) {
+ name = string_of_toks(NULL, types, tok_peek(&ps->toks));
+ kind = CDUMP_UNKNOWN;
+ } else {
+ /* Try normal types (or simple typedefs, etc). */
+ if (tok_take_if(&ps->toks, "struct")) {
+ kind = CDUMP_STRUCT;
+ } else if (tok_take_if(&ps->toks, "union")) {
+ kind = CDUMP_UNION;
+ } else if (tok_take_if(&ps->toks, "enum")) {
+ kind = CDUMP_ENUM;
+ } else
+ kind = CDUMP_UNKNOWN;
+
+ name = tok_take_ident(ps->defs, &ps->toks);
+ if (!name) {
+ complain(ps, "Invalid typename");
+ return false;
+ }
+ }
+
+ *type = get_type(ps->defs, kind, name);
+ return true;
+}
+
+/* struct|union ... */
+static bool tok_take_conglom(struct parse_state *ps,
+ enum cdump_type_kind conglom_kind)
+{
+ struct cdump_type *e;
+ const char *name;
+ size_t n;
+
+ assert(conglom_kind == CDUMP_STRUCT || conglom_kind == CDUMP_UNION);
+
+ name = tok_take_ident(ps->defs, &ps->toks);
+ if (!name) {
+ complain(ps, "Invalid struct/union name");
+ return false;
+ }
+
+ e = get_type(ps->defs, conglom_kind, name);
+ if (type_defined(e)) {
+ complain(ps, "Type already defined");
+ return false;
+ }
+
+ if (!tok_take_if(&ps->toks, "{")) {
+ complain(ps, "Expected { for struct/union");
+ return false;
+ }
+
+ e->u.members = tal_arr(e, struct cdump_member, n = 0);
+ while (!tok_is(&ps->toks, "}")) {
+ struct cdump_type *basetype;
+ const struct token *quals;
+ unsigned int num_quals = 0;
+
+ /* Anything can have these prepended. */
+ quals = ps->toks;
+ while (tok_take_if(&ps->toks, "const")
+ || tok_take_if(&ps->toks, "volatile"))
+ num_quals++;
+
+ /* eg. "struct foo" or "varint_t" */
+ if (!tok_take_type(ps, &basetype)) {
+ complain(ps, "Expected typename inside struct/union");
+ return false;
+ }
+
+ do {
+ struct cdump_member *m;
+
+ tal_resize(&e->u.members, n+1);
+ m = &e->u.members[n++];
+ m->type = basetype;
+ if (num_quals) {
+ m->qualifiers
+ = string_of_toks(e, quals,
+ quals + num_quals);
+ } else
+ m->qualifiers = NULL;
+
+ /* May have multiple asterisks. */
+ while (tok_take_if(&ps->toks, "*"))
+ m->type = ptr_of(ps, m->type);
+
+ m->name = tok_take_ident(e, &ps->toks);
+ if (!m->name) {
+ complain(ps, "Expected name for member");
+ return false;
+ }
+
+ /* May be an array. */
+ while (tok_take_if(&ps->toks, "[")) {
+ if (!tok_take_array(ps, &m->type))
+ return false;
+ }
+ } while (tok_take_if(&ps->toks, ","));
+
+ if (!tok_take_if(&ps->toks, ";")) {
+ complain(ps, "Expected ; at end of member");
+ return false;
+ }
+ }
+
+ if (tok_take_if(&ps->toks, "}") && tok_take_if(&ps->toks, ";"))
+ return true;
+ complain(ps, "Expected }; at end of struct/union");
+ return false;
+}
+
+/* enum ... */
+static bool tok_take_enum(struct parse_state *ps)
+{
+ size_t n = 0;
+ struct cdump_type *e;
+ const char *name;
+
+ name = tok_take_ident(ps->defs, &ps->toks);
+ if (!name) {
+ complain(ps, "Expected enum name");
+ return false;
+ }
+
+ e = get_type(ps->defs, CDUMP_ENUM, name);
+
+ /* Duplicate name? */
+ if (type_defined(e)) {
+ complain(ps, "enum already defined");
+ return false;
+ }
+
+ if (!tok_take_if(&ps->toks, "{")) {
+ complain(ps, "Expected { after enum name");
+ return false;
+ }
+
+ e->u.enum_vals = tal_arr(e, struct cdump_enum_val, n);
+ do {
+ struct cdump_enum_val *v;
+
+ tal_resize(&e->u.enum_vals, n+1);
+ v = &e->u.enum_vals[n++];
+
+ v->name = tok_take_ident(e, &ps->toks);
+ if (!v->name) {
+ complain(ps, "Expected enum value name");
+ return false;
+ }
+ if (tok_take_if(&ps->toks, "=")) {
+ v->value = tok_take_until(e, &ps->toks, ",}");
+ if (!v->value) {
+ complain(ps, "Expected , or } to end value");
+ return false;
+ }
+ } else
+ v->value = NULL;
+ } while (tok_take_if(&ps->toks, ","));
+
+ if (tok_take_if(&ps->toks, "}") && tok_take_if(&ps->toks, ";"))
+ return true;
+
+ complain(ps, "Expected }; at end of enum");
+ return false;
+}
+
+static bool gather_undefines(const char *name,
+ struct cdump_type *t,
+ struct cdump_map *undefs)
+{
+ if (!type_defined(t))
+ strmap_add(undefs, name, t);
+ return true;
+}
+
+static bool remove_from_map(const char *name,
+ struct cdump_type *t,
+ struct cdump_map *map)
+{
+ strmap_del(map, name, NULL);
+ return true;
+}
+
+static void remove_undefined(struct cdump_map *map)
+{
+ struct cdump_map undefs;
+
+ /* We can't delete inside iterator, so gather all the undefs
+ * then remove them. */
+ strmap_init(&undefs);
+
+ strmap_iterate(map, gather_undefines, &undefs);
+ strmap_iterate(&undefs, remove_from_map, map);
+ strmap_clear(&undefs);
+}
+
+static void destroy_definitions(struct cdump_definitions *defs)
+{
+ strmap_clear(&defs->enums);
+ strmap_clear(&defs->structs);
+ strmap_clear(&defs->unions);
+}
+
+/* Simple LL(1) parser, inspired by Tridge's genstruct.pl. */
+struct cdump_definitions *cdump_extract(const tal_t *ctx, const char *code,
+ char **complaints)
+{
+ struct parse_state ps;
+ const struct token *toks;
+
+ ps.defs = tal(ctx, struct cdump_definitions);
+ ps.complaints = tal_strdup(ctx, "");
+ ps.code = code;
+
+ strmap_init(&ps.defs->enums);
+ strmap_init(&ps.defs->structs);
+ strmap_init(&ps.defs->unions);
+ tal_add_destructor(ps.defs, destroy_definitions);
+
+ toks = ps.toks = tokenize(ps.defs, code);
+ while (tok_peek(&ps.toks)) {
+ if (tok_take_if(&ps.toks, "struct")) {
+ if (!tok_take_conglom(&ps, CDUMP_STRUCT))
+ goto fail;
+ } else if (tok_take_if(&ps.toks, "union")) {
+ if (!tok_take_conglom(&ps, CDUMP_UNION))
+ goto fail;
+ } else if (tok_take_if(&ps.toks, "enum")) {
+ if (!tok_take_enum(&ps))
+ goto fail;
+ } else
+ tok_take_unknown_statement(&ps);
+ }
+
+ /* Now, remove any undefined types! */
+ remove_undefined(&ps.defs->enums);
+ remove_undefined(&ps.defs->structs);
+ remove_undefined(&ps.defs->unions);
+ tal_free(toks);
+
+out:
+ if (streq(ps.complaints, ""))
+ ps.complaints = tal_free(ps.complaints);
+
+ if (complaints)
+ *complaints = ps.complaints;
+ else
+ tal_free(ps.complaints);
+ return ps.defs;
+
+fail:
+ ps.defs = tal_free(ps.defs);
+ goto out;
+}