cdump: new module.
[ccan] / ccan / cdump / cdump.c
1 /* MIT (BSD) license - see LICENSE file for details */
2 #include "cdump.h"
3 #include <ccan/tal/str/str.h>
4 #include <assert.h>
5
6 struct token {
7         const char *p;
8         size_t len;
9 };
10
11 static void add_token(struct token **toks, const char *p, size_t len)
12 {
13         size_t n = tal_count(*toks);
14         tal_resize(toks, n+1);
15         (*toks)[n].p = p;
16         (*toks)[n].len = len;
17 }
18
19 /* Simplified tokenizer: comments and preproc directives removed,
20    identifiers are a token, others are single char tokens. */
21 static struct token *tokenize(const void *ctx, const char *code)
22 {
23         unsigned int i, len, tok_start = -1;
24         bool start_of_line = true;
25         struct token *toks = tal_arr(ctx, struct token, 0);
26
27         for (i = 0; code[i]; i += len) {
28                 if (code[i] == '#' && start_of_line) {
29                         /* Preprocessor line. */
30                         len = strcspn(code+i, "\n");
31                 } else if (code[i] == '/' && code[i+1] == '/') {
32                         /* One line comment. */
33                         len = strcspn(code+i, "\n");
34                         if (tok_start != -1U) {
35                                 add_token(&toks, code+tok_start, i - tok_start);
36                                 tok_start = -1U;
37                         }
38                 } else if (code[i] == '/' && code[i+1] == '*') {
39                         /* Multi-line comment. */
40                         const char *end = strstr(code+i+2, "*/");
41                         len = (end + 2) - (code + i);
42                         if (!end)
43                                 len = strlen(code + i);
44                         if (tok_start != -1U) {
45                                 add_token(&toks, code+tok_start, i - tok_start);
46                                 tok_start = -1U;
47                         }
48                 } else if (cisalnum(code[i]) || code[i] == '_') {
49                         /* Identifier or part thereof */
50                         if (tok_start == -1U)
51                                 tok_start = i;
52                         len = 1;
53                 } else if (!cisspace(code[i])) {
54                         /* Punctuation: treat as single char token. */
55                         if (tok_start != -1U) {
56                                 add_token(&toks, code+tok_start, i - tok_start);
57                                 tok_start = -1U;
58                         }
59                         add_token(&toks, code+i, 1);
60                         len = 1;
61                 } else {
62                         /* Whitespace. */
63                         if (tok_start != -1U) {
64                                 add_token(&toks, code+tok_start, i - tok_start);
65                                 tok_start = -1U;
66                         }
67                         len = 1;
68                 }
69                 if (code[i] == '\n')
70                         start_of_line = true;
71                 else if (!cisspace(code[i]))
72                         start_of_line = false;
73         }
74
75         /* Add terminating NULL. */
76         tal_resizez(&toks, tal_count(toks) + 1);
77         return toks;
78 }
79
80 struct parse_state {
81         const char *code;
82         const struct token *toks;
83         struct cdump_definitions *defs;
84         char *complaints;
85 };
86
87 static bool tok_is(const struct token **toks, const char *target)
88 {
89         return (*toks)->p && (*toks)->len == strlen(target)
90                 && memcmp((*toks)->p, target, (*toks)->len) == 0;
91 }
92
93 static const struct token *tok_peek(const struct token **toks)
94 {
95         if (toks[0]->p)
96                 return toks[0];
97         return NULL;
98 }
99
100 static const struct token *tok_take(const struct token **toks)
101 {
102         if (!toks[0]->p)
103                 return NULL;
104
105         return (*toks)++;
106 }
107
108 static const struct token *tok_take_if(const struct token **toks,
109                                        const char *target)
110 {
111         if (tok_is(toks, target))
112                 return tok_take(toks);
113         return NULL;
114 }
115
116 static const char *tok_take_ident(const tal_t *ctx, const struct token **toks)
117 {
118         const struct token *t = tok_peek(toks);
119
120         if (!t)
121                 return NULL;
122
123         if (strspn(t->p, "_0123456789"
124                    "abcdefghijklmnopqrstuvwxyz"
125                    "ABCDEFGHIJKLMNOPQRSTUVWXYZ") < t->len)
126                 return NULL;
127
128         t = tok_take(toks);
129         return tal_strndup(ctx, t->p, t->len);
130 }
131
132 static char *string_of_toks(const tal_t *ctx,
133                             const struct token *first,
134                             const struct token *until)
135 {
136         const struct token *end = until - 1;
137         return tal_strndup(ctx, first->p, end->p - first->p + end->len);
138 }
139
140 static char *tok_take_until(const tal_t *ctx,
141                             const struct token **toks,
142                             const char *delims)
143 {
144         const struct token *t, *start;
145
146         start = tok_peek(toks);
147         while ((t = tok_peek(toks)) != NULL) {
148                 /* If this contains a delimiter, copy up to prev token. */
149                 if (strcspn(t->p, delims) < t->len)
150                         return string_of_toks(ctx, start, t);
151                 tok_take(toks);
152         };
153
154         /* EOF without finding delimiter */
155         return NULL;
156 }
157
158 static bool type_defined(const struct cdump_type *t)
159 {
160         switch (t->kind) {
161         case CDUMP_STRUCT:
162         case CDUMP_UNION:
163                 return (t->u.members != NULL);
164         case CDUMP_ENUM:
165                 return (t->u.enum_vals != NULL);
166
167         /* These shouldn't happen; we don't try to define them. */
168         case CDUMP_UNKNOWN:
169         case CDUMP_ARRAY:
170         case CDUMP_POINTER:
171                 break;
172         }
173         abort();
174 }
175
176 /* May allocate a new type if not already found (steals @name) */
177 static struct cdump_type *get_type(struct cdump_definitions *defs,
178                                    enum cdump_type_kind kind,
179                                    const char *name)
180 {
181         struct cdump_map *m;
182         struct cdump_type *t;
183
184         switch (kind) {
185         case CDUMP_STRUCT:
186                 m = &defs->structs;
187                 break;
188         case CDUMP_UNION:
189                 m = &defs->unions;
190                 break;
191         case CDUMP_ENUM:
192                 m = &defs->enums;
193                 break;
194         case CDUMP_UNKNOWN:
195         case CDUMP_ARRAY:
196         case CDUMP_POINTER:
197                 m = NULL;
198         }
199
200         /* Do we already have it? */
201         if (m) {
202                 t = strmap_get(m, name);
203                 if (t)
204                         return t;
205         }
206
207         t = tal(defs, struct cdump_type);
208         t->kind = kind;
209         t->name = name ? tal_steal(t, name) : NULL;
210         /* These are actually the same, but be thorough */
211         t->u.members = NULL;
212         t->u.enum_vals = NULL;
213         if (m)
214                 strmap_add(m, t->name, t);
215
216         return t;
217 }
218
219 static void complain(struct parse_state *ps, const char *complaint)
220 {
221         unsigned int linenum;
222         const char *p = ps->code;
223
224         for (linenum = 1; p < ps->toks[0].p; linenum++) {
225                 p = strchr(p+1, '\n');
226                 if (!p)
227                         break;
228         }
229
230         tal_append_fmt(&ps->complaints,
231                        "Line %u: '%.*s': %s\n",
232                        linenum, (int)ps->toks[0].len,
233                        ps->toks[0].p, complaint);
234 }
235
236 static void tok_take_unknown_statement(struct parse_state *ps)
237 {
238         complain(ps, "Ignoring unknown statement until next semicolon");
239         tal_free(tok_take_until(NULL, &ps->toks, ";"));
240         tok_take_if(&ps->toks, ";");
241 }
242
243 /* [ ... */
244 static bool tok_take_array(struct parse_state *ps, struct cdump_type **type)
245 {
246         /* This will be some arbitrary expression! */
247         struct cdump_type *arr = get_type(ps->defs, CDUMP_ARRAY, NULL);
248
249         arr->u.arr.size = tok_take_until(arr, &ps->toks, "]");
250         if (!arr->u.arr.size) {
251                 complain(ps, "Could not find closing array size ]");
252                 return false;
253         }
254
255         arr->u.arr.type = *type;
256         *type = arr;
257
258         /* Swallow ] */
259         tok_take(&ps->toks);
260         return true;
261 }
262
263 static struct cdump_type *ptr_of(struct parse_state *ps,
264                                  const struct cdump_type *ptr_to)
265 {
266         struct cdump_type *ptr = get_type(ps->defs, CDUMP_POINTER, NULL);
267         ptr->u.ptr = ptr_to;
268         return ptr;
269 }
270
271 static bool tok_take_type(struct parse_state *ps, struct cdump_type **type)
272 {
273         const char *name;
274         const struct token *types;
275         enum cdump_type_kind kind;
276
277         /* Ignoring weird typedefs, only these can be combined. */
278         types = ps->toks;
279         while (tok_take_if(&ps->toks, "int")
280                || tok_take_if(&ps->toks, "long")
281                || tok_take_if(&ps->toks, "short")
282                || tok_take_if(&ps->toks, "double")
283                || tok_take_if(&ps->toks, "float")
284                || tok_take_if(&ps->toks, "char")
285                || tok_take_if(&ps->toks, "signed")
286                || tok_take_if(&ps->toks, "unsigned"));
287
288         /* Did we get some? */
289         if (ps->toks != types) {
290                 name = string_of_toks(NULL, types, tok_peek(&ps->toks));
291                 kind = CDUMP_UNKNOWN;
292         } else {
293                 /* Try normal types (or simple typedefs, etc). */
294                 if (tok_take_if(&ps->toks, "struct")) {
295                         kind = CDUMP_STRUCT;
296                 } else if (tok_take_if(&ps->toks, "union")) {
297                         kind = CDUMP_UNION;
298                 } else if (tok_take_if(&ps->toks, "enum")) {
299                         kind = CDUMP_ENUM;
300                 } else
301                         kind = CDUMP_UNKNOWN;
302
303                 name = tok_take_ident(ps->defs, &ps->toks);
304                 if (!name) {
305                         complain(ps, "Invalid typename");
306                         return false;
307                 }
308         }
309
310         *type = get_type(ps->defs, kind, name);
311         return true;
312 }
313
314 /* struct|union ... */
315 static bool tok_take_conglom(struct parse_state *ps,
316                              enum cdump_type_kind conglom_kind)
317 {
318         struct cdump_type *e;
319         const char *name;
320         size_t n;
321
322         assert(conglom_kind == CDUMP_STRUCT || conglom_kind == CDUMP_UNION);
323
324         name = tok_take_ident(ps->defs, &ps->toks);
325         if (!name) {
326                 complain(ps, "Invalid struct/union name");
327                 return false;
328         }
329
330         e = get_type(ps->defs, conglom_kind, name);
331         if (type_defined(e)) {
332                 complain(ps, "Type already defined");
333                 return false;
334         }
335
336         if (!tok_take_if(&ps->toks, "{")) {
337                 complain(ps, "Expected { for struct/union");
338                 return false;
339         }
340
341         e->u.members = tal_arr(e, struct cdump_member, n = 0);
342         while (!tok_is(&ps->toks, "}")) {
343                 struct cdump_type *basetype;
344                 const struct token *quals;
345                 unsigned int num_quals = 0;
346
347                 /* Anything can have these prepended. */
348                 quals = ps->toks;
349                 while (tok_take_if(&ps->toks, "const")
350                        || tok_take_if(&ps->toks, "volatile"))
351                         num_quals++;
352
353                 /* eg. "struct foo" or "varint_t" */
354                 if (!tok_take_type(ps, &basetype)) {
355                         complain(ps, "Expected typename inside struct/union");
356                         return false;
357                 }
358
359                 do {
360                         struct cdump_member *m;
361
362                         tal_resize(&e->u.members, n+1);
363                         m = &e->u.members[n++];
364                         m->type = basetype;
365                         if (num_quals) {
366                                 m->qualifiers
367                                         = string_of_toks(e, quals,
368                                                          quals + num_quals);
369                         } else
370                                 m->qualifiers = NULL;
371
372                         /* May have multiple asterisks. */
373                         while (tok_take_if(&ps->toks, "*"))
374                                 m->type = ptr_of(ps, m->type);
375
376                         m->name = tok_take_ident(e, &ps->toks);
377                         if (!m->name) {
378                                 complain(ps, "Expected name for member");
379                                 return false;
380                         }
381
382                         /* May be an array. */
383                         while (tok_take_if(&ps->toks, "[")) {
384                                 if (!tok_take_array(ps, &m->type))
385                                         return false;
386                         }
387                 } while (tok_take_if(&ps->toks, ","));
388
389                 if (!tok_take_if(&ps->toks, ";")) {
390                         complain(ps, "Expected ; at end of member");
391                         return false;
392                 }
393         }
394
395         if (tok_take_if(&ps->toks, "}") && tok_take_if(&ps->toks, ";"))
396                 return true;
397         complain(ps, "Expected }; at end of struct/union");
398         return false;
399 }
400
401 /* enum ... */
402 static bool tok_take_enum(struct parse_state *ps)
403 {
404         size_t n = 0;
405         struct cdump_type *e;
406         const char *name;
407
408         name = tok_take_ident(ps->defs, &ps->toks);
409         if (!name) {
410                 complain(ps, "Expected enum name");
411                 return false;
412         }
413
414         e = get_type(ps->defs, CDUMP_ENUM, name);
415
416         /* Duplicate name? */
417         if (type_defined(e)) {
418                 complain(ps, "enum already defined");
419                 return false;
420         }
421
422         if (!tok_take_if(&ps->toks, "{")) {
423                 complain(ps, "Expected { after enum name");
424                 return false;
425         }
426
427         e->u.enum_vals = tal_arr(e, struct cdump_enum_val, n);
428         do {
429                 struct cdump_enum_val *v;
430
431                 tal_resize(&e->u.enum_vals, n+1);
432                 v = &e->u.enum_vals[n++];
433
434                 v->name = tok_take_ident(e, &ps->toks);
435                 if (!v->name) {
436                         complain(ps, "Expected enum value name");
437                         return false;
438                 }
439                 if (tok_take_if(&ps->toks, "=")) {
440                         v->value = tok_take_until(e, &ps->toks, ",}");
441                         if (!v->value) {
442                                 complain(ps, "Expected , or } to end value");
443                                 return false;
444                         }
445                 } else
446                         v->value = NULL;
447         } while (tok_take_if(&ps->toks, ","));
448
449         if (tok_take_if(&ps->toks, "}") && tok_take_if(&ps->toks, ";"))
450                 return true;
451
452         complain(ps, "Expected }; at end of enum");
453         return false;
454 }
455
456 static bool gather_undefines(const char *name,
457                              struct cdump_type *t,
458                              struct cdump_map *undefs)
459 {
460         if (!type_defined(t))
461                 strmap_add(undefs, name, t);
462         return true;
463 }
464
465 static bool remove_from_map(const char *name,
466                             struct cdump_type *t,
467                             struct cdump_map *map)
468 {
469         strmap_del(map, name, NULL);
470         return true;
471 }
472
473 static void remove_undefined(struct cdump_map *map)
474 {
475         struct cdump_map undefs;
476
477         /* We can't delete inside iterator, so gather all the undefs
478          * then remove them. */
479         strmap_init(&undefs);
480
481         strmap_iterate(map, gather_undefines, &undefs);
482         strmap_iterate(&undefs, remove_from_map, map);
483         strmap_clear(&undefs);
484 }
485
486 static void destroy_definitions(struct cdump_definitions *defs)
487 {
488         strmap_clear(&defs->enums);
489         strmap_clear(&defs->structs);
490         strmap_clear(&defs->unions);
491 }
492
493 /* Simple LL(1) parser, inspired by Tridge's genstruct.pl. */
494 struct cdump_definitions *cdump_extract(const tal_t *ctx, const char *code,
495                                         char **complaints)
496 {
497         struct parse_state ps;
498         const struct token *toks;
499
500         ps.defs = tal(ctx, struct cdump_definitions);
501         ps.complaints = tal_strdup(ctx, "");
502         ps.code = code;
503
504         strmap_init(&ps.defs->enums);
505         strmap_init(&ps.defs->structs);
506         strmap_init(&ps.defs->unions);
507         tal_add_destructor(ps.defs, destroy_definitions);
508
509         toks = ps.toks = tokenize(ps.defs, code);
510         while (tok_peek(&ps.toks)) {
511                 if (tok_take_if(&ps.toks, "struct")) {
512                         if (!tok_take_conglom(&ps, CDUMP_STRUCT))
513                                 goto fail;
514                 } else if (tok_take_if(&ps.toks, "union")) {
515                         if (!tok_take_conglom(&ps, CDUMP_UNION))
516                                 goto fail;
517                 } else if (tok_take_if(&ps.toks, "enum")) {
518                         if (!tok_take_enum(&ps))
519                                 goto fail;
520                 } else
521                         tok_take_unknown_statement(&ps);
522         }
523
524         /* Now, remove any undefined types! */
525         remove_undefined(&ps.defs->enums);
526         remove_undefined(&ps.defs->structs);
527         remove_undefined(&ps.defs->unions);
528         tal_free(toks);
529
530 out:
531         if (streq(ps.complaints, ""))
532                 ps.complaints = tal_free(ps.complaints);
533
534         if (complaints)
535                 *complaints = ps.complaints;
536         else
537                 tal_free(ps.complaints);
538         return ps.defs;
539
540 fail:
541         ps.defs = tal_free(ps.defs);
542         goto out;
543 }