]> git.ozlabs.org Git - ccan/blob - ccan/cdump/cdump.c
cdump: handle array sizes and comments better.
[ccan] / ccan / cdump / cdump.c
1 /* MIT (BSD) license - see LICENSE file for details */
2 #include "cdump.h"
3 #include <ccan/tal/str/str.h>
4 #include <assert.h>
5
6 struct token {
7         const char *p;
8         size_t len;
9 };
10
11 static void add_token(struct token **toks, const char *p, size_t len)
12 {
13         size_t n = tal_count(*toks);
14         tal_resize(toks, n+1);
15         (*toks)[n].p = p;
16         (*toks)[n].len = len;
17 }
18
19 /* Simplified tokenizer: comments and preproc directives removed,
20    identifiers are a token, others are single char tokens. */
21 static struct token *tokenize(const void *ctx, const char *code)
22 {
23         unsigned int i, len, tok_start = -1;
24         bool start_of_line = true;
25         struct token *toks = tal_arr(ctx, struct token, 0);
26
27         for (i = 0; code[i]; i += len) {
28                 if (code[i] == '#' && start_of_line) {
29                         /* Preprocessor line. */
30                         len = strcspn(code+i, "\n");
31                 } else if (code[i] == '/' && code[i+1] == '/') {
32                         /* One line comment. */
33                         len = strcspn(code+i, "\n");
34                         if (tok_start != -1U) {
35                                 add_token(&toks, code+tok_start, i - tok_start);
36                                 tok_start = -1U;
37                         }
38                 } else if (code[i] == '/' && code[i+1] == '*') {
39                         /* Multi-line comment. */
40                         const char *end = strstr(code+i+2, "*/");
41                         len = (end + 2) - (code + i);
42                         if (!end)
43                                 len = strlen(code + i);
44                         if (tok_start != -1U) {
45                                 add_token(&toks, code+tok_start, i - tok_start);
46                                 tok_start = -1U;
47                         }
48                 } else if (cisalnum(code[i]) || code[i] == '_') {
49                         /* Identifier or part thereof */
50                         if (tok_start == -1U)
51                                 tok_start = i;
52                         len = 1;
53                 } else if (!cisspace(code[i])) {
54                         /* Punctuation: treat as single char token. */
55                         if (tok_start != -1U) {
56                                 add_token(&toks, code+tok_start, i - tok_start);
57                                 tok_start = -1U;
58                         }
59                         add_token(&toks, code+i, 1);
60                         len = 1;
61                 } else {
62                         /* Whitespace. */
63                         if (tok_start != -1U) {
64                                 add_token(&toks, code+tok_start, i - tok_start);
65                                 tok_start = -1U;
66                         }
67                         len = 1;
68                 }
69                 if (code[i] == '\n')
70                         start_of_line = true;
71                 else if (!cisspace(code[i]))
72                         start_of_line = false;
73         }
74
75         /* Add terminating NULL. */
76         tal_resizez(&toks, tal_count(toks) + 1);
77         return toks;
78 }
79
80 struct parse_state {
81         const char *code;
82         const struct token *toks;
83         struct cdump_definitions *defs;
84         char *complaints;
85 };
86
87 static const struct token *tok_peek(const struct token **toks)
88 {
89         /* Ignore removed tokens (eg. comments) */
90         while (toks[0]->len == 0) {
91                 if (!toks[0]->p)
92                         return NULL;
93                 (*toks)++;
94         }
95         return toks[0];
96 }
97
98 static bool tok_is(const struct token **toks, const char *target)
99 {
100         const struct token *t = tok_peek(toks);
101         return (t && t->len == strlen(target)
102                 && memcmp(t->p, target, t->len) == 0);
103 }
104
105 static const struct token *tok_take(const struct token **toks)
106 {
107         const struct token *t = tok_peek(toks);
108         if (t)
109                 (*toks)++;
110
111         return t;
112 }
113
114 static const struct token *tok_take_if(const struct token **toks,
115                                        const char *target)
116 {
117         if (tok_is(toks, target))
118                 return tok_take(toks);
119         return NULL;
120 }
121
122 static const char *tok_take_ident(const tal_t *ctx, const struct token **toks)
123 {
124         const struct token *t = tok_peek(toks);
125
126         if (!t)
127                 return NULL;
128
129         if (strspn(t->p, "_0123456789"
130                    "abcdefghijklmnopqrstuvwxyz"
131                    "ABCDEFGHIJKLMNOPQRSTUVWXYZ") < t->len)
132                 return NULL;
133
134         t = tok_take(toks);
135         return tal_strndup(ctx, t->p, t->len);
136 }
137
138 static char *string_of_toks(const tal_t *ctx,
139                             const struct token *first,
140                             const struct token *until)
141 {
142         char *str, *p;
143
144         /* Careful to skip erased tokens (eg. comments) */
145         str = p = tal_arr(ctx, char, until->p - first->p + 1);
146         while (first != until) {
147                 const struct token *next = first + 1;
148
149                 if (first->len) {
150                         memcpy(p, first->p, first->len);
151                         p += first->len;
152                         /* Insert space if they weren't adjacent, unless last */
153                         if (next != until) {
154                                 if (first->p + first->len != next->p)
155                                         *(p++) = ' ';
156                         }
157                 }
158                 first = next;
159         }
160         *p = '\0';
161
162         return str;
163 }
164
165 static char *tok_take_until(const tal_t *ctx,
166                             const struct token **toks,
167                             const char *delims)
168 {
169         const struct token *t, *start;
170
171         start = tok_peek(toks);
172         while ((t = tok_peek(toks)) != NULL) {
173                 /* If this contains a delimiter, copy up to prev token. */
174                 if (strcspn(t->p, delims) < t->len)
175                         return string_of_toks(ctx, start, t);
176                 tok_take(toks);
177         };
178
179         /* EOF without finding delimiter */
180         return NULL;
181 }
182
183 static bool type_defined(const struct cdump_type *t)
184 {
185         switch (t->kind) {
186         case CDUMP_STRUCT:
187         case CDUMP_UNION:
188                 return (t->u.members != NULL);
189         case CDUMP_ENUM:
190                 return (t->u.enum_vals != NULL);
191
192         /* These shouldn't happen; we don't try to define them. */
193         case CDUMP_UNKNOWN:
194         case CDUMP_ARRAY:
195         case CDUMP_POINTER:
196                 break;
197         }
198         abort();
199 }
200
201 /* May allocate a new type if not already found (steals @name) */
202 static struct cdump_type *get_type(struct cdump_definitions *defs,
203                                    enum cdump_type_kind kind,
204                                    const char *name)
205 {
206         struct cdump_map *m;
207         struct cdump_type *t;
208
209         switch (kind) {
210         case CDUMP_STRUCT:
211                 m = &defs->structs;
212                 break;
213         case CDUMP_UNION:
214                 m = &defs->unions;
215                 break;
216         case CDUMP_ENUM:
217                 m = &defs->enums;
218                 break;
219         case CDUMP_UNKNOWN:
220         case CDUMP_ARRAY:
221         case CDUMP_POINTER:
222                 m = NULL;
223         }
224
225         /* Do we already have it? */
226         if (m) {
227                 t = strmap_get(m, name);
228                 if (t)
229                         return t;
230         }
231
232         t = tal(defs, struct cdump_type);
233         t->kind = kind;
234         t->name = name ? tal_steal(t, name) : NULL;
235         /* These are actually the same, but be thorough */
236         t->u.members = NULL;
237         t->u.enum_vals = NULL;
238         if (m)
239                 strmap_add(m, t->name, t);
240
241         return t;
242 }
243
244 static void complain(struct parse_state *ps, const char *complaint)
245 {
246         unsigned int linenum;
247         const char *p = ps->code;
248
249         for (linenum = 1; p < ps->toks[0].p; linenum++) {
250                 p = strchr(p+1, '\n');
251                 if (!p)
252                         break;
253         }
254
255         tal_append_fmt(&ps->complaints,
256                        "Line %u: '%.*s': %s\n",
257                        linenum, (int)ps->toks[0].len,
258                        ps->toks[0].p, complaint);
259 }
260
261 static void tok_take_unknown_statement(struct parse_state *ps)
262 {
263         complain(ps, "Ignoring unknown statement until next semicolon");
264         tal_free(tok_take_until(NULL, &ps->toks, ";"));
265         tok_take_if(&ps->toks, ";");
266 }
267
268 static bool tok_take_expr(struct parse_state *ps, const char *term)
269 {
270         while (!tok_is(&ps->toks, term)) {
271                 if (tok_take_if(&ps->toks, "(")) {
272                         if (!tok_take_expr(ps, ")"))
273                                 return false;
274                 } else if (tok_take_if(&ps->toks, "[")) {
275                         if (!tok_take_expr(ps, "]"))
276                                 return false;
277                 } else if (!tok_take(&ps->toks))
278                         return false;
279         }
280         return tok_take(&ps->toks);
281 }
282
283 /* [ ... */
284 static bool tok_take_array(struct parse_state *ps, struct cdump_type **type)
285 {
286         /* This will be some arbitrary expression! */
287         struct cdump_type *arr = get_type(ps->defs, CDUMP_ARRAY, NULL);
288         const struct token *start = tok_peek(&ps->toks);
289
290         if (!tok_take_expr(ps, "]")) {
291                 complain(ps, "Could not find closing array size ]");
292                 return false;
293         }
294
295         arr->u.arr.size = string_of_toks(arr, start, ps->toks - 1);
296         arr->u.arr.type = *type;
297         *type = arr;
298
299         return true;
300 }
301
302 static struct cdump_type *ptr_of(struct parse_state *ps,
303                                  const struct cdump_type *ptr_to)
304 {
305         struct cdump_type *ptr = get_type(ps->defs, CDUMP_POINTER, NULL);
306         ptr->u.ptr = ptr_to;
307         return ptr;
308 }
309
310 static bool tok_take_type(struct parse_state *ps, struct cdump_type **type)
311 {
312         const char *name;
313         const struct token *types;
314         enum cdump_type_kind kind;
315
316         /* Ignoring weird typedefs, only these can be combined. */
317         types = ps->toks;
318         while (tok_take_if(&ps->toks, "int")
319                || tok_take_if(&ps->toks, "long")
320                || tok_take_if(&ps->toks, "short")
321                || tok_take_if(&ps->toks, "double")
322                || tok_take_if(&ps->toks, "float")
323                || tok_take_if(&ps->toks, "char")
324                || tok_take_if(&ps->toks, "signed")
325                || tok_take_if(&ps->toks, "unsigned"));
326
327         /* Did we get some? */
328         if (ps->toks != types) {
329                 name = string_of_toks(NULL, types, tok_peek(&ps->toks));
330                 kind = CDUMP_UNKNOWN;
331         } else {
332                 /* Try normal types (or simple typedefs, etc). */
333                 if (tok_take_if(&ps->toks, "struct")) {
334                         kind = CDUMP_STRUCT;
335                 } else if (tok_take_if(&ps->toks, "union")) {
336                         kind = CDUMP_UNION;
337                 } else if (tok_take_if(&ps->toks, "enum")) {
338                         kind = CDUMP_ENUM;
339                 } else
340                         kind = CDUMP_UNKNOWN;
341
342                 name = tok_take_ident(ps->defs, &ps->toks);
343                 if (!name) {
344                         complain(ps, "Invalid typename");
345                         return false;
346                 }
347         }
348
349         *type = get_type(ps->defs, kind, name);
350         return true;
351 }
352
353 /* struct|union ... */
354 static bool tok_take_conglom(struct parse_state *ps,
355                              enum cdump_type_kind conglom_kind)
356 {
357         struct cdump_type *e;
358         const char *name;
359         size_t n;
360
361         assert(conglom_kind == CDUMP_STRUCT || conglom_kind == CDUMP_UNION);
362
363         name = tok_take_ident(ps->defs, &ps->toks);
364         if (!name) {
365                 complain(ps, "Invalid struct/union name");
366                 return false;
367         }
368
369         e = get_type(ps->defs, conglom_kind, name);
370         if (type_defined(e)) {
371                 complain(ps, "Type already defined");
372                 return false;
373         }
374
375         if (!tok_take_if(&ps->toks, "{")) {
376                 complain(ps, "Expected { for struct/union");
377                 return false;
378         }
379
380         e->u.members = tal_arr(e, struct cdump_member, n = 0);
381         while (!tok_is(&ps->toks, "}")) {
382                 struct cdump_type *basetype;
383                 const struct token *quals;
384                 unsigned int num_quals = 0;
385
386                 /* Anything can have these prepended. */
387                 quals = ps->toks;
388                 while (tok_take_if(&ps->toks, "const")
389                        || tok_take_if(&ps->toks, "volatile"))
390                         num_quals++;
391
392                 /* eg. "struct foo" or "varint_t" */
393                 if (!tok_take_type(ps, &basetype)) {
394                         complain(ps, "Expected typename inside struct/union");
395                         return false;
396                 }
397
398                 do {
399                         struct cdump_member *m;
400
401                         tal_resize(&e->u.members, n+1);
402                         m = &e->u.members[n++];
403                         m->type = basetype;
404                         if (num_quals) {
405                                 m->qualifiers
406                                         = string_of_toks(e, quals,
407                                                          quals + num_quals);
408                         } else
409                                 m->qualifiers = NULL;
410
411                         /* May have multiple asterisks. */
412                         while (tok_take_if(&ps->toks, "*"))
413                                 m->type = ptr_of(ps, m->type);
414
415                         m->name = tok_take_ident(e, &ps->toks);
416                         if (!m->name) {
417                                 complain(ps, "Expected name for member");
418                                 return false;
419                         }
420
421                         /* May be an array. */
422                         while (tok_take_if(&ps->toks, "[")) {
423                                 if (!tok_take_array(ps, &m->type))
424                                         return false;
425                         }
426                 } while (tok_take_if(&ps->toks, ","));
427
428                 if (!tok_take_if(&ps->toks, ";")) {
429                         complain(ps, "Expected ; at end of member");
430                         return false;
431                 }
432         }
433
434         if (tok_take_if(&ps->toks, "}") && tok_take_if(&ps->toks, ";"))
435                 return true;
436         complain(ps, "Expected }; at end of struct/union");
437         return false;
438 }
439
440 /* enum ... */
441 static bool tok_take_enum(struct parse_state *ps)
442 {
443         size_t n = 0;
444         struct cdump_type *e;
445         const char *name;
446
447         name = tok_take_ident(ps->defs, &ps->toks);
448         if (!name) {
449                 complain(ps, "Expected enum name");
450                 return false;
451         }
452
453         e = get_type(ps->defs, CDUMP_ENUM, name);
454
455         /* Duplicate name? */
456         if (type_defined(e)) {
457                 complain(ps, "enum already defined");
458                 return false;
459         }
460
461         if (!tok_take_if(&ps->toks, "{")) {
462                 complain(ps, "Expected { after enum name");
463                 return false;
464         }
465
466         e->u.enum_vals = tal_arr(e, struct cdump_enum_val, n);
467         do {
468                 struct cdump_enum_val *v;
469
470                 /* GCC extension: comma and end of enum */
471                 if (tok_is(&ps->toks, "}"))
472                         break;
473
474                 tal_resize(&e->u.enum_vals, n+1);
475                 v = &e->u.enum_vals[n++];
476
477                 v->name = tok_take_ident(e, &ps->toks);
478                 if (!v->name) {
479                         complain(ps, "Expected enum value name");
480                         return false;
481                 }
482                 if (tok_take_if(&ps->toks, "=")) {
483                         v->value = tok_take_until(e, &ps->toks, ",}");
484                         if (!v->value) {
485                                 complain(ps, "Expected , or } to end value");
486                                 return false;
487                         }
488                 } else
489                         v->value = NULL;
490         } while (tok_take_if(&ps->toks, ","));
491
492         if (tok_take_if(&ps->toks, "}") && tok_take_if(&ps->toks, ";"))
493                 return true;
494
495         complain(ps, "Expected }; at end of enum");
496         return false;
497 }
498
499 static bool gather_undefines(const char *name,
500                              struct cdump_type *t,
501                              struct cdump_map *undefs)
502 {
503         if (!type_defined(t))
504                 strmap_add(undefs, name, t);
505         return true;
506 }
507
508 static bool remove_from_map(const char *name,
509                             struct cdump_type *t,
510                             struct cdump_map *map)
511 {
512         strmap_del(map, name, NULL);
513         return true;
514 }
515
516 static void remove_undefined(struct cdump_map *map)
517 {
518         struct cdump_map undefs;
519
520         /* We can't delete inside iterator, so gather all the undefs
521          * then remove them. */
522         strmap_init(&undefs);
523
524         strmap_iterate(map, gather_undefines, &undefs);
525         strmap_iterate(&undefs, remove_from_map, map);
526         strmap_clear(&undefs);
527 }
528
529 static void destroy_definitions(struct cdump_definitions *defs)
530 {
531         strmap_clear(&defs->enums);
532         strmap_clear(&defs->structs);
533         strmap_clear(&defs->unions);
534 }
535
536 /* Simple LL(1) parser, inspired by Tridge's genstruct.pl. */
537 struct cdump_definitions *cdump_extract(const tal_t *ctx, const char *code,
538                                         char **complaints)
539 {
540         struct parse_state ps;
541         const struct token *toks;
542
543         ps.defs = tal(ctx, struct cdump_definitions);
544         ps.complaints = tal_strdup(ctx, "");
545         ps.code = code;
546
547         strmap_init(&ps.defs->enums);
548         strmap_init(&ps.defs->structs);
549         strmap_init(&ps.defs->unions);
550         tal_add_destructor(ps.defs, destroy_definitions);
551
552         toks = ps.toks = tokenize(ps.defs, code);
553         while (tok_peek(&ps.toks)) {
554                 if (tok_take_if(&ps.toks, "struct")) {
555                         if (!tok_take_conglom(&ps, CDUMP_STRUCT))
556                                 goto fail;
557                 } else if (tok_take_if(&ps.toks, "union")) {
558                         if (!tok_take_conglom(&ps, CDUMP_UNION))
559                                 goto fail;
560                 } else if (tok_take_if(&ps.toks, "enum")) {
561                         if (!tok_take_enum(&ps))
562                                 goto fail;
563                 } else
564                         tok_take_unknown_statement(&ps);
565         }
566
567         /* Now, remove any undefined types! */
568         remove_undefined(&ps.defs->enums);
569         remove_undefined(&ps.defs->structs);
570         remove_undefined(&ps.defs->unions);
571         tal_free(toks);
572
573 out:
574         if (streq(ps.complaints, ""))
575                 ps.complaints = tal_free(ps.complaints);
576
577         if (complaints)
578                 *complaints = ps.complaints;
579         else
580                 tal_free(ps.complaints);
581         return ps.defs;
582
583 fail:
584         ps.defs = tal_free(ps.defs);
585         goto out;
586 }