]> git.ozlabs.org Git - ccan/blob - ccan/cdump/cdump.c
cdump: handle multi-line preprocessor directives.
[ccan] / ccan / cdump / cdump.c
1 /* MIT (BSD) license - see LICENSE file for details */
2 #include "cdump.h"
3 #include <ccan/tal/str/str.h>
4 #include <assert.h>
5
6 struct token {
7         const char *p;
8         size_t len;
9 };
10
11 static void add_token(struct token **toks, const char *p, size_t len)
12 {
13         size_t n = tal_count(*toks);
14         tal_resize(toks, n+1);
15         (*toks)[n].p = p;
16         (*toks)[n].len = len;
17 }
18
19 static size_t to_eol(const char *p)
20 {
21         size_t len = strcspn(p, "\n");
22
23         /* And any \ continuations. */
24         while (p[len] && p[len-1] == '\\')
25                 len += strcspn(p+len+1, "\n") + 1;
26         return len;
27 }
28
29 /* Simplified tokenizer: comments and preproc directives removed,
30    identifiers are a token, others are single char tokens. */
31 static struct token *tokenize(const void *ctx, const char *code)
32 {
33         unsigned int i, len, tok_start = -1;
34         bool start_of_line = true;
35         struct token *toks = tal_arr(ctx, struct token, 0);
36
37         for (i = 0; code[i]; i += len) {
38                 if (code[i] == '#' && start_of_line) {
39                         /* Preprocessor line. */
40                         len = to_eol(code + i);
41                 } else if (code[i] == '/' && code[i+1] == '/') {
42                         /* One line comment. */
43                         len = to_eol(code + i);
44                         if (tok_start != -1U) {
45                                 add_token(&toks, code+tok_start, i - tok_start);
46                                 tok_start = -1U;
47                         }
48                 } else if (code[i] == '/' && code[i+1] == '*') {
49                         /* Multi-line comment. */
50                         const char *end = strstr(code+i+2, "*/");
51                         len = (end + 2) - (code + i);
52                         if (!end)
53                                 len = strlen(code + i);
54                         if (tok_start != -1U) {
55                                 add_token(&toks, code+tok_start, i - tok_start);
56                                 tok_start = -1U;
57                         }
58                 } else if (cisalnum(code[i]) || code[i] == '_') {
59                         /* Identifier or part thereof */
60                         if (tok_start == -1U)
61                                 tok_start = i;
62                         len = 1;
63                 } else if (!cisspace(code[i])) {
64                         /* Punctuation: treat as single char token. */
65                         if (tok_start != -1U) {
66                                 add_token(&toks, code+tok_start, i - tok_start);
67                                 tok_start = -1U;
68                         }
69                         add_token(&toks, code+i, 1);
70                         len = 1;
71                 } else {
72                         /* Whitespace. */
73                         if (tok_start != -1U) {
74                                 add_token(&toks, code+tok_start, i - tok_start);
75                                 tok_start = -1U;
76                         }
77                         len = 1;
78                 }
79                 if (code[i] == '\n')
80                         start_of_line = true;
81                 else if (!cisspace(code[i]))
82                         start_of_line = false;
83         }
84
85         /* Add terminating NULL. */
86         tal_resizez(&toks, tal_count(toks) + 1);
87         return toks;
88 }
89
90 struct parse_state {
91         const char *code;
92         const struct token *toks;
93         struct cdump_definitions *defs;
94         char *complaints;
95 };
96
97 static const struct token *tok_peek(const struct token **toks)
98 {
99         /* Ignore removed tokens (eg. comments) */
100         while (toks[0]->len == 0) {
101                 if (!toks[0]->p)
102                         return NULL;
103                 (*toks)++;
104         }
105         return toks[0];
106 }
107
108 static bool tok_is(const struct token **toks, const char *target)
109 {
110         const struct token *t = tok_peek(toks);
111         return (t && t->len == strlen(target)
112                 && memcmp(t->p, target, t->len) == 0);
113 }
114
115 static const struct token *tok_take(const struct token **toks)
116 {
117         const struct token *t = tok_peek(toks);
118         if (t)
119                 (*toks)++;
120
121         return t;
122 }
123
124 static const struct token *tok_take_if(const struct token **toks,
125                                        const char *target)
126 {
127         if (tok_is(toks, target))
128                 return tok_take(toks);
129         return NULL;
130 }
131
132 static const char *tok_take_ident(const tal_t *ctx, const struct token **toks)
133 {
134         const struct token *t = tok_peek(toks);
135
136         if (!t)
137                 return NULL;
138
139         if (strspn(t->p, "_0123456789"
140                    "abcdefghijklmnopqrstuvwxyz"
141                    "ABCDEFGHIJKLMNOPQRSTUVWXYZ") < t->len)
142                 return NULL;
143
144         t = tok_take(toks);
145         return tal_strndup(ctx, t->p, t->len);
146 }
147
148 static char *string_of_toks(const tal_t *ctx,
149                             const struct token *first,
150                             const struct token *until)
151 {
152         char *str, *p;
153
154         /* Careful to skip erased tokens (eg. comments) */
155         str = p = tal_arr(ctx, char, until->p - first->p + 1);
156         while (first != until) {
157                 const struct token *next = first + 1;
158
159                 if (first->len) {
160                         memcpy(p, first->p, first->len);
161                         p += first->len;
162                         /* Insert space if they weren't adjacent, unless last */
163                         if (next != until) {
164                                 if (first->p + first->len != next->p)
165                                         *(p++) = ' ';
166                         }
167                 }
168                 first = next;
169         }
170         *p = '\0';
171
172         return str;
173 }
174
175 static char *tok_take_until(const tal_t *ctx,
176                             const struct token **toks,
177                             const char *delims)
178 {
179         const struct token *t, *start;
180
181         start = tok_peek(toks);
182         while ((t = tok_peek(toks)) != NULL) {
183                 /* If this contains a delimiter, copy up to prev token. */
184                 if (strcspn(t->p, delims) < t->len)
185                         return string_of_toks(ctx, start, t);
186                 tok_take(toks);
187         };
188
189         /* EOF without finding delimiter */
190         return NULL;
191 }
192
193 static bool type_defined(const struct cdump_type *t)
194 {
195         switch (t->kind) {
196         case CDUMP_STRUCT:
197         case CDUMP_UNION:
198                 return (t->u.members != NULL);
199         case CDUMP_ENUM:
200                 return (t->u.enum_vals != NULL);
201
202         /* These shouldn't happen; we don't try to define them. */
203         case CDUMP_UNKNOWN:
204         case CDUMP_ARRAY:
205         case CDUMP_POINTER:
206                 break;
207         }
208         abort();
209 }
210
211 /* May allocate a new type if not already found (steals @name) */
212 static struct cdump_type *get_type(struct cdump_definitions *defs,
213                                    enum cdump_type_kind kind,
214                                    const char *name)
215 {
216         struct cdump_map *m;
217         struct cdump_type *t;
218
219         switch (kind) {
220         case CDUMP_STRUCT:
221                 m = &defs->structs;
222                 break;
223         case CDUMP_UNION:
224                 m = &defs->unions;
225                 break;
226         case CDUMP_ENUM:
227                 m = &defs->enums;
228                 break;
229         case CDUMP_UNKNOWN:
230         case CDUMP_ARRAY:
231         case CDUMP_POINTER:
232                 m = NULL;
233         }
234
235         /* Do we already have it? */
236         if (m) {
237                 t = strmap_get(m, name);
238                 if (t)
239                         return t;
240         }
241
242         t = tal(defs, struct cdump_type);
243         t->kind = kind;
244         t->name = name ? tal_steal(t, name) : NULL;
245         /* These are actually the same, but be thorough */
246         t->u.members = NULL;
247         t->u.enum_vals = NULL;
248         if (m)
249                 strmap_add(m, t->name, t);
250
251         return t;
252 }
253
254 static void complain(struct parse_state *ps, const char *complaint)
255 {
256         unsigned int linenum;
257         const char *p = ps->code;
258
259         for (linenum = 1; p < ps->toks[0].p; linenum++) {
260                 p = strchr(p+1, '\n');
261                 if (!p)
262                         break;
263         }
264
265         tal_append_fmt(&ps->complaints,
266                        "Line %u: '%.*s': %s\n",
267                        linenum, (int)ps->toks[0].len,
268                        ps->toks[0].p, complaint);
269 }
270
271 static void tok_take_unknown_statement(struct parse_state *ps)
272 {
273         complain(ps, "Ignoring unknown statement until next semicolon");
274         tal_free(tok_take_until(NULL, &ps->toks, ";"));
275         tok_take_if(&ps->toks, ";");
276 }
277
278 static bool tok_take_expr(struct parse_state *ps, const char *term)
279 {
280         while (!tok_is(&ps->toks, term)) {
281                 if (tok_take_if(&ps->toks, "(")) {
282                         if (!tok_take_expr(ps, ")"))
283                                 return false;
284                 } else if (tok_take_if(&ps->toks, "[")) {
285                         if (!tok_take_expr(ps, "]"))
286                                 return false;
287                 } else if (!tok_take(&ps->toks))
288                         return false;
289         }
290         return tok_take(&ps->toks);
291 }
292
293 static char *tok_take_expr_str(const tal_t *ctx,
294                                struct parse_state *ps,
295                                const char *term)
296 {
297         const struct token *start = tok_peek(&ps->toks);
298
299         if (!tok_take_expr(ps, term))
300                 return NULL;
301
302         return string_of_toks(ctx, start, ps->toks - 1);
303 }
304
305 /* [ ... */
306 static bool tok_take_array(struct parse_state *ps, struct cdump_type **type)
307 {
308         /* This will be some arbitrary expression! */
309         struct cdump_type *arr = get_type(ps->defs, CDUMP_ARRAY, NULL);
310
311         arr->u.arr.size = tok_take_expr_str(arr, ps, "]");
312         if (!arr->u.arr.size) {
313                 complain(ps, "Could not find closing array size ]");
314                 return false;
315         }
316
317         arr->u.arr.type = *type;
318         *type = arr;
319
320         return true;
321 }
322
323 static struct cdump_type *ptr_of(struct parse_state *ps,
324                                  const struct cdump_type *ptr_to)
325 {
326         struct cdump_type *ptr = get_type(ps->defs, CDUMP_POINTER, NULL);
327         ptr->u.ptr = ptr_to;
328         return ptr;
329 }
330
331 static bool tok_take_type(struct parse_state *ps, struct cdump_type **type)
332 {
333         const char *name;
334         const struct token *types;
335         enum cdump_type_kind kind;
336
337         /* Ignoring weird typedefs, only these can be combined. */
338         types = ps->toks;
339         while (tok_take_if(&ps->toks, "int")
340                || tok_take_if(&ps->toks, "long")
341                || tok_take_if(&ps->toks, "short")
342                || tok_take_if(&ps->toks, "double")
343                || tok_take_if(&ps->toks, "float")
344                || tok_take_if(&ps->toks, "char")
345                || tok_take_if(&ps->toks, "signed")
346                || tok_take_if(&ps->toks, "unsigned"));
347
348         /* Did we get some? */
349         if (ps->toks != types) {
350                 name = string_of_toks(NULL, types, tok_peek(&ps->toks));
351                 kind = CDUMP_UNKNOWN;
352         } else {
353                 /* Try normal types (or simple typedefs, etc). */
354                 if (tok_take_if(&ps->toks, "struct")) {
355                         kind = CDUMP_STRUCT;
356                 } else if (tok_take_if(&ps->toks, "union")) {
357                         kind = CDUMP_UNION;
358                 } else if (tok_take_if(&ps->toks, "enum")) {
359                         kind = CDUMP_ENUM;
360                 } else
361                         kind = CDUMP_UNKNOWN;
362
363                 name = tok_take_ident(ps->defs, &ps->toks);
364                 if (!name) {
365                         complain(ps, "Invalid typename");
366                         return false;
367                 }
368         }
369
370         *type = get_type(ps->defs, kind, name);
371         return true;
372 }
373
374 /* CDUMP */
375 static bool tok_maybe_take_cdump_note(const tal_t *ctx,
376                                       struct parse_state *ps, const char **note)
377 {
378         *note = NULL;
379         if (tok_take_if(&ps->toks, "CDUMP")) {
380                 if (!tok_take_if(&ps->toks, "(")) {
381                         complain(ps, "Expected ( after CDUMP");
382                         return false;
383                 }
384                 *note = tok_take_expr_str(ctx, ps, ")");
385                 if (!*note) {
386                         complain(ps, "Expected ) after CDUMP(");
387                         return false;
388                 }
389         }
390         return true;
391 }
392
393 /* struct|union ... */
394 static bool tok_take_conglom(struct parse_state *ps,
395                              enum cdump_type_kind conglom_kind)
396 {
397         struct cdump_type *e;
398         const char *name;
399         size_t n;
400
401         assert(conglom_kind == CDUMP_STRUCT || conglom_kind == CDUMP_UNION);
402
403         name = tok_take_ident(ps->defs, &ps->toks);
404         if (!name) {
405                 complain(ps, "Invalid struct/union name");
406                 return false;
407         }
408
409         e = get_type(ps->defs, conglom_kind, name);
410         if (type_defined(e)) {
411                 complain(ps, "Type already defined");
412                 return false;
413         }
414
415         if (!tok_maybe_take_cdump_note(e, ps, &e->note))
416                 return false;
417
418         if (!tok_take_if(&ps->toks, "{")) {
419                 complain(ps, "Expected { for struct/union");
420                 return false;
421         }
422
423         e->u.members = tal_arr(e, struct cdump_member, n = 0);
424         while (!tok_is(&ps->toks, "}")) {
425                 struct cdump_type *basetype;
426                 const struct token *quals;
427                 unsigned int num_quals = 0;
428
429                 /* Anything can have these prepended. */
430                 quals = ps->toks;
431                 while (tok_take_if(&ps->toks, "const")
432                        || tok_take_if(&ps->toks, "volatile"))
433                         num_quals++;
434
435                 /* eg. "struct foo" or "varint_t" */
436                 if (!tok_take_type(ps, &basetype)) {
437                         complain(ps, "Expected typename inside struct/union");
438                         return false;
439                 }
440
441                 do {
442                         struct cdump_member *m;
443
444                         tal_resize(&e->u.members, n+1);
445                         m = &e->u.members[n++];
446                         m->type = basetype;
447                         if (num_quals) {
448                                 m->qualifiers
449                                         = string_of_toks(e, quals,
450                                                          quals + num_quals);
451                         } else
452                                 m->qualifiers = NULL;
453
454                         /* May have multiple asterisks. */
455                         while (tok_take_if(&ps->toks, "*"))
456                                 m->type = ptr_of(ps, m->type);
457
458                         m->name = tok_take_ident(e, &ps->toks);
459                         if (!m->name) {
460                                 complain(ps, "Expected name for member");
461                                 return false;
462                         }
463
464                         /* May be an array. */
465                         while (tok_take_if(&ps->toks, "[")) {
466                                 if (!tok_take_array(ps, &m->type))
467                                         return false;
468                         }
469
470                         /* CDUMP() */
471                         if (!tok_maybe_take_cdump_note(e->u.members,
472                                                        ps, &m->note))
473                                 return false;
474                 } while (tok_take_if(&ps->toks, ","));
475
476                 if (!tok_take_if(&ps->toks, ";")) {
477                         complain(ps, "Expected ; at end of member");
478                         return false;
479                 }
480         }
481
482         if (tok_take_if(&ps->toks, "}") && tok_take_if(&ps->toks, ";"))
483                 return true;
484         complain(ps, "Expected }; at end of struct/union");
485         return false;
486 }
487
488 /* enum ... */
489 static bool tok_take_enum(struct parse_state *ps)
490 {
491         size_t n = 0;
492         struct cdump_type *e;
493         const char *name;
494
495         name = tok_take_ident(ps->defs, &ps->toks);
496         if (!name) {
497                 complain(ps, "Expected enum name");
498                 return false;
499         }
500
501         e = get_type(ps->defs, CDUMP_ENUM, name);
502
503         /* Duplicate name? */
504         if (type_defined(e)) {
505                 complain(ps, "enum already defined");
506                 return false;
507         }
508
509         /* CDUMP() */
510         if (!tok_maybe_take_cdump_note(e, ps, &e->note))
511                 return false;
512
513         if (!tok_take_if(&ps->toks, "{")) {
514                 complain(ps, "Expected { after enum name");
515                 return false;
516         }
517
518         e->u.enum_vals = tal_arr(e, struct cdump_enum_val, n);
519         do {
520                 struct cdump_enum_val *v;
521
522                 /* GCC extension: comma and end of enum */
523                 if (tok_is(&ps->toks, "}"))
524                         break;
525
526                 tal_resize(&e->u.enum_vals, n+1);
527                 v = &e->u.enum_vals[n++];
528
529                 v->name = tok_take_ident(e, &ps->toks);
530                 if (!v->name) {
531                         complain(ps, "Expected enum value name");
532                         return false;
533                 }
534
535                 /* CDUMP() */
536                 if (!tok_maybe_take_cdump_note(e->u.enum_vals, ps, &v->note))
537                         return false;
538
539                 if (tok_take_if(&ps->toks, "=")) {
540                         v->value = tok_take_until(e, &ps->toks, ",}");
541                         if (!v->value) {
542                                 complain(ps, "Expected , or } to end value");
543                                 return false;
544                         }
545                 } else
546                         v->value = NULL;
547         } while (tok_take_if(&ps->toks, ","));
548
549         if (tok_take_if(&ps->toks, "}") && tok_take_if(&ps->toks, ";"))
550                 return true;
551
552         complain(ps, "Expected }; at end of enum");
553         return false;
554 }
555
556 static bool gather_undefines(const char *name,
557                              struct cdump_type *t,
558                              struct cdump_map *undefs)
559 {
560         if (!type_defined(t))
561                 strmap_add(undefs, name, t);
562         return true;
563 }
564
565 static bool remove_from_map(const char *name,
566                             struct cdump_type *t,
567                             struct cdump_map *map)
568 {
569         strmap_del(map, name, NULL);
570         return true;
571 }
572
573 static void remove_undefined(struct cdump_map *map)
574 {
575         struct cdump_map undefs;
576
577         /* We can't delete inside iterator, so gather all the undefs
578          * then remove them. */
579         strmap_init(&undefs);
580
581         strmap_iterate(map, gather_undefines, &undefs);
582         strmap_iterate(&undefs, remove_from_map, map);
583         strmap_clear(&undefs);
584 }
585
586 static void destroy_definitions(struct cdump_definitions *defs)
587 {
588         strmap_clear(&defs->enums);
589         strmap_clear(&defs->structs);
590         strmap_clear(&defs->unions);
591 }
592
593 /* Simple LL(1) parser, inspired by Tridge's genstruct.pl. */
594 struct cdump_definitions *cdump_extract(const tal_t *ctx, const char *code,
595                                         char **complaints)
596 {
597         struct parse_state ps;
598         const struct token *toks;
599
600         ps.defs = tal(ctx, struct cdump_definitions);
601         ps.complaints = tal_strdup(ctx, "");
602         ps.code = code;
603
604         strmap_init(&ps.defs->enums);
605         strmap_init(&ps.defs->structs);
606         strmap_init(&ps.defs->unions);
607         tal_add_destructor(ps.defs, destroy_definitions);
608
609         toks = ps.toks = tokenize(ps.defs, code);
610         while (tok_peek(&ps.toks)) {
611                 if (tok_take_if(&ps.toks, "struct")) {
612                         if (!tok_take_conglom(&ps, CDUMP_STRUCT))
613                                 goto fail;
614                 } else if (tok_take_if(&ps.toks, "union")) {
615                         if (!tok_take_conglom(&ps, CDUMP_UNION))
616                                 goto fail;
617                 } else if (tok_take_if(&ps.toks, "enum")) {
618                         if (!tok_take_enum(&ps))
619                                 goto fail;
620                 } else
621                         tok_take_unknown_statement(&ps);
622         }
623
624         /* Now, remove any undefined types! */
625         remove_undefined(&ps.defs->enums);
626         remove_undefined(&ps.defs->structs);
627         remove_undefined(&ps.defs->unions);
628         tal_free(toks);
629
630 out:
631         if (streq(ps.complaints, ""))
632                 ps.complaints = tal_free(ps.complaints);
633
634         if (complaints)
635                 *complaints = ps.complaints;
636         else
637                 tal_free(ps.complaints);
638         return ps.defs;
639
640 fail:
641         ps.defs = tal_free(ps.defs);
642         goto out;
643 }