commit - e46cabf4dabf47bee184092e4cdb372e9076f3ca
commit + 9d2eb8f8b610e4c920b0c463618df5452e4641c6
blob - bcb4b3f4cd14e31b2406f1b529508c88664fbbbc
blob + 0a017a655ec96e3cbcddaa9bd43b6fab9b3eaee6
--- Makefile
+++ Makefile
CFLAGS += -std=c99 -Wall
-TARGETS = ini_r ini_a ini_m ini_n ini_j
+TARGETS = ini_r ini_a ini_m ini_n ini_j ini_b
all: $(TARGETS)
ini_m : ini_m.c minip_m.h
ini_n : ini_n.c minip_n.h
ini_j : ini_j.c minip_n.h
+ini_b : ini_b.c minip_b.h
clean:
./ini_m test.ini | grep -v ' [01] byte '
./ini_n test.ini | grep -v ' [01] byte '
./ini_j test.ini 2>/dev/null
+ ./ini_b test.ini 2>/dev/null
.PHONY: all test clean
blob - /dev/null
blob + 5b5687f09b7c9624703a8562fb92434a01c6059f (mode 644)
--- /dev/null
+++ ini_b.c
+/*
+ * demo: (simple) ini files, converted to json
+ *
+ * EBNFish:
+ *
+ * inifile = {sect} {empty} {ws} [tail]
+ * sect = header {entry}
+ * tail = "EOF" {ws} eol {any}
+ *
+ * header = {empty} bra sname ket eol
+ * entry = {empty} key "=" value eol
+ * empty = {ws} [comment] nl
+ *
+ * (* tokens *)
+ * eol = nl | end
+ * bra = {ws} "["
+ * ket = "]" {ws}
+ * comment = ";" {lchar}
+ * sname = {lchar - "]"}+
+ * key = {lchar - "="}+
+ * value = {lchar}+
+ *
+ * (* character classes *)
+ * lchar = ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ?
+ * ws = ? ASCII codes 32 (SP), 9 (HT) ?
+ * nl = ? ASCII code 10 (LF) ?
+ * end = ? end of input ?
+ * any = ? any character ?
+ *
+ * converted to be compatible with our combinators:
+ *
+ * inifile = sects empties wss tail
+ * sects = {header entries}
+ * tail = [eof wss eol anys]
+ *
+ * header = empties bra sname ket eol
+ * entries = {empties key eq value eol}
+ * empties = {wss comment nl}
+ *
+ * eol = nl | end
+ * bra = wss leftbr
+ * ket = rightbr wss
+ * comment = [semi lchars]
+ * sname = {schar}+
+ * key = {kchar}+
+ * value = {lchar}+
+ *
+ * eof = "EOF"
+ * eq = "="
+ * semi = ";"
+ * leftbr = "["
+ * rightbr = "]"
+ *
+ * lchars = {lchar}
+ * wss = {ws}
+ * anys = {any}
+ *
+ * lchar = ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ?
+ * schar = ? ASCII codes 9, 0x20-0x5c, 0x5e-0x7e ?
+ * kchar = ? ASCII codes 9, 0x20-0x3c, 0x3e-0x7e ?
+ * ws = ? ASCII codes 32 (SP), 9 (HT) ?
+ * nl = ? ASCII code 10 (LF) ?
+ * end = ? end of input ?
+ * any = ? any character ?
+ *
+ * note how right-hand sides have one of the following forms:
+ *
+ * - a sequence of nonterminals (SEQ)
+ * - a sequence of nonterminals inside { } (MANY)
+ * - a sequence of nonterminals inside { }+ (MANY1)
+ * - a sequence of nonterminals inside [ ] (OPT)
+ * - a choice of nonterminals (CHOICE)
+ * - a string of characters (STRING)
+ * - a single character (CHAR)
+ *
+ * finally, we can realize the "special sequences" (? ... ?) using END,
+ * ANYCHAR, RANGE, and expression choice ||.
+ */
+
+#include "minip_b.h"
+#include <stdio.h>
+
+action trace, print, print_string;
+#define DEF_(NT, EXPR) DEF(NT, (EXPR) && ACTION(trace, #NT))
+
+DEF (lchar, CHAR('\t') || RANGE(0x20, 0x7e))
+DEF (schar, CHAR('\t') || RANGE(0x20, 0x5c) || RANGE(0x5e, 0x7e))
+ /* = !CHAR(']') && SEQ(lchar) */
+DEF (kchar, CHAR('\t') || RANGE(0x20, 0x3c) || RANGE(0x3e, 0x7e))
+ /* = !CHAR('=') && SEQ(lchar) */
+DEF (ws, CHAR('\t') || CHAR(' '))
+DEF_(nl, CHAR('\n'))
+DEF_(end, END)
+DEF_(any, ANYCHAR)
+
+DEF_(lchars, MANY(lchar))
+DEF_(wss, MANY(ws))
+DEF_(anys, MANY(any)) /* = OMEGA */
+
+DEF_(eof, STRING("EOF"))
+DEF_(eq, CHAR('=')
+ && ACTION(print, ": "))
+DEF_(semi, CHAR(';'))
+DEF_(leftbr, CHAR('['))
+DEF_(rightbr, CHAR(']'))
+
+DEF_(eol, CHOICE(nl, end))
+DEF_(bra, SEQ(wss, leftbr))
+DEF_(ket, SEQ(rightbr, wss))
+DEF_(comment, SEQ(semi, lchars))
+DEF (comment_o, OPT(comment))
+DEF_(sname, MANY1(schar)
+ && ACTION(print_string, " "))
+DEF_(key, MANY1(kchar)
+ && ACTION(print_string, " "))
+DEF_(value, MANY1(lchar)
+ && ACTION(print_string, ""))
+
+DEF_(out_comma, ACTION(print, ",\n"))
+
+DEF_(empty, SEQ(wss, comment_o, nl))
+DEF_(empties, MANY(empty))
+DEF_(header, SEQ(empties, bra, sname, ket, eol)
+ && ACTION(print, ": {\n"))
+DEF_(entry, SEQ(empties, key, eq, value, eol))
+DEF (entry_x, SEQ(out_comma, entry))
+DEF (entries_x, MANY(entry_x))
+DEF_(entries, SEQ(entry, entries_x)
+ && ACTION(print, "\n }"))
+
+DEF_(tail, SEQ(eof, wss, eol, anys))
+DEF (tail_o, OPT(tail))
+DEF_(sect, SEQ(header, entries))
+// (A)
+DEF (sect_x, SEQ(out_comma, sect))
+DEF (sects_x, MANY(sect_x))
+DEF (sects1, SEQ(sect, sects_x))
+// (B)
+//static parser sects2;
+//DEF (sect_x, SEQ(out_comma, sect))
+//DEF (sects3, SEQ(sect_x, sects2))
+//DEF (sects2, OPT(sects3))
+//DEF (sects1, SEQ(sect, sects2))
+// (C)
+//static parser sects1;
+//DEF (sects2, SEQ(sect, out_comma, sects1)) /* if only tail-recursive */
+//DEF (sects1, CHOICE(sects2, sect))
+DEF_(sects, ACTION(print, "{\n") &&
+ OPT(sects1)
+ && ACTION(print, "\n}\n"))
+DEF_(inifile, TRY(sects, empties, wss, tail_o))
+
+bool
+trace(void *ctx, void *env, const char *s, size_t len)
+{
+ const char *nt = ctx;
+ const char *begin = env;
+ size_t pos;
+
+ pos = s - begin;
+ fprintf(stderr, "%4zx: %4zu byte %s\n", pos, len, nt);
+ return true;
+}
+
+bool
+print(void *ctx, void *env, const char *s, size_t len)
+{
+ fputs(ctx, stdout);
+ return true;
+}
+
+bool
+print_string(void *ctx, void *env, const char *s, size_t len)
+{
+ size_t i;
+ unsigned char c;
+
+ fputs(ctx, stdout); /* prefix string */
+
+ putchar('"');
+ for (i = 0; i < len; i++) {
+ c = (unsigned char)s[i];
+ assert(c <= 127); // XXX only ASCII
+ if (c == '"')
+ fputs("\\\"", stdout);
+ else if (c == '\\')
+ fputs("\\\\", stdout);
+ else if (c == '\t')
+ fputs("\\t", stdout);
+ else if (c <= 0x1F || c == 0x7F) /* i.e. iscntrl */
+ printf("\\u00%.2hhx", c);
+ else
+ putchar(c);
+ }
+ putchar('"');
+
+ return true;
+}
+
+
+#include <stdio.h>
+#include <stdlib.h> /* calloc */
+#include <fcntl.h> /* open, lseek */
+#include <sys/mman.h> /* mmap */
+#include <err.h>
+#include <assert.h>
+
+extern char *__progname;
+
+/*
+ * run the 'inifile' parser on a file given on the command line.
+ */
+int
+main(int argc, char *argv[])
+{
+ struct cache cache = {};
+ const char *infile;
+ const char *input;
+ int fd;
+ off_t o;
+ size_t sz, pos;
+
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s file\n", __progname);
+ return 3;
+ }
+ infile = argv[1];
+
+ /* mmap input */
+ if ((fd = open(infile, O_RDONLY)) == -1)
+ err(2, "%s", infile);
+ if ((o = lseek(fd, 0, SEEK_END)) == -1)
+ err(2, "lseek");
+ sz = o;
+ input = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (input == MAP_FAILED)
+ err(2, "mmap");
+
+ /* result cache */
+ assert(cache.table == NULL);
+ assert(cache.capacity == 0);
+ assert(cache.nused == 0);
+
+ /* run parser */
+ struct stream inp = {input, input + sz};
+ if (!inifile(&inp, &cache, (void *)input)) {
+ assert(inp.p == input);
+ fprintf(stderr, "%s: syntax error\n", infile);
+ return 1;
+ }
+ assert(inp.p > input);
+ assert(inp.p <= input + sz);
+ pos = inp.p - input;
+ if (pos < sz) {
+ fprintf(stderr, "%s: syntax error (after pos. %zu/%zu)\n",
+ infile, pos, sz);
+ return 1;
+ }
+ fprintf(stderr, "success (consumed %zu/%zu bytes of input)\n", pos, sz);
+
+ return 0;
+}
blob - /dev/null
blob + ff87dfcd6b28f1ac929eb32f8b579236ca0125de (mode 644)
--- /dev/null
+++ minip_b.h
+/* miniature recursive-descent parser combinators
+ * with semantic actions and omnibus negative memoization
+ * employing boolean return type and mutable input pointer
+ *
+ * pesco 2019, 2020
+ */
+#ifndef MINIP_N_H_
+#define MINIP_N_H_
+
+#include <stddef.h> /* NULL */
+#include <stdbool.h>
+
+struct stream {const char *p, *end;};
+struct cache;
+
+/* parser type -- corresponds to grammar nonterminals */
+typedef bool parser(struct stream *, struct cache *, void *);
+
+/* define a parser (nonterminal) */
+#define DEF(NT, EXPR) \
+ static bool \
+ NT(struct stream *inp_, struct cache *cac_, void *env_) \
+ { \
+ const char *pos_ = inp_->p; \
+ struct memento *mem_; \
+ \
+ /* look up (pos,sym) in cache and return if found */ \
+ if ((mem_ = lookup(cac_, pos_, __func__)) != NULL) { \
+ if (mem_->value == NULL) \
+ return false; \
+ inp_->p = mem_->value; \
+ return true; \
+ } \
+ \
+ /* else run the expression and return if it matched */ \
+ if (EXPR) \
+ return true; \
+ \
+ /* else, return and memoize a negative result */ \
+ store(cac_, pos_, __func__, NULL); \
+ return false; \
+ }
+
+/* primitive expressions */
+#define ANYCHAR anychar_(inp_)
+#define CHAR(C) char_(inp_, (C))
+#define RANGE(M, N) range_(inp_, (M), (N))
+#define STRING(S) string_(inp_, (S))
+#define END end_(inp_)
+#define EPSILON true
+#define OMEGA (inp_->p = inp_->end, true)
+
+/*
+ * logical operators allowed on expressions *sans* actions:
+ *
+ * && restriction (lookahead); rhs is consumed
+ * || ordered choice
+ * ! negation; only allowed to the left of && (to avoid consumption)
+ */
+
+/* variable-length argument lists */
+#define NARGS(T, ...) (sizeof (T[]){__VA_ARGS__} / sizeof(T))
+#define VARGS(T, ...) (T[]){__VA_ARGS__}, NARGS(T, __VA_ARGS__)
+#define PARGS(...) VARGS(parser *, __VA_ARGS__)
+
+/* basic combinators -- take a list of nonterminals as arguments */
+#define CHOICE(...) choice_(inp_, cac_, env_, PARGS(__VA_ARGS__))
+#define SEQ(...) seq_(inp_, cac_, env_, PARGS(__VA_ARGS__))
+
+/* secondary combinators -- could be derived from the basics */
+#define MANY(X) many_(inp_, cac_, env_, X)
+#define MANY1(X) many1_(inp_, cac_, env_, X)
+#define OPT(X) ((X)(inp_, cac_, env_), true)
+
+/* NB:
+ * because memoization only applies to nonterminals and the above combinators
+ * (MANY, MANY1, OPT) form implicit choices with them, we can't use variable
+ * argument lists that form an implicit SEQ here. otherwise, since those
+ * anonymous argument sequences would not be memoized (memoization only happens
+ * on actual nonterminals), actions could be executed in them, even if the
+ * sequence failed to match in a later element.
+ */
+
+/*
+ * semantic actions:
+ *
+ * - attach to an expression with &&: EXPR && ACTION(f, ctx)
+ * - receive an arbitrary context parameter via ACTION()'s second argument.
+ * - receive an arbitrary environment pointer from the top-level parser call.
+ * - execute (only) after their subordinate parser(s) matched.
+ * - can also act as validations by returning the intended result.
+ *
+ * Thanks to prerecognition and negative memoization, it is also possible to
+ * attach actions before the expression. E.g.:
+ *
+ * ACTION(print, "[") && EXPR && ACTION(print, "]")
+ */
+typedef bool action(void *, void *, const char *, size_t);
+#define ACTION(F, C) (env_ == NULL || (F)((C), env_, pos_, inp_->p - pos_))
+
+
+/*
+ * to avoid premature execution of actions, TRY(...) can be used in place of
+ * SEQ on any sequence where actions could execute before a later nonterminal
+ * fails. note:
+ *
+ * - single-element sequences do not need TRY.
+ * - multi-argument calls of MANY, MANY1, OPT may need to be replaced.
+ * - OPT(...) can be replaced with TRY(...) || EPSILON.
+ *
+ * in any case, beware of the overhead. since TRY executes the sequence twice,
+ * it can easily lead to a substantial increase in recognition time.
+ */
+#define MATCH(...) (seq_(inp_, cac_, NULL, PARGS(__VA_ARGS__)) && (inp_->p = pos_))
+#define TRY(...) ((env_ == NULL || MATCH(__VA_ARGS__)) && SEQ(__VA_ARGS__))
+
+
+/*
+ * general memoization of parse results (~ packrat parsing)
+ *
+ * handles the positive cases, but only when actions are enabled, i.e. env
+ * pointer is non-null. otherwise, actions would be lost - cf. TRY().
+ *
+ * NB: negative results are always cached - see DEF().
+ *
+ * usage: DEF(nt, EXPR && MEMO)
+ */
+#define MEMO (env_ == NULL || store(cac_, pos_, __func__, inp_->p))
+
+/* an entry in the memoization hash table */
+struct memento {
+ const void *key; /* NULL: unused entry */
+ const void *key2; /* NULL: dummy entry -- may be reused */
+ const void *value;
+};
+
+struct cache {
+ struct memento *table;
+ size_t capacity, nused;
+};
+
+
+#include <assert.h>
+
+static inline bool
+anychar_(struct stream *inp)
+{
+ if (inp->p < inp->end)
+ return (inp->p++, true);
+ else
+ return false;
+}
+
+static inline bool
+char_(struct stream *inp, char c)
+{
+ if (inp->p < inp->end && *inp->p == c)
+ return (inp->p++, true);
+ else
+ return false;
+}
+
+static inline bool
+range_(struct stream *inp, unsigned char m, unsigned char n)
+{
+ unsigned char b = *(unsigned char *)inp->p;
+
+ if (inp->p < inp->end && b >= m && b <= n)
+ return (inp->p++, true);
+ else
+ return false;
+}
+
+static inline bool
+string_(struct stream *inp, const char *s)
+{
+ const char *p = inp->p;
+
+ while (p < inp->end && *s != '\0' && *p == *s) {
+ p++;
+ s++;
+ }
+ if (*s == '\0') /* did we match the full string? */
+ return (inp->p = p, true);
+ else
+ return false; /* inp->p remains untouched */
+}
+
+static inline bool
+end_(struct stream *inp)
+{
+ if (inp->p >= inp->end)
+ return (inp->p = inp->end, true);
+ else
+ return false;
+}
+
+static inline bool
+seq_(struct stream *inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+ const char *bak = inp->p;
+
+ for (size_t i=0; i<n; i++) {
+ if (!ps[i](inp, cac, env)) /* no match */
+ return (inp->p = bak, false);
+ }
+
+ return true;
+}
+
+static inline bool
+choice_(struct stream *inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+ const char *bak = inp->p;
+
+ for (size_t i=0; i<n; i++) {
+ if (ps[i](inp, cac, env)) /* match */
+ return true;
+ assert(inp->p == bak);
+ }
+
+ return false;
+}
+
+static inline bool
+many_(struct stream *inp, struct cache *cac, void *env, parser *pa)
+{
+ const char *bak;
+
+ for (;;) {
+ bak = inp->p;
+ if (!pa(inp, cac, env))
+ return (inp->p = bak, true);
+ assert(inp->p > bak); /* progress */
+ }
+ /* not reached */
+}
+
+static inline bool
+many1_(struct stream *inp, struct cache *cac, void *env, parser *pa)
+{
+ const char *bak = inp->p;
+
+ if (!pa(inp, cac, env))
+ return (inp->p = bak, false);
+ return many_(inp, cac, env, pa);
+}
+
+
+#include <stdlib.h> /* calloc */
+#include <limits.h> /* UINT_MAX */
+#include <stdint.h> /* SIZE_MAX */
+
+#include <stdio.h> // XXX debug
+
+static inline unsigned int
+hash(const void *key, const void *key2)
+{
+ unsigned int h = 5831; /* djbhash; f(h,x) = 33h + x */
+ size_t i;
+
+ for (i=0; i < sizeof key; i++)
+ h = h * 33 + ((unsigned char *)&key)[i];
+ for (i=0; i < sizeof key2; i++)
+ h = h * 33 + ((unsigned char *)&key2)[i];
+
+ return h;
+}
+
+/* NB:
+ * the hash math is modulo cache->capacity, but when that is a power of
+ * two, we can run the intermediates modulo UINT_MAX+1.
+ */
+
+static inline struct memento *
+lookup(const struct cache *cache, const void *key, const void *key2)
+{
+ struct memento *mem;
+ unsigned int h, s;
+ size_t i;
+
+ if (cache == NULL)
+ return NULL;
+
+ h = hash(key, key2);
+
+ /* NB: count steps, since the table could be filled with dummies */
+ for (i=0, s=1; i < cache->capacity; i++, h+=s, s*=5) {
+ mem = cache->table + (h % cache->capacity);
+ if (mem->key == NULL) /* not in table */
+ break;
+ if (mem->key2 == NULL) /* dummy entry */
+ continue;
+ if (mem->key == key && mem->key2 == key2)
+ return mem; /* match */
+ }
+
+ return NULL;
+}
+
+static inline void
+grow(struct cache *cache)
+{
+ struct memento *mem, *newp;
+ unsigned int h, s;
+ size_t i, newcap;
+
+ /* realloc XXX nicer err handling */
+ assert(cache->capacity <= UINT_MAX / 2);
+ assert(cache->capacity <= SIZE_MAX / sizeof *cache->table);
+ if (cache->capacity == 0) /* initial alloc */
+ newcap = 4;
+ else
+ newcap = cache->capacity * 2;
+#ifdef PROFGROW
+ size_t count=0, count2=0;
+ fprintf(stderr, "\tgrow at %zu/%zu -> newcap %zu\n",
+ cache->nused, cache->capacity, newcap);
+#endif
+ newp = calloc(newcap, sizeof *cache->table);
+ assert(newp != NULL);
+
+ /* relocate items */
+ for (i=0; i < cache->capacity; i++) {
+ const void *k = cache->table[i].key;
+ const char *k2 = cache->table[i].key2;
+
+ if (k == NULL || k2 == NULL) /* unused or dummy */
+ continue;
+
+ /* find the correct place for (k,k2) in the new table */
+ for (h = hash(k, k2), s=1; ; h+=s, s*=5) {
+ h %= newcap;
+ mem = newp + h;
+ if (mem->key == NULL) /* unused */
+ break;
+ }
+#ifdef PROFGROW
+ if (s != 1)
+ count2++;
+ if (h != i)
+ count++;
+#endif
+
+ /* copy the entry to the new array */
+ *mem = cache->table[i];
+ }
+#ifdef PROFGROW
+ fprintf(stderr, "\tpos. changed on %zu/%zu entries = %.1f%%\n",
+ count, cache->nused, count * 100.0 / cache->nused);
+ fprintf(stderr, "\tnow %zu/%zu are off-base, i.e. %.1f%%\n",
+ count2, cache->nused, count2 * 100.0 / cache->nused);
+#endif
+
+ free(cache->table);
+ cache->table = newp;
+ cache->capacity = newcap;
+}
+
+static inline const void *
+store(struct cache *cache, const void *key, const void *key2, const void *val)
+{
+ struct memento *mem;
+ unsigned int h, s;
+
+ if (cache == NULL)
+ return val;
+
+ /* grow hashtable if necessary */
+ if (cache->nused >= cache->capacity * 9 / 10)
+ grow(cache);
+
+ /* find our place in the table */
+ /* NB: loop will terminate because dummies count as free here */
+ h = hash(key, key2);
+ assert(cache->nused < cache->capacity);
+ for (s=1; ; h+=s, s*=5) {
+ mem = cache->table + (h % cache->capacity);
+ if (mem->key == NULL) /* unused entry */
+ break;
+ if (mem->key2 == NULL) /* dummy entry */
+ break; /* reuse! */
+ }
+
+ /* save the result */
+ assert(mem->key == NULL || mem->key2 == NULL);
+ mem->key = key;
+ mem->key2 = key2;
+ mem->value = val;
+ cache->nused++;
+
+ return val;
+}
+
+#endif /* MINIP_N_H_ */