commit - 6bc16f3a37132ac01dc7e6c95b3397b27f78a02d
commit + 495a6e54da5bacc0cc7de703543c5ff65abfc44f
blob - f2f7833d69da7a62ef7d5fffc139278b2735576d
blob + 17edf08425ceb13ebd2f487bcfd80cdf4816b93d
--- Makefile
+++ Makefile
CFLAGS += -std=c99 -Wall
-TARGETS = ini_r ini_a ini_m
+TARGETS = ini_r ini_a ini_m ini_n
all: $(TARGETS)
ini_r : ini_r.c minip_r.h
ini_a : ini_a.c minip_a.h
ini_m : ini_m.c minip_m.h
+ini_n : ini_n.c minip_n.h
clean:
./ini_r test.ini
./ini_a test.ini | grep -v ' [01] byte '
./ini_m test.ini | grep -v ' [01] byte '
+ ./ini_n test.ini | grep -v ' [01] byte '
.PHONY: all test clean
blob - /dev/null
blob + 3c0e10b3e2b91d31bda7df666cbfbbfb056092eb (mode 644)
--- /dev/null
+++ ini_n.c
+/*
+ * demo: (simple) ini files
+ *
+ * EBNFish:
+ *
+ * inifile = {sect} {empty} {ws} [tail]
+ * sect = header {entry}
+ * tail = "EOF" {ws} eol {any}
+ *
+ * header = {empty} bra sname ket eol
+ * entry = {empty} key "=" value eol
+ * empty = {ws} [comment] nl
+ *
+ * (* tokens *)
+ * eol = nl | end
+ * bra = {ws} "["
+ * ket = "]" {ws}
+ * comment = ";" {lchar}
+ * sname = {lchar - "]"}+
+ * key = {lchar - "="}+
+ * value = {lchar}+
+ *
+ * (* character classes *)
+ * lchar = ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ?
+ * ws = ? ASCII codes 32 (SP), 9 (HT) ?
+ * nl = ? ASCII code 10 (LF) ?
+ * end = ? end of input ?
+ * any = ? any character ?
+ *
+ * converted to be compatible with our combinators:
+ *
+ * inifile = sects empties wss tail
+ * sects = {header entries}
+ * tail = [eof wss eol anys]
+ *
+ * header = empties bra sname ket eol
+ * entries = {empties key eq value eol}
+ * empties = {wss comment nl}
+ *
+ * eol = nl | end
+ * bra = wss leftbr
+ * ket = rightbr wss
+ * comment = [semi lchars]
+ * sname = {schar}+
+ * key = {kchar}+
+ * value = {lchar}+
+ *
+ * eof = "EOF"
+ * eq = "="
+ * semi = ";"
+ * leftbr = "["
+ * rightbr = "]"
+ *
+ * lchars = {lchar}
+ * wss = {ws}
+ * anys = {any}
+ *
+ * lchar = ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ?
+ * schar = ? ASCII codes 9, 0x20-0x5c, 0x5e-0x7e ?
+ * kchar = ? ASCII codes 9, 0x20-0x3c, 0x3e-0x7e ?
+ * ws = ? ASCII codes 32 (SP), 9 (HT) ?
+ * nl = ? ASCII code 10 (LF) ?
+ * end = ? end of input ?
+ * any = ? any character ?
+ *
+ * note how right-hand sides have one of the following forms:
+ *
+ * - a sequence of nonterminals (SEQ)
+ * - a sequence of nonterminals inside { } (MANY)
+ * - a sequence of nonterminals inside { }+ (MANY1)
+ * - a sequence of nonterminals inside [ ] (OPT)
+ * - a choice of nonterminals (CHOICE)
+ * - a string of characters (STRING)
+ * - a single character (CHAR)
+ *
+ * finally, we can realize the "special sequences" (? ... ?) using END,
+ * ANYCHAR, RANGE, and expression choice ||.
+ */
+
+#include "minip_n.h"
+#include <stdio.h>
+
+action trace;
+#define DEF_(NT, EXPR) DEF(NT, (EXPR) && ACTION(trace, #NT))
+
+DEF_(lchar, CHAR('\t') || RANGE(0x20, 0x7e))
+DEF_(schar, CHAR('\t') || RANGE(0x20, 0x5c) || RANGE(0x5e, 0x7e))
+ /* = !CHAR(']') && SEQ(lchar) */
+DEF_(kchar, !CHAR('=') && SEQ(lchar))
+DEF_(ws, CHAR('\t') || CHAR(' '))
+DEF_(nl, CHAR('\n'))
+DEF_(end, END)
+DEF_(any, ANYCHAR)
+
+DEF_(lchars, MANY(lchar))
+DEF_(wss, MANY(ws))
+DEF_(anys, MANY(any)) /* = OMEGA */
+
+DEF_(eof, STRING("EOF"))
+DEF_(eq, CHAR('='))
+DEF_(semi, CHAR(';'))
+DEF_(leftbr, CHAR('['))
+DEF_(rightbr, CHAR(']'))
+
+DEF_(eol, CHOICE(nl, end))
+DEF_(bra, MEMO(wss, leftbr))
+DEF_(ket, SEQ(rightbr, wss))
+DEF_(comment, OPT(semi, lchars))
+DEF_(sname, MANY1(schar))
+DEF_(key, MANY1(kchar))
+DEF_(value, MANY1(lchar))
+
+DEF_(empty, MEMO(wss, comment, nl))
+DEF_(empties, MANY(empty))
+DEF (header_, TRY(empties, bra, sname, ket, eol))
+DEF_(header, MEMO(header_))
+DEF (entry_, TRY(empties, key, eq, value, eol))
+DEF_(entry, MEMO(entry_))
+DEF_(entries, MANY(entry))
+
+DEF_(tail, MEMO(eof, wss, eol, anys) || EPSILON)
+DEF_(sects, MANY(header, entries))
+DEF_(inifile, SEQ(sects, empties, wss, tail))
+
+bool
+trace(void *ctx, void *env, const char *s, size_t len)
+{
+ const char *nt = ctx;
+ const char *begin = env;
+ size_t pos;
+
+ if (env == NULL)
+ return true;
+
+ pos = s - begin;
+ printf("%4zx: %4zu byte %s\n", pos, len, nt);
+ return true;
+}
+
+
+#include <stdio.h>
+#include <stdlib.h> /* calloc */
+#include <fcntl.h> /* open, lseek */
+#include <sys/mman.h> /* mmap */
+#include <err.h>
+#include <assert.h>
+
+extern char *__progname;
+
+/*
+ * run the 'inifile' parser on a file given on the command line.
+ */
+int
+main(int argc, char *argv[])
+{
+ struct cache cache = {};
+ const char *infile;
+ const char *input;
+ const char *p;
+ int fd;
+ off_t o;
+ size_t sz, pos;
+
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s file\n", __progname);
+ return 3;
+ }
+ infile = argv[1];
+
+ /* mmap input */
+ if ((fd = open(infile, O_RDONLY)) == -1)
+ err(2, "%s", infile);
+ if ((o = lseek(fd, 0, SEEK_END)) == -1)
+ err(2, "lseek");
+ sz = o;
+ input = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (input == MAP_FAILED)
+ err(2, "mmap");
+
+ /* result cache */
+ assert(cache.table == NULL);
+ assert(cache.capacity == 0);
+ assert(cache.nused == 0);
+ //cache.capacity = 128;
+ //cache.table = calloc(cache.capacity, sizeof(cache.table[0]));
+ //assert(cache.table != NULL);
+
+ /* run parser */
+ p = inifile((struct stream){input, input + sz}, &cache, (void *)input);
+ if (p == NULL) {
+ fprintf(stderr, "%s: syntax error\n", infile);
+ return 1;
+ }
+ assert(p > input);
+ assert(p <= input + sz);
+ pos = p - input;
+ if (pos < sz) {
+ fprintf(stderr, "%s: syntax error (after pos. %zu/%zu)\n",
+ infile, pos, sz);
+ return 1;
+ }
+ printf("success (consumed %zu/%zu bytes of input)\n", pos, sz);
+
+ return 0;
+}
blob - /dev/null
blob + da50978d4e5b035c90ab5649713da061c53dc29c (mode 644)
--- /dev/null
+++ minip_n.h
+/* miniature recursive-descent parser combinators
+ * with semantic actions
+ *
+ * pesco 2019, 2020
+ */
+#ifndef MINIP_N_H_
+#define MINIP_N_H_
+
+#include <stddef.h> /* NULL */
+
+struct stream {const char *p, *end;};
+struct cache;
+
+/* parser type -- corresponds to grammar nonterminals */
+typedef const char *parser(struct stream, struct cache *, void *);
+
+/* define a parser (nonterminal) */
+#define DEF(NT, EXPR) \
+ static const char * \
+ NT(struct stream inp_, struct cache *cac_, void *env_) \
+ { \
+ const char *p_ = inp_.p; \
+ return (EXPR) ? p_ : NULL; \
+ }
+
+/* primitive expressions */
+#define ANYCHAR (p_ = anychar_(inp_))
+#define CHAR(C) (p_ = char_(inp_, (C)))
+#define RANGE(M, N) (p_ = range_(inp_, (M), (N)))
+#define STRING(S) (p_ = string_(inp_, (S)))
+#define END (p_ = end_(inp_))
+#define EPSILON (p_ = inp_.p)
+#define OMEGA (p_ = inp_.end)
+
+/*
+ * logical operators allowed on expressions:
+ *
+ * || (ordered) choice; first match wins
+ * && restriction (lookahead); rhs is consumed
+ * ! negation; only allowed to the left of &&
+ */
+
+/* variable-length argument lists */
+#define NARGS(T, ...) (sizeof (T[]){__VA_ARGS__} / sizeof(T))
+#define VARGS(T, ...) (T[]){__VA_ARGS__}, NARGS(T, __VA_ARGS__)
+#define PARGS(...) VARGS(parser *, __VA_ARGS__)
+
+/* basic combinators -- take a list of nonterminals as arguments */
+#define CHOICE(...) (p_ = choice_(inp_, cac_, env_, PARGS(__VA_ARGS__)))
+#define SEQ(...) (p_ = seq_(inp_, cac_, env_, PARGS(__VA_ARGS__)))
+
+/* derived combinators; arguments form an implicit SEQ */
+#define MANY(...) (p_ = many_(inp_, cac_, env_, PARGS(__VA_ARGS__)))
+#define MANY1(...) (p_ = many1_(inp_, cac_, env_, PARGS(__VA_ARGS__)))
+#define OPT(...) (SEQ(__VA_ARGS__) || EPSILON)
+
+
+#include <stdbool.h>
+
+/*
+ * semantic actions:
+ *
+ * - attach to an expression with &&: (EXPR) && ACTION(f, ctx)
+ * - receive an arbitrary context parameter via ACTION()'s second argument.
+ * - receive an arbitrary environment pointer from the top-level parser call.
+ * - execute (only) after their subordinate parser(s) matched.
+ * - can also act as validations by returning the intended result.
+ */
+typedef bool action(void *, void *, const char *, size_t);
+#define ACTION(F, C) (F)((C), env_, inp_.p, p_ - inp_.p)
+
+
+/*
+ * to avoid premature execution of actions, TRY(...) can be used in place of
+ * SEQ on any sequence where actions could execute before a later nonterminal
+ * fails. note:
+ *
+ * - single-element sequences do not need TRY.
+ * - multi-argument calls of MANY, MANY1, OPT may need to be replaced.
+ * - OPT(...) can be replaced with TRY(...) || EPSILON.
+ *
+ * in any case, beware of the overhead. since TRY executes the sequence twice,
+ * it can easily lead to a substantial increase in recognition time.
+ */
+#define MATCH(...) seq_(inp_, NULL, NULL, PARGS(__VA_ARGS__))
+#define TRY(...) ((env_ == NULL || MATCH(__VA_ARGS__)) && SEQ(__VA_ARGS__))
+
+
+/*
+ * memoization of parse results (~ packrat parsing)
+ */
+#define MEMO(...) (p_ = memo_(inp_, cac_, env_, __func__, PARGS(__VA_ARGS__)))
+
+/* an entry in the memoization hash table */
+struct memento {
+ const char *position; /* NULL: unused entry */
+ const char *symbol; /* NULL: dummy entry -- may be reused */
+ const char *result;
+};
+
+struct cache {
+ struct memento *table;
+ size_t capacity, nused;
+};
+
+
+#include <assert.h>
+
+static inline const char *
+anychar_(struct stream inp)
+{
+ if (inp.p < inp.end)
+ return inp.p + 1;
+ else
+ return NULL;
+}
+
+static inline const char *
+char_(struct stream inp, char c)
+{
+ if (inp.p < inp.end && *inp.p == c)
+ return inp.p + 1;
+ else
+ return NULL;
+}
+
+static inline const char *
+range_(struct stream inp, unsigned char m, unsigned char n)
+{
+ unsigned char b = *(unsigned char *)inp.p;
+
+ if (inp.p < inp.end && b >= m && b <= n)
+ return inp.p + 1;
+ else
+ return NULL;
+}
+
+static inline const char *
+string_(struct stream inp, const char *s)
+{
+ while (inp.p < inp.end && *s != '\0' && *inp.p == *s) {
+ inp.p++;
+ s++;
+ }
+ if (*s == '\0') /* did we match the full string? */
+ return inp.p;
+ else
+ return NULL;
+}
+
+static inline const char *
+end_(struct stream inp)
+{
+ if (inp.p >= inp.end)
+ return inp.end;
+ else
+ return NULL;
+}
+
+static inline const char *
+seq_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+ for (size_t i=0; i<n; i++) {
+ inp.p = ps[i](inp, cac, env);
+ if (inp.p == NULL) /* no match */
+ break;
+ }
+
+ return inp.p;
+}
+
+static inline const char *
+choice_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+ const char *p = NULL;
+
+ for (size_t i=0; i<n; i++) {
+ p = ps[i](inp, cac, env);
+ if (p != NULL) /* match */
+ break;
+ }
+
+ return p;
+}
+
+static inline const char *
+many_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+ const char *bak;
+
+ for (;;) {
+ bak = inp.p;
+ inp.p = seq_(inp, cac, env, ps, n);
+ if (inp.p == NULL)
+ return bak;
+ assert(inp.p > bak); /* progress */
+ }
+}
+
+static inline const char *
+many1_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+ inp.p = seq_(inp, cac, env, ps, n);
+ if (inp.p == NULL)
+ return NULL;
+ return many_(inp, cac, env, ps, n);
+}
+
+#include <string.h> /* strcmp */
+#include <stdlib.h> /* calloc */
+#include <limits.h> /* UINT_MAX */
+#include <stdint.h> /* SIZE_MAX */
+
+#include <stdio.h> // XXX debug
+
+static inline const char *
+memo_(struct stream inp, struct cache *cache, void *env, const char *sym,
+ parser *ps[], size_t n)
+{
+ struct memento *mem;
+ const char *res;
+ size_t i;
+ unsigned int h0 = 5831; /* djbhash; f(h,x) = 33h + x */
+ unsigned int h, s;
+
+ if (cache == NULL)
+ return seq_(inp, cache, env, ps, n);
+
+ /* NB:
+ * the hash math is modulo cache->capacity, but when that is a power of
+ * two, we can run the intermediates modulo UINT_MAX+1.
+ */
+
+ /* hash (pos,sym) */
+ for (i=0; i < sizeof inp.p; i++)
+ h0 = h0 * 33 + ((unsigned char *)&inp.p)[i];
+ for (i=0; sym[i] != '\0'; i++)
+ h0 = h0 * 33 + (unsigned char)sym[i];
+
+ /* look up (pos,sym) in cache and return if found */
+ /* NB: count steps; the table could be filled with dummies */
+ for (i=0, h=h0, s=1; i < cache->capacity; i++, h+=s, s*=5) {
+ mem = cache->table + (h % cache->capacity);
+ if (mem->position == NULL) /* not in table */
+ break;
+ if (mem->symbol == NULL) /* dummy entry */
+ continue;
+ if (mem->position == inp.p && strcmp(mem->symbol, sym) == 0)
+ return mem->result; /* match */
+ }
+
+ /* else run the argument sequence */
+ res = seq_(inp, cache, env, ps, n);
+
+ /* grow hashtable if necessary */
+ if (cache->nused >= cache->capacity * 9 / 10) {
+ struct memento *newp;
+ size_t newcap;
+ size_t j;
+
+ /* realloc XXX nicer err handling */
+ assert(cache->capacity <= UINT_MAX / 2);
+ assert(cache->capacity <= SIZE_MAX / sizeof *cache->table);
+ if (cache->capacity == 0) /* initial alloc */
+ newcap = 4;
+ else
+ newcap = cache->capacity * 2;
+#ifdef PROFGROW
+ size_t count=0, count2=0;
+ fprintf(stderr, "\tgrow at %zu/%zu -> newcap %zu\n",
+ cache->nused, cache->capacity, newcap);
+#endif
+ newp = calloc(newcap, sizeof *cache->table);
+ assert(newp != NULL);
+
+ /* relocate items */
+ for (i=0; i < cache->capacity; i++) {
+ const char *pi = cache->table[i].position;
+ const char *si = cache->table[i].symbol;
+
+ if (pi == NULL || si == NULL) /* unused or dummy */
+ continue;
+
+ /* hash the value (pi,si) in table cell i */
+ h = 5831;
+ for (j=0; j < sizeof pi; j++)
+ h = h * 33 + ((unsigned char *)&pi)[j];
+ for (j=0; si[j] != '\0'; j++)
+ h = h * 33 + (unsigned char)si[j];
+
+ /* find the correct place for it in the new table */
+ for (s=1; ; h+=s, s*=5) {
+ h %= newcap;
+ mem = newp + h;
+ if (mem->position == NULL) /* unused */
+ break;
+ }
+#ifdef PROFGROW
+ if (s != 1)
+ count2++;
+ if (h != i)
+ count++;
+#endif
+
+ /* copy the entry to the new array */
+ *mem = cache->table[i];
+ }
+#ifdef PROFGROW
+ fprintf(stderr, "\tpos. changed on %zu/%zu entries = %.1f%%\n",
+ count, cache->nused, count * 100.0 / cache->nused);
+ fprintf(stderr, "\tnow %zu/%zu are off-base, i.e. %.1f%%\n",
+ count2, cache->nused, count2 * 100.0 / cache->nused);
+#endif
+
+ free(cache->table);
+ cache->table = newp;
+ cache->capacity = newcap;
+ }
+
+ /* find our place in the table */
+ assert(cache->nused < cache->capacity);
+ /* NB: loop will terminate because dummies count as free here */
+ for (h=h0, s=1; ; h+=s, s*=5) {
+ mem = cache->table + (h % cache->capacity);
+ if (mem->position == NULL) /* unused entry */
+ break;
+ if (mem->symbol == NULL) /* dummy entry */
+ break; /* reuse! */
+ }
+
+ /* save the result */
+ assert(mem->position == NULL || mem->symbol == NULL);
+ mem->position = inp.p;
+ mem->symbol = sym;
+ mem->result = res;
+ cache->nused++;
+
+ return res;
+}
+
+#endif /* MINIP_N_H_ */