commit 2bac640ef0b48e6ad7a239c30efd1f40dea37caf
from: Sven M. Hallberg <pesco@khjk.org>
date: Fri Jun 12 16:57:09 2020 UTC

add memoizing version, first prototype

commit - f2567d1f9b90fabbcc831652cc67e265c96659bc
commit + 2bac640ef0b48e6ad7a239c30efd1f40dea37caf
blob - e00f6349d561119e2b05a05cacc61187f326a3ae
blob + f2f7833d69da7a62ef7d5fffc139278b2735576d
--- Makefile
+++ Makefile
@@ -1,11 +1,12 @@
 CFLAGS += -std=c99 -Wall
 
-TARGETS = ini_r ini_a
+TARGETS = ini_r ini_a ini_m
 
 all: $(TARGETS)
 
 ini_r : ini_r.c minip_r.h
 ini_a : ini_a.c minip_a.h
+ini_m : ini_m.c minip_m.h
 
 
 clean:
@@ -14,5 +15,6 @@ clean:
 test: all
 	./ini_r test.ini
 	./ini_a test.ini | grep -v ' [01] byte '
+	./ini_m test.ini | grep -v ' [01] byte '
 
 .PHONY: all test clean
blob - /dev/null
blob + 586cce27ebc480dfc85d92e59a3c47b353cb15ee (mode 644)
--- /dev/null
+++ ini_m.c
@@ -0,0 +1,204 @@
+/*
+ * demo: (simple) ini files
+ *
+ * EBNFish:
+ *
+ *	inifile	= {sect} {empty} {ws} [tail]
+ *	sect	= header {entry}
+ *	tail	= "EOF" {ws} eol {any}
+ *
+ *	header	= {empty} bra sname ket eol
+ *	entry	= {empty} key "=" value eol
+ *	empty	= {ws} [comment] nl
+ *
+ *	(* tokens *)
+ *	eol	= nl | end
+ *	bra	= {ws} "["
+ *	ket	= "]" {ws}
+ *	comment	= ";" {lchar}
+ *	sname	= {lchar - "]"}+
+ *	key	= {lchar - "="}+
+ *	value	= {lchar}+
+ *
+ *	(* character classes *)
+ *	lchar	= ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ?
+ *	ws	= ? ASCII codes 32 (SP), 9 (HT) ?
+ *	nl	= ? ASCII code 10 (LF) ?
+ *	end	= ? end of input ?
+ *	any	= ? any character ?
+ *
+ * converted to be compatible with our combinators:
+ *
+ *	inifile	= sects empties wss tail
+ *	sects	= {header entries}
+ *	tail	= [eof wss eol anys]
+ *
+ *	header	= empties bra sname ket eol
+ *	entries	= {empties key eq value eol}
+ *	empties	= {wss comment nl}
+ *
+ *	eol	= nl | end
+ *	bra	= wss leftbr
+ *	ket	= rightbr wss
+ *	comment	= [semi lchars]
+ *	sname	= {schar}+
+ *	key	= {kchar}+
+ *	value	= {lchar}+
+ *
+ *	eof	= "EOF"
+ *	eq	= "="
+ *	semi	= ";"
+ *	leftbr	= "["
+ *	rightbr	= "]"
+ *
+ *	lchars	= {lchar}
+ *	wss	= {ws}
+ *	anys	= {any}
+ *
+ *	lchar	= ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ?
+ *	schar	= ? ASCII codes 9, 0x20-0x5c, 0x5e-0x7e ?
+ *	kchar	= ? ASCII codes 9, 0x20-0x3c, 0x3e-0x7e ?
+ *	ws	= ? ASCII codes 32 (SP), 9 (HT) ?
+ *	nl	= ? ASCII code 10 (LF) ?
+ *	end	= ? end of input ?
+ *	any	= ? any character ?
+ *
+ * note how right-hand sides have one of the following forms:
+ *
+ *  - a sequence of nonterminals (SEQ)
+ *  - a sequence of nonterminals inside { } (MANY)
+ *  - a sequence of nonterminals inside { }+ (MANY1)
+ *  - a sequence of nonterminals inside [ ] (OPT)
+ *  - a choice of nonterminals (CHOICE)
+ *  - a string of characters (STRING)
+ *  - a single character (CHAR)
+ *
+ * finally, we can realize the "special sequences" (? ... ?) using END,
+ * ANYCHAR, RANGE, and expression choice ||.
+ */
+
+#include "minip_m.h"
+#include <stdio.h>
+
+action trace;
+#define DEF_(NT, EXPR)	DEF(NT, (EXPR) && ACTION(trace, #NT))
+
+DEF_(lchar,	CHAR('\t') || RANGE(0x20, 0x7e))
+DEF_(schar,	CHAR('\t') || RANGE(0x20, 0x5c) || RANGE(0x5e, 0x7e))
+				/* = !CHAR(']') && SEQ(lchar) */
+DEF_(kchar,	!CHAR('=') && SEQ(lchar))
+DEF_(ws,	CHAR('\t') || CHAR(' '))
+DEF_(nl,	CHAR('\n'))
+DEF_(end,	END)
+DEF_(any,	ANYCHAR)
+
+DEF_(lchars,	MANY(lchar))
+DEF_(wss,	MANY(ws))
+DEF_(anys,	MANY(any))	/* = OMEGA */
+
+DEF_(eof,	STRING("EOF"))
+DEF_(eq,	CHAR('='))
+DEF_(semi,	CHAR(';'))
+DEF_(leftbr,	CHAR('['))
+DEF_(rightbr,	CHAR(']'))
+
+DEF_(eol,	CHOICE(nl, end))
+DEF_(bra,	MEMO(wss, leftbr))
+DEF_(ket,	SEQ(rightbr, wss))
+DEF_(comment,	OPT(semi, lchars))
+DEF_(sname,	MANY1(schar))
+DEF_(key,	MANY1(kchar))
+DEF_(value,	MANY1(lchar))
+
+DEF (empty_,	MEMO(wss, comment, nl))
+DEF_(empty,	TRY(empty_))
+DEF_(empties,	MANY(empty))
+DEF_(header,	MEMO(empties, bra, sname, ket, eol))
+DEF_(entry,	MEMO(empties, key, eq, value, eol))
+DEF_(entries,	MANY(entry))
+
+DEF_(tail,	MEMO(eof, wss, eol, anys) || EPSILON)
+DEF_(sects,	MANY(header, entries))
+DEF_(inifile,	SEQ(sects, empties, wss, tail))
+
+bool
+trace(void *ctx, void *env, const char *s, size_t len)
+{
+	const char *nt = ctx;
+	const char *begin = env;
+	size_t pos;
+
+	if (env == NULL)
+		return true;
+
+	pos = s - begin;
+	printf("%4zx: %4zu byte %s\n", pos, len, nt);
+	return true;
+}
+
+
+#include <stdio.h>
+#include <stdlib.h>	/* calloc */
+#include <fcntl.h>	/* open, lseek */
+#include <sys/mman.h>	/* mmap */
+#include <err.h>
+#include <assert.h>
+
+extern char *__progname;
+
+/*
+ * run the 'inifile' parser on a file given on the command line.
+ */
+int
+main(int argc, char *argv[])
+{
+	struct cache cache = {};
+	const char *infile;
+	const char *input;
+	const char *p;
+	int fd;
+	off_t o;
+	size_t sz, pos;
+
+	if (argc < 2) {
+		fprintf(stderr, "usage: %s file\n", __progname);
+		return 3;
+	}
+	infile = argv[1];
+
+	/* mmap input */
+	if ((fd = open(infile, O_RDONLY)) == -1)
+		err(2, "%s", infile);
+	if ((o = lseek(fd, 0, SEEK_END)) == -1)
+		err(2, "lseek");
+	sz = o;
+	input = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (input == MAP_FAILED)
+		err(2, "mmap");
+
+	/* result cache */
+	assert(cache.table == NULL);
+	assert(cache.capacity == 0);
+	assert(cache.nused == 0);
+	//cache.capacity = 128;
+	//cache.table = calloc(cache.capacity, sizeof(cache.table[0]));
+	//assert(cache.table != NULL);
+
+	/* run parser */
+	p = inifile((struct stream){input, input + sz}, &cache, (void *)input);
+	if (p == NULL) {
+		fprintf(stderr, "%s: syntax error\n", infile);
+		return 1;
+	}
+	assert(p > input);
+	assert(p <= input + sz);
+	pos = p - input;
+	if (pos < sz) {
+		fprintf(stderr, "%s: syntax error (after pos. %zu/%zu)\n",
+		    infile, pos, sz);
+		return 1;
+	}
+	printf("success (consumed %zu/%zu bytes of input)\n", pos, sz);
+
+	return 0;
+}
blob - /dev/null
blob + 0c3683eb3845e8842f541548139e119e4fa502f6 (mode 644)
--- /dev/null
+++ minip_m.h
@@ -0,0 +1,346 @@
+/* miniature recursive-descent parser combinators
+ * with semantic actions
+ *
+ * pesco 2019, 2020
+ */
+#ifndef MINIP_M_H_
+#define MINIP_M_H_
+
+#include <stddef.h>	/* NULL */
+
+struct stream {const char *p, *end;};
+struct cache;
+
+/* parser type -- corresponds to grammar nonterminals */
+typedef const char *parser(struct stream, struct cache *, void *);
+
+/* define a parser (nonterminal) */
+#define DEF(NT, EXPR)						\
+	static const char *					\
+	NT(struct stream inp_, struct cache *cac_, void *env_)	\
+	{							\
+		const char *p_ = inp_.p;			\
+		return (EXPR) ? p_ : NULL;			\
+	}
+
+/* primitive expressions */
+#define ANYCHAR		(p_ = anychar_(inp_))
+#define CHAR(C)		(p_ = char_(inp_, (C)))
+#define RANGE(M, N)	(p_ = range_(inp_, (M), (N)))
+#define STRING(S)	(p_ = string_(inp_, (S)))
+#define END		(p_ = end_(inp_))
+#define EPSILON		(p_ = inp_.p)
+#define OMEGA		(p_ = inp_.end)
+
+/*
+ * logical operators allowed on expressions:
+ *
+ *  ||	(ordered) choice; first match wins
+ *  &&	restriction (lookahead); rhs is consumed
+ *  !	negation; only allowed to the left of &&
+ */
+
+/* variable-length argument lists */
+#define NARGS(T, ...)	(sizeof (T[]){__VA_ARGS__} / sizeof(T))
+#define VARGS(T, ...)	(T[]){__VA_ARGS__}, NARGS(T, __VA_ARGS__)
+#define PARGS(...)	VARGS(parser *, __VA_ARGS__)
+
+/* basic combinators -- take a list of nonterminals as arguments */
+#define CHOICE(...)	(p_ = choice_(inp_, cac_, env_, PARGS(__VA_ARGS__)))
+#define SEQ(...)	(p_ = seq_(inp_, cac_, env_, PARGS(__VA_ARGS__)))
+
+/* derived combinators; arguments form an implicit SEQ */
+#define MANY(...)	(p_ = many_(inp_, cac_, env_, PARGS(__VA_ARGS__)))
+#define MANY1(...)	(p_ = many1_(inp_, cac_, env_, PARGS(__VA_ARGS__)))
+#define OPT(...)	(SEQ(__VA_ARGS__) || EPSILON)
+
+
+#include <stdbool.h>
+
+/*
+ * semantic actions:
+ *
+ * - attach to an expression with &&: (EXPR) && ACTION(f, ctx)
+ * - receive an arbitrary context parameter via ACTION()'s second argument.
+ * - receive an arbitrary environment pointer from the top-level parser call.
+ * - execute (only) after their subordinate parser(s) matched.
+ * - can also act as validations by returning the intended result.
+ */
+typedef bool action(void *, void *, const char *, size_t);
+#define ACTION(F, C)	(F)((C), env_, inp_.p, p_ - inp_.p)
+
+
+/*
+ * to avoid premature execution of actions, TRY(...) can be used in place of
+ * SEQ on any sequence where actions could execute before a later nonterminal
+ * fails. note:
+ *
+ * - single-element sequences do not need TRY.
+ * - multi-argument calls of MANY, MANY1, OPT may need to be replaced.
+ * - OPT(...) can be replaced with TRY(...) || EPSILON.
+ *
+ * in any case, beware of the overhead. since TRY executes the sequence twice,
+ * it can easily lead to a substantial increase in recognition time.
+ */
+#define MATCH(...)	seq_(inp_, NULL, NULL, PARGS(__VA_ARGS__))
+#define TRY(...)	((env_ == NULL || MATCH(__VA_ARGS__)) && SEQ(__VA_ARGS__))
+
+
+/*
+ * memoization of parse results (~ packrat parsing)
+ */
+#define MEMO(...)	(p_ = memo_(inp_, cac_, env_, __func__, PARGS(__VA_ARGS__)))
+
+/* an entry in the memoization hash table */
+struct memento {
+	const char *position;	/* NULL: unused entry */
+	const char *symbol;	/* NULL: dummy entry -- may be reused */
+	const char *result;
+};
+
+struct cache {
+	struct memento *table;
+	size_t capacity, nused;
+};
+
+
+#include <assert.h>
+
+static inline const char *
+anychar_(struct stream inp)
+{
+	if (inp.p < inp.end)
+		return inp.p + 1;
+	else
+		return NULL;
+}
+
+static inline const char *
+char_(struct stream inp, char c)
+{
+	if (inp.p < inp.end && *inp.p == c)
+		return inp.p + 1;
+	else
+		return NULL;
+}
+
+static inline const char *
+range_(struct stream inp, unsigned char m, unsigned char n)
+{
+	unsigned char b = *(unsigned char *)inp.p;
+
+	if (inp.p < inp.end && b >= m && b <= n)
+		return inp.p + 1;
+	else
+		return NULL;
+}
+
+static inline const char *
+string_(struct stream inp, const char *s)
+{
+	while (inp.p < inp.end && *s != '\0' && *inp.p == *s) {
+		inp.p++;
+		s++;
+	}
+	if (*s == '\0')		/* did we match the full string? */
+		return inp.p;
+	else
+		return NULL;
+}
+
+static inline const char *
+end_(struct stream inp)
+{
+	if (inp.p >= inp.end)
+		return inp.end;
+	else
+		return NULL;
+}
+
+static inline const char *
+seq_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+	for (size_t i=0; i<n; i++) {
+		inp.p = ps[i](inp, cac, env);
+		if (inp.p == NULL)	/* no match */
+			break;
+	}
+
+	return inp.p;
+}
+
+static inline const char *
+choice_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+	const char *p = NULL;
+
+	for (size_t i=0; i<n; i++) {
+		p = ps[i](inp, cac, env);
+		if (p != NULL)	/* match */
+			break;
+	}
+
+	return p;
+}
+
+static inline const char *
+many_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+	const char *bak;
+
+	for (;;) {
+		bak = inp.p;
+		inp.p = seq_(inp, cac, env, ps, n);
+		if (inp.p == NULL)
+			return bak;
+		assert(inp.p > bak);	/* progress */
+	}
+}
+
+static inline const char *
+many1_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+	inp.p = seq_(inp, cac, env, ps, n);
+	if (inp.p == NULL)
+		return NULL;
+	return many_(inp, cac, env, ps, n);
+}
+
+#include <string.h>	/* strcmp */
+#include <stdlib.h>	/* realloc */
+#include <limits.h>	/* UINT_MAX */
+#include <stdint.h>	/* SIZE_MAX */
+
+#include <stdio.h>	// XXX debug
+
+static inline const char *
+memo_(struct stream inp, struct cache *cache, void *env, const char *sym,
+    parser *ps[], size_t n)
+{
+	struct memento *mem;
+	const char *res;
+	size_t i;
+	unsigned int h0 = 5831;	/* djbhash; f(h,x) = 33h + x */
+	unsigned int h, s;
+
+	if (cache == NULL)
+		return seq_(inp, cache, env, ps, n);
+
+	/* NB:
+	 * the hash math is modulo cache->capacity, but when that is a power of
+	 * two, we can run the intermediates modulo UINT_MAX+1.
+	 */
+
+	/* hash (inp,sym) */
+	for (i=0; i < sizeof inp; i++)
+		h0 = h0 * 33 + ((unsigned char *)&inp.p)[i];
+	for (i=0; sym[i] != '\0'; i++)
+		h0 = h0 * 33 + (unsigned char)sym[i];
+
+	/* look up (inp,sym) in cache and return if found */
+	/* NB: count steps; the table could be filled with dummies */
+	for (i=0, h=h0, s=1; i < cache->capacity; i++, h+=s, s*=5) {
+		mem = cache->table + (h % cache->capacity);
+		if (mem->position == NULL)	/* not in table */
+			break;
+		if (mem->symbol == NULL)	/* dummy entry */
+			continue;
+		if (mem->position == inp.p && strcmp(mem->symbol, sym) == 0)
+			return mem->result;	/* match */
+	}
+
+	/* else run the argument sequence */
+	res = seq_(inp, cache, env, ps, n);
+
+	/* grow hashtable if necessary */
+	if (cache->nused >= cache->capacity * 9 / 10) {
+		void *newp;
+		size_t newcap;
+		size_t j;
+
+		/* realloc  XXX nicer err handling */
+		assert(cache->capacity <= UINT_MAX / 2);
+		assert(cache->capacity <= SIZE_MAX / sizeof *cache->table);
+		if (cache->capacity == 0)	/* initial alloc */
+			newcap = 4;
+		else
+			newcap = cache->capacity * 2;
+		fprintf(stderr, "XXX grow at %zu/%zu -> newcap %zu\n",
+		    cache->nused, cache->capacity, newcap);
+		newp = realloc(cache->table, newcap * sizeof *cache->table);
+		memset(newp + cache->capacity, 0,
+		    (newcap - cache->capacity) * sizeof *newp);
+		assert(newp != NULL);
+		if (newp != cache->table)
+			fprintf(stderr, "XXX TABLE MOVE!\n");
+		cache->table = newp;
+
+		/* relocate items as necessary */
+		size_t count=0, count2=0;	// XXX
+		for (i=0; i < cache->capacity; i++) {
+			const char *pi = cache->table[i].position;
+			const char *si = cache->table[i].symbol;
+
+			if (pi == NULL || si == NULL)	/* unused or dummy */
+				continue;
+
+			/* hash the value (pi,si) in table cell i */
+			h = 5831;
+			for (j=0; j < sizeof pi; j++)
+				h = h * 33 + ((unsigned char *)&pi)[j];
+			for (j=0; si[j] != '\0'; j++)
+				h = h * 33 + (unsigned char)si[j];
+
+			/* find the correct place for it in the new table */
+			for (s=1; ; h+=s, s*=5) {
+				h %= newcap;
+				mem = cache->table + h;
+				if (h == i)			/* same spot */
+					break;
+				if (mem->position == NULL)	/* unused */
+					break;
+				//if (mem->symbol == NULL)	/* dummy */
+				//	break;			/* reuse! */
+			}
+			if (s != 1)
+				count2++;
+
+			/* move the entry if needed, leaving a dummy behind */
+			if (h != i) {
+				fprintf(stderr, "XXX relocate %zu -> %u %s\n",
+				    i, h, h < cache->capacity ? "(!)" : "");
+				count++; // XXX
+				*mem = cache->table[i];
+				cache->table[i].symbol = NULL;
+			}
+		}
+		fprintf(stderr, "XXX had to move %zu/%zu entries, i.e. %.1f%%\n",
+		    count, cache->nused, count * 100.0 / cache->nused);
+		fprintf(stderr, "XXX now %zu/%zu are off-base, i.e. %.1f%%\n",
+		    count2, cache->nused, count2 * 100.0 / cache->nused);
+
+		cache->capacity = newcap;
+	}
+
+	/* find our place in the table */
+	assert(cache->nused < cache->capacity);
+	/* NB: loop will terminate because dummies count as free here */
+	for (h=h0, s=1; ; h+=s, s*=5) {
+		mem = cache->table + (h % cache->capacity);
+		if (mem->position == NULL)	/* unused entry */
+			break;
+		if (mem->symbol == NULL)	/* dummy entry */
+			break;			/* reuse! */
+	}
+
+	/* save the result */
+	assert(mem->position == NULL || mem->symbol == NULL);
+	mem->position = inp.p;
+	mem->symbol = sym;
+	mem->result = res;
+	cache->nused++;
+
+	return res;
+}
+
+#endif /* MINIP_M_H_ */