KHJK gotweb

Commit Diff

Commit:: 9d2eb8f8b610e4c920b0c463618df5452e4641c6
From:: Sven M. Hallberg <pesco@khjk.org>
Date:: Mon Dec 5 04:19:41 2022 UTC
Message:: add minip_b, ini_b (boolean return value variant)
Actions:: Patch | Tree
commit - e46cabf4dabf47bee184092e4cdb372e9076f3ca
commit + 9d2eb8f8b610e4c920b0c463618df5452e4641c6
blob - bcb4b3f4cd14e31b2406f1b529508c88664fbbbc
blob + 0a017a655ec96e3cbcddaa9bd43b6fab9b3eaee6
--- Makefile
+++ Makefile
@@ -1,6 +1,6 @@
 CFLAGS += -std=c99 -Wall
 
-TARGETS = ini_r ini_a ini_m ini_n ini_j
+TARGETS = ini_r ini_a ini_m ini_n ini_j ini_b
 
 all: $(TARGETS)
 
@@ -9,6 +9,7 @@ ini_a : ini_a.c minip_a.h
 ini_m : ini_m.c minip_m.h
 ini_n : ini_n.c minip_n.h
 ini_j : ini_j.c minip_n.h
+ini_b : ini_b.c minip_b.h
 
 
 clean:
@@ -20,5 +21,6 @@ test: all
 	./ini_m test.ini | grep -v ' [01] byte '
 	./ini_n test.ini | grep -v ' [01] byte '
 	./ini_j test.ini 2>/dev/null
+	./ini_b test.ini 2>/dev/null
 
 .PHONY: all test clean
blob - /dev/null
blob + 5b5687f09b7c9624703a8562fb92434a01c6059f (mode 644)
--- /dev/null
+++ ini_b.c
@@ -0,0 +1,262 @@
+/*
+ * demo: (simple) ini files, converted to json
+ *
+ * EBNFish:
+ *
+ *	inifile	= {sect} {empty} {ws} [tail]
+ *	sect	= header {entry}
+ *	tail	= "EOF" {ws} eol {any}
+ *
+ *	header	= {empty} bra sname ket eol
+ *	entry	= {empty} key "=" value eol
+ *	empty	= {ws} [comment] nl
+ *
+ *	(* tokens *)
+ *	eol	= nl | end
+ *	bra	= {ws} "["
+ *	ket	= "]" {ws}
+ *	comment	= ";" {lchar}
+ *	sname	= {lchar - "]"}+
+ *	key	= {lchar - "="}+
+ *	value	= {lchar}+
+ *
+ *	(* character classes *)
+ *	lchar	= ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ?
+ *	ws	= ? ASCII codes 32 (SP), 9 (HT) ?
+ *	nl	= ? ASCII code 10 (LF) ?
+ *	end	= ? end of input ?
+ *	any	= ? any character ?
+ *
+ * converted to be compatible with our combinators:
+ *
+ *	inifile	= sects empties wss tail
+ *	sects	= {header entries}
+ *	tail	= [eof wss eol anys]
+ *
+ *	header	= empties bra sname ket eol
+ *	entries	= {empties key eq value eol}
+ *	empties	= {wss comment nl}
+ *
+ *	eol	= nl | end
+ *	bra	= wss leftbr
+ *	ket	= rightbr wss
+ *	comment	= [semi lchars]
+ *	sname	= {schar}+
+ *	key	= {kchar}+
+ *	value	= {lchar}+
+ *
+ *	eof	= "EOF"
+ *	eq	= "="
+ *	semi	= ";"
+ *	leftbr	= "["
+ *	rightbr	= "]"
+ *
+ *	lchars	= {lchar}
+ *	wss	= {ws}
+ *	anys	= {any}
+ *
+ *	lchar	= ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ?
+ *	schar	= ? ASCII codes 9, 0x20-0x5c, 0x5e-0x7e ?
+ *	kchar	= ? ASCII codes 9, 0x20-0x3c, 0x3e-0x7e ?
+ *	ws	= ? ASCII codes 32 (SP), 9 (HT) ?
+ *	nl	= ? ASCII code 10 (LF) ?
+ *	end	= ? end of input ?
+ *	any	= ? any character ?
+ *
+ * note how right-hand sides have one of the following forms:
+ *
+ *  - a sequence of nonterminals (SEQ)
+ *  - a sequence of nonterminals inside { } (MANY)
+ *  - a sequence of nonterminals inside { }+ (MANY1)
+ *  - a sequence of nonterminals inside [ ] (OPT)
+ *  - a choice of nonterminals (CHOICE)
+ *  - a string of characters (STRING)
+ *  - a single character (CHAR)
+ *
+ * finally, we can realize the "special sequences" (? ... ?) using END,
+ * ANYCHAR, RANGE, and expression choice ||.
+ */
+
+#include "minip_b.h"
+#include <stdio.h>
+
+action trace, print, print_string;
+#define DEF_(NT, EXPR)	DEF(NT, (EXPR) && ACTION(trace, #NT))
+
+DEF (lchar,	CHAR('\t') || RANGE(0x20, 0x7e))
+DEF (schar,	CHAR('\t') || RANGE(0x20, 0x5c) || RANGE(0x5e, 0x7e))
+				/* = !CHAR(']') && SEQ(lchar) */
+DEF (kchar,	CHAR('\t') || RANGE(0x20, 0x3c) || RANGE(0x3e, 0x7e))
+				/* = !CHAR('=') && SEQ(lchar) */
+DEF (ws,	CHAR('\t') || CHAR(' '))
+DEF_(nl,	CHAR('\n'))
+DEF_(end,	END)
+DEF_(any,	ANYCHAR)
+
+DEF_(lchars,	MANY(lchar))
+DEF_(wss,	MANY(ws))
+DEF_(anys,	MANY(any))	/* = OMEGA */
+
+DEF_(eof,	STRING("EOF"))
+DEF_(eq,	CHAR('=')
+		    && ACTION(print, ": "))
+DEF_(semi,	CHAR(';'))
+DEF_(leftbr,	CHAR('['))
+DEF_(rightbr,	CHAR(']'))
+
+DEF_(eol,	CHOICE(nl, end))
+DEF_(bra,	SEQ(wss, leftbr))
+DEF_(ket,	SEQ(rightbr, wss))
+DEF_(comment,	SEQ(semi, lchars))
+DEF (comment_o,	OPT(comment))
+DEF_(sname,	MANY1(schar)
+		    && ACTION(print_string, "  "))
+DEF_(key,	MANY1(kchar)
+		    && ACTION(print_string, "    "))
+DEF_(value,	MANY1(lchar)
+		    && ACTION(print_string, ""))
+
+DEF_(out_comma,	ACTION(print, ",\n"))
+
+DEF_(empty,	SEQ(wss, comment_o, nl))
+DEF_(empties,	MANY(empty))
+DEF_(header,	SEQ(empties, bra, sname, ket, eol)
+		    && ACTION(print, ": {\n"))
+DEF_(entry,	SEQ(empties, key, eq, value, eol))
+DEF (entry_x,	SEQ(out_comma, entry))
+DEF (entries_x,	MANY(entry_x))
+DEF_(entries,	SEQ(entry, entries_x)
+		    && ACTION(print, "\n  }"))
+
+DEF_(tail,	SEQ(eof, wss, eol, anys))
+DEF (tail_o,	OPT(tail))
+DEF_(sect,	SEQ(header, entries))
+// (A)
+DEF (sect_x,	SEQ(out_comma, sect))
+DEF (sects_x,	MANY(sect_x))
+DEF (sects1,	SEQ(sect, sects_x))
+// (B)
+//static parser sects2;
+//DEF (sect_x,	SEQ(out_comma, sect))
+//DEF (sects3,	SEQ(sect_x, sects2))
+//DEF (sects2,	OPT(sects3))
+//DEF (sects1,	SEQ(sect, sects2))
+// (C)
+//static parser sects1;
+//DEF (sects2,	SEQ(sect, out_comma, sects1))	/* if only tail-recursive */
+//DEF (sects1,	CHOICE(sects2, sect))
+DEF_(sects,	ACTION(print, "{\n") &&
+		    OPT(sects1)
+		    && ACTION(print, "\n}\n"))
+DEF_(inifile,	TRY(sects, empties, wss, tail_o))
+
+bool
+trace(void *ctx, void *env, const char *s, size_t len)
+{
+	const char *nt = ctx;
+	const char *begin = env;
+	size_t pos;
+
+	pos = s - begin;
+	fprintf(stderr, "%4zx: %4zu byte %s\n", pos, len, nt);
+	return true;
+}
+
+bool
+print(void *ctx, void *env, const char *s, size_t len)
+{
+	fputs(ctx, stdout);
+	return true;
+}
+
+bool
+print_string(void *ctx, void *env, const char *s, size_t len)
+{
+	size_t i;
+	unsigned char c;
+
+	fputs(ctx, stdout);	/* prefix string */
+
+	putchar('"');
+	for (i = 0; i < len; i++) {
+		c = (unsigned char)s[i];
+		assert(c <= 127);			// XXX only ASCII
+		if (c == '"')
+			fputs("\\\"", stdout);
+		else if (c == '\\')
+			fputs("\\\\", stdout);
+		else if (c == '\t')
+			fputs("\\t", stdout);
+		else if (c <= 0x1F || c == 0x7F)	/* i.e. iscntrl */
+			printf("\\u00%.2hhx", c);
+		else
+			putchar(c);
+	}
+	putchar('"');
+
+	return true;
+}
+
+
+#include <stdio.h>
+#include <stdlib.h>	/* calloc */
+#include <fcntl.h>	/* open, lseek */
+#include <sys/mman.h>	/* mmap */
+#include <err.h>
+#include <assert.h>
+
+extern char *__progname;
+
+/*
+ * run the 'inifile' parser on a file given on the command line.
+ */
+int
+main(int argc, char *argv[])
+{
+	struct cache cache = {};
+	const char *infile;
+	const char *input;
+	int fd;
+	off_t o;
+	size_t sz, pos;
+
+	if (argc < 2) {
+		fprintf(stderr, "usage: %s file\n", __progname);
+		return 3;
+	}
+	infile = argv[1];
+
+	/* mmap input */
+	if ((fd = open(infile, O_RDONLY)) == -1)
+		err(2, "%s", infile);
+	if ((o = lseek(fd, 0, SEEK_END)) == -1)
+		err(2, "lseek");
+	sz = o;
+	input = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (input == MAP_FAILED)
+		err(2, "mmap");
+
+	/* result cache */
+	assert(cache.table == NULL);
+	assert(cache.capacity == 0);
+	assert(cache.nused == 0);
+
+	/* run parser */
+	struct stream inp = {input, input + sz};
+	if (!inifile(&inp, &cache, (void *)input)) {
+		assert(inp.p == input);
+		fprintf(stderr, "%s: syntax error\n", infile);
+		return 1;
+	}
+	assert(inp.p > input);
+	assert(inp.p <= input + sz);
+	pos = inp.p - input;
+	if (pos < sz) {
+		fprintf(stderr, "%s: syntax error (after pos. %zu/%zu)\n",
+		    infile, pos, sz);
+		return 1;
+	}
+	fprintf(stderr, "success (consumed %zu/%zu bytes of input)\n", pos, sz);
+
+	return 0;
+}
blob - /dev/null
blob + ff87dfcd6b28f1ac929eb32f8b579236ca0125de (mode 644)
--- /dev/null
+++ minip_b.h
@@ -0,0 +1,395 @@
+/* miniature recursive-descent parser combinators
+ * with semantic actions and omnibus negative memoization
+ * employing boolean return type and mutable input pointer
+ *
+ * pesco 2019, 2020
+ */
+#ifndef MINIP_N_H_
+#define MINIP_N_H_
+
+#include <stddef.h>	/* NULL */
+#include <stdbool.h>
+
+struct stream {const char *p, *end;};
+struct cache;
+
+/* parser type -- corresponds to grammar nonterminals */
+typedef bool parser(struct stream *, struct cache *, void *);
+
+/* define a parser (nonterminal) */
+#define DEF(NT, EXPR)							\
+	static bool							\
+	NT(struct stream *inp_, struct cache *cac_, void *env_)		\
+	{								\
+		const char *pos_ = inp_->p;				\
+		struct memento *mem_;					\
+									\
+		/* look up (pos,sym) in cache and return if found */	\
+		if ((mem_ = lookup(cac_, pos_, __func__)) != NULL) {	\
+			if (mem_->value == NULL)			\
+				return false;				\
+			inp_->p = mem_->value;				\
+			return true;					\
+		}							\
+									\
+		/* else run the expression and return if it matched */	\
+		if (EXPR)						\
+			return true;					\
+									\
+		/* else, return and memoize a negative result */	\
+		store(cac_, pos_, __func__, NULL);			\
+		return false;						\
+	}
+
+/* primitive expressions */
+#define ANYCHAR		anychar_(inp_)
+#define CHAR(C)		char_(inp_, (C))
+#define RANGE(M, N)	range_(inp_, (M), (N))
+#define STRING(S)	string_(inp_, (S))
+#define END		end_(inp_)
+#define EPSILON		true
+#define OMEGA		(inp_->p = inp_->end, true)
+
+/*
+ * logical operators allowed on expressions *sans* actions:
+ *
+ *  &&	restriction (lookahead); rhs is consumed
+ *  ||	ordered choice
+ *  !	negation; only allowed to the left of && (to avoid consumption)
+ */
+
+/* variable-length argument lists */
+#define NARGS(T, ...)	(sizeof (T[]){__VA_ARGS__} / sizeof(T))
+#define VARGS(T, ...)	(T[]){__VA_ARGS__}, NARGS(T, __VA_ARGS__)
+#define PARGS(...)	VARGS(parser *, __VA_ARGS__)
+
+/* basic combinators -- take a list of nonterminals as arguments */
+#define CHOICE(...)	choice_(inp_, cac_, env_, PARGS(__VA_ARGS__))
+#define SEQ(...)	seq_(inp_, cac_, env_, PARGS(__VA_ARGS__))
+
+/* secondary combinators -- could be derived from the basics */
+#define MANY(X)		many_(inp_, cac_, env_, X)
+#define MANY1(X)	many1_(inp_, cac_, env_, X)
+#define OPT(X)		((X)(inp_, cac_, env_), true)
+
+/* NB:
+ * because memoization only applies to nonterminals and the above combinators
+ * (MANY, MANY1, OPT) form implicit choices with them, we can't use variable
+ * argument lists that form an implicit SEQ here. otherwise, since those
+ * anonymous argument sequences would not be memoized (memoization only happens
+ * on actual nonterminals), actions could be executed in them, even if the
+ * sequence failed to match in a later element.
+ */
+
+/*
+ * semantic actions:
+ *
+ * - attach to an expression with &&: EXPR && ACTION(f, ctx)
+ * - receive an arbitrary context parameter via ACTION()'s second argument.
+ * - receive an arbitrary environment pointer from the top-level parser call.
+ * - execute (only) after their subordinate parser(s) matched.
+ * - can also act as validations by returning the intended result.
+ *
+ * Thanks to prerecognition and negative memoization, it is also possible to
+ * attach actions before the expression. E.g.:
+ *
+ *     ACTION(print, "[") && EXPR && ACTION(print, "]")
+ */
+typedef bool action(void *, void *, const char *, size_t);
+#define ACTION(F, C)	(env_ == NULL || (F)((C), env_, pos_, inp_->p - pos_))
+
+
+/*
+ * to avoid premature execution of actions, TRY(...) can be used in place of
+ * SEQ on any sequence where actions could execute before a later nonterminal
+ * fails. note:
+ *
+ * - single-element sequences do not need TRY.
+ * - multi-argument calls of MANY, MANY1, OPT may need to be replaced.
+ * - OPT(...) can be replaced with TRY(...) || EPSILON.
+ *
+ * in any case, beware of the overhead. since TRY executes the sequence twice,
+ * it can easily lead to a substantial increase in recognition time.
+ */
+#define MATCH(...)	(seq_(inp_, cac_, NULL, PARGS(__VA_ARGS__)) && (inp_->p = pos_))
+#define TRY(...)	((env_ == NULL || MATCH(__VA_ARGS__)) && SEQ(__VA_ARGS__))
+
+
+/*
+ * general memoization of parse results (~ packrat parsing)
+ *
+ * handles the positive cases, but only when actions are enabled, i.e. env
+ * pointer is non-null. otherwise, actions would be lost - cf. TRY().
+ *
+ * NB: negative results are always cached - see DEF().
+ *
+ * usage: DEF(nt, EXPR && MEMO)
+ */
+#define MEMO	(env_ == NULL || store(cac_, pos_, __func__, inp_->p))
+
+/* an entry in the memoization hash table */
+struct memento {
+	const void *key;	/* NULL: unused entry */
+	const void *key2;	/* NULL: dummy entry -- may be reused */
+	const void *value;
+};
+
+struct cache {
+	struct memento *table;
+	size_t capacity, nused;
+};
+
+
+#include <assert.h>
+
+static inline bool
+anychar_(struct stream *inp)
+{
+	if (inp->p < inp->end)
+		return (inp->p++, true);
+	else
+		return false;
+}
+
+static inline bool
+char_(struct stream *inp, char c)
+{
+	if (inp->p < inp->end && *inp->p == c)
+		return (inp->p++, true);
+	else
+		return false;
+}
+
+static inline bool
+range_(struct stream *inp, unsigned char m, unsigned char n)
+{
+	unsigned char b = *(unsigned char *)inp->p;
+
+	if (inp->p < inp->end && b >= m && b <= n)
+		return (inp->p++, true);
+	else
+		return false;
+}
+
+static inline bool
+string_(struct stream *inp, const char *s)
+{
+	const char *p = inp->p;
+
+	while (p < inp->end && *s != '\0' && *p == *s) {
+		p++;
+		s++;
+	}
+	if (*s == '\0')		/* did we match the full string? */
+		return (inp->p = p, true);
+	else
+		return false;	/* inp->p remains untouched */
+}
+
+static inline bool
+end_(struct stream *inp)
+{
+	if (inp->p >= inp->end)
+		return (inp->p = inp->end, true);
+	else
+		return false;
+}
+
+static inline bool
+seq_(struct stream *inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+	const char *bak = inp->p;
+
+	for (size_t i=0; i<n; i++) {
+		if (!ps[i](inp, cac, env))	/* no match */
+			return (inp->p = bak, false);
+	}
+
+	return true;
+}
+
+static inline bool
+choice_(struct stream *inp, struct cache *cac, void *env, parser *ps[], size_t n)
+{
+	const char *bak = inp->p;
+
+	for (size_t i=0; i<n; i++) {
+		if (ps[i](inp, cac, env))	/* match */
+			return true;
+		assert(inp->p == bak);
+	}
+
+	return false;
+}
+
+static inline bool
+many_(struct stream *inp, struct cache *cac, void *env, parser *pa)
+{
+	const char *bak;
+
+	for (;;) {
+		bak = inp->p;
+		if (!pa(inp, cac, env))
+			return (inp->p = bak, true);
+		assert(inp->p > bak);	/* progress */
+	}
+	/* not reached */
+}
+
+static inline bool
+many1_(struct stream *inp, struct cache *cac, void *env, parser *pa)
+{
+	const char *bak = inp->p;
+
+	if (!pa(inp, cac, env))
+		return (inp->p = bak, false);
+	return many_(inp, cac, env, pa);
+}
+
+
+#include <stdlib.h>	/* calloc */
+#include <limits.h>	/* UINT_MAX */
+#include <stdint.h>	/* SIZE_MAX */
+
+#include <stdio.h>	// XXX debug
+
+static inline unsigned int
+hash(const void *key, const void *key2)
+{
+	unsigned int h = 5831;	/* djbhash; f(h,x) = 33h + x */
+	size_t i;
+
+	for (i=0; i < sizeof key; i++)
+		h = h * 33 + ((unsigned char *)&key)[i];
+	for (i=0; i < sizeof key2; i++)
+		h = h * 33 + ((unsigned char *)&key2)[i];
+
+	return h;
+}
+
+/* NB:
+ * the hash math is modulo cache->capacity, but when that is a power of
+ * two, we can run the intermediates modulo UINT_MAX+1.
+ */
+
+static inline struct memento *
+lookup(const struct cache *cache, const void *key, const void *key2)
+{
+	struct memento *mem;
+	unsigned int h, s;
+	size_t i;
+
+	if (cache == NULL)
+		return NULL;
+
+	h = hash(key, key2);
+
+	/* NB: count steps, since the table could be filled with dummies */
+	for (i=0, s=1; i < cache->capacity; i++, h+=s, s*=5) {
+		mem = cache->table + (h % cache->capacity);
+		if (mem->key == NULL)	/* not in table */
+			break;
+		if (mem->key2 == NULL)	/* dummy entry */
+			continue;
+		if (mem->key == key && mem->key2 == key2)
+			return mem;		/* match */
+	}
+
+	return NULL;
+}
+
+static inline void
+grow(struct cache *cache)
+{
+	struct memento *mem, *newp;
+	unsigned int h, s;
+	size_t i, newcap;
+
+	/* realloc  XXX nicer err handling */
+	assert(cache->capacity <= UINT_MAX / 2);
+	assert(cache->capacity <= SIZE_MAX / sizeof *cache->table);
+	if (cache->capacity == 0)	/* initial alloc */
+		newcap = 4;
+	else
+		newcap = cache->capacity * 2;
+#ifdef PROFGROW
+	size_t count=0, count2=0;
+	fprintf(stderr, "\tgrow at %zu/%zu -> newcap %zu\n",
+	    cache->nused, cache->capacity, newcap);
+#endif
+	newp = calloc(newcap, sizeof *cache->table);
+	assert(newp != NULL);
+
+	/* relocate items */
+	for (i=0; i < cache->capacity; i++) {
+		const void *k = cache->table[i].key;
+		const char *k2 = cache->table[i].key2;
+
+		if (k == NULL || k2 == NULL)	/* unused or dummy */
+			continue;
+
+		/* find the correct place for (k,k2) in the new table */
+		for (h = hash(k, k2), s=1; ; h+=s, s*=5) {
+			h %= newcap;
+			mem = newp + h;
+			if (mem->key == NULL)	/* unused */
+				break;
+		}
+#ifdef PROFGROW
+		if (s != 1)
+			count2++;
+		if (h != i)
+			count++;
+#endif
+
+		/* copy the entry to the new array */
+		*mem = cache->table[i];
+	}
+#ifdef PROFGROW
+	fprintf(stderr, "\tpos. changed on %zu/%zu entries = %.1f%%\n",
+	    count, cache->nused, count * 100.0 / cache->nused);
+	fprintf(stderr, "\tnow %zu/%zu are off-base, i.e. %.1f%%\n",
+	    count2, cache->nused, count2 * 100.0 / cache->nused);
+#endif
+
+	free(cache->table);
+	cache->table = newp;
+	cache->capacity = newcap;
+}
+
+static inline const void *
+store(struct cache *cache, const void *key, const void *key2, const void *val)
+{
+	struct memento *mem;
+	unsigned int h, s;
+
+	if (cache == NULL)
+		return val;
+
+	/* grow hashtable if necessary */
+	if (cache->nused >= cache->capacity * 9 / 10)
+		grow(cache);
+
+	/* find our place in the table */
+	/* NB: loop will terminate because dummies count as free here */
+	h = hash(key, key2);
+	assert(cache->nused < cache->capacity);
+	for (s=1; ; h+=s, s*=5) {
+		mem = cache->table + (h % cache->capacity);
+		if (mem->key == NULL)	/* unused entry */
+			break;
+		if (mem->key2 == NULL)	/* dummy entry */
+			break;		/* reuse! */
+	}
+
+	/* save the result */
+	assert(mem->key == NULL || mem->key2 == NULL);
+	mem->key = key;
+	mem->key2 = key2;
+	mem->value = val;
+	cache->nused++;
+
+	return val;
+}
+
+#endif /* MINIP_N_H_ */