commit 2bac640ef0b48e6ad7a239c30efd1f40dea37caf from: Sven M. Hallberg date: Fri Jun 12 16:57:09 2020 UTC add memoizing version, first prototype commit - f2567d1f9b90fabbcc831652cc67e265c96659bc commit + 2bac640ef0b48e6ad7a239c30efd1f40dea37caf blob - e00f6349d561119e2b05a05cacc61187f326a3ae blob + f2f7833d69da7a62ef7d5fffc139278b2735576d --- Makefile +++ Makefile @@ -1,11 +1,12 @@ CFLAGS += -std=c99 -Wall -TARGETS = ini_r ini_a +TARGETS = ini_r ini_a ini_m all: $(TARGETS) ini_r : ini_r.c minip_r.h ini_a : ini_a.c minip_a.h +ini_m : ini_m.c minip_m.h clean: @@ -14,5 +15,6 @@ clean: test: all ./ini_r test.ini ./ini_a test.ini | grep -v ' [01] byte ' + ./ini_m test.ini | grep -v ' [01] byte ' .PHONY: all test clean blob - /dev/null blob + 586cce27ebc480dfc85d92e59a3c47b353cb15ee (mode 644) --- /dev/null +++ ini_m.c @@ -0,0 +1,204 @@ +/* + * demo: (simple) ini files + * + * EBNFish: + * + * inifile = {sect} {empty} {ws} [tail] + * sect = header {entry} + * tail = "EOF" {ws} eol {any} + * + * header = {empty} bra sname ket eol + * entry = {empty} key "=" value eol + * empty = {ws} [comment] nl + * + * (* tokens *) + * eol = nl | end + * bra = {ws} "[" + * ket = "]" {ws} + * comment = ";" {lchar} + * sname = {lchar - "]"}+ + * key = {lchar - "="}+ + * value = {lchar}+ + * + * (* character classes *) + * lchar = ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ? + * ws = ? ASCII codes 32 (SP), 9 (HT) ? + * nl = ? ASCII code 10 (LF) ? + * end = ? end of input ? + * any = ? any character ? + * + * converted to be compatible with our combinators: + * + * inifile = sects empties wss tail + * sects = {header entries} + * tail = [eof wss eol anys] + * + * header = empties bra sname ket eol + * entries = {empties key eq value eol} + * empties = {wss comment nl} + * + * eol = nl | end + * bra = wss leftbr + * ket = rightbr wss + * comment = [semi lchars] + * sname = {schar}+ + * key = {kchar}+ + * value = {lchar}+ + * + * eof = "EOF" + * eq = "=" + * semi = ";" + * leftbr = "[" + * rightbr = "]" + * + * lchars = {lchar} + * wss = {ws} + * anys = {any} + * + * lchar = ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ? + * schar = ? ASCII codes 9, 0x20-0x5c, 0x5e-0x7e ? + * kchar = ? ASCII codes 9, 0x20-0x3c, 0x3e-0x7e ? + * ws = ? ASCII codes 32 (SP), 9 (HT) ? + * nl = ? ASCII code 10 (LF) ? + * end = ? end of input ? + * any = ? any character ? + * + * note how right-hand sides have one of the following forms: + * + * - a sequence of nonterminals (SEQ) + * - a sequence of nonterminals inside { } (MANY) + * - a sequence of nonterminals inside { }+ (MANY1) + * - a sequence of nonterminals inside [ ] (OPT) + * - a choice of nonterminals (CHOICE) + * - a string of characters (STRING) + * - a single character (CHAR) + * + * finally, we can realize the "special sequences" (? ... ?) using END, + * ANYCHAR, RANGE, and expression choice ||. + */ + +#include "minip_m.h" +#include + +action trace; +#define DEF_(NT, EXPR) DEF(NT, (EXPR) && ACTION(trace, #NT)) + +DEF_(lchar, CHAR('\t') || RANGE(0x20, 0x7e)) +DEF_(schar, CHAR('\t') || RANGE(0x20, 0x5c) || RANGE(0x5e, 0x7e)) + /* = !CHAR(']') && SEQ(lchar) */ +DEF_(kchar, !CHAR('=') && SEQ(lchar)) +DEF_(ws, CHAR('\t') || CHAR(' ')) +DEF_(nl, CHAR('\n')) +DEF_(end, END) +DEF_(any, ANYCHAR) + +DEF_(lchars, MANY(lchar)) +DEF_(wss, MANY(ws)) +DEF_(anys, MANY(any)) /* = OMEGA */ + +DEF_(eof, STRING("EOF")) +DEF_(eq, CHAR('=')) +DEF_(semi, CHAR(';')) +DEF_(leftbr, CHAR('[')) +DEF_(rightbr, CHAR(']')) + +DEF_(eol, CHOICE(nl, end)) +DEF_(bra, MEMO(wss, leftbr)) +DEF_(ket, SEQ(rightbr, wss)) +DEF_(comment, OPT(semi, lchars)) +DEF_(sname, MANY1(schar)) +DEF_(key, MANY1(kchar)) +DEF_(value, MANY1(lchar)) + +DEF (empty_, MEMO(wss, comment, nl)) +DEF_(empty, TRY(empty_)) +DEF_(empties, MANY(empty)) +DEF_(header, MEMO(empties, bra, sname, ket, eol)) +DEF_(entry, MEMO(empties, key, eq, value, eol)) +DEF_(entries, MANY(entry)) + +DEF_(tail, MEMO(eof, wss, eol, anys) || EPSILON) +DEF_(sects, MANY(header, entries)) +DEF_(inifile, SEQ(sects, empties, wss, tail)) + +bool +trace(void *ctx, void *env, const char *s, size_t len) +{ + const char *nt = ctx; + const char *begin = env; + size_t pos; + + if (env == NULL) + return true; + + pos = s - begin; + printf("%4zx: %4zu byte %s\n", pos, len, nt); + return true; +} + + +#include +#include /* calloc */ +#include /* open, lseek */ +#include /* mmap */ +#include +#include + +extern char *__progname; + +/* + * run the 'inifile' parser on a file given on the command line. + */ +int +main(int argc, char *argv[]) +{ + struct cache cache = {}; + const char *infile; + const char *input; + const char *p; + int fd; + off_t o; + size_t sz, pos; + + if (argc < 2) { + fprintf(stderr, "usage: %s file\n", __progname); + return 3; + } + infile = argv[1]; + + /* mmap input */ + if ((fd = open(infile, O_RDONLY)) == -1) + err(2, "%s", infile); + if ((o = lseek(fd, 0, SEEK_END)) == -1) + err(2, "lseek"); + sz = o; + input = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0); + if (input == MAP_FAILED) + err(2, "mmap"); + + /* result cache */ + assert(cache.table == NULL); + assert(cache.capacity == 0); + assert(cache.nused == 0); + //cache.capacity = 128; + //cache.table = calloc(cache.capacity, sizeof(cache.table[0])); + //assert(cache.table != NULL); + + /* run parser */ + p = inifile((struct stream){input, input + sz}, &cache, (void *)input); + if (p == NULL) { + fprintf(stderr, "%s: syntax error\n", infile); + return 1; + } + assert(p > input); + assert(p <= input + sz); + pos = p - input; + if (pos < sz) { + fprintf(stderr, "%s: syntax error (after pos. %zu/%zu)\n", + infile, pos, sz); + return 1; + } + printf("success (consumed %zu/%zu bytes of input)\n", pos, sz); + + return 0; +} blob - /dev/null blob + 0c3683eb3845e8842f541548139e119e4fa502f6 (mode 644) --- /dev/null +++ minip_m.h @@ -0,0 +1,346 @@ +/* miniature recursive-descent parser combinators + * with semantic actions + * + * pesco 2019, 2020 + */ +#ifndef MINIP_M_H_ +#define MINIP_M_H_ + +#include /* NULL */ + +struct stream {const char *p, *end;}; +struct cache; + +/* parser type -- corresponds to grammar nonterminals */ +typedef const char *parser(struct stream, struct cache *, void *); + +/* define a parser (nonterminal) */ +#define DEF(NT, EXPR) \ + static const char * \ + NT(struct stream inp_, struct cache *cac_, void *env_) \ + { \ + const char *p_ = inp_.p; \ + return (EXPR) ? p_ : NULL; \ + } + +/* primitive expressions */ +#define ANYCHAR (p_ = anychar_(inp_)) +#define CHAR(C) (p_ = char_(inp_, (C))) +#define RANGE(M, N) (p_ = range_(inp_, (M), (N))) +#define STRING(S) (p_ = string_(inp_, (S))) +#define END (p_ = end_(inp_)) +#define EPSILON (p_ = inp_.p) +#define OMEGA (p_ = inp_.end) + +/* + * logical operators allowed on expressions: + * + * || (ordered) choice; first match wins + * && restriction (lookahead); rhs is consumed + * ! negation; only allowed to the left of && + */ + +/* variable-length argument lists */ +#define NARGS(T, ...) (sizeof (T[]){__VA_ARGS__} / sizeof(T)) +#define VARGS(T, ...) (T[]){__VA_ARGS__}, NARGS(T, __VA_ARGS__) +#define PARGS(...) VARGS(parser *, __VA_ARGS__) + +/* basic combinators -- take a list of nonterminals as arguments */ +#define CHOICE(...) (p_ = choice_(inp_, cac_, env_, PARGS(__VA_ARGS__))) +#define SEQ(...) (p_ = seq_(inp_, cac_, env_, PARGS(__VA_ARGS__))) + +/* derived combinators; arguments form an implicit SEQ */ +#define MANY(...) (p_ = many_(inp_, cac_, env_, PARGS(__VA_ARGS__))) +#define MANY1(...) (p_ = many1_(inp_, cac_, env_, PARGS(__VA_ARGS__))) +#define OPT(...) (SEQ(__VA_ARGS__) || EPSILON) + + +#include + +/* + * semantic actions: + * + * - attach to an expression with &&: (EXPR) && ACTION(f, ctx) + * - receive an arbitrary context parameter via ACTION()'s second argument. + * - receive an arbitrary environment pointer from the top-level parser call. + * - execute (only) after their subordinate parser(s) matched. + * - can also act as validations by returning the intended result. + */ +typedef bool action(void *, void *, const char *, size_t); +#define ACTION(F, C) (F)((C), env_, inp_.p, p_ - inp_.p) + + +/* + * to avoid premature execution of actions, TRY(...) can be used in place of + * SEQ on any sequence where actions could execute before a later nonterminal + * fails. note: + * + * - single-element sequences do not need TRY. + * - multi-argument calls of MANY, MANY1, OPT may need to be replaced. + * - OPT(...) can be replaced with TRY(...) || EPSILON. + * + * in any case, beware of the overhead. since TRY executes the sequence twice, + * it can easily lead to a substantial increase in recognition time. + */ +#define MATCH(...) seq_(inp_, NULL, NULL, PARGS(__VA_ARGS__)) +#define TRY(...) ((env_ == NULL || MATCH(__VA_ARGS__)) && SEQ(__VA_ARGS__)) + + +/* + * memoization of parse results (~ packrat parsing) + */ +#define MEMO(...) (p_ = memo_(inp_, cac_, env_, __func__, PARGS(__VA_ARGS__))) + +/* an entry in the memoization hash table */ +struct memento { + const char *position; /* NULL: unused entry */ + const char *symbol; /* NULL: dummy entry -- may be reused */ + const char *result; +}; + +struct cache { + struct memento *table; + size_t capacity, nused; +}; + + +#include + +static inline const char * +anychar_(struct stream inp) +{ + if (inp.p < inp.end) + return inp.p + 1; + else + return NULL; +} + +static inline const char * +char_(struct stream inp, char c) +{ + if (inp.p < inp.end && *inp.p == c) + return inp.p + 1; + else + return NULL; +} + +static inline const char * +range_(struct stream inp, unsigned char m, unsigned char n) +{ + unsigned char b = *(unsigned char *)inp.p; + + if (inp.p < inp.end && b >= m && b <= n) + return inp.p + 1; + else + return NULL; +} + +static inline const char * +string_(struct stream inp, const char *s) +{ + while (inp.p < inp.end && *s != '\0' && *inp.p == *s) { + inp.p++; + s++; + } + if (*s == '\0') /* did we match the full string? */ + return inp.p; + else + return NULL; +} + +static inline const char * +end_(struct stream inp) +{ + if (inp.p >= inp.end) + return inp.end; + else + return NULL; +} + +static inline const char * +seq_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n) +{ + for (size_t i=0; i bak); /* progress */ + } +} + +static inline const char * +many1_(struct stream inp, struct cache *cac, void *env, parser *ps[], size_t n) +{ + inp.p = seq_(inp, cac, env, ps, n); + if (inp.p == NULL) + return NULL; + return many_(inp, cac, env, ps, n); +} + +#include /* strcmp */ +#include /* realloc */ +#include /* UINT_MAX */ +#include /* SIZE_MAX */ + +#include // XXX debug + +static inline const char * +memo_(struct stream inp, struct cache *cache, void *env, const char *sym, + parser *ps[], size_t n) +{ + struct memento *mem; + const char *res; + size_t i; + unsigned int h0 = 5831; /* djbhash; f(h,x) = 33h + x */ + unsigned int h, s; + + if (cache == NULL) + return seq_(inp, cache, env, ps, n); + + /* NB: + * the hash math is modulo cache->capacity, but when that is a power of + * two, we can run the intermediates modulo UINT_MAX+1. + */ + + /* hash (inp,sym) */ + for (i=0; i < sizeof inp; i++) + h0 = h0 * 33 + ((unsigned char *)&inp.p)[i]; + for (i=0; sym[i] != '\0'; i++) + h0 = h0 * 33 + (unsigned char)sym[i]; + + /* look up (inp,sym) in cache and return if found */ + /* NB: count steps; the table could be filled with dummies */ + for (i=0, h=h0, s=1; i < cache->capacity; i++, h+=s, s*=5) { + mem = cache->table + (h % cache->capacity); + if (mem->position == NULL) /* not in table */ + break; + if (mem->symbol == NULL) /* dummy entry */ + continue; + if (mem->position == inp.p && strcmp(mem->symbol, sym) == 0) + return mem->result; /* match */ + } + + /* else run the argument sequence */ + res = seq_(inp, cache, env, ps, n); + + /* grow hashtable if necessary */ + if (cache->nused >= cache->capacity * 9 / 10) { + void *newp; + size_t newcap; + size_t j; + + /* realloc XXX nicer err handling */ + assert(cache->capacity <= UINT_MAX / 2); + assert(cache->capacity <= SIZE_MAX / sizeof *cache->table); + if (cache->capacity == 0) /* initial alloc */ + newcap = 4; + else + newcap = cache->capacity * 2; + fprintf(stderr, "XXX grow at %zu/%zu -> newcap %zu\n", + cache->nused, cache->capacity, newcap); + newp = realloc(cache->table, newcap * sizeof *cache->table); + memset(newp + cache->capacity, 0, + (newcap - cache->capacity) * sizeof *newp); + assert(newp != NULL); + if (newp != cache->table) + fprintf(stderr, "XXX TABLE MOVE!\n"); + cache->table = newp; + + /* relocate items as necessary */ + size_t count=0, count2=0; // XXX + for (i=0; i < cache->capacity; i++) { + const char *pi = cache->table[i].position; + const char *si = cache->table[i].symbol; + + if (pi == NULL || si == NULL) /* unused or dummy */ + continue; + + /* hash the value (pi,si) in table cell i */ + h = 5831; + for (j=0; j < sizeof pi; j++) + h = h * 33 + ((unsigned char *)&pi)[j]; + for (j=0; si[j] != '\0'; j++) + h = h * 33 + (unsigned char)si[j]; + + /* find the correct place for it in the new table */ + for (s=1; ; h+=s, s*=5) { + h %= newcap; + mem = cache->table + h; + if (h == i) /* same spot */ + break; + if (mem->position == NULL) /* unused */ + break; + //if (mem->symbol == NULL) /* dummy */ + // break; /* reuse! */ + } + if (s != 1) + count2++; + + /* move the entry if needed, leaving a dummy behind */ + if (h != i) { + fprintf(stderr, "XXX relocate %zu -> %u %s\n", + i, h, h < cache->capacity ? "(!)" : ""); + count++; // XXX + *mem = cache->table[i]; + cache->table[i].symbol = NULL; + } + } + fprintf(stderr, "XXX had to move %zu/%zu entries, i.e. %.1f%%\n", + count, cache->nused, count * 100.0 / cache->nused); + fprintf(stderr, "XXX now %zu/%zu are off-base, i.e. %.1f%%\n", + count2, cache->nused, count2 * 100.0 / cache->nused); + + cache->capacity = newcap; + } + + /* find our place in the table */ + assert(cache->nused < cache->capacity); + /* NB: loop will terminate because dummies count as free here */ + for (h=h0, s=1; ; h+=s, s*=5) { + mem = cache->table + (h % cache->capacity); + if (mem->position == NULL) /* unused entry */ + break; + if (mem->symbol == NULL) /* dummy entry */ + break; /* reuse! */ + } + + /* save the result */ + assert(mem->position == NULL || mem->symbol == NULL); + mem->position = inp.p; + mem->symbol = sym; + mem->result = res; + cache->nused++; + + return res; +} + +#endif /* MINIP_M_H_ */