commit 9d2eb8f8b610e4c920b0c463618df5452e4641c6 from: Sven M. Hallberg date: Mon Dec 05 04:19:41 2022 UTC add minip_b, ini_b (boolean return value variant) commit - e46cabf4dabf47bee184092e4cdb372e9076f3ca commit + 9d2eb8f8b610e4c920b0c463618df5452e4641c6 blob - bcb4b3f4cd14e31b2406f1b529508c88664fbbbc blob + 0a017a655ec96e3cbcddaa9bd43b6fab9b3eaee6 --- Makefile +++ Makefile @@ -1,6 +1,6 @@ CFLAGS += -std=c99 -Wall -TARGETS = ini_r ini_a ini_m ini_n ini_j +TARGETS = ini_r ini_a ini_m ini_n ini_j ini_b all: $(TARGETS) @@ -9,6 +9,7 @@ ini_a : ini_a.c minip_a.h ini_m : ini_m.c minip_m.h ini_n : ini_n.c minip_n.h ini_j : ini_j.c minip_n.h +ini_b : ini_b.c minip_b.h clean: @@ -20,5 +21,6 @@ test: all ./ini_m test.ini | grep -v ' [01] byte ' ./ini_n test.ini | grep -v ' [01] byte ' ./ini_j test.ini 2>/dev/null + ./ini_b test.ini 2>/dev/null .PHONY: all test clean blob - /dev/null blob + 5b5687f09b7c9624703a8562fb92434a01c6059f (mode 644) --- /dev/null +++ ini_b.c @@ -0,0 +1,262 @@ +/* + * demo: (simple) ini files, converted to json + * + * EBNFish: + * + * inifile = {sect} {empty} {ws} [tail] + * sect = header {entry} + * tail = "EOF" {ws} eol {any} + * + * header = {empty} bra sname ket eol + * entry = {empty} key "=" value eol + * empty = {ws} [comment] nl + * + * (* tokens *) + * eol = nl | end + * bra = {ws} "[" + * ket = "]" {ws} + * comment = ";" {lchar} + * sname = {lchar - "]"}+ + * key = {lchar - "="}+ + * value = {lchar}+ + * + * (* character classes *) + * lchar = ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ? + * ws = ? ASCII codes 32 (SP), 9 (HT) ? + * nl = ? ASCII code 10 (LF) ? + * end = ? end of input ? + * any = ? any character ? + * + * converted to be compatible with our combinators: + * + * inifile = sects empties wss tail + * sects = {header entries} + * tail = [eof wss eol anys] + * + * header = empties bra sname ket eol + * entries = {empties key eq value eol} + * empties = {wss comment nl} + * + * eol = nl | end + * bra = wss leftbr + * ket = rightbr wss + * comment = [semi lchars] + * sname = {schar}+ + * key = {kchar}+ + * value = {lchar}+ + * + * eof = "EOF" + * eq = "=" + * semi = ";" + * leftbr = "[" + * rightbr = "]" + * + * lchars = {lchar} + * wss = {ws} + * anys = {any} + * + * lchar = ? ASCII codes 9 (HT), 0x20-0x7e (SP, alnum, punct) ? + * schar = ? ASCII codes 9, 0x20-0x5c, 0x5e-0x7e ? + * kchar = ? ASCII codes 9, 0x20-0x3c, 0x3e-0x7e ? + * ws = ? ASCII codes 32 (SP), 9 (HT) ? + * nl = ? ASCII code 10 (LF) ? + * end = ? end of input ? + * any = ? any character ? + * + * note how right-hand sides have one of the following forms: + * + * - a sequence of nonterminals (SEQ) + * - a sequence of nonterminals inside { } (MANY) + * - a sequence of nonterminals inside { }+ (MANY1) + * - a sequence of nonterminals inside [ ] (OPT) + * - a choice of nonterminals (CHOICE) + * - a string of characters (STRING) + * - a single character (CHAR) + * + * finally, we can realize the "special sequences" (? ... ?) using END, + * ANYCHAR, RANGE, and expression choice ||. + */ + +#include "minip_b.h" +#include + +action trace, print, print_string; +#define DEF_(NT, EXPR) DEF(NT, (EXPR) && ACTION(trace, #NT)) + +DEF (lchar, CHAR('\t') || RANGE(0x20, 0x7e)) +DEF (schar, CHAR('\t') || RANGE(0x20, 0x5c) || RANGE(0x5e, 0x7e)) + /* = !CHAR(']') && SEQ(lchar) */ +DEF (kchar, CHAR('\t') || RANGE(0x20, 0x3c) || RANGE(0x3e, 0x7e)) + /* = !CHAR('=') && SEQ(lchar) */ +DEF (ws, CHAR('\t') || CHAR(' ')) +DEF_(nl, CHAR('\n')) +DEF_(end, END) +DEF_(any, ANYCHAR) + +DEF_(lchars, MANY(lchar)) +DEF_(wss, MANY(ws)) +DEF_(anys, MANY(any)) /* = OMEGA */ + +DEF_(eof, STRING("EOF")) +DEF_(eq, CHAR('=') + && ACTION(print, ": ")) +DEF_(semi, CHAR(';')) +DEF_(leftbr, CHAR('[')) +DEF_(rightbr, CHAR(']')) + +DEF_(eol, CHOICE(nl, end)) +DEF_(bra, SEQ(wss, leftbr)) +DEF_(ket, SEQ(rightbr, wss)) +DEF_(comment, SEQ(semi, lchars)) +DEF (comment_o, OPT(comment)) +DEF_(sname, MANY1(schar) + && ACTION(print_string, " ")) +DEF_(key, MANY1(kchar) + && ACTION(print_string, " ")) +DEF_(value, MANY1(lchar) + && ACTION(print_string, "")) + +DEF_(out_comma, ACTION(print, ",\n")) + +DEF_(empty, SEQ(wss, comment_o, nl)) +DEF_(empties, MANY(empty)) +DEF_(header, SEQ(empties, bra, sname, ket, eol) + && ACTION(print, ": {\n")) +DEF_(entry, SEQ(empties, key, eq, value, eol)) +DEF (entry_x, SEQ(out_comma, entry)) +DEF (entries_x, MANY(entry_x)) +DEF_(entries, SEQ(entry, entries_x) + && ACTION(print, "\n }")) + +DEF_(tail, SEQ(eof, wss, eol, anys)) +DEF (tail_o, OPT(tail)) +DEF_(sect, SEQ(header, entries)) +// (A) +DEF (sect_x, SEQ(out_comma, sect)) +DEF (sects_x, MANY(sect_x)) +DEF (sects1, SEQ(sect, sects_x)) +// (B) +//static parser sects2; +//DEF (sect_x, SEQ(out_comma, sect)) +//DEF (sects3, SEQ(sect_x, sects2)) +//DEF (sects2, OPT(sects3)) +//DEF (sects1, SEQ(sect, sects2)) +// (C) +//static parser sects1; +//DEF (sects2, SEQ(sect, out_comma, sects1)) /* if only tail-recursive */ +//DEF (sects1, CHOICE(sects2, sect)) +DEF_(sects, ACTION(print, "{\n") && + OPT(sects1) + && ACTION(print, "\n}\n")) +DEF_(inifile, TRY(sects, empties, wss, tail_o)) + +bool +trace(void *ctx, void *env, const char *s, size_t len) +{ + const char *nt = ctx; + const char *begin = env; + size_t pos; + + pos = s - begin; + fprintf(stderr, "%4zx: %4zu byte %s\n", pos, len, nt); + return true; +} + +bool +print(void *ctx, void *env, const char *s, size_t len) +{ + fputs(ctx, stdout); + return true; +} + +bool +print_string(void *ctx, void *env, const char *s, size_t len) +{ + size_t i; + unsigned char c; + + fputs(ctx, stdout); /* prefix string */ + + putchar('"'); + for (i = 0; i < len; i++) { + c = (unsigned char)s[i]; + assert(c <= 127); // XXX only ASCII + if (c == '"') + fputs("\\\"", stdout); + else if (c == '\\') + fputs("\\\\", stdout); + else if (c == '\t') + fputs("\\t", stdout); + else if (c <= 0x1F || c == 0x7F) /* i.e. iscntrl */ + printf("\\u00%.2hhx", c); + else + putchar(c); + } + putchar('"'); + + return true; +} + + +#include +#include /* calloc */ +#include /* open, lseek */ +#include /* mmap */ +#include +#include + +extern char *__progname; + +/* + * run the 'inifile' parser on a file given on the command line. + */ +int +main(int argc, char *argv[]) +{ + struct cache cache = {}; + const char *infile; + const char *input; + int fd; + off_t o; + size_t sz, pos; + + if (argc < 2) { + fprintf(stderr, "usage: %s file\n", __progname); + return 3; + } + infile = argv[1]; + + /* mmap input */ + if ((fd = open(infile, O_RDONLY)) == -1) + err(2, "%s", infile); + if ((o = lseek(fd, 0, SEEK_END)) == -1) + err(2, "lseek"); + sz = o; + input = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0); + if (input == MAP_FAILED) + err(2, "mmap"); + + /* result cache */ + assert(cache.table == NULL); + assert(cache.capacity == 0); + assert(cache.nused == 0); + + /* run parser */ + struct stream inp = {input, input + sz}; + if (!inifile(&inp, &cache, (void *)input)) { + assert(inp.p == input); + fprintf(stderr, "%s: syntax error\n", infile); + return 1; + } + assert(inp.p > input); + assert(inp.p <= input + sz); + pos = inp.p - input; + if (pos < sz) { + fprintf(stderr, "%s: syntax error (after pos. %zu/%zu)\n", + infile, pos, sz); + return 1; + } + fprintf(stderr, "success (consumed %zu/%zu bytes of input)\n", pos, sz); + + return 0; +} blob - /dev/null blob + ff87dfcd6b28f1ac929eb32f8b579236ca0125de (mode 644) --- /dev/null +++ minip_b.h @@ -0,0 +1,395 @@ +/* miniature recursive-descent parser combinators + * with semantic actions and omnibus negative memoization + * employing boolean return type and mutable input pointer + * + * pesco 2019, 2020 + */ +#ifndef MINIP_N_H_ +#define MINIP_N_H_ + +#include /* NULL */ +#include + +struct stream {const char *p, *end;}; +struct cache; + +/* parser type -- corresponds to grammar nonterminals */ +typedef bool parser(struct stream *, struct cache *, void *); + +/* define a parser (nonterminal) */ +#define DEF(NT, EXPR) \ + static bool \ + NT(struct stream *inp_, struct cache *cac_, void *env_) \ + { \ + const char *pos_ = inp_->p; \ + struct memento *mem_; \ + \ + /* look up (pos,sym) in cache and return if found */ \ + if ((mem_ = lookup(cac_, pos_, __func__)) != NULL) { \ + if (mem_->value == NULL) \ + return false; \ + inp_->p = mem_->value; \ + return true; \ + } \ + \ + /* else run the expression and return if it matched */ \ + if (EXPR) \ + return true; \ + \ + /* else, return and memoize a negative result */ \ + store(cac_, pos_, __func__, NULL); \ + return false; \ + } + +/* primitive expressions */ +#define ANYCHAR anychar_(inp_) +#define CHAR(C) char_(inp_, (C)) +#define RANGE(M, N) range_(inp_, (M), (N)) +#define STRING(S) string_(inp_, (S)) +#define END end_(inp_) +#define EPSILON true +#define OMEGA (inp_->p = inp_->end, true) + +/* + * logical operators allowed on expressions *sans* actions: + * + * && restriction (lookahead); rhs is consumed + * || ordered choice + * ! negation; only allowed to the left of && (to avoid consumption) + */ + +/* variable-length argument lists */ +#define NARGS(T, ...) (sizeof (T[]){__VA_ARGS__} / sizeof(T)) +#define VARGS(T, ...) (T[]){__VA_ARGS__}, NARGS(T, __VA_ARGS__) +#define PARGS(...) VARGS(parser *, __VA_ARGS__) + +/* basic combinators -- take a list of nonterminals as arguments */ +#define CHOICE(...) choice_(inp_, cac_, env_, PARGS(__VA_ARGS__)) +#define SEQ(...) seq_(inp_, cac_, env_, PARGS(__VA_ARGS__)) + +/* secondary combinators -- could be derived from the basics */ +#define MANY(X) many_(inp_, cac_, env_, X) +#define MANY1(X) many1_(inp_, cac_, env_, X) +#define OPT(X) ((X)(inp_, cac_, env_), true) + +/* NB: + * because memoization only applies to nonterminals and the above combinators + * (MANY, MANY1, OPT) form implicit choices with them, we can't use variable + * argument lists that form an implicit SEQ here. otherwise, since those + * anonymous argument sequences would not be memoized (memoization only happens + * on actual nonterminals), actions could be executed in them, even if the + * sequence failed to match in a later element. + */ + +/* + * semantic actions: + * + * - attach to an expression with &&: EXPR && ACTION(f, ctx) + * - receive an arbitrary context parameter via ACTION()'s second argument. + * - receive an arbitrary environment pointer from the top-level parser call. + * - execute (only) after their subordinate parser(s) matched. + * - can also act as validations by returning the intended result. + * + * Thanks to prerecognition and negative memoization, it is also possible to + * attach actions before the expression. E.g.: + * + * ACTION(print, "[") && EXPR && ACTION(print, "]") + */ +typedef bool action(void *, void *, const char *, size_t); +#define ACTION(F, C) (env_ == NULL || (F)((C), env_, pos_, inp_->p - pos_)) + + +/* + * to avoid premature execution of actions, TRY(...) can be used in place of + * SEQ on any sequence where actions could execute before a later nonterminal + * fails. note: + * + * - single-element sequences do not need TRY. + * - multi-argument calls of MANY, MANY1, OPT may need to be replaced. + * - OPT(...) can be replaced with TRY(...) || EPSILON. + * + * in any case, beware of the overhead. since TRY executes the sequence twice, + * it can easily lead to a substantial increase in recognition time. + */ +#define MATCH(...) (seq_(inp_, cac_, NULL, PARGS(__VA_ARGS__)) && (inp_->p = pos_)) +#define TRY(...) ((env_ == NULL || MATCH(__VA_ARGS__)) && SEQ(__VA_ARGS__)) + + +/* + * general memoization of parse results (~ packrat parsing) + * + * handles the positive cases, but only when actions are enabled, i.e. env + * pointer is non-null. otherwise, actions would be lost - cf. TRY(). + * + * NB: negative results are always cached - see DEF(). + * + * usage: DEF(nt, EXPR && MEMO) + */ +#define MEMO (env_ == NULL || store(cac_, pos_, __func__, inp_->p)) + +/* an entry in the memoization hash table */ +struct memento { + const void *key; /* NULL: unused entry */ + const void *key2; /* NULL: dummy entry -- may be reused */ + const void *value; +}; + +struct cache { + struct memento *table; + size_t capacity, nused; +}; + + +#include + +static inline bool +anychar_(struct stream *inp) +{ + if (inp->p < inp->end) + return (inp->p++, true); + else + return false; +} + +static inline bool +char_(struct stream *inp, char c) +{ + if (inp->p < inp->end && *inp->p == c) + return (inp->p++, true); + else + return false; +} + +static inline bool +range_(struct stream *inp, unsigned char m, unsigned char n) +{ + unsigned char b = *(unsigned char *)inp->p; + + if (inp->p < inp->end && b >= m && b <= n) + return (inp->p++, true); + else + return false; +} + +static inline bool +string_(struct stream *inp, const char *s) +{ + const char *p = inp->p; + + while (p < inp->end && *s != '\0' && *p == *s) { + p++; + s++; + } + if (*s == '\0') /* did we match the full string? */ + return (inp->p = p, true); + else + return false; /* inp->p remains untouched */ +} + +static inline bool +end_(struct stream *inp) +{ + if (inp->p >= inp->end) + return (inp->p = inp->end, true); + else + return false; +} + +static inline bool +seq_(struct stream *inp, struct cache *cac, void *env, parser *ps[], size_t n) +{ + const char *bak = inp->p; + + for (size_t i=0; ip = bak, false); + } + + return true; +} + +static inline bool +choice_(struct stream *inp, struct cache *cac, void *env, parser *ps[], size_t n) +{ + const char *bak = inp->p; + + for (size_t i=0; ip == bak); + } + + return false; +} + +static inline bool +many_(struct stream *inp, struct cache *cac, void *env, parser *pa) +{ + const char *bak; + + for (;;) { + bak = inp->p; + if (!pa(inp, cac, env)) + return (inp->p = bak, true); + assert(inp->p > bak); /* progress */ + } + /* not reached */ +} + +static inline bool +many1_(struct stream *inp, struct cache *cac, void *env, parser *pa) +{ + const char *bak = inp->p; + + if (!pa(inp, cac, env)) + return (inp->p = bak, false); + return many_(inp, cac, env, pa); +} + + +#include /* calloc */ +#include /* UINT_MAX */ +#include /* SIZE_MAX */ + +#include // XXX debug + +static inline unsigned int +hash(const void *key, const void *key2) +{ + unsigned int h = 5831; /* djbhash; f(h,x) = 33h + x */ + size_t i; + + for (i=0; i < sizeof key; i++) + h = h * 33 + ((unsigned char *)&key)[i]; + for (i=0; i < sizeof key2; i++) + h = h * 33 + ((unsigned char *)&key2)[i]; + + return h; +} + +/* NB: + * the hash math is modulo cache->capacity, but when that is a power of + * two, we can run the intermediates modulo UINT_MAX+1. + */ + +static inline struct memento * +lookup(const struct cache *cache, const void *key, const void *key2) +{ + struct memento *mem; + unsigned int h, s; + size_t i; + + if (cache == NULL) + return NULL; + + h = hash(key, key2); + + /* NB: count steps, since the table could be filled with dummies */ + for (i=0, s=1; i < cache->capacity; i++, h+=s, s*=5) { + mem = cache->table + (h % cache->capacity); + if (mem->key == NULL) /* not in table */ + break; + if (mem->key2 == NULL) /* dummy entry */ + continue; + if (mem->key == key && mem->key2 == key2) + return mem; /* match */ + } + + return NULL; +} + +static inline void +grow(struct cache *cache) +{ + struct memento *mem, *newp; + unsigned int h, s; + size_t i, newcap; + + /* realloc XXX nicer err handling */ + assert(cache->capacity <= UINT_MAX / 2); + assert(cache->capacity <= SIZE_MAX / sizeof *cache->table); + if (cache->capacity == 0) /* initial alloc */ + newcap = 4; + else + newcap = cache->capacity * 2; +#ifdef PROFGROW + size_t count=0, count2=0; + fprintf(stderr, "\tgrow at %zu/%zu -> newcap %zu\n", + cache->nused, cache->capacity, newcap); +#endif + newp = calloc(newcap, sizeof *cache->table); + assert(newp != NULL); + + /* relocate items */ + for (i=0; i < cache->capacity; i++) { + const void *k = cache->table[i].key; + const char *k2 = cache->table[i].key2; + + if (k == NULL || k2 == NULL) /* unused or dummy */ + continue; + + /* find the correct place for (k,k2) in the new table */ + for (h = hash(k, k2), s=1; ; h+=s, s*=5) { + h %= newcap; + mem = newp + h; + if (mem->key == NULL) /* unused */ + break; + } +#ifdef PROFGROW + if (s != 1) + count2++; + if (h != i) + count++; +#endif + + /* copy the entry to the new array */ + *mem = cache->table[i]; + } +#ifdef PROFGROW + fprintf(stderr, "\tpos. changed on %zu/%zu entries = %.1f%%\n", + count, cache->nused, count * 100.0 / cache->nused); + fprintf(stderr, "\tnow %zu/%zu are off-base, i.e. %.1f%%\n", + count2, cache->nused, count2 * 100.0 / cache->nused); +#endif + + free(cache->table); + cache->table = newp; + cache->capacity = newcap; +} + +static inline const void * +store(struct cache *cache, const void *key, const void *key2, const void *val) +{ + struct memento *mem; + unsigned int h, s; + + if (cache == NULL) + return val; + + /* grow hashtable if necessary */ + if (cache->nused >= cache->capacity * 9 / 10) + grow(cache); + + /* find our place in the table */ + /* NB: loop will terminate because dummies count as free here */ + h = hash(key, key2); + assert(cache->nused < cache->capacity); + for (s=1; ; h+=s, s*=5) { + mem = cache->table + (h % cache->capacity); + if (mem->key == NULL) /* unused entry */ + break; + if (mem->key2 == NULL) /* dummy entry */ + break; /* reuse! */ + } + + /* save the result */ + assert(mem->key == NULL || mem->key2 == NULL); + mem->key = key; + mem->key2 = key2; + mem->value = val; + cache->nused++; + + return val; +} + +#endif /* MINIP_N_H_ */