commit 43cc6062210e39cc517683aa2b419b2dc1a4d944 from: Sven M. Hallberg date: Thu Feb 10 22:04:46 2022 UTC support suspend / chunked operation (split out END processing) commit - ed4d5d022fc7fef0a97e5bcd973f93d3f5eaa331 commit + 43cc6062210e39cc517683aa2b419b2dc1a4d944 blob - 1722e03753bff277083f0fa53e6b78aab8d46692 blob + c7323c7d2b5a930b765a19eda9edbed6106bdd7e --- nfa.c +++ nfa.c @@ -421,6 +421,9 @@ struct prep { struct set next; /* set of "next active" states */ struct set eps; /* which states carry epsilon edges */ + int pos; /* current input position */ + int match; /* length of longest match so far */ + int sz; bunch_t buf[]; }; @@ -432,6 +435,8 @@ nfareset(struct prep *pr) pr->act.sz = pr->next.sz = pr->sz; pr->act.bits = pr->buf; pr->next.bits = pr->buf + pr->sz; + pr->pos = 0; + pr->match = -1; /* no match */ memset(pr->act.bits, 0, pr->act.sz * sizeof(bunch_t)); setinsert(&pr->act, 0); @@ -482,7 +487,7 @@ nfaprep(NFA a, sub_t *sub, out_t *out, void *env) * returns 1 iff any states remain in the 'work' set, 0 otherwise. */ static int -epsilon_step(struct prep *pr, struct set *act, struct set *work, const char *p) +epsilon_step(struct prep *pr, struct set *act, struct set *work) { NFA a = pr->a; struct state s; @@ -519,7 +524,7 @@ epsilon_step(struct prep *pr, struct set *act, struct */ if (s.chr.base == 0 || (pr->sub != NULL && - pr->sub(s.chr.base, k, p, pr->env))) { + pr->sub(s.chr.base, k, pr->pos, pr->env))) { x = k + s.edge_chr; setinsert(work, x); setinsert(act, x); @@ -552,13 +557,13 @@ epsilon_step(struct prep *pr, struct set *act, struct } static void -epsilon_closure(struct prep *pr, struct set *act, struct set *wrk, const char *p) +epsilon_closure(struct prep *pr, struct set *act, struct set *wrk) { /* initial workset = initial active set */ assert(wrk->sz == act->sz); memcpy(wrk->bits, act->bits, act->sz * sizeof(bunch_t)); - while (epsilon_step(pr, act, wrk, p)); + while (epsilon_step(pr, act, wrk)); /* NB: wrk is now empty */ } @@ -625,7 +630,7 @@ matchchr(NFA a, const struct set *act, struct set *nex * returns the symbol produced or -1 if no output is possible. */ static int -generate(struct prep *pr, const struct set *act, const char *p) +generate(struct prep *pr, const struct set *act) { struct range r; size_t k; @@ -643,7 +648,8 @@ generate(struct prep *pr, const struct set *act, const assert(r.base + r.n >= OUTPUT); /* all output */ if (pr->out != NULL) { - x = pr->out(r, k, p, pr->env); /* attempt output */ + /* attempt output */ + x = pr->out(r, k, pr->pos, pr->env); if (x != -1) /* success? */ return x; } @@ -652,66 +658,72 @@ generate(struct prep *pr, const struct set *act, const return -1; /* no output possible */ } -int -nfacont(struct prep *pr, const char *input, size_t sz) +static int +nfastep(struct prep *pr, int x, struct set *act, struct set *next) { NFA a = pr->a; - struct set act, next; bunch_t *tmp; - int i, x; - int result = -1; /* no match */ - const char *p; + int r = 0; /* consume input? */ - /* get state sets from prep */ - act = pr->act; - next = pr->next; + printstates(act, x); /* diagnostic output */ - /* iterate over the input */ - for(i = 0; i <= sz; ) { - p = input + i; /* input pointer */ - x = (i < sz ? *p : END); /* input symbol */ - - printstates(&act, x); /* diagnostic output */ - - /* - * form epsilon closure of act - also handles inner input. - * uses, and incidentally clears, our 'next' set for its work. - */ - // XXX inner input not possible after last outer input - // - but since last input symbol is END, this is okay - epsilon_closure(pr, &act, &next, p); - - if (inset(&act, a.size)) /* final state? */ - result = i; /* save position */ - - if (matchchr(a, &act, &next, x)) /* consume input? */ - i++; - else { - x = generate(pr, &act, p); /* generate output? */ - if (x == -1) - break; /* no output edges here */ - if (!matchchr(a, &act, &next, x)) /* fill next */ - break; /* should not happen */ - } + /* + * form epsilon closure of act - also handles inner input. + * uses, and incidentally clears, our 'next' set for its work. + */ + epsilon_closure(pr, act, next); - /* swap sets */ - tmp = act.bits; - act.bits = next.bits; - next.bits = tmp; + if (inset(act, a.size)) /* final state? */ + pr->match = pr->pos; /* save position */ + + if (matchchr(a, act, next, x)) /* consume input? */ + r = 1; + else { + x = generate(pr, act); /* generate output? */ + if (x == -1) + r = -1; /* no output edges here */ + if (!matchchr(a, act, next, x)) /* fill next */ + r = -1; /* should not happen */ } - /* store state sets back into prep for future use */ - pr->act = act; - pr->next = next; + /* swap sets */ + tmp = act->bits; + act->bits = next->bits; + next->bits = tmp; - return result; + return r; } int +nfacont(struct prep *pr, const char *input, size_t sz) +{ + int i, r; + + for(i = 0; i < sz; i += r) { + r = nfastep(pr, input[i], &pr->act, &pr->next); + if (r == -1) /* no match */ + break; + pr->pos += r; + } + + return pr->match; +} + +int +nfaend(struct prep *pr) +{ + /* run until output ceases */ + while(nfastep(pr, END, &pr->act, &pr->next) == 0); + return pr->match; +} + +int nfarun(struct prep *pr, const char *input, size_t sz) { nfareset(pr); - return nfacont(pr, input, sz); + nfacont(pr, input, sz); + nfaend(pr); + return pr->match; } int blob - 22a469b798cb78f7b90b30d82f3a274e4ddf577b blob + 32fe3ed0306698b10a5c55d7fc0a7f170c32983d --- nfa.h +++ nfa.h @@ -63,13 +63,14 @@ NFA star(NFA); /* running NFAs */ -typedef int sub_t(int symbol, size_t state, const char *pos, void *env); -typedef int out_t(struct range r, size_t state, const char *pos, void *env); +typedef int sub_t(int symbol, size_t state, int, void *env); +typedef int out_t(struct range r, size_t state, int, void *env); struct prep; struct prep *nfaprep(NFA, sub_t *, out_t *, void *); void nfareset(struct prep *); int nfarun(struct prep *, const char *, size_t); int nfacont(struct prep *, const char *, size_t); +int nfaend(struct prep *); int nfamatch(NFA, sub_t *, out_t *, void *, const char *, size_t); blob - 109812125ad9e36833c3be6cb874930d70a822b6 blob + 5a9bec5e272c9ba8094181042bf6d785a5e6da23 --- test_nfa.c +++ test_nfa.c @@ -92,21 +92,16 @@ struct mind { }; int -sub(int symbol, size_t state, const char *pos, void *env_) +sub(int symbol, size_t state, int i, void *env_) { struct mind *env = env_; -#if 0 /* diagnostic output */ - size_t i; assert(env->input != NULL); - assert(pos >= env->input); - i = pos - env->input; -#endif switch (symbol) { case BEGWORD: - //printf("%8s[%3zu] pos %zu\n", "BEGWORD", state, i); - env->begword = pos; + //printf("%8s[%3zu] pos %d\n", "BEGWORD", state, i); + env->begword = env->input + i; return 1; } @@ -114,29 +109,27 @@ sub(int symbol, size_t state, const char *pos, void *e } int -out_print(struct range r, size_t state, const char *pos, void *env_) +out_print(struct range r, size_t state, int i, void *env_) { - size_t i, l; + int l; struct mind *env = env_; assert(env->input != NULL); - assert(pos >= env->input); - i = pos - env->input; /* inward output - actions */ assert(r.n == 1); switch (r.base) { case ATSIGN: - env->atsign = pos; - printf("%8s[%3zu] pos %zu\n", "ATSIGN", state, i - 1); + env->atsign = env->input + i; + printf("%8s[%3zu] pos %d\n", "ATSIGN", state, i - 1); return r.base; case WORD: + env->word = env->input + i; assert(env->begword != NULL); - assert(env->begword < pos); - env->word = pos; + assert(env->begword < env->word); l = env->word - env->begword; - printf("%8s[%3zu] pos %zu\t%.*s\n", "WORD", state, i - l, - (int)l, env->begword); + printf("%8s[%3zu] pos %d\t%.*s\n", "WORD", state, i - l, + l, env->begword); return r.base; } @@ -144,23 +137,22 @@ out_print(struct range r, size_t state, const char *po } int -out_bench(struct range r, size_t state, const char *pos, void *env_) +out_bench(struct range r, size_t state, int i, void *env_) { struct mind *env = env_; assert(env->input != NULL); - assert(pos >= env->input); /* inward output - actions */ assert(r.n == 1); switch (r.base) { case ATSIGN: - env->atsign = pos; + env->atsign = env->input + i; return r.base; case WORD: + env->word = env->input + i; assert(env->begword != NULL); - assert(env->begword < pos); - env->word = pos; + assert(env->begword < env->word); return r.base; }