commit b02bcc1b1c826553fdcd0ffe6421d1d636f4665c from: Sven M. Hallberg date: Tue Feb 08 11:49:07 2022 UTC factor nfa.h and test_nfa.c out of nfa.c commit - eb08a8335cd7f30115fa269134f3ca646fe30a31 commit + b02bcc1b1c826553fdcd0ffe6421d1d636f4665c blob - 01ee473730fa2fdf382b3297129d499c70e7bf3e blob + 83fd94c11f6beaf00dc41c3d95c2886c3d8df0d3 --- Makefile +++ Makefile @@ -1,15 +1,24 @@ -all: nfa regex pcre hammer +TESTS = test_nfa test_regex test_pcre test_hammer +OBJS = nfa.o -benchmark: nfa regex pcre hammer - ./nfa -b - ./regex -b - ./pcre -b - ./hammer -b +.PHONY: all clean benchmark +all: ${OBJS} ${TESTS} +clean: + rm -f ${OBJS} + rm -f ${TESTS} +benchmark: ${TESTS} + ./test_nfa -b + ./test_regex -b + ./test_pcre -b + ./test_hammer -b -pcre: pcre.c +test_nfa: test_nfa.c nfa.o + $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< nfa.o + +test_pcre: test_pcre.c $(CC) $(CFLAGS) -I/usr/local/include $(LDFLAGS) -o $@ $< \ -lpcreposix -lpcre -hammer: hammer.c +test_hammer: test_hammer.c $(CC) $(CFLAGS) -I/usr/local/include $(LDFLAGS) -o $@ $< \ -lhammer blob - aa437262e9ea4f42f9541969ea23348b3ab295ec (mode 644) blob + /dev/null --- hammer.c +++ /dev/null @@ -1,146 +0,0 @@ -#include -#include -#include -#include -#include /* clock_gettime */ -#include /* timespecsub */ - -#include - -int -match(const HParser *parser, const char *input, size_t sz) -{ - HParseResult *res; - - res = h_parse(parser, input, sz); - if (res == NULL) - return -1; - sz = res->bit_length / 8; - h_parse_result_free(res); - - return sz; -} - -int -benchmark(const HParser *parser) -{ - static const char *inputs[] = { - "test@example.com", - "hans.wurst+kaese@gmail.com", - "!#$%&@valid.email", - "...@invalid", - "total@falsch.", - "auch@.falsch", - "genauso.@falsch.ey", - "bis@hier.richtig ABER", - "mett@wurst", - "hallo@", - "@dingens", - "@", - "", - NULL }; - - static const long N = 5000; /* runs per input */ - - struct timespec t, t0, td; - const char **p; - int i, n; - size_t sz; - - /* warmup */ - sz = strlen(inputs[0]); - for (i = 0; i < N; i++) - match(parser, inputs[0], sz); - - for (p = inputs; *p != NULL; p++) { - printf("%30.30s ", *p); - fflush(stdout); - - sz = strlen(*p); - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0); - for (i = 0; i < N; i++) - n = match(parser, *p, sz); - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t); - timespecsub(&t, &t0, &td); - - if (n == sz) - printf("match "); - else if (n < 0) - printf("no match "); - else - printf("partial "); - - printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N); - } - - return 0; -} - -#include - -#define OTHER "!#$%&'*+/=?^_`{|}~-" - -int -main(int argc, char **argv) -{ - int ch, x; - - /* character classes */ - HParser *lower = h_ch_range('a', 'z'); - HParser *upper = h_ch_range('Z', 'Z'); - HParser *digit = h_ch_range('0', '9'); - HParser *alpha = h_choice(lower, upper, NULL); - HParser *other = h_in(OTHER, sizeof OTHER - 1); - - /* email addresses. cf. rfc 5322, section 3.4 */ - HParser *atext = h_choice(alpha, digit, other, NULL); - HParser *word = h_many1(atext); - HParser *dword = h_sequence(h_ch('.'), word, NULL); - HParser *dotted = h_sequence(word, h_many(dword), NULL); - HParser *address = h_sequence(dotted, h_ch('@'), dotted, NULL); - - HParser *parser = address; - - x = h_compile(parser, PB_REGULAR, NULL); - if (x != 0) { - fprintf(stderr, "h_compile: %d, ", x); - switch (x) { - case -1: - fprintf(stderr, "parser not regular\n"); - break; - case -2: - fprintf(stderr, "invalid parameters\n"); - break; - default: - fprintf(stderr, "internal error\n"); - return 1; - } - } - - while ((ch = getopt(argc, argv, "b")) != -1) { - switch (ch) { - case 'b': - return benchmark(parser); - default: - printf("usage: hammer [-b] string ...\n"); - return 1; - } - } - - for (int i = 1; i < argc; i++) { - char *inp; - size_t sz; - int n; - - inp = argv[i]; - sz = strlen(inp); - n = match(parser, inp, sz); - if (n == -1) - printf("no "); - else if (n < sz) - printf("partial "); - printf("match: %.*s\n", n, inp); - } - - return 0; -} blob - b3dafac618072d85714de9d1caca101445ee6f14 blob + 2c83d11b339042c80f64d6821bbabca4da703f5a --- nfa.c +++ nfa.c @@ -1,49 +1,18 @@ /* simple NFA representation - * pesco 2021, isc license + * pesco 2021-2022, isc license */ -#include /* INT_MAX, INT_MIN */ +#include "nfa.h" +#include -#if CHAR_MAX == INT_MAX /* it could happen */ -#error need INT_MAX > CHAR_MAX -#endif -/* - * Note: INT_MAX > CHAR_MAX implies sizeof(int) > sizeof(char), and since - * sizeof(char) = 1, the number of bits in an int is n * CHAR_BIT with n > 1. - * In other words, int has at least CHAR_BIT bits more than char. CHAR_BIT is - * greater than 1 (actually, at least 8), so it follows that the number - * INT_MAX / 2 is still greater than CHAR_MAX (by at least 7 bits). - */ -#define END (INT_MAX / 2) /* special input symbol meaning end of input */ -#define OUTPUT (END + 1) /* start of output range */ -#define INNER INT_MIN /* start of "inner input" range */ +/* replaceable default allocator */ -struct range { - int base; - int n; /* interval length; negative = inverted */ -}; -struct state { - struct range chr; - unsigned int edge_chr; /* advance (positive offset) only */ - int edge_eps; /* can point back (negative offset) */ -}; -struct nfa { - struct state *state; - int size; /* allocated size, i.e. num. of states - 1 */ -}; -typedef struct nfa NFA; - NFA nfa_alloc(int); /* must not return on error */ - -/* replaceable default allocator */ - #ifndef NOALLOC -#include /* malloc */ #include /* SIZE_MAX */ -#include #include NFA @@ -241,7 +210,6 @@ seq_(NFA a, ...) return sq; } -#define seq(...) seq_(__VA_ARGS__, (NFA){NULL, -1}) NFA choice_(NFA a, ...) @@ -313,7 +281,6 @@ choice_(NFA a, ...) return sq; } -#define choice(...) choice_(__VA_ARGS__, (NFA){NULL, -1}) NFA opt(NFA a) @@ -374,8 +341,6 @@ star(NFA a) } -#include - #ifdef BITSET typedef unsigned long bunch_t; /* a bunch of bits */ #define BUNCH_BIT (sizeof(bunch_t) * CHAR_BIT) @@ -390,7 +355,7 @@ struct set { size_t sz; /* number of bunches */ }; -int +static int inset(const struct set *set, int x) { #ifdef BITSET @@ -400,7 +365,7 @@ inset(const struct set *set, int x) #endif } -void +static void setinsert(struct set *set, int x) { #ifdef BITSET @@ -410,7 +375,7 @@ setinsert(struct set *set, int x) #endif } -void +static void setremove(struct set *set, int x) { #ifdef BITSET @@ -426,7 +391,8 @@ setremove(struct set *set, int x) for (I = 0; I < (A).size; I++) if (inset(SET, I)) /* diagnostic output: print input symbol and state set */ -void +#include +static void printstates(const struct set *set, int x) { #if 0 @@ -445,9 +411,6 @@ printstates(const struct set *set, int x) } -typedef int sub_t(int symbol, size_t state, const char *pos, void *env); -typedef int out_t(struct range r, size_t state, const char *pos, void *env); - struct prep { NFA a; sub_t *sub; /* the "subconscious" mind of the automaton */ @@ -501,7 +464,7 @@ nfaprep(NFA a, sub_t *sub, out_t *out, void *env) * * returns 1 iff any states remain in the 'work' set, 0 otherwise. */ -int +static int epsilon_step(struct prep *pr, struct set *act, struct set *work, const char *p) { NFA a = pr->a; @@ -571,7 +534,7 @@ epsilon_step(struct prep *pr, struct set *act, struct return workleft; } -void +static void epsilon_closure(struct prep *pr, struct set *act, struct set *wrk, const char *p) { /* initial workset = initial active set */ @@ -582,7 +545,7 @@ epsilon_closure(struct prep *pr, struct set *act, stru /* NB: wrk is now empty */ } -void +static void markactive(NFA a, struct set *next, int x) { struct state s; @@ -602,7 +565,7 @@ markactive(NFA a, struct set *next, int x) setinsert(next, x); /* mark end of chain */ } -int +static int matchchr(NFA a, const struct set *act, struct set *next, int x) { size_t i; @@ -644,7 +607,7 @@ matchchr(NFA a, const struct set *act, struct set *nex * * returns the symbol produced or -1 if no output is possible. */ -int +static int generate(struct prep *pr, const struct set *act, const char *p) { struct range r; @@ -738,260 +701,4 @@ match(NFA a, sub_t *sub, out_t *out, void *env, const free(pr); return result; -} - - -#include -#include - -void -print_nfa(NFA a) -{ - struct state s; - int i; - - for(i = 0; i < a.size; i++) { - s = a.state[i]; - -#if 0 - /* a simple uniform display format for the raw state data */ - printf("%5d: [%2Xh, %2Xh) -> %d, e -> %d\n", i, - (unsigned)s.chr.base, (unsigned)s.chr.base + s.chr.n, - i + s.edge_chr, i + s.edge_eps); -#else - /* - * a big mess of conditionals to make the possible cases easy - * to read and distinguish for hunams - */ - printf("%5d: ", i); - if (s.chr.n == 0) { - /* edge_chr is an epsilon or inner input edge */ - if (s.chr.base != 0) - printf("", s.chr.base - INNER); - else if (s.edge_chr != 1) - printf("e"); /* epsilon */ - if (s.edge_chr != 1) - printf(" -> %d", i + s.edge_chr); - } else { - if (s.chr.n < 0) /* negative range */ - printf(")%2Xh, %2Xh(", - (unsigned)s.chr.base + s.chr.n, - (unsigned)s.chr.base - 1); - else if (s.chr.n == 1) { /* singleton */ - if (s.chr.base >= OUTPUT) { - printf(">output %d<", - s.chr.base - OUTPUT); - } else { - printf("%2Xh", (unsigned)s.chr.base); - if (isprint(s.chr.base)) - printf(" '%c'", s.chr.base); - else if (s.chr.base == END) - printf(" END"); - } - } else - printf("[%2Xh, %2Xh]", (unsigned)s.chr.base, - (unsigned)s.chr.base + s.chr.n - 1); - if (s.edge_chr != 1) - printf(" -> %d", i + s.edge_chr); - } - if (s.edge_eps != 0) { - if (s.chr.n != 0 || s.edge_chr != 1) - printf(", "); - printf("e -> %d", i + s.edge_eps); - } else if (s.chr.n != 0) { /* no epsilon edges */ - if (s.edge_chr == 0) /* no future */ - printf(" (fail!)"); - } - putchar('\n'); -#endif - } - - /* show the virtual final state for completeness */ - printf("%5d: (final)\n", i); } - -/* the parser's mind */ - -enum gedanken { - BEGWORD = INNER -}; -enum ausgaben { - WORD = OUTPUT, - ATSIGN -}; - -struct mind { - const char *input, *begword, *word, *atsign; -}; - -int -sub(int symbol, size_t state, const char *pos, void *env_) -{ - struct mind *env = env_; - -#if 0 /* diagnostic output */ - size_t i; - assert(env->input != NULL); - assert(pos >= env->input); - i = pos - env->input; -#endif - - switch (symbol) { - case BEGWORD: - //printf("%8s[%3zu] pos %zu\n", "BEGWORD", state, i); - env->begword = pos; - return 1; - } - - return 0; /* my mind is blank */ -} - -int -out(struct range r, size_t state, const char *pos, void *env_) -{ - size_t i, l; - struct mind *env = env_; - - assert(env->input != NULL); - assert(pos >= env->input); - i = pos - env->input; - - /* inward output - actions */ - assert(r.n == 1); - switch (r.base) { - case ATSIGN: - env->atsign = pos; - printf("%8s[%3zu] pos %zu\n", "ATSIGN", state, i - 1); - return r.base; - case WORD: - assert(env->begword != NULL); - assert(env->begword < pos); - env->word = pos; - l = env->word - env->begword; - printf("%8s[%3zu] pos %zu\t%.*s\n", "WORD", state, i - l, - (int)l, env->begword); - return r.base; - } - - return -1; /* what can i do? */ -} - -#include /* clock_gettime */ -#include /* timespecsub */ - -int -benchmark(NFA a) -{ - static const char *inputs[] = { - "test@example.com", - "hans.wurst+kaese@gmail.com", - "!#$%&@valid.email", - "...@invalid", - "total@falsch.", - "auch@.falsch", - "genauso.@falsch.ey", - "bis@hier.richtig ABER", - "mett@wurst", - "hallo@", - "@dingens", - "@", - "", - NULL }; - - static const long N = 10000; /* runs per input */ - - struct prep *pr; - struct mind env; - struct timespec t, t0, td; - const char **p; - int i, n; - size_t sz; - - env.input = inputs[0]; - pr = nfaprep(a, sub, out, &env); - - /* warmup */ - sz = strlen(inputs[0]); - for (i = 0; i < N; i++) - match_(pr, inputs[0], sz); - - for (p = inputs; *p != NULL; p++) { - printf("%30.30s ", *p); - fflush(stdout); - - sz = strlen(*p); - env.input = *p; - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0); - for (i = 0; i < N; i++) - n = match_(pr, *p, sz); - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t); - timespecsub(&t, &t0, &td); - - if (n == sz) - printf("match "); - else if (n < 0) - printf("no match "); - else - printf("partial "); - - printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N); - } - - return 0; -} - -#include - -int -main(int argc, char **argv) -{ - /* character classes */ - NFA lower = range('a', 'z'); - NFA upper = range('A', 'Z'); - NFA digit = range('0', '9'); - NFA alpha = choice(lower, upper); - NFA other = choice(chr('!'), range('#', '\''), range('*', '-'), - chr('/'), chr('='), chr('?'), range('^', '`'), range('{', '~')); - /* = oneof("!#$%&'*+-/=?^_`{|}~") */ - - /* email addresses. cf. rfc 5322, section 3.4 */ - NFA atext = choice(alpha, digit, other); - NFA word = seq(think(BEGWORD), plus(atext), output(WORD)); - NFA atsign = seq(chr('@'), output(ATSIGN)); - NFA dotted = seq(word, star(seq(chr('.'), word))); - NFA address = seq(dotted, atsign, dotted); - - NFA nfa = address; - int ch; - - while ((ch = getopt(argc, argv, "b")) != -1) { - switch (ch) { - case 'b': - return benchmark(nfa); - default: - printf("usage: nfa [-b] string ...\n"); - return 1; - } - } - - print_nfa(nfa); - putchar('\n'); - for (int i = 1; i < argc; i++) { - char *inp; - size_t sz; - int n; - struct mind env; - - inp = argv[i]; - sz = strlen(inp); - env.input = inp; - n = match(nfa, sub, out, &env, inp, sz); - if (n == -1) - printf("no "); - else if (n < sz) - printf("partial "); - printf("match: %.*s\n", n, inp); - } - - return 0; -} blob - /dev/null blob + 4ab6613d29bc4385fdca19d300caef5972951859 (mode 644) --- /dev/null +++ nfa.h @@ -0,0 +1,73 @@ +/* simple NFA representation + * pesco 2021-2022, isc license + */ + +#include /* INT_MAX, INT_MIN */ +#include /* size_t */ + +#if CHAR_MAX == INT_MAX /* it could happen */ +#error need INT_MAX > CHAR_MAX +#endif + +/* + * Note: INT_MAX > CHAR_MAX implies sizeof(int) > sizeof(char), and since + * sizeof(char) = 1, the number of bits in an int is n * CHAR_BIT with n > 1. + * In other words, int has at least CHAR_BIT bits more than char. CHAR_BIT is + * greater than 1 (actually, at least 8), so it follows that the number + * INT_MAX / 2 is still greater than CHAR_MAX (by at least 7 bits). + */ +#define END (INT_MAX / 2) /* special input symbol meaning end of input */ +#define OUTPUT (END + 1) /* start of output range */ +#define INNER INT_MIN /* start of "inner input" range */ + +struct range { + int base; + int n; /* interval length; negative = inverted */ +}; +struct state { + struct range chr; + unsigned int edge_chr; /* advance (positive offset) only */ + int edge_eps; /* can point back (negative offset) */ +}; +struct nfa { + struct state *state; + int size; /* allocated size, i.e. num. of states - 1 */ +}; +typedef struct nfa NFA; + + +/* elementary NFAs */ +NFA fail(void); +NFA range(char, char); +NFA nrange(char, char); +NFA chr(char); +NFA any(void); +NFA symbol(int); /* input only */ +NFA output(int); +NFA end(void); +NFA nop(void); +NFA think(int); +NFA epsilon(void); + +/* combinators */ +NFA str(const char *); +NFA seq_(NFA, ...); /* -> macro seq() */ +NFA choice_(NFA, ...); /* -> macro choice() */ +NFA opt(NFA); +NFA rep(int, NFA); +NFA plus(NFA); +NFA star(NFA); +#define seq(...) seq_(__VA_ARGS__, (NFA){NULL, -1}) +#define choice(...) choice_(__VA_ARGS__, (NFA){NULL, -1}) + + +/* running NFAs */ + +typedef int sub_t(int symbol, size_t state, const char *pos, void *env); +typedef int out_t(struct range r, size_t state, const char *pos, void *env); + +struct prep; +struct prep *nfaprep(NFA, sub_t *, out_t *, void *); + +int match_(struct prep *, const char *, size_t); +int match(NFA, sub_t *, out_t *, void *, const char *, size_t); blob - be5e3d820ddc2fb2bda2a6ca14b23bc418d813cb (mode 644) blob + /dev/null --- pcre.c +++ /dev/null @@ -1,131 +0,0 @@ -#include -#include -#include -#include -#include /* clock_gettime */ -#include /* timespecsub */ - -#include - -int -match(const regex_t *re, const char *input, size_t sz) -{ - static char buf[128]; - regmatch_t pmatch; - int x; - - assert (input[sz] == '\0'); - x = regexec(re, input, 1, &pmatch, 0); - if (x == REG_NOMATCH) - return -1; - else if (x != 0) { - regerror(x, re, buf, sizeof buf); - errx(100, "regexec: %s", buf); - } - - return (int)(pmatch.rm_eo - pmatch.rm_so); -} - -int -benchmark(regex_t *re) -{ - static const char *inputs[] = { - "test@example.com", - "hans.wurst+kaese@gmail.com", - "!#$%&@valid.email", - "...@invalid", - "total@falsch.", - "auch@.falsch", - "genauso.@falsch.ey", - "bis@hier.richtig ABER", - "mett@wurst", - "hallo@", - "@dingens", - "@", - "", - NULL }; - - static const long N = 200000; /* runs per input */ - - struct timespec t, t0, td; - const char **p; - int i, n; - size_t sz; - - /* warmup */ - sz = strlen(inputs[0]); - for (i = 0; i < N; i++) - match(re, inputs[0], sz); - - for (p = inputs; *p != NULL; p++) { - printf("%30.30s ", *p); - fflush(stdout); - - sz = strlen(*p); - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0); - for (i = 0; i < N; i++) - n = match(re, *p, sz); - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t); - timespecsub(&t, &t0, &td); - - if (n == sz) - printf("match "); - else if (n < 0) - printf("no match "); - else - printf("partial "); - - printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N); - } - - return 0; -} - -#include - -/* email addresses. cf. rfc 5322, section 3.4 */ -#define ATEXT "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]" -#define WORD ATEXT "+" -#define DOTTED WORD "(\\." WORD ")*" -#define ADDRESS DOTTED "@" DOTTED - -int -main(int argc, char **argv) -{ - int ch, x; - char buf[128]; - regex_t re; - - if ((x = regcomp(&re, "^" ADDRESS, REG_EXTENDED)) != 0) { - regerror(x, &re, buf, sizeof buf); - fprintf(stderr, "regcomp: %s\n", buf); - return 1; - } - - while ((ch = getopt(argc, argv, "b")) != -1) { - switch (ch) { - case 'b': - return benchmark(&re); - default: - printf("usage: pcre [-b] string ...\n"); - return 1; - } - } - - for (int i = 1; i < argc; i++) { - char *inp; - size_t sz; - int n; - - inp = argv[i]; - sz = strlen(inp); - n = match(&re, inp, sz); - if (n == -1) - printf("no "); - else if (n < sz) - printf("partial "); - printf("match: %.*s\n", n, inp); - } - - return 0; -} blob - /dev/null blob + aa437262e9ea4f42f9541969ea23348b3ab295ec (mode 644) --- /dev/null +++ test_hammer.c @@ -0,0 +1,146 @@ +#include +#include +#include +#include +#include /* clock_gettime */ +#include /* timespecsub */ + +#include + +int +match(const HParser *parser, const char *input, size_t sz) +{ + HParseResult *res; + + res = h_parse(parser, input, sz); + if (res == NULL) + return -1; + sz = res->bit_length / 8; + h_parse_result_free(res); + + return sz; +} + +int +benchmark(const HParser *parser) +{ + static const char *inputs[] = { + "test@example.com", + "hans.wurst+kaese@gmail.com", + "!#$%&@valid.email", + "...@invalid", + "total@falsch.", + "auch@.falsch", + "genauso.@falsch.ey", + "bis@hier.richtig ABER", + "mett@wurst", + "hallo@", + "@dingens", + "@", + "", + NULL }; + + static const long N = 5000; /* runs per input */ + + struct timespec t, t0, td; + const char **p; + int i, n; + size_t sz; + + /* warmup */ + sz = strlen(inputs[0]); + for (i = 0; i < N; i++) + match(parser, inputs[0], sz); + + for (p = inputs; *p != NULL; p++) { + printf("%30.30s ", *p); + fflush(stdout); + + sz = strlen(*p); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0); + for (i = 0; i < N; i++) + n = match(parser, *p, sz); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t); + timespecsub(&t, &t0, &td); + + if (n == sz) + printf("match "); + else if (n < 0) + printf("no match "); + else + printf("partial "); + + printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N); + } + + return 0; +} + +#include + +#define OTHER "!#$%&'*+/=?^_`{|}~-" + +int +main(int argc, char **argv) +{ + int ch, x; + + /* character classes */ + HParser *lower = h_ch_range('a', 'z'); + HParser *upper = h_ch_range('Z', 'Z'); + HParser *digit = h_ch_range('0', '9'); + HParser *alpha = h_choice(lower, upper, NULL); + HParser *other = h_in(OTHER, sizeof OTHER - 1); + + /* email addresses. cf. rfc 5322, section 3.4 */ + HParser *atext = h_choice(alpha, digit, other, NULL); + HParser *word = h_many1(atext); + HParser *dword = h_sequence(h_ch('.'), word, NULL); + HParser *dotted = h_sequence(word, h_many(dword), NULL); + HParser *address = h_sequence(dotted, h_ch('@'), dotted, NULL); + + HParser *parser = address; + + x = h_compile(parser, PB_REGULAR, NULL); + if (x != 0) { + fprintf(stderr, "h_compile: %d, ", x); + switch (x) { + case -1: + fprintf(stderr, "parser not regular\n"); + break; + case -2: + fprintf(stderr, "invalid parameters\n"); + break; + default: + fprintf(stderr, "internal error\n"); + return 1; + } + } + + while ((ch = getopt(argc, argv, "b")) != -1) { + switch (ch) { + case 'b': + return benchmark(parser); + default: + printf("usage: hammer [-b] string ...\n"); + return 1; + } + } + + for (int i = 1; i < argc; i++) { + char *inp; + size_t sz; + int n; + + inp = argv[i]; + sz = strlen(inp); + n = match(parser, inp, sz); + if (n == -1) + printf("no "); + else if (n < sz) + printf("partial "); + printf("match: %.*s\n", n, inp); + } + + return 0; +} blob - 33fb0717be9d812d3453c116a9a0785ba0558234 (mode 644) blob + /dev/null --- regex.c +++ /dev/null @@ -1,131 +0,0 @@ -#include -#include -#include -#include -#include /* clock_gettime */ -#include /* timespecsub */ - -#include - -int -match(const regex_t *re, const char *input, size_t sz) -{ - static char buf[128]; - regmatch_t pmatch; - int x; - - assert (input[sz] == '\0'); - x = regexec(re, input, 1, &pmatch, 0); - if (x == REG_NOMATCH) - return -1; - else if (x != 0) { - regerror(x, re, buf, sizeof buf); - errx(100, "regexec: %s", buf); - } - - return (int)(pmatch.rm_eo - pmatch.rm_so); -} - -int -benchmark(regex_t *re) -{ - static const char *inputs[] = { - "test@example.com", - "hans.wurst+kaese@gmail.com", - "!#$%&@valid.email", - "...@invalid", - "total@falsch.", - "auch@.falsch", - "genauso.@falsch.ey", - "bis@hier.richtig ABER", - "mett@wurst", - "hallo@", - "@dingens", - "@", - "", - NULL }; - - static const long N = 10000; /* runs per input */ - - struct timespec t, t0, td; - const char **p; - int i, n; - size_t sz; - - /* warmup */ - sz = strlen(inputs[0]); - for (i = 0; i < N; i++) - match(re, inputs[0], sz); - - for (p = inputs; *p != NULL; p++) { - printf("%30.30s ", *p); - fflush(stdout); - - sz = strlen(*p); - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0); - for (i = 0; i < N; i++) - n = match(re, *p, sz); - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t); - timespecsub(&t, &t0, &td); - - if (n == sz) - printf("match "); - else if (n < 0) - printf("no match "); - else - printf("partial "); - - printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N); - } - - return 0; -} - -#include - -/* email addresses. cf. rfc 5322, section 3.4 */ -#define ATEXT "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]" -#define WORD ATEXT "+" -#define DOTTED WORD "(\\." WORD ")*" -#define ADDRESS DOTTED "@" DOTTED - -int -main(int argc, char **argv) -{ - int ch, x; - char buf[128]; - regex_t re; - - if ((x = regcomp(&re, "^" ADDRESS, REG_EXTENDED)) != 0) { - regerror(x, &re, buf, sizeof buf); - fprintf(stderr, "regcomp: %s\n", buf); - return 1; - } - - while ((ch = getopt(argc, argv, "b")) != -1) { - switch (ch) { - case 'b': - return benchmark(&re); - default: - printf("usage: nfa [-b] string ...\n"); - return 1; - } - } - - for (int i = 1; i < argc; i++) { - char *inp; - size_t sz; - int n; - - inp = argv[i]; - sz = strlen(inp); - n = match(&re, inp, sz); - if (n == -1) - printf("no "); - else if (n < sz) - printf("partial "); - printf("match: %.*s\n", n, inp); - } - - return 0; -} blob - /dev/null blob + 6440ec0c1349aaf39fb4f74a3a0495b6cfc7cc32 (mode 644) --- /dev/null +++ test_nfa.c @@ -0,0 +1,266 @@ +/* test/example/benchmark for nfa.c + * pesco 2021-2022, isc license + */ + +#include "nfa.h" +#include +#include + + +#include +#include + +void +print_nfa(NFA a) +{ + struct state s; + int i; + + for(i = 0; i < a.size; i++) { + s = a.state[i]; + +#if 0 + /* a simple uniform display format for the raw state data */ + printf("%5d: [%2Xh, %2Xh) -> %d, e -> %d\n", i, + (unsigned)s.chr.base, (unsigned)s.chr.base + s.chr.n, + i + s.edge_chr, i + s.edge_eps); +#else + /* + * a big mess of conditionals to make the possible cases easy + * to read and distinguish for hunams + */ + printf("%5d: ", i); + if (s.chr.n == 0) { + /* edge_chr is an epsilon or inner input edge */ + if (s.chr.base != 0) + printf("", s.chr.base - INNER); + else if (s.edge_chr != 1) + printf("e"); /* epsilon */ + if (s.edge_chr != 1) + printf(" -> %d", i + s.edge_chr); + } else { + if (s.chr.n < 0) /* negative range */ + printf(")%2Xh, %2Xh(", + (unsigned)s.chr.base + s.chr.n, + (unsigned)s.chr.base - 1); + else if (s.chr.n == 1) { /* singleton */ + if (s.chr.base >= OUTPUT) { + printf(">output %d<", + s.chr.base - OUTPUT); + } else { + printf("%2Xh", (unsigned)s.chr.base); + if (isprint(s.chr.base)) + printf(" '%c'", s.chr.base); + else if (s.chr.base == END) + printf(" END"); + } + } else + printf("[%2Xh, %2Xh]", (unsigned)s.chr.base, + (unsigned)s.chr.base + s.chr.n - 1); + if (s.edge_chr != 1) + printf(" -> %d", i + s.edge_chr); + } + if (s.edge_eps != 0) { + if (s.chr.n != 0 || s.edge_chr != 1) + printf(", "); + printf("e -> %d", i + s.edge_eps); + } else if (s.chr.n != 0) { /* no epsilon edges */ + if (s.edge_chr == 0) /* no future */ + printf(" (fail!)"); + } + putchar('\n'); +#endif + } + + /* show the virtual final state for completeness */ + printf("%5d: (final)\n", i); +} + + +/* the parser's mind */ + +enum gedanken { + BEGWORD = INNER +}; +enum ausgaben { + WORD = OUTPUT, + ATSIGN +}; + +struct mind { + const char *input, *begword, *word, *atsign; +}; + +int +sub(int symbol, size_t state, const char *pos, void *env_) +{ + struct mind *env = env_; + +#if 0 /* diagnostic output */ + size_t i; + assert(env->input != NULL); + assert(pos >= env->input); + i = pos - env->input; +#endif + + switch (symbol) { + case BEGWORD: + //printf("%8s[%3zu] pos %zu\n", "BEGWORD", state, i); + env->begword = pos; + return 1; + } + + return 0; /* my mind is blank */ +} + +int +out(struct range r, size_t state, const char *pos, void *env_) +{ + size_t i, l; + struct mind *env = env_; + + assert(env->input != NULL); + assert(pos >= env->input); + i = pos - env->input; + + /* inward output - actions */ + assert(r.n == 1); + switch (r.base) { + case ATSIGN: + env->atsign = pos; + printf("%8s[%3zu] pos %zu\n", "ATSIGN", state, i - 1); + return r.base; + case WORD: + assert(env->begword != NULL); + assert(env->begword < pos); + env->word = pos; + l = env->word - env->begword; + printf("%8s[%3zu] pos %zu\t%.*s\n", "WORD", state, i - l, + (int)l, env->begword); + return r.base; + } + + return -1; /* what can i do? */ +} + + +#include /* clock_gettime */ +#include /* timespecsub */ + +int +benchmark(NFA a) +{ + static const char *inputs[] = { + "test@example.com", + "hans.wurst+kaese@gmail.com", + "!#$%&@valid.email", + "...@invalid", + "total@falsch.", + "auch@.falsch", + "genauso.@falsch.ey", + "bis@hier.richtig ABER", + "mett@wurst", + "hallo@", + "@dingens", + "@", + "", + NULL }; + + static const long N = 10000; /* runs per input */ + + struct prep *pr; + struct mind env; + struct timespec t, t0, td; + const char **p; + int i, n; + size_t sz; + + env.input = inputs[0]; + pr = nfaprep(a, sub, out, &env); + + /* warmup */ + sz = strlen(inputs[0]); + for (i = 0; i < N; i++) + match_(pr, inputs[0], sz); + + for (p = inputs; *p != NULL; p++) { + printf("%30.30s ", *p); + fflush(stdout); + + sz = strlen(*p); + env.input = *p; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0); + for (i = 0; i < N; i++) + n = match_(pr, *p, sz); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t); + timespecsub(&t, &t0, &td); + + if (n == sz) + printf("match "); + else if (n < 0) + printf("no match "); + else + printf("partial "); + + printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N); + } + + return 0; +} + + +#include + +int +main(int argc, char **argv) +{ + /* character classes */ + NFA lower = range('a', 'z'); + NFA upper = range('A', 'Z'); + NFA digit = range('0', '9'); + NFA alpha = choice(lower, upper); + NFA other = choice(chr('!'), range('#', '\''), range('*', '-'), + chr('/'), chr('='), chr('?'), range('^', '`'), range('{', '~')); + /* = oneof("!#$%&'*+-/=?^_`{|}~") */ + + /* email addresses. cf. rfc 5322, section 3.4 */ + NFA atext = choice(alpha, digit, other); + NFA word = seq(think(BEGWORD), plus(atext), output(WORD)); + NFA atsign = seq(chr('@'), output(ATSIGN)); + NFA dotted = seq(word, star(seq(chr('.'), word))); + NFA address = seq(dotted, atsign, dotted); + + NFA nfa = address; + int ch; + + while ((ch = getopt(argc, argv, "b")) != -1) { + switch (ch) { + case 'b': + return benchmark(nfa); + default: + printf("usage: nfa [-b] string ...\n"); + return 1; + } + } + + print_nfa(nfa); + putchar('\n'); + for (int i = 1; i < argc; i++) { + char *inp; + size_t sz; + int n; + struct mind env; + + inp = argv[i]; + sz = strlen(inp); + env.input = inp; + n = match(nfa, sub, out, &env, inp, sz); + if (n == -1) + printf("no "); + else if (n < sz) + printf("partial "); + printf("match: %.*s\n", n, inp); + } + + return 0; +} blob - /dev/null blob + be5e3d820ddc2fb2bda2a6ca14b23bc418d813cb (mode 644) --- /dev/null +++ test_pcre.c @@ -0,0 +1,131 @@ +#include +#include +#include +#include +#include /* clock_gettime */ +#include /* timespecsub */ + +#include + +int +match(const regex_t *re, const char *input, size_t sz) +{ + static char buf[128]; + regmatch_t pmatch; + int x; + + assert (input[sz] == '\0'); + x = regexec(re, input, 1, &pmatch, 0); + if (x == REG_NOMATCH) + return -1; + else if (x != 0) { + regerror(x, re, buf, sizeof buf); + errx(100, "regexec: %s", buf); + } + + return (int)(pmatch.rm_eo - pmatch.rm_so); +} + +int +benchmark(regex_t *re) +{ + static const char *inputs[] = { + "test@example.com", + "hans.wurst+kaese@gmail.com", + "!#$%&@valid.email", + "...@invalid", + "total@falsch.", + "auch@.falsch", + "genauso.@falsch.ey", + "bis@hier.richtig ABER", + "mett@wurst", + "hallo@", + "@dingens", + "@", + "", + NULL }; + + static const long N = 200000; /* runs per input */ + + struct timespec t, t0, td; + const char **p; + int i, n; + size_t sz; + + /* warmup */ + sz = strlen(inputs[0]); + for (i = 0; i < N; i++) + match(re, inputs[0], sz); + + for (p = inputs; *p != NULL; p++) { + printf("%30.30s ", *p); + fflush(stdout); + + sz = strlen(*p); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0); + for (i = 0; i < N; i++) + n = match(re, *p, sz); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t); + timespecsub(&t, &t0, &td); + + if (n == sz) + printf("match "); + else if (n < 0) + printf("no match "); + else + printf("partial "); + + printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N); + } + + return 0; +} + +#include + +/* email addresses. cf. rfc 5322, section 3.4 */ +#define ATEXT "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]" +#define WORD ATEXT "+" +#define DOTTED WORD "(\\." WORD ")*" +#define ADDRESS DOTTED "@" DOTTED + +int +main(int argc, char **argv) +{ + int ch, x; + char buf[128]; + regex_t re; + + if ((x = regcomp(&re, "^" ADDRESS, REG_EXTENDED)) != 0) { + regerror(x, &re, buf, sizeof buf); + fprintf(stderr, "regcomp: %s\n", buf); + return 1; + } + + while ((ch = getopt(argc, argv, "b")) != -1) { + switch (ch) { + case 'b': + return benchmark(&re); + default: + printf("usage: pcre [-b] string ...\n"); + return 1; + } + } + + for (int i = 1; i < argc; i++) { + char *inp; + size_t sz; + int n; + + inp = argv[i]; + sz = strlen(inp); + n = match(&re, inp, sz); + if (n == -1) + printf("no "); + else if (n < sz) + printf("partial "); + printf("match: %.*s\n", n, inp); + } + + return 0; +} blob - /dev/null blob + 33fb0717be9d812d3453c116a9a0785ba0558234 (mode 644) --- /dev/null +++ test_regex.c @@ -0,0 +1,131 @@ +#include +#include +#include +#include +#include /* clock_gettime */ +#include /* timespecsub */ + +#include + +int +match(const regex_t *re, const char *input, size_t sz) +{ + static char buf[128]; + regmatch_t pmatch; + int x; + + assert (input[sz] == '\0'); + x = regexec(re, input, 1, &pmatch, 0); + if (x == REG_NOMATCH) + return -1; + else if (x != 0) { + regerror(x, re, buf, sizeof buf); + errx(100, "regexec: %s", buf); + } + + return (int)(pmatch.rm_eo - pmatch.rm_so); +} + +int +benchmark(regex_t *re) +{ + static const char *inputs[] = { + "test@example.com", + "hans.wurst+kaese@gmail.com", + "!#$%&@valid.email", + "...@invalid", + "total@falsch.", + "auch@.falsch", + "genauso.@falsch.ey", + "bis@hier.richtig ABER", + "mett@wurst", + "hallo@", + "@dingens", + "@", + "", + NULL }; + + static const long N = 10000; /* runs per input */ + + struct timespec t, t0, td; + const char **p; + int i, n; + size_t sz; + + /* warmup */ + sz = strlen(inputs[0]); + for (i = 0; i < N; i++) + match(re, inputs[0], sz); + + for (p = inputs; *p != NULL; p++) { + printf("%30.30s ", *p); + fflush(stdout); + + sz = strlen(*p); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0); + for (i = 0; i < N; i++) + n = match(re, *p, sz); + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t); + timespecsub(&t, &t0, &td); + + if (n == sz) + printf("match "); + else if (n < 0) + printf("no match "); + else + printf("partial "); + + printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N); + } + + return 0; +} + +#include + +/* email addresses. cf. rfc 5322, section 3.4 */ +#define ATEXT "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]" +#define WORD ATEXT "+" +#define DOTTED WORD "(\\." WORD ")*" +#define ADDRESS DOTTED "@" DOTTED + +int +main(int argc, char **argv) +{ + int ch, x; + char buf[128]; + regex_t re; + + if ((x = regcomp(&re, "^" ADDRESS, REG_EXTENDED)) != 0) { + regerror(x, &re, buf, sizeof buf); + fprintf(stderr, "regcomp: %s\n", buf); + return 1; + } + + while ((ch = getopt(argc, argv, "b")) != -1) { + switch (ch) { + case 'b': + return benchmark(&re); + default: + printf("usage: nfa [-b] string ...\n"); + return 1; + } + } + + for (int i = 1; i < argc; i++) { + char *inp; + size_t sz; + int n; + + inp = argv[i]; + sz = strlen(inp); + n = match(&re, inp, sz); + if (n == -1) + printf("no "); + else if (n < sz) + printf("partial "); + printf("match: %.*s\n", n, inp); + } + + return 0; +}