commit - eb08a8335cd7f30115fa269134f3ca646fe30a31
commit + b02bcc1b1c826553fdcd0ffe6421d1d636f4665c
blob - 01ee473730fa2fdf382b3297129d499c70e7bf3e
blob + 83fd94c11f6beaf00dc41c3d95c2886c3d8df0d3
--- Makefile
+++ Makefile
-all: nfa regex pcre hammer
+TESTS = test_nfa test_regex test_pcre test_hammer
+OBJS = nfa.o
-benchmark: nfa regex pcre hammer
- ./nfa -b
- ./regex -b
- ./pcre -b
- ./hammer -b
+.PHONY: all clean benchmark
+all: ${OBJS} ${TESTS}
+clean:
+ rm -f ${OBJS}
+ rm -f ${TESTS}
+benchmark: ${TESTS}
+ ./test_nfa -b
+ ./test_regex -b
+ ./test_pcre -b
+ ./test_hammer -b
-pcre: pcre.c
+test_nfa: test_nfa.c nfa.o
+ $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $< nfa.o
+
+test_pcre: test_pcre.c
$(CC) $(CFLAGS) -I/usr/local/include $(LDFLAGS) -o $@ $< \
-lpcreposix -lpcre
-hammer: hammer.c
+test_hammer: test_hammer.c
$(CC) $(CFLAGS) -I/usr/local/include $(LDFLAGS) -o $@ $< \
-lhammer
blob - aa437262e9ea4f42f9541969ea23348b3ab295ec (mode 644)
blob + /dev/null
--- hammer.c
+++ /dev/null
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <err.h>
-#include <time.h> /* clock_gettime */
-#include <sys/time.h> /* timespecsub */
-
-#include <hammer/hammer.h>
-
-int
-match(const HParser *parser, const char *input, size_t sz)
-{
- HParseResult *res;
-
- res = h_parse(parser, input, sz);
- if (res == NULL)
- return -1;
- sz = res->bit_length / 8;
- h_parse_result_free(res);
-
- return sz;
-}
-
-int
-benchmark(const HParser *parser)
-{
- static const char *inputs[] = {
- "test@example.com",
- "hans.wurst+kaese@gmail.com",
- "!#$%&@valid.email",
- "...@invalid",
- "total@falsch.",
- "auch@.falsch",
- "genauso.@falsch.ey",
- "bis@hier.richtig ABER",
- "mett@wurst",
- "hallo@",
- "@dingens",
- "@",
- "",
- NULL };
-
- static const long N = 5000; /* runs per input */
-
- struct timespec t, t0, td;
- const char **p;
- int i, n;
- size_t sz;
-
- /* warmup */
- sz = strlen(inputs[0]);
- for (i = 0; i < N; i++)
- match(parser, inputs[0], sz);
-
- for (p = inputs; *p != NULL; p++) {
- printf("%30.30s ", *p);
- fflush(stdout);
-
- sz = strlen(*p);
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);
- for (i = 0; i < N; i++)
- n = match(parser, *p, sz);
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
- timespecsub(&t, &t0, &td);
-
- if (n == sz)
- printf("match ");
- else if (n < 0)
- printf("no match ");
- else
- printf("partial ");
-
- printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N);
- }
-
- return 0;
-}
-
-#include <getopt.h>
-
-#define OTHER "!#$%&'*+/=?^_`{|}~-"
-
-int
-main(int argc, char **argv)
-{
- int ch, x;
-
- /* character classes */
- HParser *lower = h_ch_range('a', 'z');
- HParser *upper = h_ch_range('Z', 'Z');
- HParser *digit = h_ch_range('0', '9');
- HParser *alpha = h_choice(lower, upper, NULL);
- HParser *other = h_in(OTHER, sizeof OTHER - 1);
-
- /* email addresses. cf. rfc 5322, section 3.4 */
- HParser *atext = h_choice(alpha, digit, other, NULL);
- HParser *word = h_many1(atext);
- HParser *dword = h_sequence(h_ch('.'), word, NULL);
- HParser *dotted = h_sequence(word, h_many(dword), NULL);
- HParser *address = h_sequence(dotted, h_ch('@'), dotted, NULL);
-
- HParser *parser = address;
-
- x = h_compile(parser, PB_REGULAR, NULL);
- if (x != 0) {
- fprintf(stderr, "h_compile: %d, ", x);
- switch (x) {
- case -1:
- fprintf(stderr, "parser not regular\n");
- break;
- case -2:
- fprintf(stderr, "invalid parameters\n");
- break;
- default:
- fprintf(stderr, "internal error\n");
- return 1;
- }
- }
-
- while ((ch = getopt(argc, argv, "b")) != -1) {
- switch (ch) {
- case 'b':
- return benchmark(parser);
- default:
- printf("usage: hammer [-b] string ...\n");
- return 1;
- }
- }
-
- for (int i = 1; i < argc; i++) {
- char *inp;
- size_t sz;
- int n;
-
- inp = argv[i];
- sz = strlen(inp);
- n = match(parser, inp, sz);
- if (n == -1)
- printf("no ");
- else if (n < sz)
- printf("partial ");
- printf("match: %.*s\n", n, inp);
- }
-
- return 0;
-}
blob - b3dafac618072d85714de9d1caca101445ee6f14
blob + 2c83d11b339042c80f64d6821bbabca4da703f5a
--- nfa.c
+++ nfa.c
/* simple NFA representation
- * pesco 2021, isc license
+ * pesco 2021-2022, isc license
*/
-#include <limits.h> /* INT_MAX, INT_MIN */
+#include "nfa.h"
+#include <assert.h>
-#if CHAR_MAX == INT_MAX /* it could happen */
-#error need INT_MAX > CHAR_MAX
-#endif
-/*
- * Note: INT_MAX > CHAR_MAX implies sizeof(int) > sizeof(char), and since
- * sizeof(char) = 1, the number of bits in an int is n * CHAR_BIT with n > 1.
- * In other words, int has at least CHAR_BIT bits more than char. CHAR_BIT is
- * greater than 1 (actually, at least 8), so it follows that the number
- * INT_MAX / 2 is still greater than CHAR_MAX (by at least 7 bits).
- */
-#define END (INT_MAX / 2) /* special input symbol meaning end of input */
-#define OUTPUT (END + 1) /* start of output range */
-#define INNER INT_MIN /* start of "inner input" range */
+/* replaceable default allocator */
-struct range {
- int base;
- int n; /* interval length; negative = inverted */
-};
-struct state {
- struct range chr;
- unsigned int edge_chr; /* advance (positive offset) only */
- int edge_eps; /* can point back (negative offset) */
-};
-struct nfa {
- struct state *state;
- int size; /* allocated size, i.e. num. of states - 1 */
-};
-typedef struct nfa NFA;
-
NFA nfa_alloc(int); /* must not return on error */
-
-/* replaceable default allocator */
-
#ifndef NOALLOC
-#include <stdlib.h> /* malloc */
#include <stdint.h> /* SIZE_MAX */
-#include <assert.h>
#include <err.h>
NFA
return sq;
}
-#define seq(...) seq_(__VA_ARGS__, (NFA){NULL, -1})
NFA
choice_(NFA a, ...)
return sq;
}
-#define choice(...) choice_(__VA_ARGS__, (NFA){NULL, -1})
NFA
opt(NFA a)
}
-#include <stdio.h>
-
#ifdef BITSET
typedef unsigned long bunch_t; /* a bunch of bits */
#define BUNCH_BIT (sizeof(bunch_t) * CHAR_BIT)
size_t sz; /* number of bunches */
};
-int
+static int
inset(const struct set *set, int x)
{
#ifdef BITSET
#endif
}
-void
+static void
setinsert(struct set *set, int x)
{
#ifdef BITSET
#endif
}
-void
+static void
setremove(struct set *set, int x)
{
#ifdef BITSET
for (I = 0; I < (A).size; I++) if (inset(SET, I))
/* diagnostic output: print input symbol and state set */
-void
+#include <stdio.h>
+static void
printstates(const struct set *set, int x)
{
#if 0
}
-typedef int sub_t(int symbol, size_t state, const char *pos, void *env);
-typedef int out_t(struct range r, size_t state, const char *pos, void *env);
-
struct prep {
NFA a;
sub_t *sub; /* the "subconscious" mind of the automaton */
*
* returns 1 iff any states remain in the 'work' set, 0 otherwise.
*/
-int
+static int
epsilon_step(struct prep *pr, struct set *act, struct set *work, const char *p)
{
NFA a = pr->a;
return workleft;
}
-void
+static void
epsilon_closure(struct prep *pr, struct set *act, struct set *wrk, const char *p)
{
/* initial workset = initial active set */
/* NB: wrk is now empty */
}
-void
+static void
markactive(NFA a, struct set *next, int x)
{
struct state s;
setinsert(next, x); /* mark end of chain */
}
-int
+static int
matchchr(NFA a, const struct set *act, struct set *next, int x)
{
size_t i;
*
* returns the symbol produced or -1 if no output is possible.
*/
-int
+static int
generate(struct prep *pr, const struct set *act, const char *p)
{
struct range r;
free(pr);
return result;
-}
-
-
-#include <stdio.h>
-#include <ctype.h>
-
-void
-print_nfa(NFA a)
-{
- struct state s;
- int i;
-
- for(i = 0; i < a.size; i++) {
- s = a.state[i];
-
-#if 0
- /* a simple uniform display format for the raw state data */
- printf("%5d: [%2Xh, %2Xh) -> %d, e -> %d\n", i,
- (unsigned)s.chr.base, (unsigned)s.chr.base + s.chr.n,
- i + s.edge_chr, i + s.edge_eps);
-#else
- /*
- * a big mess of conditionals to make the possible cases easy
- * to read and distinguish for hunams
- */
- printf("%5d: ", i);
- if (s.chr.n == 0) {
- /* edge_chr is an epsilon or inner input edge */
- if (s.chr.base != 0)
- printf("<inner %d>", s.chr.base - INNER);
- else if (s.edge_chr != 1)
- printf("e"); /* epsilon */
- if (s.edge_chr != 1)
- printf(" -> %d", i + s.edge_chr);
- } else {
- if (s.chr.n < 0) /* negative range */
- printf(")%2Xh, %2Xh(",
- (unsigned)s.chr.base + s.chr.n,
- (unsigned)s.chr.base - 1);
- else if (s.chr.n == 1) { /* singleton */
- if (s.chr.base >= OUTPUT) {
- printf(">output %d<",
- s.chr.base - OUTPUT);
- } else {
- printf("%2Xh", (unsigned)s.chr.base);
- if (isprint(s.chr.base))
- printf(" '%c'", s.chr.base);
- else if (s.chr.base == END)
- printf(" END");
- }
- } else
- printf("[%2Xh, %2Xh]", (unsigned)s.chr.base,
- (unsigned)s.chr.base + s.chr.n - 1);
- if (s.edge_chr != 1)
- printf(" -> %d", i + s.edge_chr);
- }
- if (s.edge_eps != 0) {
- if (s.chr.n != 0 || s.edge_chr != 1)
- printf(", ");
- printf("e -> %d", i + s.edge_eps);
- } else if (s.chr.n != 0) { /* no epsilon edges */
- if (s.edge_chr == 0) /* no future */
- printf(" (fail!)");
- }
- putchar('\n');
-#endif
- }
-
- /* show the virtual final state for completeness */
- printf("%5d: (final)\n", i);
}
-
-/* the parser's mind */
-
-enum gedanken {
- BEGWORD = INNER
-};
-enum ausgaben {
- WORD = OUTPUT,
- ATSIGN
-};
-
-struct mind {
- const char *input, *begword, *word, *atsign;
-};
-
-int
-sub(int symbol, size_t state, const char *pos, void *env_)
-{
- struct mind *env = env_;
-
-#if 0 /* diagnostic output */
- size_t i;
- assert(env->input != NULL);
- assert(pos >= env->input);
- i = pos - env->input;
-#endif
-
- switch (symbol) {
- case BEGWORD:
- //printf("%8s[%3zu] pos %zu\n", "BEGWORD", state, i);
- env->begword = pos;
- return 1;
- }
-
- return 0; /* my mind is blank */
-}
-
-int
-out(struct range r, size_t state, const char *pos, void *env_)
-{
- size_t i, l;
- struct mind *env = env_;
-
- assert(env->input != NULL);
- assert(pos >= env->input);
- i = pos - env->input;
-
- /* inward output - actions */
- assert(r.n == 1);
- switch (r.base) {
- case ATSIGN:
- env->atsign = pos;
- printf("%8s[%3zu] pos %zu\n", "ATSIGN", state, i - 1);
- return r.base;
- case WORD:
- assert(env->begword != NULL);
- assert(env->begword < pos);
- env->word = pos;
- l = env->word - env->begword;
- printf("%8s[%3zu] pos %zu\t%.*s\n", "WORD", state, i - l,
- (int)l, env->begword);
- return r.base;
- }
-
- return -1; /* what can i do? */
-}
-
-#include <time.h> /* clock_gettime */
-#include <sys/time.h> /* timespecsub */
-
-int
-benchmark(NFA a)
-{
- static const char *inputs[] = {
- "test@example.com",
- "hans.wurst+kaese@gmail.com",
- "!#$%&@valid.email",
- "...@invalid",
- "total@falsch.",
- "auch@.falsch",
- "genauso.@falsch.ey",
- "bis@hier.richtig ABER",
- "mett@wurst",
- "hallo@",
- "@dingens",
- "@",
- "",
- NULL };
-
- static const long N = 10000; /* runs per input */
-
- struct prep *pr;
- struct mind env;
- struct timespec t, t0, td;
- const char **p;
- int i, n;
- size_t sz;
-
- env.input = inputs[0];
- pr = nfaprep(a, sub, out, &env);
-
- /* warmup */
- sz = strlen(inputs[0]);
- for (i = 0; i < N; i++)
- match_(pr, inputs[0], sz);
-
- for (p = inputs; *p != NULL; p++) {
- printf("%30.30s ", *p);
- fflush(stdout);
-
- sz = strlen(*p);
- env.input = *p;
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);
- for (i = 0; i < N; i++)
- n = match_(pr, *p, sz);
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
- timespecsub(&t, &t0, &td);
-
- if (n == sz)
- printf("match ");
- else if (n < 0)
- printf("no match ");
- else
- printf("partial ");
-
- printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N);
- }
-
- return 0;
-}
-
-#include <getopt.h>
-
-int
-main(int argc, char **argv)
-{
- /* character classes */
- NFA lower = range('a', 'z');
- NFA upper = range('A', 'Z');
- NFA digit = range('0', '9');
- NFA alpha = choice(lower, upper);
- NFA other = choice(chr('!'), range('#', '\''), range('*', '-'),
- chr('/'), chr('='), chr('?'), range('^', '`'), range('{', '~'));
- /* = oneof("!#$%&'*+-/=?^_`{|}~") */
-
- /* email addresses. cf. rfc 5322, section 3.4 */
- NFA atext = choice(alpha, digit, other);
- NFA word = seq(think(BEGWORD), plus(atext), output(WORD));
- NFA atsign = seq(chr('@'), output(ATSIGN));
- NFA dotted = seq(word, star(seq(chr('.'), word)));
- NFA address = seq(dotted, atsign, dotted);
-
- NFA nfa = address;
- int ch;
-
- while ((ch = getopt(argc, argv, "b")) != -1) {
- switch (ch) {
- case 'b':
- return benchmark(nfa);
- default:
- printf("usage: nfa [-b] string ...\n");
- return 1;
- }
- }
-
- print_nfa(nfa);
- putchar('\n');
- for (int i = 1; i < argc; i++) {
- char *inp;
- size_t sz;
- int n;
- struct mind env;
-
- inp = argv[i];
- sz = strlen(inp);
- env.input = inp;
- n = match(nfa, sub, out, &env, inp, sz);
- if (n == -1)
- printf("no ");
- else if (n < sz)
- printf("partial ");
- printf("match: %.*s\n", n, inp);
- }
-
- return 0;
-}
blob - /dev/null
blob + 4ab6613d29bc4385fdca19d300caef5972951859 (mode 644)
--- /dev/null
+++ nfa.h
+/* simple NFA representation
+ * pesco 2021-2022, isc license
+ */
+
+#include <limits.h> /* INT_MAX, INT_MIN */
+#include <stdlib.h> /* size_t */
+
+#if CHAR_MAX == INT_MAX /* it could happen */
+#error need INT_MAX > CHAR_MAX
+#endif
+
+/*
+ * Note: INT_MAX > CHAR_MAX implies sizeof(int) > sizeof(char), and since
+ * sizeof(char) = 1, the number of bits in an int is n * CHAR_BIT with n > 1.
+ * In other words, int has at least CHAR_BIT bits more than char. CHAR_BIT is
+ * greater than 1 (actually, at least 8), so it follows that the number
+ * INT_MAX / 2 is still greater than CHAR_MAX (by at least 7 bits).
+ */
+#define END (INT_MAX / 2) /* special input symbol meaning end of input */
+#define OUTPUT (END + 1) /* start of output range */
+#define INNER INT_MIN /* start of "inner input" range */
+
+struct range {
+ int base;
+ int n; /* interval length; negative = inverted */
+};
+struct state {
+ struct range chr;
+ unsigned int edge_chr; /* advance (positive offset) only */
+ int edge_eps; /* can point back (negative offset) */
+};
+struct nfa {
+ struct state *state;
+ int size; /* allocated size, i.e. num. of states - 1 */
+};
+typedef struct nfa NFA;
+
+
+/* elementary NFAs */
+NFA fail(void);
+NFA range(char, char);
+NFA nrange(char, char);
+NFA chr(char);
+NFA any(void);
+NFA symbol(int); /* input only */
+NFA output(int);
+NFA end(void);
+NFA nop(void);
+NFA think(int);
+NFA epsilon(void);
+
+/* combinators */
+NFA str(const char *);
+NFA seq_(NFA, ...); /* -> macro seq() */
+NFA choice_(NFA, ...); /* -> macro choice() */
+NFA opt(NFA);
+NFA rep(int, NFA);
+NFA plus(NFA);
+NFA star(NFA);
+#define seq(...) seq_(__VA_ARGS__, (NFA){NULL, -1})
+#define choice(...) choice_(__VA_ARGS__, (NFA){NULL, -1})
+
+
+/* running NFAs */
+
+typedef int sub_t(int symbol, size_t state, const char *pos, void *env);
+typedef int out_t(struct range r, size_t state, const char *pos, void *env);
+
+struct prep;
+struct prep *nfaprep(NFA, sub_t *, out_t *, void *);
+
+int match_(struct prep *, const char *, size_t);
+int match(NFA, sub_t *, out_t *, void *, const char *, size_t);
blob - be5e3d820ddc2fb2bda2a6ca14b23bc418d813cb (mode 644)
blob + /dev/null
--- pcre.c
+++ /dev/null
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <err.h>
-#include <time.h> /* clock_gettime */
-#include <sys/time.h> /* timespecsub */
-
-#include <pcreposix.h>
-
-int
-match(const regex_t *re, const char *input, size_t sz)
-{
- static char buf[128];
- regmatch_t pmatch;
- int x;
-
- assert (input[sz] == '\0');
- x = regexec(re, input, 1, &pmatch, 0);
- if (x == REG_NOMATCH)
- return -1;
- else if (x != 0) {
- regerror(x, re, buf, sizeof buf);
- errx(100, "regexec: %s", buf);
- }
-
- return (int)(pmatch.rm_eo - pmatch.rm_so);
-}
-
-int
-benchmark(regex_t *re)
-{
- static const char *inputs[] = {
- "test@example.com",
- "hans.wurst+kaese@gmail.com",
- "!#$%&@valid.email",
- "...@invalid",
- "total@falsch.",
- "auch@.falsch",
- "genauso.@falsch.ey",
- "bis@hier.richtig ABER",
- "mett@wurst",
- "hallo@",
- "@dingens",
- "@",
- "",
- NULL };
-
- static const long N = 200000; /* runs per input */
-
- struct timespec t, t0, td;
- const char **p;
- int i, n;
- size_t sz;
-
- /* warmup */
- sz = strlen(inputs[0]);
- for (i = 0; i < N; i++)
- match(re, inputs[0], sz);
-
- for (p = inputs; *p != NULL; p++) {
- printf("%30.30s ", *p);
- fflush(stdout);
-
- sz = strlen(*p);
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);
- for (i = 0; i < N; i++)
- n = match(re, *p, sz);
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
- timespecsub(&t, &t0, &td);
-
- if (n == sz)
- printf("match ");
- else if (n < 0)
- printf("no match ");
- else
- printf("partial ");
-
- printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N);
- }
-
- return 0;
-}
-
-#include <getopt.h>
-
-/* email addresses. cf. rfc 5322, section 3.4 */
-#define ATEXT "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]"
-#define WORD ATEXT "+"
-#define DOTTED WORD "(\\." WORD ")*"
-#define ADDRESS DOTTED "@" DOTTED
-
-int
-main(int argc, char **argv)
-{
- int ch, x;
- char buf[128];
- regex_t re;
-
- if ((x = regcomp(&re, "^" ADDRESS, REG_EXTENDED)) != 0) {
- regerror(x, &re, buf, sizeof buf);
- fprintf(stderr, "regcomp: %s\n", buf);
- return 1;
- }
-
- while ((ch = getopt(argc, argv, "b")) != -1) {
- switch (ch) {
- case 'b':
- return benchmark(&re);
- default:
- printf("usage: pcre [-b] string ...\n");
- return 1;
- }
- }
-
- for (int i = 1; i < argc; i++) {
- char *inp;
- size_t sz;
- int n;
-
- inp = argv[i];
- sz = strlen(inp);
- n = match(&re, inp, sz);
- if (n == -1)
- printf("no ");
- else if (n < sz)
- printf("partial ");
- printf("match: %.*s\n", n, inp);
- }
-
- return 0;
-}
blob - /dev/null
blob + aa437262e9ea4f42f9541969ea23348b3ab295ec (mode 644)
--- /dev/null
+++ test_hammer.c
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <err.h>
+#include <time.h> /* clock_gettime */
+#include <sys/time.h> /* timespecsub */
+
+#include <hammer/hammer.h>
+
+int
+match(const HParser *parser, const char *input, size_t sz)
+{
+ HParseResult *res;
+
+ res = h_parse(parser, input, sz);
+ if (res == NULL)
+ return -1;
+ sz = res->bit_length / 8;
+ h_parse_result_free(res);
+
+ return sz;
+}
+
+int
+benchmark(const HParser *parser)
+{
+ static const char *inputs[] = {
+ "test@example.com",
+ "hans.wurst+kaese@gmail.com",
+ "!#$%&@valid.email",
+ "...@invalid",
+ "total@falsch.",
+ "auch@.falsch",
+ "genauso.@falsch.ey",
+ "bis@hier.richtig ABER",
+ "mett@wurst",
+ "hallo@",
+ "@dingens",
+ "@",
+ "",
+ NULL };
+
+ static const long N = 5000; /* runs per input */
+
+ struct timespec t, t0, td;
+ const char **p;
+ int i, n;
+ size_t sz;
+
+ /* warmup */
+ sz = strlen(inputs[0]);
+ for (i = 0; i < N; i++)
+ match(parser, inputs[0], sz);
+
+ for (p = inputs; *p != NULL; p++) {
+ printf("%30.30s ", *p);
+ fflush(stdout);
+
+ sz = strlen(*p);
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);
+ for (i = 0; i < N; i++)
+ n = match(parser, *p, sz);
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
+ timespecsub(&t, &t0, &td);
+
+ if (n == sz)
+ printf("match ");
+ else if (n < 0)
+ printf("no match ");
+ else
+ printf("partial ");
+
+ printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N);
+ }
+
+ return 0;
+}
+
+#include <getopt.h>
+
+#define OTHER "!#$%&'*+/=?^_`{|}~-"
+
+int
+main(int argc, char **argv)
+{
+ int ch, x;
+
+ /* character classes */
+ HParser *lower = h_ch_range('a', 'z');
+ HParser *upper = h_ch_range('Z', 'Z');
+ HParser *digit = h_ch_range('0', '9');
+ HParser *alpha = h_choice(lower, upper, NULL);
+ HParser *other = h_in(OTHER, sizeof OTHER - 1);
+
+ /* email addresses. cf. rfc 5322, section 3.4 */
+ HParser *atext = h_choice(alpha, digit, other, NULL);
+ HParser *word = h_many1(atext);
+ HParser *dword = h_sequence(h_ch('.'), word, NULL);
+ HParser *dotted = h_sequence(word, h_many(dword), NULL);
+ HParser *address = h_sequence(dotted, h_ch('@'), dotted, NULL);
+
+ HParser *parser = address;
+
+ x = h_compile(parser, PB_REGULAR, NULL);
+ if (x != 0) {
+ fprintf(stderr, "h_compile: %d, ", x);
+ switch (x) {
+ case -1:
+ fprintf(stderr, "parser not regular\n");
+ break;
+ case -2:
+ fprintf(stderr, "invalid parameters\n");
+ break;
+ default:
+ fprintf(stderr, "internal error\n");
+ return 1;
+ }
+ }
+
+ while ((ch = getopt(argc, argv, "b")) != -1) {
+ switch (ch) {
+ case 'b':
+ return benchmark(parser);
+ default:
+ printf("usage: hammer [-b] string ...\n");
+ return 1;
+ }
+ }
+
+ for (int i = 1; i < argc; i++) {
+ char *inp;
+ size_t sz;
+ int n;
+
+ inp = argv[i];
+ sz = strlen(inp);
+ n = match(parser, inp, sz);
+ if (n == -1)
+ printf("no ");
+ else if (n < sz)
+ printf("partial ");
+ printf("match: %.*s\n", n, inp);
+ }
+
+ return 0;
+}
blob - 33fb0717be9d812d3453c116a9a0785ba0558234 (mode 644)
blob + /dev/null
--- regex.c
+++ /dev/null
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-#include <err.h>
-#include <time.h> /* clock_gettime */
-#include <sys/time.h> /* timespecsub */
-
-#include <regex.h>
-
-int
-match(const regex_t *re, const char *input, size_t sz)
-{
- static char buf[128];
- regmatch_t pmatch;
- int x;
-
- assert (input[sz] == '\0');
- x = regexec(re, input, 1, &pmatch, 0);
- if (x == REG_NOMATCH)
- return -1;
- else if (x != 0) {
- regerror(x, re, buf, sizeof buf);
- errx(100, "regexec: %s", buf);
- }
-
- return (int)(pmatch.rm_eo - pmatch.rm_so);
-}
-
-int
-benchmark(regex_t *re)
-{
- static const char *inputs[] = {
- "test@example.com",
- "hans.wurst+kaese@gmail.com",
- "!#$%&@valid.email",
- "...@invalid",
- "total@falsch.",
- "auch@.falsch",
- "genauso.@falsch.ey",
- "bis@hier.richtig ABER",
- "mett@wurst",
- "hallo@",
- "@dingens",
- "@",
- "",
- NULL };
-
- static const long N = 10000; /* runs per input */
-
- struct timespec t, t0, td;
- const char **p;
- int i, n;
- size_t sz;
-
- /* warmup */
- sz = strlen(inputs[0]);
- for (i = 0; i < N; i++)
- match(re, inputs[0], sz);
-
- for (p = inputs; *p != NULL; p++) {
- printf("%30.30s ", *p);
- fflush(stdout);
-
- sz = strlen(*p);
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);
- for (i = 0; i < N; i++)
- n = match(re, *p, sz);
- clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
- timespecsub(&t, &t0, &td);
-
- if (n == sz)
- printf("match ");
- else if (n < 0)
- printf("no match ");
- else
- printf("partial ");
-
- printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N);
- }
-
- return 0;
-}
-
-#include <getopt.h>
-
-/* email addresses. cf. rfc 5322, section 3.4 */
-#define ATEXT "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]"
-#define WORD ATEXT "+"
-#define DOTTED WORD "(\\." WORD ")*"
-#define ADDRESS DOTTED "@" DOTTED
-
-int
-main(int argc, char **argv)
-{
- int ch, x;
- char buf[128];
- regex_t re;
-
- if ((x = regcomp(&re, "^" ADDRESS, REG_EXTENDED)) != 0) {
- regerror(x, &re, buf, sizeof buf);
- fprintf(stderr, "regcomp: %s\n", buf);
- return 1;
- }
-
- while ((ch = getopt(argc, argv, "b")) != -1) {
- switch (ch) {
- case 'b':
- return benchmark(&re);
- default:
- printf("usage: nfa [-b] string ...\n");
- return 1;
- }
- }
-
- for (int i = 1; i < argc; i++) {
- char *inp;
- size_t sz;
- int n;
-
- inp = argv[i];
- sz = strlen(inp);
- n = match(&re, inp, sz);
- if (n == -1)
- printf("no ");
- else if (n < sz)
- printf("partial ");
- printf("match: %.*s\n", n, inp);
- }
-
- return 0;
-}
blob - /dev/null
blob + 6440ec0c1349aaf39fb4f74a3a0495b6cfc7cc32 (mode 644)
--- /dev/null
+++ test_nfa.c
+/* test/example/benchmark for nfa.c
+ * pesco 2021-2022, isc license
+ */
+
+#include "nfa.h"
+#include <assert.h>
+#include <string.h>
+
+
+#include <stdio.h>
+#include <ctype.h>
+
+void
+print_nfa(NFA a)
+{
+ struct state s;
+ int i;
+
+ for(i = 0; i < a.size; i++) {
+ s = a.state[i];
+
+#if 0
+ /* a simple uniform display format for the raw state data */
+ printf("%5d: [%2Xh, %2Xh) -> %d, e -> %d\n", i,
+ (unsigned)s.chr.base, (unsigned)s.chr.base + s.chr.n,
+ i + s.edge_chr, i + s.edge_eps);
+#else
+ /*
+ * a big mess of conditionals to make the possible cases easy
+ * to read and distinguish for hunams
+ */
+ printf("%5d: ", i);
+ if (s.chr.n == 0) {
+ /* edge_chr is an epsilon or inner input edge */
+ if (s.chr.base != 0)
+ printf("<inner %d>", s.chr.base - INNER);
+ else if (s.edge_chr != 1)
+ printf("e"); /* epsilon */
+ if (s.edge_chr != 1)
+ printf(" -> %d", i + s.edge_chr);
+ } else {
+ if (s.chr.n < 0) /* negative range */
+ printf(")%2Xh, %2Xh(",
+ (unsigned)s.chr.base + s.chr.n,
+ (unsigned)s.chr.base - 1);
+ else if (s.chr.n == 1) { /* singleton */
+ if (s.chr.base >= OUTPUT) {
+ printf(">output %d<",
+ s.chr.base - OUTPUT);
+ } else {
+ printf("%2Xh", (unsigned)s.chr.base);
+ if (isprint(s.chr.base))
+ printf(" '%c'", s.chr.base);
+ else if (s.chr.base == END)
+ printf(" END");
+ }
+ } else
+ printf("[%2Xh, %2Xh]", (unsigned)s.chr.base,
+ (unsigned)s.chr.base + s.chr.n - 1);
+ if (s.edge_chr != 1)
+ printf(" -> %d", i + s.edge_chr);
+ }
+ if (s.edge_eps != 0) {
+ if (s.chr.n != 0 || s.edge_chr != 1)
+ printf(", ");
+ printf("e -> %d", i + s.edge_eps);
+ } else if (s.chr.n != 0) { /* no epsilon edges */
+ if (s.edge_chr == 0) /* no future */
+ printf(" (fail!)");
+ }
+ putchar('\n');
+#endif
+ }
+
+ /* show the virtual final state for completeness */
+ printf("%5d: (final)\n", i);
+}
+
+
+/* the parser's mind */
+
+enum gedanken {
+ BEGWORD = INNER
+};
+enum ausgaben {
+ WORD = OUTPUT,
+ ATSIGN
+};
+
+struct mind {
+ const char *input, *begword, *word, *atsign;
+};
+
+int
+sub(int symbol, size_t state, const char *pos, void *env_)
+{
+ struct mind *env = env_;
+
+#if 0 /* diagnostic output */
+ size_t i;
+ assert(env->input != NULL);
+ assert(pos >= env->input);
+ i = pos - env->input;
+#endif
+
+ switch (symbol) {
+ case BEGWORD:
+ //printf("%8s[%3zu] pos %zu\n", "BEGWORD", state, i);
+ env->begword = pos;
+ return 1;
+ }
+
+ return 0; /* my mind is blank */
+}
+
+int
+out(struct range r, size_t state, const char *pos, void *env_)
+{
+ size_t i, l;
+ struct mind *env = env_;
+
+ assert(env->input != NULL);
+ assert(pos >= env->input);
+ i = pos - env->input;
+
+ /* inward output - actions */
+ assert(r.n == 1);
+ switch (r.base) {
+ case ATSIGN:
+ env->atsign = pos;
+ printf("%8s[%3zu] pos %zu\n", "ATSIGN", state, i - 1);
+ return r.base;
+ case WORD:
+ assert(env->begword != NULL);
+ assert(env->begword < pos);
+ env->word = pos;
+ l = env->word - env->begword;
+ printf("%8s[%3zu] pos %zu\t%.*s\n", "WORD", state, i - l,
+ (int)l, env->begword);
+ return r.base;
+ }
+
+ return -1; /* what can i do? */
+}
+
+
+#include <time.h> /* clock_gettime */
+#include <sys/time.h> /* timespecsub */
+
+int
+benchmark(NFA a)
+{
+ static const char *inputs[] = {
+ "test@example.com",
+ "hans.wurst+kaese@gmail.com",
+ "!#$%&@valid.email",
+ "...@invalid",
+ "total@falsch.",
+ "auch@.falsch",
+ "genauso.@falsch.ey",
+ "bis@hier.richtig ABER",
+ "mett@wurst",
+ "hallo@",
+ "@dingens",
+ "@",
+ "",
+ NULL };
+
+ static const long N = 10000; /* runs per input */
+
+ struct prep *pr;
+ struct mind env;
+ struct timespec t, t0, td;
+ const char **p;
+ int i, n;
+ size_t sz;
+
+ env.input = inputs[0];
+ pr = nfaprep(a, sub, out, &env);
+
+ /* warmup */
+ sz = strlen(inputs[0]);
+ for (i = 0; i < N; i++)
+ match_(pr, inputs[0], sz);
+
+ for (p = inputs; *p != NULL; p++) {
+ printf("%30.30s ", *p);
+ fflush(stdout);
+
+ sz = strlen(*p);
+ env.input = *p;
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);
+ for (i = 0; i < N; i++)
+ n = match_(pr, *p, sz);
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
+ timespecsub(&t, &t0, &td);
+
+ if (n == sz)
+ printf("match ");
+ else if (n < 0)
+ printf("no match ");
+ else
+ printf("partial ");
+
+ printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N);
+ }
+
+ return 0;
+}
+
+
+#include <getopt.h>
+
+int
+main(int argc, char **argv)
+{
+ /* character classes */
+ NFA lower = range('a', 'z');
+ NFA upper = range('A', 'Z');
+ NFA digit = range('0', '9');
+ NFA alpha = choice(lower, upper);
+ NFA other = choice(chr('!'), range('#', '\''), range('*', '-'),
+ chr('/'), chr('='), chr('?'), range('^', '`'), range('{', '~'));
+ /* = oneof("!#$%&'*+-/=?^_`{|}~") */
+
+ /* email addresses. cf. rfc 5322, section 3.4 */
+ NFA atext = choice(alpha, digit, other);
+ NFA word = seq(think(BEGWORD), plus(atext), output(WORD));
+ NFA atsign = seq(chr('@'), output(ATSIGN));
+ NFA dotted = seq(word, star(seq(chr('.'), word)));
+ NFA address = seq(dotted, atsign, dotted);
+
+ NFA nfa = address;
+ int ch;
+
+ while ((ch = getopt(argc, argv, "b")) != -1) {
+ switch (ch) {
+ case 'b':
+ return benchmark(nfa);
+ default:
+ printf("usage: nfa [-b] string ...\n");
+ return 1;
+ }
+ }
+
+ print_nfa(nfa);
+ putchar('\n');
+ for (int i = 1; i < argc; i++) {
+ char *inp;
+ size_t sz;
+ int n;
+ struct mind env;
+
+ inp = argv[i];
+ sz = strlen(inp);
+ env.input = inp;
+ n = match(nfa, sub, out, &env, inp, sz);
+ if (n == -1)
+ printf("no ");
+ else if (n < sz)
+ printf("partial ");
+ printf("match: %.*s\n", n, inp);
+ }
+
+ return 0;
+}
blob - /dev/null
blob + be5e3d820ddc2fb2bda2a6ca14b23bc418d813cb (mode 644)
--- /dev/null
+++ test_pcre.c
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <err.h>
+#include <time.h> /* clock_gettime */
+#include <sys/time.h> /* timespecsub */
+
+#include <pcreposix.h>
+
+int
+match(const regex_t *re, const char *input, size_t sz)
+{
+ static char buf[128];
+ regmatch_t pmatch;
+ int x;
+
+ assert (input[sz] == '\0');
+ x = regexec(re, input, 1, &pmatch, 0);
+ if (x == REG_NOMATCH)
+ return -1;
+ else if (x != 0) {
+ regerror(x, re, buf, sizeof buf);
+ errx(100, "regexec: %s", buf);
+ }
+
+ return (int)(pmatch.rm_eo - pmatch.rm_so);
+}
+
+int
+benchmark(regex_t *re)
+{
+ static const char *inputs[] = {
+ "test@example.com",
+ "hans.wurst+kaese@gmail.com",
+ "!#$%&@valid.email",
+ "...@invalid",
+ "total@falsch.",
+ "auch@.falsch",
+ "genauso.@falsch.ey",
+ "bis@hier.richtig ABER",
+ "mett@wurst",
+ "hallo@",
+ "@dingens",
+ "@",
+ "",
+ NULL };
+
+ static const long N = 200000; /* runs per input */
+
+ struct timespec t, t0, td;
+ const char **p;
+ int i, n;
+ size_t sz;
+
+ /* warmup */
+ sz = strlen(inputs[0]);
+ for (i = 0; i < N; i++)
+ match(re, inputs[0], sz);
+
+ for (p = inputs; *p != NULL; p++) {
+ printf("%30.30s ", *p);
+ fflush(stdout);
+
+ sz = strlen(*p);
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);
+ for (i = 0; i < N; i++)
+ n = match(re, *p, sz);
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
+ timespecsub(&t, &t0, &td);
+
+ if (n == sz)
+ printf("match ");
+ else if (n < 0)
+ printf("no match ");
+ else
+ printf("partial ");
+
+ printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N);
+ }
+
+ return 0;
+}
+
+#include <getopt.h>
+
+/* email addresses. cf. rfc 5322, section 3.4 */
+#define ATEXT "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]"
+#define WORD ATEXT "+"
+#define DOTTED WORD "(\\." WORD ")*"
+#define ADDRESS DOTTED "@" DOTTED
+
+int
+main(int argc, char **argv)
+{
+ int ch, x;
+ char buf[128];
+ regex_t re;
+
+ if ((x = regcomp(&re, "^" ADDRESS, REG_EXTENDED)) != 0) {
+ regerror(x, &re, buf, sizeof buf);
+ fprintf(stderr, "regcomp: %s\n", buf);
+ return 1;
+ }
+
+ while ((ch = getopt(argc, argv, "b")) != -1) {
+ switch (ch) {
+ case 'b':
+ return benchmark(&re);
+ default:
+ printf("usage: pcre [-b] string ...\n");
+ return 1;
+ }
+ }
+
+ for (int i = 1; i < argc; i++) {
+ char *inp;
+ size_t sz;
+ int n;
+
+ inp = argv[i];
+ sz = strlen(inp);
+ n = match(&re, inp, sz);
+ if (n == -1)
+ printf("no ");
+ else if (n < sz)
+ printf("partial ");
+ printf("match: %.*s\n", n, inp);
+ }
+
+ return 0;
+}
blob - /dev/null
blob + 33fb0717be9d812d3453c116a9a0785ba0558234 (mode 644)
--- /dev/null
+++ test_regex.c
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <err.h>
+#include <time.h> /* clock_gettime */
+#include <sys/time.h> /* timespecsub */
+
+#include <regex.h>
+
+int
+match(const regex_t *re, const char *input, size_t sz)
+{
+ static char buf[128];
+ regmatch_t pmatch;
+ int x;
+
+ assert (input[sz] == '\0');
+ x = regexec(re, input, 1, &pmatch, 0);
+ if (x == REG_NOMATCH)
+ return -1;
+ else if (x != 0) {
+ regerror(x, re, buf, sizeof buf);
+ errx(100, "regexec: %s", buf);
+ }
+
+ return (int)(pmatch.rm_eo - pmatch.rm_so);
+}
+
+int
+benchmark(regex_t *re)
+{
+ static const char *inputs[] = {
+ "test@example.com",
+ "hans.wurst+kaese@gmail.com",
+ "!#$%&@valid.email",
+ "...@invalid",
+ "total@falsch.",
+ "auch@.falsch",
+ "genauso.@falsch.ey",
+ "bis@hier.richtig ABER",
+ "mett@wurst",
+ "hallo@",
+ "@dingens",
+ "@",
+ "",
+ NULL };
+
+ static const long N = 10000; /* runs per input */
+
+ struct timespec t, t0, td;
+ const char **p;
+ int i, n;
+ size_t sz;
+
+ /* warmup */
+ sz = strlen(inputs[0]);
+ for (i = 0; i < N; i++)
+ match(re, inputs[0], sz);
+
+ for (p = inputs; *p != NULL; p++) {
+ printf("%30.30s ", *p);
+ fflush(stdout);
+
+ sz = strlen(*p);
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);
+ for (i = 0; i < N; i++)
+ n = match(re, *p, sz);
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t);
+ timespecsub(&t, &t0, &td);
+
+ if (n == sz)
+ printf("match ");
+ else if (n < 0)
+ printf("no match ");
+ else
+ printf("partial ");
+
+ printf("%9.0f ns\n", (td.tv_sec * 1e9 + td.tv_nsec) / N);
+ }
+
+ return 0;
+}
+
+#include <getopt.h>
+
+/* email addresses. cf. rfc 5322, section 3.4 */
+#define ATEXT "[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]"
+#define WORD ATEXT "+"
+#define DOTTED WORD "(\\." WORD ")*"
+#define ADDRESS DOTTED "@" DOTTED
+
+int
+main(int argc, char **argv)
+{
+ int ch, x;
+ char buf[128];
+ regex_t re;
+
+ if ((x = regcomp(&re, "^" ADDRESS, REG_EXTENDED)) != 0) {
+ regerror(x, &re, buf, sizeof buf);
+ fprintf(stderr, "regcomp: %s\n", buf);
+ return 1;
+ }
+
+ while ((ch = getopt(argc, argv, "b")) != -1) {
+ switch (ch) {
+ case 'b':
+ return benchmark(&re);
+ default:
+ printf("usage: nfa [-b] string ...\n");
+ return 1;
+ }
+ }
+
+ for (int i = 1; i < argc; i++) {
+ char *inp;
+ size_t sz;
+ int n;
+
+ inp = argv[i];
+ sz = strlen(inp);
+ n = match(&re, inp, sz);
+ if (n == -1)
+ printf("no ");
+ else if (n < sz)
+ printf("partial ");
+ printf("match: %.*s\n", n, inp);
+ }
+
+ return 0;
+}