KHJK gotweb

Commit Diff

Commit:: f31903c8acf3479533aa88556b5632ef54159fe3
From:: pompolic <pompolic@special-circumstanc.es>
Date:: Mon May 4 15:06:04 2020 UTC
Message:: Merge branch '2020-04-27_RELEASE' into 'master' Bring master up to date See merge request pesco/pdf!12
Actions:: Patch | Tree
commit - 1191d61e7323c479a891b8810b0d56996ab0075f
commit + f31903c8acf3479533aa88556b5632ef54159fe3
blob - d9374c2925bf1ddb2966d2d99dba2cbc29f8989c
blob + f39a06d3076d76b1399fdc6a8823d602bfbd097a
--- LICENSE
+++ LICENSE
@@ -1,4 +1,6 @@
 Copyright (c) 2019, 2020 Sven M. Hallberg <pesco@khjk.org>
+Copyright (c) 2020 pompolic <pompolic@special-circumstanc.es>
+Copyright (c) 2020 Paul Vines <paul.vines@baesystems.com>
 
 Permission to use, copy, modify, and distribute this software for any
 purpose with or without fee is hereby granted, provided that the above
blob - 9ca933445c2668987c6aa7bc34117e0f70f47fbd
blob + 6154e1d8363343e12153e52a27e9826581afdc3a
--- Makefile
+++ Makefile
@@ -1,4 +1,4 @@
-CFLAGS += -std=c99 -Wall -Werror
+CFLAGS += -std=c99 -Wall -Werror -DLOG
 
 # find our hammer build - adjust this to your needs
 # i have, for instance, symlinks:
@@ -8,6 +8,7 @@ HAMMER_INCLUDE = .
 HAMMER_LIB = ./lib
 CFLAGS += -I$(HAMMER_INCLUDE)
 LDFLAGS += -L$(HAMMER_LIB)
+SOURCES = pdf.c lzw-lib.c
 
 .PHONY: all test clean
 all: pdf
@@ -17,8 +18,8 @@ test: pdf
 	'for x in t/*.pdf; do ./pdf "$$x" >/dev/null && echo OK: "$$x"; done'
 	@true
 
-pdf: pdf.c
-	$(CC) -o $@ $(CFLAGS) $(LDFLAGS) $< -lhammer -lz
+pdf: $(SOURCES)
+	$(CC) -o $@ $(CFLAGS) $(LDFLAGS) $(SOURCES) -lhammer -lz
 
 clean:
 	rm -f pdf
blob - 0b2cc8912c3730af5961ac3d76eaf5497e51a5f1
blob + a2da3272b5216acdead625a852d6cd6fdf91a017
--- README
+++ README
@@ -5,15 +5,61 @@ Beginnings of a PDF parser in Hammer
 
    https://gitlab.special-circumstanc.es/pesco/hammer/tree/pdf
 
+   For detailed build instructions, see README.md in that repository.
+
  - Help the default Makefile find Hammer
 
-       $ ln -s ../hammer/src hammer
-       $ ln -s ../hammer/build/opt/src lib
+       $ ln -s ../hammer/src hammer         # needed for building pdf, include files
+       $ ln -s ../hammer/build/opt/src lib  # needed for running pdf, to locate libhammer.so
 
- - Build/Usage:
+ - Notes for 2020-04-27 release:
 
+    The release branch has been tested to build with the 2020-04-27_RELEASE` branch located at https://gitlab.special-circumstanc.es/pesco/hammer/tree/2020-04-27_RELEASE
+
+ - Build:
+
+       $ pushd ../hammer; scons; popd       # build Hammer
        $ make pdf
-       $ ./pdf test.pdf
 
+ - Usage:
+
+       $ export LD_LIBRARY_PATH=./lib       # see Troubleshooting section below to see if this is needed
+       $ ldd ./pdf | grep libhammer         # verify that libhammer.so was found
+       $ ./pdf <filename>
+
        # place some test files in the t/ directory...
        $ make test
+
+ - Troubleshooting:
+
+       libhammer.so not found:
+
+           If Hammer is not installed as a system library, ld may fail to locate libhammer.so. The quick fix for this is altering LD_LIBRARY_PATH before running pdf:
+
+           $ export LD_LIBRARY_PATH=./lib
+           $ make test
+
+           The second solution is executing "scons install" when building Hammer, which will install it in ld's usual search path:
+
+           $ pushd ../hammer; scons install; popd
+           # ... Update ldconfig cache if needed
+           $ make pdf
+           $ make test
+
+ - Evaluating test results:
+ 
+   For every file in the t/ directory, the pdf parser is executed. On successful parse, a message of the following form is displayed:
+
+   OK: t/<filename>
+
+   In case of a non-fatal parse error, error messages may be displayed, but presence of the "OK" indicates pdf exited successfully. On a failed test run, only parse error messages are displayed.
+
+ - Copyright:
+
+  - pesco 2019,2020
+  - pompolic 2020
+  - Paul Vines 2020
+  - David Bryant (modified lzw-ab code)
+
+  See LICENSE and lzw-ab-license.txt for full copyright and licensing notice.
+
blob - f363e541d5613ebce67d335fc00c4288801a1982
blob + d649711646645c3e6fb1a86dac043752e2e4f12a
--- pdf.c
+++ pdf.c
@@ -1,8 +1,11 @@
 /* beginnings of a PDF parser in hammer
  * pesco 2019,2020
+ * pompolic 2020
+ * Paul Vines 2020
  */
 
-#include <string.h>	/* strncmp(), memset() */
+#include <string.h>	/* strncmp(), memset(), memcpy() */
+#include <stdlib.h>	/* exit() */
 
 #include <hammer/hammer.h>
 #include <hammer/glue.h>
@@ -17,7 +20,14 @@
 #define IN(STR)		h_in((const uint8_t *)(STR), sizeof(STR) - 1)
 #define NOT_IN(STR)	h_not_in((const uint8_t *)(STR), sizeof(STR) - 1)
 
+#ifdef LOG
+#define VIOL(P,VIOL)	h_action(h_sequence(P, h_tell(), NULL), act_viol, VIOL)
+#else
+#define VIOL(P,VIOL)	P
+#endif
 
+
+
 /*
  * some helpers
  */
@@ -26,6 +36,7 @@ HParser *p_fail;
 HParser *p_epsilon;
 HParser *p_return_0;
 HParser *p_return_1;
+uint8_t strictness = 0;
 
 /* a combinator to parse a given character but return a different value */
 
@@ -62,6 +73,20 @@ p_return_uint__m(HAllocator *mm__, uint64_t x)
 	return h_action__m(mm__, p_epsilon, act_return_uint, (void *)x);
 }
 
+/* like h_sepBy but parses a fixed number of elements */
+HParser *
+p_sepBy_n__m(HAllocator *mm__, HParser *p, HParser *sep, size_t n)
+{
+	if (n == 0)
+		return p_epsilon;
+
+	HParser *sep_p = h_sequence__m(mm__, sep, p, NULL);
+	HParser *tail = h_repeat_n__m(mm__, sep_p, n - 1);
+	HParser *seq = h_sequence__m(mm__, p, tail, NULL);
+
+	return h_action__m(mm__, seq, h_act_flatten, NULL);
+}
+
 /* a helper to compare an HBytes to a string */
 bool
 bytes_eq(HBytes b, const char *s)
@@ -169,7 +194,10 @@ pp_parseresult(FILE *stream, const HParsedToken *tok, 
 {
 	HParseResult *res = H_CAST(HParseResult, tok);
 
-	h_pprint(stream, res->ast, indent, delta);
+	if (res == NULL)
+		fprintf(stream, "null");
+	else
+		h_pprint(stream, res->ast, indent, delta);
 }
 
 void
@@ -213,16 +241,290 @@ act_digit(const HParseResult *p, void *u)
 HParsedToken *
 act_hlower(const HParseResult *p, void *u)
 {
-	return H_MAKE_UINT(H_CAST_UINT(p->ast) - 'a');
+	return H_MAKE_UINT(10 + H_CAST_UINT(p->ast) - 'a');
 }
 
 HParsedToken *
 act_hupper(const HParseResult *p, void *u)
 {
-	return H_MAKE_UINT(H_CAST_UINT(p->ast) - 'A');
+	return H_MAKE_UINT(10 + H_CAST_UINT(p->ast) - 'A');
 }
 
+HParsedToken*
+act_hdigitpair(const HParseResult *p, void *u)
+{
+	uint8_t b = 0;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	size_t digits_processed = 0;
+	uint8_t digits[2];
+	for(size_t i = 0; i < seq->used; ++i)
+	{
+		switch(seq->elements[i]->token_type)
+		{
+			case TT_UINT:
+				digits[digits_processed] = H_CAST_UINT(seq->elements[i]);
+				digits_processed++;
+				break;
+			default:
+				break;
+		}
+		assert(digits_processed == 2);
+	}
+
+	b = (digits[0] << 4) + digits[1];
+	return H_MAKE_UINT(b);
+}
+
 HParsedToken *
+act_ahextruncated(const HParseResult *p, void *u)
+{
+	uint8_t b = 0;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+
+	/* Assumption: At this point seq->elements[0] is a hex digit
+	 * and seq->elements[1] holds '>' (EOD)
+	 */
+	// XXX figure out how to compare to '>'
+	assert(seq->used == 2);
+	b = H_CAST_UINT(seq->elements[0]) << 4;
+	return H_MAKE_UINT(b);
+}
+
+HParsedToken *
+act_hs_end(const HParseResult *p, void *u)
+{
+	HParsedToken *res;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	assert(seq->used >= 1);
+
+	res = H_MAKE_UINT(H_CAST_UINT(seq->elements[0]));
+	return res;
+}
+
+HParsedToken *
+act_ahexstream(const HParseResult *p, void *u)
+{
+	uint8_t *result_bytes;
+	size_t required_bytes;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HParsedToken *res;
+
+	/* Ignore the the last element, which is EOD */
+	required_bytes = (seq->used - 1);
+
+	result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * required_bytes);
+
+	/* memcpy all but the last group's bytes into a single array */
+	for (size_t i = 0; i < seq->used-1; ++i)
+	{
+		assert(i < required_bytes);
+		result_bytes[i] = H_CAST_UINT(seq->elements[i]);
+	}
+
+
+	res = H_MAKE_BYTES(result_bytes, required_bytes);
+	return res;
+}
+
+
+HParsedToken *
+act_a85zero(const HParseResult *p, void *u)
+{
+	uint32_t b = 0;
+	return H_MAKE_UINT(b);
+}
+
+HParsedToken *
+act_a85digit(const HParseResult *p, void *u)
+{
+	uint8_t b = H_CAST_UINT(p->ast);
+	b -= '!';
+
+	/* At this point we have the base85 value of one byte out of 5 */
+	return H_MAKE_UINT(b);
+}
+
+HParsedToken *
+act_a85fivedigits(const HParseResult *p, void *u)
+{
+	uint64_t fourbytes = 0;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HParsedToken **digits = h_seq_elements(p->ast);
+
+	/* 2^32-1, the max value the group can hold as per spec */
+	#define A85GRPMAX 4294967295
+
+	/* Only for groups that do not need to padded to 5 */
+	assert(seq->used == 5);
+	fourbytes = H_CAST_UINT(digits[0]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[3]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[4]);
+
+	assert(fourbytes <= A85GRPMAX);
+	return H_MAKE_UINT(fourbytes);
+}
+
+/* Checking the following condition in the spec:
+ * The value represented by a group of 5 characters is greater than 2^32 - 1.
+ */
+bool
+validate_a85fivedigits(HParseResult *p, void *u)
+{
+	/* "s8W-!" should be the highest accepted value */
+	return H_CAST_UINT(p->ast) <= A85GRPMAX;
+}
+
+HParsedToken *
+act_a85group(const HParseResult *p, void *u)
+{
+	uint8_t *bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * 4);
+	uint32_t fourbytes = H_CAST_UINT(p->ast);
+
+	bytes[0] = (fourbytes & 0xFF000000) >> 24;
+	bytes[1] = (fourbytes & 0x00FF0000) >> 16;
+	bytes[2] = (fourbytes & 0x0000FF00) >> 8;
+	bytes[3] = (fourbytes & 0x000000FF);
+
+	HParsedToken *b = H_MAKE_BYTES(bytes, 4);
+	return b;
+}
+
+HParsedToken *
+act_a85partial2group(const HParseResult *p, void *u)
+{
+	uint64_t fourbytes = 0;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HParsedToken **digits = h_seq_elements(p->ast);
+
+	assert(seq->used == 2);
+	fourbytes = H_CAST_UINT(digits[0]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]);
+	fourbytes *= 85 * 85 * 85;
+
+	assert(fourbytes <= A85GRPMAX);
+	return H_MAKE_UINT(fourbytes);
+}
+
+bool
+validate_a85partial2group(HParseResult *p, void *u)
+{
+	return H_CAST_UINT(p->ast) <= A85GRPMAX;
+}
+
+HParsedToken *
+act_a85partial3group(const HParseResult *p, void *u)
+{
+	uint64_t fourbytes = 0;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HParsedToken **digits = h_seq_elements(p->ast);
+
+	assert(seq->used == 3);
+	fourbytes = H_CAST_UINT(digits[0]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]);
+	fourbytes *= 85 * 85;
+
+	assert(fourbytes <= A85GRPMAX);
+	return H_MAKE_UINT(fourbytes);
+}
+
+bool
+validate_a85partial3group(HParseResult *p, void *u)
+{
+	return H_CAST_UINT(p->ast) <= A85GRPMAX;
+}
+
+HParsedToken *
+act_a85partial4group(const HParseResult *p, void *u)
+{
+	uint64_t fourbytes = 0;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HParsedToken **digits = h_seq_elements(p->ast);
+
+	assert(seq->used == 4);
+	fourbytes = H_CAST_UINT(digits[0]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]);
+	fourbytes = fourbytes * 85 + H_CAST_UINT(digits[3]);
+	fourbytes *= 85;
+
+	assert(fourbytes <= A85GRPMAX);
+	return H_MAKE_UINT(fourbytes);
+}
+
+bool
+validate_a85partial4group(HParseResult *p, void *u)
+{
+	return H_CAST_UINT(p->ast) <= A85GRPMAX;
+}
+
+HParsedToken *
+act_a85partialgroup(const HParseResult *p, void *u)
+{
+	uint8_t bytes_helper[4];
+	size_t bytes_used = 0;
+	uint8_t *bytes;
+
+	uint32_t fourbytes = H_CAST_UINT(p->ast);
+
+	/* Scan the uint backwards to find the first non-zero byte */
+	for (size_t i = 3; i > 0; --i)
+	{
+		/* Shift by 0, 8, 16 and 24 to get the correct byte */
+		bytes_helper[i] = (fourbytes >> ((3-i) * 8)) & 0xFF;
+		/* If we haven't set bytes_used yet, and the particular byte is nonzero */
+		if (!bytes_used && bytes_helper[i])
+			bytes_used = i;
+	}
+	assert(bytes_used > 0);
+
+	bytes = h_arena_malloc(p->arena, bytes_used);
+	return H_MAKE_BYTES(bytes, bytes_used);
+}
+
+HParsedToken *
+act_a85string(const HParseResult *p, void *u)
+{
+	uint8_t *result_bytes;
+	size_t chunk_number;
+	size_t required_bytes;
+	size_t out_pos = 0;
+	HCountedArray *seq = H_CAST_SEQ(p->ast);
+	HParsedToken *res;
+
+	/* Number of 4-byte chunks, minus the potential last partial group and EOD */
+	chunk_number = seq->used - 2;
+
+	/* Special-case: last chunk before EOD may be 4, 3, 2 or 1 bytes
+	 * The latter two happening if the group was parsed from a partial
+	 * group consisting less than 5 chars */
+	HBytes *last_chunk = &H_CAST_BYTES(seq->elements[seq->used-1]);
+	required_bytes = (chunk_number * 4 + last_chunk->len);
+
+	result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * required_bytes);
+
+	/* memcpy all but the last group's bytes into a single array */
+	for (size_t i = 0; i < seq->used-1; ++i)
+	{
+		HBytes *chunk = &H_CAST_BYTES(seq->elements[i]);
+		assert(out_pos < required_bytes);
+		memcpy(&(result_bytes[out_pos]), chunk->token, 4);
+		out_pos += 4;
+		assert(out_pos < required_bytes);
+	}
+
+	memcpy(&(result_bytes[out_pos]), last_chunk->token, last_chunk->len);
+	out_pos += last_chunk->len;
+	/* We should have filled the array exactly by this point */
+	assert(out_pos == required_bytes-1);
+
+	res = H_MAKE_BYTES(result_bytes, required_bytes);
+	return res;
+}
+
+HParsedToken *
 act_nat(const HParseResult *p, void *u)
 {
 	uint64_t x = 0;
@@ -237,6 +539,34 @@ act_nat(const HParseResult *p, void *u)
 #define act_xroff act_nat
 #define act_xrgen act_nat
 
+HParser *p_violsev;
+HParsedToken *
+act_viol(const HParseResult *p, void *viol)
+{
+	uint32_t severity;
+	uint32_t offset;
+	HParseResult *severity_parse;
+	viol = (uint8_t *) viol;
+	severity_parse = h_parse(p_violsev, viol, strlen((char *)viol));
+	if (!severity_parse) {
+	        fprintf(stderr, "Severity for violaiton %s not assigned!\n", (char *)viol);
+	        severity = 99999;
+	}
+	else {
+	        severity = severity_parse->ast->seq->elements[0]->uint;
+	}
+	offset = p->ast->seq->elements[p->ast->seq->used-1]->uint / 8;
+	fprintf(stderr, "VIOLATION[%d]@%d (0x%x): %s\n", severity, offset, offset, (char *) viol);
+	if (strictness && severity > strictness) {
+	        exit(1);
+	}
+	/* Just return the parse AST, drop the h_tell */
+	if (p->ast->seq->used == 1) {
+		return (HParsedToken *) NULL;
+	}
+	return (HParsedToken *) p->ast->seq->elements[0];
+}
+
 bool
 validate_pnat(HParseResult *p, void *u)
 {
@@ -327,7 +657,7 @@ act_nesc(const HParseResult *p, void *u)
 	return H_MAKE_UINT(H_FIELD_UINT(1)*16 + H_FIELD_UINT(2));
 }
 
-#define act_str_ h_act_flatten
+#define act_schars h_act_flatten
 #define act_string act_token
 
 HParsedToken *
@@ -341,9 +671,6 @@ act_octal(const HParseResult *p, void *u)
 
 	return H_MAKE_UINT(x);
 }
-#define act_oct3 act_octal
-#define act_oct2 act_octal
-#define act_oct1 act_octal
 
 HParsedToken *
 act_xrent(const HParseResult *p, void *u)
@@ -458,7 +785,61 @@ act_dict_(const HParseResult *p, void *env)
 
 #define act_array_ h_act_flatten
 
+HParsedToken *
+act_shortlength(const HParseResult *p, void *u)
+{
+	uint8_t length = H_CAST_UINT(p->ast);
+	/* Length can range from 0-127, corresponding to the range 1-128, inclusive */
+	uint8_t finallength = length+1;
 
+	return H_MAKE_UINT(finallength);
+}
+
+HParsedToken *
+act_longlength(const HParseResult *p, void *u)
+{
+	uint8_t length = H_CAST_UINT(p->ast);
+	uint8_t finallength = 257-length;
+
+	return H_MAKE_UINT(finallength);
+}
+
+HParsedToken *
+act_longrun(const HParseResult *p, void *u)
+{
+	HParsedToken **elements = h_seq_elements(p->ast);
+	HParsedToken *res = H_MAKE_SEQ();
+
+	uint8_t length = H_CAST_UINT(elements[0]);
+	uint8_t data = H_CAST_UINT(elements[1]);
+
+	for (size_t len = 0; len < length; ++len)
+	{
+		h_seq_snoc(res, H_MAKE_UINT(data));
+	}
+
+	return res;
+}
+
+HParsedToken *
+act_rldstring(const HParseResult *p, void *u)
+{
+	const HParsedToken *flattened = h_seq_flatten(p->arena, p->ast);
+	HCountedArray *flattened_seq = H_CAST_SEQ(flattened);
+	uint8_t bytes_required;
+	uint8_t *result_bytes;
+
+	bytes_required = flattened_seq->used - 1;
+	result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * bytes_required);
+
+	for (size_t i = 0; i < flattened_seq->used-1; ++i)
+	{
+		result_bytes[i] = H_CAST_UINT(flattened_seq->elements[i]);
+	}
+
+	return H_MAKE_BYTES(result_bytes, bytes_required);
+}
+
 /*
  * input grammar
  */
@@ -468,12 +849,37 @@ HParser *p_pdfdbg;
 HParser *p_startxref;
 HParser *p_xref;
 HParser *p_objdef;
+HParser *p_a85string;
+HParser *p_ahexstream;
+HParser *p_rldstring;
+HParser *p_ws;
+HParser *p_wel;
+HParser *p_elemr;
+HParser *p_npair;
 
 /* continuations for h_bind() */
 HParser *kstream(HAllocator *, const HParsedToken *, void *);
 HParser *kxstream(HAllocator *, const HParsedToken *, void *);
 
 void
+init_runlengthdecode_parser(struct Env *aux)
+{
+	H_RULE(rldeod, h_ch(0x80));
+	H_ARULE(longlength, h_ch_range(0x81, 0xFF));
+	H_ARULE(shortlength, h_ch_range(0x0, 0x7F));
+
+	H_RULE(shortdata, h_uint8());
+	H_RULE(longdata, h_uint8());
+
+	H_RULE(shortrun, h_length_value(shortlength, shortdata));
+	H_ARULE(longrun, SEQ(longlength, longdata));
+
+	H_ARULE(rldstring, SEQ(h_many(CHX(shortrun, longrun)), IGN(rldeod)));
+
+	p_rldstring = rldstring;
+}
+
+void
 init_parser(struct Env *aux)
 {
 	TT_HParseResult = h_allocate_token_new("HParseResult", NULL, pp_parseresult);
@@ -497,6 +903,7 @@ init_parser(struct Env *aux)
 	//H_RULE(dchar,	IN(DCHARS));			/* delimiter */
 	H_RULE(rchar,	NOT_IN(WCHARS DCHARS));		/* regular */
 	H_RULE(nchar,	NOT_IN(WCHARS DCHARS "#"));	/* name */
+	H_RULE(schar,	NOT_IN("()\n\r\\"));		/* string literal */
 	H_ARULE(digit,	h_ch_range('0', '9'));
 	H_ARULE(pdigit,	h_ch_range('1', '9'));
 	H_ARULE(hlower,	h_ch_range('a', 'f'));
@@ -551,16 +958,10 @@ init_parser(struct Env *aux)
 	/* numbers */
 	H_ARULE(sign,	CHX(minus, IGN(plus)));
 	H_VRULE(intnn,	nat);
-	#if 1
 	H_ARULE(realnn,	CHX(SEQ(digits, period, digits),	/* 12.3 */
 			    SEQ(digits, period, empty),		/* 123. */
 			    SEQ(empty, period, digits)));	/* .123 */
 		// XXX ^ we _could_ move the "123." case into intnn...
-	#else
-	// XXX the .123 case above somehow leads to a conflict with litstr...
-	H_ARULE(realnn,	CHX(SEQ(digits, period, digits),	/* 12.3 */
-			    SEQ(digits, period, empty)));	/* 123. */
-	#endif
 	H_RULE(numbnn,	CHX(realnn, intnn));
 	H_RULE(snumb,	SEQ(sign, numbnn));
 	H_ARULE(numb,	CHX(snumb, numbnn));
@@ -570,59 +971,24 @@ init_parser(struct Env *aux)
 	H_ARULE(nstr,	h_many(CHX(nchar, nesc)));	/* '/' is valid */
 	H_RULE(name,	h_right(slash, nstr));
 
-	/* strings
-	 *
-	 * this is so convoluted in order to make it LALR including the
-	 * precedence rules for octal escapes ("\123" vs "\12 3" vs "\1 23")
-	 * and end-of-line ("CRLF" vs "CR LF").
-	 *
-	 * we have to split the base rule 'str' into variants 'str_o' and
-	 * 'str_l' depending on whether they may start with an octal digit or
-	 * linefeed, respectively.
-	 */
-	H_RULE(str_ol,	h_indirect());
-	H_RULE(str_o,	h_indirect());
-	H_RULE(str_l,	h_indirect());
-	H_RULE(str,	h_indirect());
+	/* strings */
+	H_RULE(snest,	h_indirect());
 	H_RULE(bsn,	p_mapch('n', 0x0a));	/* LF */
 	H_RULE(bsr,	p_mapch('r', 0x0d));	/* CR */
 	H_RULE(bst,	p_mapch('t', 0x09));	/* HT */
 	H_RULE(bsb,	p_mapch('b', 0x08));	/* BS (backspace) */
 	H_RULE(bsf,	p_mapch('f', 0x0c));	/* FF */
 	H_RULE(escape,	CHX(bsn, bsr, bst, bsb, bsf, lparen, rparen, bslash));
-	H_ARULE(oct3,	REP(odigit,3));
-	H_ARULE(oct2,	REP(odigit,2));
-	H_ARULE(oct1,	REP(odigit,1));
-	H_RULE(octesc,	CHX(SEQ(oct3, str),
-			    SEQ(oct2, str_o),
-			    SEQ(oct1, str_o)));
-	H_RULE(eolesc,	CHX(SEQ(IGN(crlf), str),
-			    SEQ(IGN(cr), str_l),
-			    SEQ(IGN(lf), str)));
-	H_RULE(schar_o,	NOT_IN("()\n\r\\" "01234567"));
-	H_RULE(schar_e,	NOT_IN("()\n\r\\" "01234567" "nrtbf"));
-	H_RULE(str_o_,	CHX(SEQ(lf, str), str_ol));	/* str "but not" odigit */
-	H_RULE(str_l_,	CHX(SEQ(odigit, str), str_ol));	/* str "but not" lf */
-	H_RULE(str_ol_,	CHX(SEQ(cr, str_l),		/* str "but neither" */
-			    SEQ(crlf, str),
-			    SEQ(schar_o, str),
-			    SEQ(lparen, str, rparen, str),
-			    SEQ(IGN(bslash), escape, str),
-			    SEQ(IGN(bslash), schar_e, str),	/* "lone" bs */
-		/* NB: ^ lone backslashes are to be ignored per spec, but we
-		 * let them "escape" with the following character. this works
-		 * because they are never truly alone. */
-			    SEQ(IGN(bslash), octesc),
-			    SEQ(IGN(bslash), eolesc),		/* line split */
-			    epsilon));
-	H_ARULE(str_,	CHX(SEQ(lf, str), SEQ(odigit, str), str_ol));
-	H_RULE(litstr,	h_middle(lparen, str, rparen));
+	H_ARULE(octal,	CHX(REP(odigit,3), REP(odigit,2), REP(odigit,1)));
+	H_RULE(wrap,	IGN(eol));
+	H_RULE(sesc,	h_right(bslash, CHX(escape, octal, wrap, epsilon)));
+		/* NB: lone backslashes and escaped newlines are ignored */
+	H_ARULE(schars,	h_many(CHX(schar, snest, sesc, eol)));
+	H_RULE(snest_,	SEQ(lparen, schars, rparen));
+	H_RULE(litstr,	h_middle(lparen, schars, rparen));
 	H_RULE(hexstr,	h_middle(langle, MANY_WS(hdigit), rangle));
 	H_ARULE(string,	CHX(litstr, hexstr));
-	h_bind_indirect(str_ol,	str_ol_);
-	h_bind_indirect(str_o,	str_o_);
-	h_bind_indirect(str_l,	str_l_);
-	h_bind_indirect(str,	str_);
+	h_bind_indirect(snest, snest_);
 
 	H_RULE(array,	h_indirect());
 	H_RULE(dict,	h_indirect());
@@ -635,8 +1001,9 @@ init_parser(struct Env *aux)
 	/* dictionaries */
 	H_RULE(dopen,	LIT("<<"));
 	H_RULE(dclose,	LIT(">>"));
-	H_RULE(k_v,	CHX(SEQ(name, wel,ws, obj),
-			    SEQ(name, CHX(name,dobj))));
+	H_RULE(k_v,	CHX(CHX(SEQ(name, wel,ws, obj),
+				SEQ(name, CHX(name,dobj))),
+			    VIOL(SEQ(name, wel,ws), "Key with no value (severity=2)")));
 	H_ARULE(dict_,	h_middle(dopen, MANY_WS(k_v), dclose));
 		// XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok?
 		// XXX validate: dict keys must be unique
@@ -659,34 +1026,58 @@ init_parser(struct Env *aux)
 	h_bind_indirect(array, array_);
 
 	/* streams */
-	H_RULE(stmbeg,	SEQ(dict, ws, LIT("stream"), OPT(cr), lf));
-	H_RULE(stmend,	SEQ(OPT(eol), LIT("endstream")));
+	H_RULE(stmbeg,	SEQ(dict, OPT(ws), LIT("stream"), OPT(cr), lf));
+	H_RULE(stmend, CHX(SEQ(eol, LIT("endstream")),
+			   VIOL(LIT("ndstream"), "Stream length >1-too-long (severity=10)"),
+			   VIOL(SEQ(h_many1(wchar), LIT("endstream")),
+				"No newline before endstream (severity=7)"),
+			   VIOL(LIT("endstream"), "Stream length 1-too-long (severity=9)"),
+			   VIOL(SEQ(OPT(h_ch_range(0, 255)), OPT(eol), LIT("endstream")),
+				"Stream length 1-too-short (severity=4)"),
+			   VIOL(SEQ(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"),
+									     SEQ(npair, wel, KW("obj")),
+									     KW("xref"),
+									     LIT("endstream")))), LIT("endstream")),
+				"Stream length >1-too-short (severity=5)"),
+			   VIOL(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"),
+									 SEQ(npair, wel, KW("obj")),
+									 KW("xref")))),
+				"Missing endstream token (severity=7)")));
+
 	H_RULE(stream,	h_left(h_bind(stmbeg, kstream, aux), stmend));
 		// XXX is whitespace allowed between the eol and "endstream"?
+		// peter wyatt says no. (2020-03-25)
 
 	/*
 	 * file structure
 	 */
 
 	/* header */
-	H_RULE(version,	SEQ(pdigit, IGN(period), pdigit));
+	H_RULE(version,	SEQ(pdigit, IGN(period), digit));
 	H_RULE(header,	h_middle(LIT("%PDF-"), version, nl));
 
 	/* body */
 	H_RULE(indobj,	CHX(stream, obj));
-	H_RULE(objdef,	SEQ(ws, npair, wel, KW("obj"), ws, indobj, KW("endobj")));
+	H_RULE(objdef,	SEQ(ws, npair, wel, KW("obj"), ws, indobj,
+			    CHX(VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(KW("endobj"))),
+				     "More than 1 endobj token (severity=1)"),
+				VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(SEQ(dclose, h_many1(CHX(wchar, eol)), KW("endobj")))),
+				     "More than 1 >> and endobj token (severity=2)"),
+				SEQ(OPT(ws), OPT(lws), KW("endobj")),
+				VIOL(h_optional(KW("endobj")), "Missing endobj token (severity=1)"))));
 	H_RULE(body,	h_many(objdef));
 
-	/* for object streams */
-	//H_RULE(osidx,	h_sepBy(npair, SEQ(wel,ws)));
-
 	/* cross-reference section */
 	H_RULE(xreol,	CHX(SEQ(sp, cr), SEQ(sp, lf), crlf));
 		// ^ XXX does the real world follow this rule?! cf. loop.pdf
 	H_RULE(xrtyp,	CHX(h_ch('n'), h_ch('f')));
 	H_ARULE(xroff,	REP(digit, 10));
 	H_ARULE(xrgen,	REP(digit, 5));
-	H_ARULE(xrent,	SEQ(xroff, IGN(sp), xrgen, IGN(sp), xrtyp, IGN(xreol)));
+	H_ARULE(xrent,	SEQ(xroff, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref offset_gen entry (severity=1)"), sp)),
+	    xrgen, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref gen_use entry (severity=1)"), sp)),
+	    xrtyp, IGN(CHX(VIOL(SEQ(wchar, wchar, h_many1(wchar)), "Greater-than-2-byte WS at end of xref entry (severity=1)"),
+	    xreol,
+	    VIOL(SEQ(h_many1(wchar)), "Nonconformant WS at end of xref entry (severity=1)")))));
 	H_RULE(xrhead,	SEQ(nat, IGN(sp), nat, nl));
 	H_RULE(xrsub,	SEQ(xrhead, h_many(xrent)));
 	H_ARULE(xrefs,	SEQ(KW("xref"), nl, h_many(xrsub)));
@@ -697,40 +1088,108 @@ init_parser(struct Env *aux)
 		// XXX skip however much we consumed and check for "endstream endobj"?
 
 	/* trailer */
-	H_RULE(startxr,	SEQ(nl, KW("startxref"), nl,
+	H_RULE(startxr, SEQ(nl, KW("startxref"), nl,
 			    lws, nat, nl,
-			    LIT("%%EOF"), CHX(nl, end)));
+			    LIT("%%EOF"), OPT(nl)));
+
+	/* used for the backwards search */
+	H_RULE(lasteof, SEQ(nl, KW("startxref"), nl,
+			    lws, nat, nl,
 		// XXX the real world sometimes omits nl after %%EOF inside the file.
 		//     the next 'tail' would be appended right after the 'F',
 		//     presumably because the previous version of the file
 		//     ended without a trailing newline. m)
-		//     this is invalid per spec, because it creates a run-on 
+		//     this is invalid per spec, because it creates a run-on
 		//     comment, but we should probably accept-and-warn.
 		// XXX should lws be allowed before EOF marker?
 		// NB: lws before xref offset is allowed, cf. p.48 (example 4)
+			    LIT("%%EOF"),
+			    CHX(VIOL(SEQ(nl, h_many1(nl), end),
+				     "(offset FROM END) Multiple newlines after final %%EOF (severity=4)"),
+				SEQ(h_many(nl), end),
+				VIOL(SEQ(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))),
+				     "(offset FROM END) Data after final  %%EOF (severity=7)"))));
+
 	H_RULE(xr_td,	SEQ(xrefs, KW("trailer"), ws, dict));
 
-	H_RULE(tail,	SEQ(body, h_optional(xr_td), startxr));
-		// XXX the real world likes to omit 'startxr' from all but the
-		// last trailer. we should accept-and-warn in that case.
-	H_RULE(pdf,	SEQ(header, h_many1(tail), end));
+	H_RULE(hdr_junk,    CHX(comment,
+	    VIOL(h_many1(h_butnot(h_ch_range(0, 255), SEQ(npair, wel, KW("obj")))),
+	    "Uncommented junk after header (severity=1)")));
+	H_RULE(tail,	SEQ(body, CHX(SEQ(h_optional(xr_td), startxr),
+				      VIOL(SEQ(xr_td, OPT(SEQ(nl, KW("startxref"), nl, lws, nat, nl)),
+					       OPT(nl), OPT(LIT("%%EOF")), OPT(nl)),
+					   "Improper end of trailer - missing startxref and/or %%EOF (severity=5)"))));
+	H_RULE(final_eof_junk, CHX(VIOL(SEQ(h_many1(nl), end), "Multiple newlines after final %%EOF (severity=4)"),
+				   VIOL(h_many1(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))),
+					"Data after final %%EOF (severity=7)"),
+				   end));
+	H_RULE(pdf,	SEQ(header, OPT(hdr_junk), h_many1(tail), final_eof_junk));
 
 	/* debug parser to consume as much as possible */
-	H_RULE(pdfdbg,	SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr)));
+	H_RULE(pdfdbg,	SEQ(header, OPT(hdr_junk), h_many(tail), body, OPT(xr_td), OPT(SEQ(startxr, final_eof_junk))));
 
+	/*
+	 * filters
+	 */
 
+	/* Ascii85Decode */
+	H_RULE(a85eod,	SEQ(h_ch('~'), OPT(h_many(lwchar)), h_ch('>')));
+	H_ARULE(a85zero,	h_ch('z'));
+	H_ARULE(a85digit,	h_ch_range('!', 'u'));
+
+	/* Line whitespace can occur between any digit and has to be ignored, */
+	/* Comments are not allowed inside streams, and % character should cause
+	 * a parse error. */
+	#define MANY_LWS(X) h_many(CHX(lws, X))
+
+	/* This encoding of zero is not allowed */
+	H_RULE(a85fiveexcl, h_repeat_n(MANY_LWS(h_ch('!')), 5));
+	H_VARULE(a85fivedigits,	SEQ(h_and(h_not(a85fiveexcl)), h_repeat_n(MANY_LWS(a85digit), 5)));
+	H_ARULE(a85group,	CHX(a85zero, a85fivedigits));
+
+	H_VARULE(a85partial2group,	h_repeat_n(MANY_LWS(a85digit), 2));
+	H_VARULE(a85partial3group,	h_repeat_n(MANY_LWS(a85digit), 3));
+	H_VARULE(a85partial4group,	h_repeat_n(MANY_LWS(a85digit), 4));
+	H_ARULE(a85partialgroup,	CHX(a85partial4group, a85partial3group, a85partial2group));
+
+	H_ARULE(a85string,	SEQ(h_many(a85group), OPT(a85partialgroup), IGN(a85eod)));
+
+	/* AsciiHexDecode */
+	H_RULE(ahexeod,	h_ch('>'));
+	H_ARULE(hdigitpair, SEQ(IGN(OPT(h_many(lwchar))), hdigit, IGN(OPT(h_many(lwchar))), hdigit));
+	H_ARULE(ahextruncated, SEQ(IGN(OPT(h_many(lwchar))), hdigit, IGN(OPT(h_many(lwchar)))));
+
+	H_ARULE(hs_end, SEQ(CHX(hdigitpair, ahextruncated), ahexeod));
+	H_ARULE(ahexstream, SEQ(h_many(hdigitpair), hs_end));
+
+	init_runlengthdecode_parser(aux);
+
+
 	/* global parser variables */
 	p_pdf = pdf;
 	p_pdfdbg = pdfdbg;
-	p_startxref = startxr;
+	p_startxref = lasteof; //startxr;
 	p_xref = CHX(xr_td, xrstm);
 	p_objdef = objdef;
+	p_a85string = a85string;
+	p_ahexstream = ahexstream;
+	p_ws = ws;
+	p_wel = wel;
+	p_elemr = h_action(elemr, h_act_flatten, NULL);
+	p_npair = npair;
 
 	p_fail = h_nothing_p();
 	p_epsilon = epsilon;
 	p_return_0 = h_action(epsilon, act_return_uint, (void *)0);
 	p_return_1 = h_action(epsilon, act_return_uint, (void *)1);
 
+	/* Parsing of severity messages */
+	H_RULE(viol_preamble, SEQ(h_many(NOT_IN("=")), LIT("=")));
+	H_RULE(severity_num, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)),
+			     act_nat, NULL));
+	H_RULE(violsev, SEQ(IGN(viol_preamble), severity_num));
+	p_violsev = violsev;
+
 #if 0
 	// XXX testing
 	int r;
@@ -910,12 +1369,25 @@ struct predictor {
 	uint8_t	*buf;		/* previous row of input */
 	uint8_t	 c;		/* byte 'c' (upper left) */
 	int	 x;		/* current position */
+
+#ifndef ITERATIVE // XXX
+	uint8_t	*out;
+	size_t	 nout;
+#endif
 };
 
 int
 depred_none(struct predictor *pred, uint8_t *inp, size_t sz)
 {
+#ifdef ITERATIVE // XXX
 	return h_parse_chunk(pred->sp, inp, sz);
+#else
+	pred->out = realloc(pred->out, pred->nout + sz);
+	assert(pred->out != NULL);
+	memcpy(pred->out + pred->nout, inp, sz);
+	pred->nout += sz;
+	return false;
+#endif
 }
 
 uint8_t pp_none(int a, int b, int c)	{ return 0; }
@@ -982,7 +1454,14 @@ depred_png(struct predictor *pred, uint8_t *inp, size_
 
 		/* when row complete, pass it to parser and start a new row */
 		if (x == pred->rowsz) {
+#ifdef ITERATIVE // XXX
 			done = h_parse_chunk(pred->sp, pred->buf, pred->rowsz);
+#else
+			pred->out = realloc(pred->out, pred->nout + pred->rowsz);
+			assert(pred->out != NULL);
+			memcpy(pred->out + pred->nout, pred->buf, pred->rowsz);
+			pred->nout += pred->rowsz;
+#endif
 			pred->c = pred->x = 0;
 			if (pred->num != 2)	/* support for 8-bpc TIFF */
 				pred->predfun = NULL;
@@ -997,7 +1476,9 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p)
 {
 	size_t const BUFSIZE = 8 * 1024;
 	uint8_t *buf;
+#ifdef ITERATIVE // XXX
 	HSuspendedParser *sp;
+#endif
 	HParseResult *res;
 	const HParsedToken *v;
 	size_t sz;
@@ -1063,10 +1544,12 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p)
 	if (buf == NULL)
 		err(1, "FlateDecode");
 
+#ifdef ITERATIVE // XXX
 	/* initialize target parser */
 	sp = h_parse_start(p);
 	assert(sp != NULL);
 	pred.sp = sp;
+#endif
 
 	done = 0;
 	strm.avail_in = b.len;
@@ -1085,8 +1568,13 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p)
 		done = depredict(&pred, buf, sz);
 	} while (done == 0 && ret == Z_OK);
 
+#ifdef ITERATIVE // XXX
 	res = h_parse_finish(sp);
 		// XXX always return NULL on error?
+#else
+	res = h_parse(p, pred.out, pred.nout);
+	free(pred.out);
+#endif
 	inflateEnd(&strm);
 	free(pred.buf);
 	free(buf);
@@ -1096,7 +1584,271 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p)
 	return res;
 }
 
+/* LZW helpers */
+
+typedef struct
+{
+	uint8_t *lzw_buf;
+	size_t total_buf_size;
+	size_t write_head;
+	size_t write_tail;
+	uint8_t write_checksum;
+	size_t eof_loc;
+
+	HBytes *input_stream;
+	size_t read_head;
+	size_t read_tail;
+	uint8_t read_checksum;
+} lzwspec;
+
+lzwspec *cur_lzw_spec;
+
+/* used by write_lzw_buffer to get more space for decoding if needed */
+void
+grow_lzw_buffer(size_t amount)
+{
+	uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t));
+	if(ret_buf != NULL)
+	{
+		cur_lzw_spec->total_buf_size += amount;
+		cur_lzw_spec->lzw_buf = ret_buf;
+	}
+	else
+	{
+		fprintf(stderr, "LZWDecode: h_arena_realloc() failed");
+		return;
+	}
+}
+
+lzwspec *
+new_lzw_spec(HBytes *bytes)
+{
+	size_t const BUFSIZE = sizeof(uint8_t) * 1024;
+	lzwspec *ret = malloc(sizeof(lzwspec));
+	ret->input_stream = bytes;
+	ret->lzw_buf = malloc(BUFSIZE);
+	ret->total_buf_size = BUFSIZE;
+	return ret;
+}
+
+void
+delete_lzw_spec(lzwspec *spec)
+{
+	free(spec->lzw_buf);
+	free(spec);
+}
+
+void
+bind_lzw_spec(lzwspec *spec)
+{
+	cur_lzw_spec = spec;
+}
+
+
+#include "lzw-lib.h"
+
+/* Buffer writer function for the lzw-ab implementation, with a fixed signature.
+ * Although the type is defined as int, it is expected to write one byte at a time.
+ * Modifies cur_lzw_spec. Set up the lzw spec to use with bind_lzw_spec() */
+
+void
+write_lzw_buffer(int value)
+{
+	size_t const BUFSIZE = sizeof(uint8_t) * 1024;
+
+	if(!cur_lzw_spec->lzw_buf)
+	{
+		fprintf(stderr, "LZWDecode: lzw_buf is null!");
+		assert(cur_lzw_spec->lzw_buf != NULL);
+	}
+
+	assert(cur_lzw_spec->write_head <= cur_lzw_spec->total_buf_size);
+
+	if (value == EOF) {
+        cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head] = (uint8_t) value;
+        cur_lzw_spec->eof_loc = cur_lzw_spec->write_head;
+        cur_lzw_spec->write_head++;
+        return;
+    }
+
+	/* We can get away with this cast due to writing single bytes. */
+    cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head++] = (uint8_t) value;
+
+	/* If you looked at lzw-ab's code, the write head is reset here
+	 * This function uses write_head as the offset of the last written item */
+    if (cur_lzw_spec->write_head >= cur_lzw_spec->total_buf_size)
+    {
+        grow_lzw_buffer(BUFSIZE);
+    }
+
+    cur_lzw_spec->write_checksum = cur_lzw_spec->write_checksum * 3 + (uint8_t) value;
+}
+
+
+/* Fixed signature function for reading bytes. Modifies cur_lzw_spec. Set cur_lzw_spec
+ * with bind_lzw_spec() */
+int read_lzw_buffer(void)
+{
+	uint8_t byte_read;
+	int ret_value;
+
+	/* Input data is already waiting in the buffer */
+    if (cur_lzw_spec->read_head == cur_lzw_spec->read_tail)
+        cur_lzw_spec->read_tail = cur_lzw_spec->input_stream->len;
+
+    if (cur_lzw_spec->read_head < cur_lzw_spec->read_tail)
+    {
+        byte_read = cur_lzw_spec->input_stream->token[cur_lzw_spec->read_head++];
+        cur_lzw_spec->read_checksum = cur_lzw_spec->read_checksum * 3 + byte_read;
+        ret_value = byte_read;
+    }
+    else
+        ret_value = EOF;
+
+    return ret_value;
+}
+
+
+HParseResult *
+LZWDecode(const Dict *parms, HBytes b, HParser *p)
+{
+	struct predictor pred = {1, 1, 8, 1};
+	int (*depredict)(struct predictor *, uint8_t *, size_t);
+	HParseResult *res;
+	int done;
+	int ret;
+	const HParsedToken *v;
+
+	/* set up the predictor (if any) */
+	#define SETPARM(VAR,STR) do {					\
+		v = dictentry(parms, (STR));				\
+		if (v != NULL) {					\
+			if (v->token_type != TT_SINT || v->sint < 0)	\
+				return NULL;				\
+			VAR = v->sint;					\
+		} } while(0)
+	SETPARM(pred.num,	"Predictor");
+	SETPARM(pred.colors,	"Colors");
+	SETPARM(pred.bpc,	"BitsPerComponent");
+	SETPARM(pred.columns,	"Columns");
+	#undef SETPARM
+	if (pred.num == 1)
+		depredict = depred_none;
+	else {
+		if (pred.num >= 10 && pred.num <= 15)
+			depredict = depred_png;
+		else if (pred.num == 2) {
+			/* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */
+			if (pred.bpc == 8) {
+				pred.predfun = pp_sub;	/* predict left */
+				depredict = depred_png;
+			} else {
+				// XXX add general TIFF predictor (bpc != 8)
+				fprintf(stderr, "LZWDecode: /Predictor %d "
+				    "not supported for /BitsPerComponent %d\n",
+				    pred.num, pred.bpc);
+				return NULL;
+			}
+		} else {
+			fprintf(stderr, "LZWDecode: /Predictor %d"
+			    " not supported\n", pred.num);
+			return NULL;
+		}
+
+		/* allocate row buffer */
+		if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) {
+			fprintf(stderr, "LZWDecode: overflow\n");
+			return NULL;
+		}
+		pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8;
+		pred.buf = calloc(1, pred.rowsz);
+		if (pred.buf == NULL)
+			err(1, "LZWDecode");
+	}
+
+	lzwspec *lzw_spec = new_lzw_spec(&b);
+	bind_lzw_spec(lzw_spec);
+
+	ret = lzw_decompress(write_lzw_buffer, read_lzw_buffer);
+	if (ret) {
+		fprintf(stderr, "lzw_decompress: error (%d)\n", ret);
+		assert(!"LZWDecode: failed to decompress\n");
+	}
+	done = depredict(&pred, cur_lzw_spec->lzw_buf, cur_lzw_spec->write_head-1);
+	assert(!done);	// XXX ITERATIVE
+
+	res = h_parse(p, pred.out, pred.nout);
+	free(pred.out);
+
+	bind_lzw_spec(NULL);
+	delete_lzw_spec(lzw_spec);
+
+	return res;
+}
+
+HParseResult *
+RunLengthDecode(const Dict *parms, HBytes b, HParser *p)
+{
+	HParseResult *res;
+
+	res = h_parse(p_rldstring, b.token, b.len);
+	if(!res)
+	{
+		fprintf(stderr, "parse error in RunLengthDecode filter\n");
+		return NULL;
+	}
+
+	assert(res->ast && res->ast->token_type == TT_BYTES);
+	res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len);
+
+	return res;
+}
+
 /*
+ * Decodes ASCII hexadecimal data into binary data.
+ * parms should be empty, because the filter has no parameters
+ */
+HParseResult *
+ASCIIHexDecode(const Dict *parms, HBytes b, HParser *p)
+{
+	HParseResult *res;
+
+	res = h_parse(p_ahexstream, b.token, b.len);
+	if(!res)
+	{
+		fprintf(stderr, "parse error in ASCIIHexDecode filter\n");
+		return NULL;
+	}
+
+	assert(res->ast && res->ast->token_type == TT_BYTES);
+	res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len);
+
+	return res;
+}
+
+/*
+ * Decodes ASCII base-85 encoded data and produces binary data.
+ * parms should be empty, because the filter has no parameters
+ */
+HParseResult*
+ASCII85Decode(const Dict *parms, HBytes b, HParser *p)
+{
+	HParseResult *res;
+
+	res = h_parse(p_a85string, b.token, b.len);
+	if(!res)
+	{
+		fprintf(stderr, "parse error in ASCII85Decode filter\n");
+		return NULL;
+	}
+
+	assert(res->ast && res->ast->token_type == TT_BYTES);
+	res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len);
+
+	return res;
+}
+
+/*
  * decode the bytes in 'b' according to metadata in the stream dictionary 'd'
  * and parse the result with 'p'.
  */
@@ -1111,15 +1863,25 @@ decode_stream(const Dict *d, HBytes b, HParser *p)
 	if (v == NULL)
 		return h_parse(p, b.token, b.len);
 
+#ifdef ITERATIVE // XXX
 	/* compile to a CF backend to enable incremental parsing */
 	if (h_compile(p, PB_LLk, NULL) == -1)
 		errx(1, "stream data parser: LL(1) compile failed");
+#endif
 
 	if (v->token_type == TT_SEQUENCE)
 		return NULL;	// XXX filter chains not supported, yet
 	assert(v->token_type == TT_BYTES);
 	if (bytes_eq(v->bytes, "FlateDecode"))
 		filter = FlateDecode;
+	else if (bytes_eq(v->bytes, "ASCIIHexDecode"))
+		filter = ASCIIHexDecode;
+	else if (bytes_eq(v->bytes, "ASCII85Decode"))
+		filter = ASCII85Decode;
+	else if (bytes_eq(v->bytes, "RunLengthDecode"))
+		filter = RunLengthDecode;
+	else if (bytes_eq(v->bytes, "LZWDecode"))
+		filter = LZWDecode;
 	else
 		return NULL;		/* filter not supported */
 
@@ -1177,8 +1939,8 @@ p_take__m(HAllocator *mm__, size_t n, struct Env *aux)
 	return h_left__m(mm__, bytes, skip);
 }
 
-HParser *
-p_xrefdata__m(HAllocator *mm__, const Dict *dict);
+HParser *p_xrefdata__m(HAllocator *, const Dict *);
+HParser *p_objstm__m(HAllocator *, const Dict *);
 
 HParser *
 p_stream_data__m(HAllocator *mm__, const Dict *dict)
@@ -1192,9 +1954,10 @@ p_stream_data__m(HAllocator *mm__, const Dict *dict)
 	/* interpret known stream types */
 	if (bytes_eq(v->bytes, "XRef"))
 		return p_xrefdata__m(mm__, dict);
-	// XXX
-	//if (bytes_eq(v->bytes, "ObjStm"))
-	//	return p_objstm__m(mm__, dict);
+#ifndef NOOBJSTM
+	if (bytes_eq(v->bytes, "ObjStm"))
+		return p_objstm__m(mm__, dict);
+#endif
 
 	return NULL;					/* unrecognized type */
 }
@@ -1213,7 +1976,17 @@ act_ks_value(const HParseResult *p, void *u)
 
 	/* decode and parse the stream data */
 	res = decode_stream(spec->dict, bytes, spec->parser);
-	assert(res != NULL);	// XXX parse failure!
+	if (res == NULL) {
+		HBytes b = {NULL, 0};
+		const HParsedToken *v = dictentry(spec->dict, "Type");
+		if (v != NULL && v->token_type == TT_BYTES)
+			b = v->bytes;
+		if (b.len > INT_MAX)
+			b.len = INT_MAX;
+		fprintf(stderr, "parse error in stream (%*s)\n",
+		    (int)b.len, b.token);
+		// XXX return the undecoded stream (p->ast)?
+	}
 
 	return H_MAKE(HParseResult, res);
 }
@@ -1245,7 +2018,7 @@ kstream(HAllocator *mm__, const HParsedToken *x, void 
 
 	//fprintf(stderr, "parsing stream object, length %zu.\n", sz);	// XXX debug
 
-	dict_p  = p_return__m(mm__, dict_t);
+	dict_p	= p_return__m(mm__, dict_t);
 	bytes_p = p_take__m(mm__, sz, aux);
 
 	spec = h_alloc(mm__, sizeof(struct streamspec));
@@ -1436,6 +2209,29 @@ p_xrefdata__m(HAllocator *mm__, const Dict *dict)
 	return h_sequence__ma(mm__, (void **)p_subs);
 }
 
+HParser *
+p_objstm__m(HAllocator *mm__, const Dict *dict)
+{
+	const HParsedToken *v;
+	size_t N;
+
+	v = dictentry(dict, "N");
+	if (v == NULL || v->token_type != TT_SINT || v->sint < 0 ||
+	    (uint64_t)v->sint > SIZE_MAX) {
+		fprintf(stderr, "missing /N on object stream\n");
+		return p_fail;
+	}
+	N = v->sint;
+
+	HParser *wel_ws = h_sequence__m(mm__, p_wel, p_ws, NULL);
+	HParser *idx = p_sepBy_n__m(mm__, p_npair, wel_ws, N);
+
+	return h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL);
+		// XXX leading and trailing ws OK?
+
+	// XXX consistency-check against /First, idx, /N
+}
+
 /*
  * This continuation is very similar to kstream, except that it does not
  * rely on /Length to consume the right amount of input. If /Length is
@@ -1589,10 +2385,15 @@ main(int argc, char *argv[])
 	int fd;
 
 	/* command line handling */
-	if (argc != 2) {
+	if (argc > 3) {
 		fprintf(stderr, "usage: %s file\n", argv[0]);
 		return 1;
 	}
+	if (argc == 3) {
+	  H_RULE(nat, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)),
+			       act_nat, NULL));
+	  strictness = h_parse(nat, (uint8_t *)argv[2], strlen(argv[2]))->ast->uint;
+	}
 	infile = argv[1];
 
 	/* mmap the input file */
blob - /dev/null
blob + 65d4a2e4b96304208852290e3d3bf6ee7ce3dde8 (mode 644)
--- /dev/null
+++ lzw-ab-license.txt
@@ -0,0 +1,25 @@
+                       Copyright (c) David Bryant
+                          All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Conifer Software nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
blob - /dev/null
blob + 8283dfc52f1df905e2c0cce028eee9969684864b (mode 644)
--- /dev/null
+++ t/hello_ascii85.pdf
@@ -0,0 +1,62 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 59 /Filter /ASCII85Decode>>
+stream
+<!k9J0d&5.+B2q[0eb:(0eb:(<+H=a+@p'[Ci:FtDfTZ)+=SF4C'mC/$3~>
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f 
+0000000009 00000 n 
+0000000074 00000 n 
+0000000120 00000 n 
+0000000179 00000 n 
+0000000322 00000 n 
+0000000453 00000 n 
+0000000483 00000 n 
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+592
+%%EOF
blob - /dev/null
blob + 4f54996cce84b86df745dc4912ca558e1760b557 (mode 644)
--- /dev/null
+++ t/hello_ascii85_lf.pdf
@@ -0,0 +1,63 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 60 /Filter /ASCII85Decode>>
+stream
+<!k9J0d&5.+B2q[0eb:(0eb:(<+H=
+a+@p'[Ci:FtDfTZ)+=SF4C'mC/$3~>
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f 
+0000000009 00000 n 
+0000000074 00000 n 
+0000000120 00000 n 
+0000000179 00000 n 
+0000000322 00000 n 
+0000000454 00000 n 
+0000000484 00000 n 
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+593
+%%EOF
blob - /dev/null
blob + 7496f59864d8d8b22ac06557f7ce3d0e3ed2e2c3 (mode 644)
--- /dev/null
+++ t/hello_asciihex.pdf
@@ -0,0 +1,62 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 91 /Filter /ASCIIHexDecode>>
+stream
+540a2f46312032342054660a313030203130302054640a282048656c6c6f20576f726c64202920546a0a45540a>
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f 
+0000000009 00000 n 
+0000000074 00000 n 
+0000000120 00000 n 
+0000000179 00000 n 
+0000000322 00000 n 
+0000000486 00000 n 
+0000000516 00000 n 
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+624
+%%EOF
blob - /dev/null
blob + b077906ba43c89935d5d6fbe226777cc84afb0bb (mode 644)
--- /dev/null
+++ t/hello_asciihex_lf.pdf
@@ -0,0 +1,63 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 92 /Filter /ASCIIHexDecode>>
+stream
+540a2f46312032342054660a3130302031303
+02054640a282048656c6c6f20576f726c64202920546a0a45540a>
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f 
+0000000009 00000 n 
+0000000074 00000 n 
+0000000120 00000 n 
+0000000179 00000 n 
+0000000322 00000 n 
+0000000487 00000 n 
+0000000517 00000 n 
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+625
+%%EOF
blob - /dev/null
blob + 7f887ccce2c1c995ea65f1524e28685713f435d3 (mode 644)
Binary files /dev/null and t/hello_lzwdecode_march.pdf differ
blob - /dev/null
blob + 8f14de953d0952359834e9c7e67874246996a6ec (mode 644)
--- /dev/null
+++ t/hello_runlength.pdf
@@ -0,0 +1,62 @@
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 23 /Filter /RunLengthDecode>>
+stream
+ AAAAAAAAAAAAAAAAAAAA@B
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f 
+0000000009 00000 n 
+0000000074 00000 n 
+0000000120 00000 n 
+0000000179 00000 n 
+0000000322 00000 n 
+0000000419 00000 n 
+0000000450 00000 n 
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+557
+%%EOF
blob - /dev/null
blob + a7919358495066d05aba0366b1c55e4b38f109d2 (mode 644)
Binary files /dev/null and t/lzw.pdf differ
blob - /dev/null
blob + 38ecd30a3ed38605d2cb752efb66dc8ecb4861c6 (mode 644)
--- /dev/null
+++ lzw-lib.c
@@ -0,0 +1,318 @@
+////////////////////////////////////////////////////////////////////////////

+//                            **** LZW-AB ****                            //

+//               Adjusted Binary LZW Compressor/Decompressor              //

+//                     Copyright (c) 2016 David Bryant                    //

+//                           All Rights Reserved                          //

+//      Distributed under the BSD Software License (see license.txt)      //

+////////////////////////////////////////////////////////////////////////////

+

+#include <stdio.h>

+#include <stdlib.h>

+#include <string.h>

+

+#include "lzw-lib.h"

+

+/* This library implements the LZW general-purpose data compression algorithm.

+ * The algorithm was originally described as a hardware implementation by

+ * Terry Welsh here:

+ *

+ *   Welch, T.A. “A Technique for High-Performance Data Compression.”

+ *   IEEE Computer 17,6 (June 1984), pp. 8-19.

+ *

+ * Since then there have been enumerable refinements and variations on the

+ * basic technique, and this implementation is no different. The target of

+ * the present implementation is embedded systems, and so emphasis was placed

+ * on simplicity, fast execution, and minimal RAM usage.

+ *

+ * The symbols are stored in adjusted binary, which provides considerably

+ * better compression performance with virtually no speed penalty compared to

+ * the fixed sizes normally used. To ensure good performance on data with

+ * varying characteristics (like executable images) the encoder resets as

+ * soon as the dictionary is full. Also, worst-case performance is limited

+ * to about 8% inflation by catching poor performance and forcing an early

+ * reset before longer symbols are sent.

+ *

+ * The maximum symbol size is configurable on the encode side (from 9 bits

+ * to 12 bits) and determines the RAM footprint required by both sides and,

+ * to a large extent, the compression performance. This information is

+ * communicated to the decoder in the first stream byte so that it can

+ * allocate accordingly. The RAM requirements are as follows:

+ *

+ *    maximum    encoder RAM   decoder RAM

+ *  symbol size  requirement   requirement

+ * -----------------------------------------

+ *     9-bit     1792 bytes    1024 bytes

+ *    10-bit     4352 bytes    3072 bytes

+ *    11-bit     9472 bytes    7168 bytes

+ *    12-bit     19712 bytes   15360 bytes

+ * 

+ * This implementation uses malloc(), but obviously an embedded version could

+ * use static arrays instead if desired (assuming that the maxbits was

+ * controlled outside).

+ */

+

+#define NULL_CODE       -1      // indicates a NULL prefix

+#define CLEAR_CODE      256     // code to flush dictionary and restart decoder

+#define EOD_CODE        257     // used in PDF's LZWDecode to signal end of data

+#define FIRST_STRING    258     // code of first dictionary string, PDF edition

+

+/* This macro writes the adjusted-binary symbol "code" given the maximum

+ * symbol "maxcode". A macro is used here just to avoid the duplication in

+ * the lzw_compress() function. The idea is that if "maxcode" is not one

+ * less than a power of two (which it rarely will be) then this code can

+ * often send fewer bits that would be required with a fixed-sized code.

+ *

+ * For example, the first code we send will have a "maxcode" of 257, so

+ * every "code" would normally consume 9 bits. But with adjusted binary we

+ * can actually represent any code from 0 to 253 with just 8 bits -- only

+ * the 4 codes from 254 to 257 take 9 bits.

+ */

+

+#define WRITE_CODE(code,maxcode) do {                           \

+    int code_bits = (maxcode) < 1024 ?                          \

+        ((maxcode) < 512 ? 8 : 9) :                             \

+        ((maxcode) < 2048 ? 10 : 11);                           \

+    int extras = (1 << (code_bits + 1)) - (maxcode) - 1;        \

+    if ((code) < extras) {                                      \

+        shifter |= ((long)(code) << bits);                      \

+        bits += code_bits;                                      \

+    }                                                           \

+    else {                                                      \

+        shifter |= ((long)(((code) + extras) >> 1) << bits);    \

+        bits += code_bits;                                      \

+        shifter |= ((long)(((code) + extras) & 1) << bits++);   \

+    }                                                           \

+    do { (*dst)(shifter); shifter >>= 8; output_bytes++;        \

+    } while ((bits -= 8) >= 8);                                 \

+} while (0)

+

+/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the

+ * "maxbits" parameter specifies the maximum symbol size (9-12), which in turn determines

+ * the RAM requirement and, to a large extent, the level of compression achievable. A return

+ * value of EOF from the "src" callback terminates the compression process. A non-zero return

+ * value indicates one of the two possible errors -- bad "maxbits" param or failed malloc().

+ */

+

+int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits)

+{

+    int next = FIRST_STRING, prefix = NULL_CODE, bits = 0, total_codes, c;

+    unsigned long input_bytes = 0, output_bytes = 0;

+    short *first_references, *next_references;

+    unsigned char *terminators;

+    unsigned long shifter = 0;

+

+    if (maxbits < 9 || maxbits > 12)    // check for valid "maxbits" setting

+        return 1;

+

+    // based on the "maxbits" parameter, compute total codes and allocate dictionary storage

+

+    total_codes = 1 << maxbits;

+    first_references = malloc (total_codes * sizeof (first_references [0]));

+    next_references = malloc ((total_codes - 256) * sizeof (next_references [0]));

+    terminators = malloc ((total_codes - 256) * sizeof (terminators [0]));

+

+    if (!first_references || !next_references || !terminators)

+        return 1;                       // failed malloc()

+

+    // clear the dictionary

+

+    memset (first_references, 0, total_codes * sizeof (first_references [0]));

+    memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0]));

+    memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0]));

+

+    (*dst)(maxbits - 9);    // first byte in output stream indicates the maximum symbol bits

+

+    // This is the main loop where we read input bytes and compress them. We always keep track of the

+    // "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that

+    // has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits"

+    // variables and are sent to the output every time 8 bits are available (done in the macro).

+

+    while ((c = (*src)()) != EOF) {

+        int cti;                            // coding table index

+

+        input_bytes++;

+

+        if (prefix == NULL_CODE) {          // this only happens the very first byte when we don't yet have a prefix

+            prefix = c;

+            continue;

+        }

+

+        if ((cti = first_references [prefix])) {    // if any longer strings are built on the current prefix...

+            while (1)

+                if (terminators [cti - 256] == c) { // we found a matching string, so we just update the prefix

+                    prefix = cti;                   // to that string and continue without sending anything

+                    break;

+                }

+                else if (!next_references [cti - 256]) {    // this string did not match the new character and

+                    next_references [cti - 256] = next;     // there aren't any more, so we'll add a new string

+                    cti = 0;                                // and point to it with "next_reference"

+                    break;

+                }

+                else

+                    cti = next_references [cti - 256];      // there are more possible matches to check, so loop back

+        }

+        else                                        // no longer strings are based on the current prefix, so now

+            first_references [prefix] = next;       // the current prefix plus the new byte will be the next string

+

+        // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a

+        // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the

+        // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix.

+

+        if (!cti) {

+            WRITE_CODE (prefix, next);              // send symbol for current prefix (0 to next-1)

+            terminators [next - 256] = c;           // newly created string has current byte as the terminator

+            prefix = c;                             // current byte also becomes new prefix for next string

+

+            // This is where we bump the next string index and decide whether to clear the dictionary and start over.

+            // The triggers for that are either the dictionary is full or we've been outputting too many bytes and

+            // decide to cut our losses before the symbols get any larger. Note that for the dictionary full case we

+            // do NOT send the CLEAR_CODE because the decoder knows about this and we don't want to be redundant.

+

+            if (++next == total_codes || output_bytes > 8 + input_bytes + (input_bytes >> 4)) {

+                if (next < total_codes)

+                    WRITE_CODE (CLEAR_CODE, next);

+

+                // clear the dictionary and reset the byte counters -- basically everything starts over

+                // except that we keep the last pending "prefix" (which, of course, was never sent)

+

+                memset (first_references, 0, total_codes * sizeof (first_references [0]));

+                memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0]));

+                memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0]));

+                input_bytes = output_bytes = 0;

+                next = FIRST_STRING;

+            }

+        }

+    }

+

+    // we're done with input, so if we've received anything we still need to send that pesky pending prefix...

+

+    if (prefix != NULL_CODE) {

+        WRITE_CODE (prefix, next);

+

+        if (++next == total_codes)  // watch for clearing to the first string to stay in step with the decoder!

+            next = FIRST_STRING;    // (this was actually a corner-case bug that did not trigger often)

+    }

+

+    WRITE_CODE (next, next);        // the maximum possible code is always reserved for our END_CODE

+

+    if (bits)                       // finally, flush any pending bits from the shifter

+        (*dst)(shifter);

+

+    free (terminators); free (next_references); free (first_references);

+    return 0;

+}

+

+/* LZW decompression function. Bytes (8-bit) are read and written through callbacks.

+ * A return value of EOF from the "src" callback terminates the compression process 

+ * (although this should not normally occur). A non-zero return value

+ * indicates an error, which in this case can be  a

+ * failed malloc(), or if an EOF is read from the input stream before the compression

+ * terminates naturally with END_CODE.

+ */

+

+int lzw_decompress (void (*dst)(int), int (*src)(void))

+{

+    int read_byte, next = FIRST_STRING, prefix = CLEAR_CODE, bits = 0, total_codes;

+    unsigned char *terminators, *reverse_buffer;

+    unsigned long shifter = 0;

+    short *prefixes;

+

+    // PDF specific change: maxbits is not in the input stream

+    // we'll just be pessimistic and allocate the maximal size buffer

+

+    total_codes = 4096;

+    reverse_buffer = malloc ((total_codes - 256) * sizeof (reverse_buffer [0]));

+    prefixes = malloc ((total_codes - 256) * sizeof (prefixes [0]));

+    terminators = malloc ((total_codes - 256) * sizeof (terminators [0]));

+

+    if (!reverse_buffer || !prefixes || !terminators)       // check for mallco() failure

+        return 1;

+

+    // This is the main loop where we read input symbols. The values range from 0 to the code value

+    // of the "next" string in the dictionary. Note that receiving an EOF from the input

+    // stream is actually an error because we should have gotten the END_CODE first.

+

+    while (1) {

+        int code_bits = next < 512 ? 9 : (next < 1024 ? 10 : (next < 2048 ? 11 : 12) ), code;

+

+        #define TOP_BITMASK  (((1 << code_bits) - 1) << (bits - code_bits) )

+        #define BOTTOM_BITMASK ((1 << (bits - code_bits)) - 1)

+

+        do {

+            if ((read_byte = ((*src)())) == EOF) {

+                free (terminators); free (prefixes); free (reverse_buffer);

+                return 1;

+            }

+

+            /* shifter reworked: everything shifted left by a byte,

+             * and the byte we just read becomes the least significant

+             * byte */

+

+            // prepare to shift in next byte

+            shifter <<= 8;

+            /* the bitstrings forming the symbols are stored MSB first,

+            *  so we can just OR in the next */

+            shifter |= (unsigned long) read_byte;

+        } while ((bits += 8) < code_bits);

+

+

+        /* for a 12-bit code, the shifter's bits now look like 

+         * from MSB to LSB: 00...0cccccccccn...n

+         * where c are the bits of our code

+         * and n are the bits we're not yet interested in

+         * the number of times n is repeated is bits - code_bits 

+         * ie. the number of bits read in minus the bits we're interested in */

+

+        // shift our code bits into thier proper place, and save it as the final code

+        code = (int) shifter >> (bits - code_bits);

+        /* we can now clear the shifter's top bits. the result looks like:

+         * 00...0n...n

+         * number of n is bits-code_bits

+         * */

+        shifter &= BOTTOM_BITMASK;

+        // update the count of bytes in the shifter

+        bits -= code_bits;

+

+        if (code == EOD_CODE)                   // In PDF, EOD is signalled by 257, rather than the max code

+            break;

+        else if (code == CLEAR_CODE)        // otherwise check for a CLEAR_CODE to start over early

+            next = FIRST_STRING;

+        else if (prefix == CLEAR_CODE) {    // this only happens at the first symbol which is always sent

+            (*dst)(code);                   // literally and becomes our initial prefix

+            next++;

+        }

+        // Otherwise we have a valid prefix so we step through the string from end to beginning storing the

+        // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case

+        // we have to handle here is that the string might be the same one that is actually being defined

+        // now (code == next-1). Also, the first 256 entries of "terminators" and "prefixes" are fixed and

+        // not allocated, so that messes things up a bit.

+        else {

+            int cti = (code == next-1) ? prefix : code;

+            unsigned char *rbp = reverse_buffer, c;

+

+            do *rbp++ = cti < 256 ? cti : terminators [cti - 256];      // step backward through string...

+            while ((cti = (cti < 256) ? NULL_CODE : prefixes [cti - 256]) != NULL_CODE);

+

+            c = *--rbp;     // the first byte in this string is the terminator for the last string, which is

+                            // the one that we'll create a new dictionary entry for this time

+

+            do (*dst)(*rbp);                        // send string in corrected order (except for the terminator

+            while (rbp-- != reverse_buffer);        // which we don't know yet)

+

+            if (code == next-1)

+                (*dst)(c);

+

+            prefixes [next - 1 - 256] = prefix;     // now update the next dictionary entry with the new string

+            terminators [next - 1 - 256] = c;       // (but we're always one behind, so it's not the string just sent)

+

+            if (++next == total_codes)              // check for full dictionary, which forces a reset (and, BTW,

+                next = FIRST_STRING;                // means we'll never use the dictionary entry we just wrote)

+        }

+

+        prefix = code;      // the code we just received becomes the prefix for the next dictionary string entry

+                            // (which we'll create once we find out the terminator)

+    }

+

+    free (terminators); free (prefixes); free (reverse_buffer);

+    return 0;

+}

blob - /dev/null
blob + 81fdeb15e6ade7ef0fd6089a4fdc3d3f2d593578 (mode 644)
--- /dev/null
+++ lzw-lib.h
@@ -0,0 +1,15 @@
+////////////////////////////////////////////////////////////////////////////

+//                            **** LZW-AB ****                            //

+//               Adjusted Binary LZW Compressor/Decompressor              //

+//                     Copyright (c) 2016 David Bryant                    //

+//                           All Rights Reserved                          //

+//      Distributed under the BSD Software License (see license.txt)      //

+////////////////////////////////////////////////////////////////////////////

+

+#ifndef LZWLIB_H_

+#define LZWLIB_H_

+

+int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits);

+int lzw_decompress (void (*dst)(int), int (*src)(void));

+

+#endif /* LZWLIB_H_ */