commit - 1191d61e7323c479a891b8810b0d56996ab0075f
commit + f31903c8acf3479533aa88556b5632ef54159fe3
blob - d9374c2925bf1ddb2966d2d99dba2cbc29f8989c
blob + f39a06d3076d76b1399fdc6a8823d602bfbd097a
--- LICENSE
+++ LICENSE
Copyright (c) 2019, 2020 Sven M. Hallberg <pesco@khjk.org>
+Copyright (c) 2020 pompolic <pompolic@special-circumstanc.es>
+Copyright (c) 2020 Paul Vines <paul.vines@baesystems.com>
Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
blob - 9ca933445c2668987c6aa7bc34117e0f70f47fbd
blob + 6154e1d8363343e12153e52a27e9826581afdc3a
--- Makefile
+++ Makefile
-CFLAGS += -std=c99 -Wall -Werror
+CFLAGS += -std=c99 -Wall -Werror -DLOG
# find our hammer build - adjust this to your needs
# i have, for instance, symlinks:
HAMMER_LIB = ./lib
CFLAGS += -I$(HAMMER_INCLUDE)
LDFLAGS += -L$(HAMMER_LIB)
+SOURCES = pdf.c lzw-lib.c
.PHONY: all test clean
all: pdf
'for x in t/*.pdf; do ./pdf "$$x" >/dev/null && echo OK: "$$x"; done'
@true
-pdf: pdf.c
- $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $< -lhammer -lz
+pdf: $(SOURCES)
+ $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $(SOURCES) -lhammer -lz
clean:
rm -f pdf
blob - 0b2cc8912c3730af5961ac3d76eaf5497e51a5f1
blob + a2da3272b5216acdead625a852d6cd6fdf91a017
--- README
+++ README
https://gitlab.special-circumstanc.es/pesco/hammer/tree/pdf
+ For detailed build instructions, see README.md in that repository.
+
- Help the default Makefile find Hammer
- $ ln -s ../hammer/src hammer
- $ ln -s ../hammer/build/opt/src lib
+ $ ln -s ../hammer/src hammer # needed for building pdf, include files
+ $ ln -s ../hammer/build/opt/src lib # needed for running pdf, to locate libhammer.so
- - Build/Usage:
+ - Notes for 2020-04-27 release:
+ The release branch has been tested to build with the 2020-04-27_RELEASE` branch located at https://gitlab.special-circumstanc.es/pesco/hammer/tree/2020-04-27_RELEASE
+
+ - Build:
+
+ $ pushd ../hammer; scons; popd # build Hammer
$ make pdf
- $ ./pdf test.pdf
+ - Usage:
+
+ $ export LD_LIBRARY_PATH=./lib # see Troubleshooting section below to see if this is needed
+ $ ldd ./pdf | grep libhammer # verify that libhammer.so was found
+ $ ./pdf <filename>
+
# place some test files in the t/ directory...
$ make test
+
+ - Troubleshooting:
+
+ libhammer.so not found:
+
+ If Hammer is not installed as a system library, ld may fail to locate libhammer.so. The quick fix for this is altering LD_LIBRARY_PATH before running pdf:
+
+ $ export LD_LIBRARY_PATH=./lib
+ $ make test
+
+ The second solution is executing "scons install" when building Hammer, which will install it in ld's usual search path:
+
+ $ pushd ../hammer; scons install; popd
+ # ... Update ldconfig cache if needed
+ $ make pdf
+ $ make test
+
+ - Evaluating test results:
+
+ For every file in the t/ directory, the pdf parser is executed. On successful parse, a message of the following form is displayed:
+
+ OK: t/<filename>
+
+ In case of a non-fatal parse error, error messages may be displayed, but presence of the "OK" indicates pdf exited successfully. On a failed test run, only parse error messages are displayed.
+
+ - Copyright:
+
+ - pesco 2019,2020
+ - pompolic 2020
+ - Paul Vines 2020
+ - David Bryant (modified lzw-ab code)
+
+ See LICENSE and lzw-ab-license.txt for full copyright and licensing notice.
+
blob - f363e541d5613ebce67d335fc00c4288801a1982
blob + d649711646645c3e6fb1a86dac043752e2e4f12a
--- pdf.c
+++ pdf.c
/* beginnings of a PDF parser in hammer
* pesco 2019,2020
+ * pompolic 2020
+ * Paul Vines 2020
*/
-#include <string.h> /* strncmp(), memset() */
+#include <string.h> /* strncmp(), memset(), memcpy() */
+#include <stdlib.h> /* exit() */
#include <hammer/hammer.h>
#include <hammer/glue.h>
#define IN(STR) h_in((const uint8_t *)(STR), sizeof(STR) - 1)
#define NOT_IN(STR) h_not_in((const uint8_t *)(STR), sizeof(STR) - 1)
+#ifdef LOG
+#define VIOL(P,VIOL) h_action(h_sequence(P, h_tell(), NULL), act_viol, VIOL)
+#else
+#define VIOL(P,VIOL) P
+#endif
+
+
/*
* some helpers
*/
HParser *p_epsilon;
HParser *p_return_0;
HParser *p_return_1;
+uint8_t strictness = 0;
/* a combinator to parse a given character but return a different value */
return h_action__m(mm__, p_epsilon, act_return_uint, (void *)x);
}
+/* like h_sepBy but parses a fixed number of elements */
+HParser *
+p_sepBy_n__m(HAllocator *mm__, HParser *p, HParser *sep, size_t n)
+{
+ if (n == 0)
+ return p_epsilon;
+
+ HParser *sep_p = h_sequence__m(mm__, sep, p, NULL);
+ HParser *tail = h_repeat_n__m(mm__, sep_p, n - 1);
+ HParser *seq = h_sequence__m(mm__, p, tail, NULL);
+
+ return h_action__m(mm__, seq, h_act_flatten, NULL);
+}
+
/* a helper to compare an HBytes to a string */
bool
bytes_eq(HBytes b, const char *s)
{
HParseResult *res = H_CAST(HParseResult, tok);
- h_pprint(stream, res->ast, indent, delta);
+ if (res == NULL)
+ fprintf(stream, "null");
+ else
+ h_pprint(stream, res->ast, indent, delta);
}
void
HParsedToken *
act_hlower(const HParseResult *p, void *u)
{
- return H_MAKE_UINT(H_CAST_UINT(p->ast) - 'a');
+ return H_MAKE_UINT(10 + H_CAST_UINT(p->ast) - 'a');
}
HParsedToken *
act_hupper(const HParseResult *p, void *u)
{
- return H_MAKE_UINT(H_CAST_UINT(p->ast) - 'A');
+ return H_MAKE_UINT(10 + H_CAST_UINT(p->ast) - 'A');
+}
+
+HParsedToken*
+act_hdigitpair(const HParseResult *p, void *u)
+{
+ uint8_t b = 0;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+ size_t digits_processed = 0;
+ uint8_t digits[2];
+ for(size_t i = 0; i < seq->used; ++i)
+ {
+ switch(seq->elements[i]->token_type)
+ {
+ case TT_UINT:
+ digits[digits_processed] = H_CAST_UINT(seq->elements[i]);
+ digits_processed++;
+ break;
+ default:
+ break;
+ }
+ assert(digits_processed == 2);
+ }
+
+ b = (digits[0] << 4) + digits[1];
+ return H_MAKE_UINT(b);
+}
+
+HParsedToken *
+act_ahextruncated(const HParseResult *p, void *u)
+{
+ uint8_t b = 0;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+
+ /* Assumption: At this point seq->elements[0] is a hex digit
+ * and seq->elements[1] holds '>' (EOD)
+ */
+ // XXX figure out how to compare to '>'
+ assert(seq->used == 2);
+ b = H_CAST_UINT(seq->elements[0]) << 4;
+ return H_MAKE_UINT(b);
+}
+
+HParsedToken *
+act_hs_end(const HParseResult *p, void *u)
+{
+ HParsedToken *res;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+ assert(seq->used >= 1);
+
+ res = H_MAKE_UINT(H_CAST_UINT(seq->elements[0]));
+ return res;
+}
+
+HParsedToken *
+act_ahexstream(const HParseResult *p, void *u)
+{
+ uint8_t *result_bytes;
+ size_t required_bytes;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+ HParsedToken *res;
+
+ /* Ignore the the last element, which is EOD */
+ required_bytes = (seq->used - 1);
+
+ result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * required_bytes);
+
+ /* memcpy all but the last group's bytes into a single array */
+ for (size_t i = 0; i < seq->used-1; ++i)
+ {
+ assert(i < required_bytes);
+ result_bytes[i] = H_CAST_UINT(seq->elements[i]);
+ }
+
+
+ res = H_MAKE_BYTES(result_bytes, required_bytes);
+ return res;
+}
+
+
+HParsedToken *
+act_a85zero(const HParseResult *p, void *u)
+{
+ uint32_t b = 0;
+ return H_MAKE_UINT(b);
+}
+
+HParsedToken *
+act_a85digit(const HParseResult *p, void *u)
+{
+ uint8_t b = H_CAST_UINT(p->ast);
+ b -= '!';
+
+ /* At this point we have the base85 value of one byte out of 5 */
+ return H_MAKE_UINT(b);
}
HParsedToken *
+act_a85fivedigits(const HParseResult *p, void *u)
+{
+ uint64_t fourbytes = 0;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+ HParsedToken **digits = h_seq_elements(p->ast);
+
+ /* 2^32-1, the max value the group can hold as per spec */
+ #define A85GRPMAX 4294967295
+
+ /* Only for groups that do not need to padded to 5 */
+ assert(seq->used == 5);
+ fourbytes = H_CAST_UINT(digits[0]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[3]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[4]);
+
+ assert(fourbytes <= A85GRPMAX);
+ return H_MAKE_UINT(fourbytes);
+}
+
+/* Checking the following condition in the spec:
+ * The value represented by a group of 5 characters is greater than 2^32 - 1.
+ */
+bool
+validate_a85fivedigits(HParseResult *p, void *u)
+{
+ /* "s8W-!" should be the highest accepted value */
+ return H_CAST_UINT(p->ast) <= A85GRPMAX;
+}
+
+HParsedToken *
+act_a85group(const HParseResult *p, void *u)
+{
+ uint8_t *bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * 4);
+ uint32_t fourbytes = H_CAST_UINT(p->ast);
+
+ bytes[0] = (fourbytes & 0xFF000000) >> 24;
+ bytes[1] = (fourbytes & 0x00FF0000) >> 16;
+ bytes[2] = (fourbytes & 0x0000FF00) >> 8;
+ bytes[3] = (fourbytes & 0x000000FF);
+
+ HParsedToken *b = H_MAKE_BYTES(bytes, 4);
+ return b;
+}
+
+HParsedToken *
+act_a85partial2group(const HParseResult *p, void *u)
+{
+ uint64_t fourbytes = 0;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+ HParsedToken **digits = h_seq_elements(p->ast);
+
+ assert(seq->used == 2);
+ fourbytes = H_CAST_UINT(digits[0]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]);
+ fourbytes *= 85 * 85 * 85;
+
+ assert(fourbytes <= A85GRPMAX);
+ return H_MAKE_UINT(fourbytes);
+}
+
+bool
+validate_a85partial2group(HParseResult *p, void *u)
+{
+ return H_CAST_UINT(p->ast) <= A85GRPMAX;
+}
+
+HParsedToken *
+act_a85partial3group(const HParseResult *p, void *u)
+{
+ uint64_t fourbytes = 0;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+ HParsedToken **digits = h_seq_elements(p->ast);
+
+ assert(seq->used == 3);
+ fourbytes = H_CAST_UINT(digits[0]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]);
+ fourbytes *= 85 * 85;
+
+ assert(fourbytes <= A85GRPMAX);
+ return H_MAKE_UINT(fourbytes);
+}
+
+bool
+validate_a85partial3group(HParseResult *p, void *u)
+{
+ return H_CAST_UINT(p->ast) <= A85GRPMAX;
+}
+
+HParsedToken *
+act_a85partial4group(const HParseResult *p, void *u)
+{
+ uint64_t fourbytes = 0;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+ HParsedToken **digits = h_seq_elements(p->ast);
+
+ assert(seq->used == 4);
+ fourbytes = H_CAST_UINT(digits[0]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]);
+ fourbytes = fourbytes * 85 + H_CAST_UINT(digits[3]);
+ fourbytes *= 85;
+
+ assert(fourbytes <= A85GRPMAX);
+ return H_MAKE_UINT(fourbytes);
+}
+
+bool
+validate_a85partial4group(HParseResult *p, void *u)
+{
+ return H_CAST_UINT(p->ast) <= A85GRPMAX;
+}
+
+HParsedToken *
+act_a85partialgroup(const HParseResult *p, void *u)
+{
+ uint8_t bytes_helper[4];
+ size_t bytes_used = 0;
+ uint8_t *bytes;
+
+ uint32_t fourbytes = H_CAST_UINT(p->ast);
+
+ /* Scan the uint backwards to find the first non-zero byte */
+ for (size_t i = 3; i > 0; --i)
+ {
+ /* Shift by 0, 8, 16 and 24 to get the correct byte */
+ bytes_helper[i] = (fourbytes >> ((3-i) * 8)) & 0xFF;
+ /* If we haven't set bytes_used yet, and the particular byte is nonzero */
+ if (!bytes_used && bytes_helper[i])
+ bytes_used = i;
+ }
+ assert(bytes_used > 0);
+
+ bytes = h_arena_malloc(p->arena, bytes_used);
+ return H_MAKE_BYTES(bytes, bytes_used);
+}
+
+HParsedToken *
+act_a85string(const HParseResult *p, void *u)
+{
+ uint8_t *result_bytes;
+ size_t chunk_number;
+ size_t required_bytes;
+ size_t out_pos = 0;
+ HCountedArray *seq = H_CAST_SEQ(p->ast);
+ HParsedToken *res;
+
+ /* Number of 4-byte chunks, minus the potential last partial group and EOD */
+ chunk_number = seq->used - 2;
+
+ /* Special-case: last chunk before EOD may be 4, 3, 2 or 1 bytes
+ * The latter two happening if the group was parsed from a partial
+ * group consisting less than 5 chars */
+ HBytes *last_chunk = &H_CAST_BYTES(seq->elements[seq->used-1]);
+ required_bytes = (chunk_number * 4 + last_chunk->len);
+
+ result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * required_bytes);
+
+ /* memcpy all but the last group's bytes into a single array */
+ for (size_t i = 0; i < seq->used-1; ++i)
+ {
+ HBytes *chunk = &H_CAST_BYTES(seq->elements[i]);
+ assert(out_pos < required_bytes);
+ memcpy(&(result_bytes[out_pos]), chunk->token, 4);
+ out_pos += 4;
+ assert(out_pos < required_bytes);
+ }
+
+ memcpy(&(result_bytes[out_pos]), last_chunk->token, last_chunk->len);
+ out_pos += last_chunk->len;
+ /* We should have filled the array exactly by this point */
+ assert(out_pos == required_bytes-1);
+
+ res = H_MAKE_BYTES(result_bytes, required_bytes);
+ return res;
+}
+
+HParsedToken *
act_nat(const HParseResult *p, void *u)
{
uint64_t x = 0;
}
#define act_xroff act_nat
#define act_xrgen act_nat
+
+HParser *p_violsev;
+HParsedToken *
+act_viol(const HParseResult *p, void *viol)
+{
+ uint32_t severity;
+ uint32_t offset;
+ HParseResult *severity_parse;
+ viol = (uint8_t *) viol;
+ severity_parse = h_parse(p_violsev, viol, strlen((char *)viol));
+ if (!severity_parse) {
+ fprintf(stderr, "Severity for violaiton %s not assigned!\n", (char *)viol);
+ severity = 99999;
+ }
+ else {
+ severity = severity_parse->ast->seq->elements[0]->uint;
+ }
+ offset = p->ast->seq->elements[p->ast->seq->used-1]->uint / 8;
+ fprintf(stderr, "VIOLATION[%d]@%d (0x%x): %s\n", severity, offset, offset, (char *) viol);
+ if (strictness && severity > strictness) {
+ exit(1);
+ }
+ /* Just return the parse AST, drop the h_tell */
+ if (p->ast->seq->used == 1) {
+ return (HParsedToken *) NULL;
+ }
+ return (HParsedToken *) p->ast->seq->elements[0];
+}
bool
validate_pnat(HParseResult *p, void *u)
return H_MAKE_UINT(H_FIELD_UINT(1)*16 + H_FIELD_UINT(2));
}
-#define act_str_ h_act_flatten
+#define act_schars h_act_flatten
#define act_string act_token
HParsedToken *
return H_MAKE_UINT(x);
}
-#define act_oct3 act_octal
-#define act_oct2 act_octal
-#define act_oct1 act_octal
HParsedToken *
act_xrent(const HParseResult *p, void *u)
}
#define act_array_ h_act_flatten
+
+HParsedToken *
+act_shortlength(const HParseResult *p, void *u)
+{
+ uint8_t length = H_CAST_UINT(p->ast);
+ /* Length can range from 0-127, corresponding to the range 1-128, inclusive */
+ uint8_t finallength = length+1;
+
+ return H_MAKE_UINT(finallength);
+}
+
+HParsedToken *
+act_longlength(const HParseResult *p, void *u)
+{
+ uint8_t length = H_CAST_UINT(p->ast);
+ uint8_t finallength = 257-length;
+
+ return H_MAKE_UINT(finallength);
+}
+
+HParsedToken *
+act_longrun(const HParseResult *p, void *u)
+{
+ HParsedToken **elements = h_seq_elements(p->ast);
+ HParsedToken *res = H_MAKE_SEQ();
+ uint8_t length = H_CAST_UINT(elements[0]);
+ uint8_t data = H_CAST_UINT(elements[1]);
+ for (size_t len = 0; len < length; ++len)
+ {
+ h_seq_snoc(res, H_MAKE_UINT(data));
+ }
+
+ return res;
+}
+
+HParsedToken *
+act_rldstring(const HParseResult *p, void *u)
+{
+ const HParsedToken *flattened = h_seq_flatten(p->arena, p->ast);
+ HCountedArray *flattened_seq = H_CAST_SEQ(flattened);
+ uint8_t bytes_required;
+ uint8_t *result_bytes;
+
+ bytes_required = flattened_seq->used - 1;
+ result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * bytes_required);
+
+ for (size_t i = 0; i < flattened_seq->used-1; ++i)
+ {
+ result_bytes[i] = H_CAST_UINT(flattened_seq->elements[i]);
+ }
+
+ return H_MAKE_BYTES(result_bytes, bytes_required);
+}
+
/*
* input grammar
*/
HParser *p_startxref;
HParser *p_xref;
HParser *p_objdef;
+HParser *p_a85string;
+HParser *p_ahexstream;
+HParser *p_rldstring;
+HParser *p_ws;
+HParser *p_wel;
+HParser *p_elemr;
+HParser *p_npair;
/* continuations for h_bind() */
HParser *kstream(HAllocator *, const HParsedToken *, void *);
HParser *kxstream(HAllocator *, const HParsedToken *, void *);
void
+init_runlengthdecode_parser(struct Env *aux)
+{
+ H_RULE(rldeod, h_ch(0x80));
+ H_ARULE(longlength, h_ch_range(0x81, 0xFF));
+ H_ARULE(shortlength, h_ch_range(0x0, 0x7F));
+
+ H_RULE(shortdata, h_uint8());
+ H_RULE(longdata, h_uint8());
+
+ H_RULE(shortrun, h_length_value(shortlength, shortdata));
+ H_ARULE(longrun, SEQ(longlength, longdata));
+
+ H_ARULE(rldstring, SEQ(h_many(CHX(shortrun, longrun)), IGN(rldeod)));
+
+ p_rldstring = rldstring;
+}
+
+void
init_parser(struct Env *aux)
{
TT_HParseResult = h_allocate_token_new("HParseResult", NULL, pp_parseresult);
//H_RULE(dchar, IN(DCHARS)); /* delimiter */
H_RULE(rchar, NOT_IN(WCHARS DCHARS)); /* regular */
H_RULE(nchar, NOT_IN(WCHARS DCHARS "#")); /* name */
+ H_RULE(schar, NOT_IN("()\n\r\\")); /* string literal */
H_ARULE(digit, h_ch_range('0', '9'));
H_ARULE(pdigit, h_ch_range('1', '9'));
H_ARULE(hlower, h_ch_range('a', 'f'));
/* numbers */
H_ARULE(sign, CHX(minus, IGN(plus)));
H_VRULE(intnn, nat);
- #if 1
H_ARULE(realnn, CHX(SEQ(digits, period, digits), /* 12.3 */
SEQ(digits, period, empty), /* 123. */
SEQ(empty, period, digits))); /* .123 */
// XXX ^ we _could_ move the "123." case into intnn...
- #else
- // XXX the .123 case above somehow leads to a conflict with litstr...
- H_ARULE(realnn, CHX(SEQ(digits, period, digits), /* 12.3 */
- SEQ(digits, period, empty))); /* 123. */
- #endif
H_RULE(numbnn, CHX(realnn, intnn));
H_RULE(snumb, SEQ(sign, numbnn));
H_ARULE(numb, CHX(snumb, numbnn));
H_ARULE(nstr, h_many(CHX(nchar, nesc))); /* '/' is valid */
H_RULE(name, h_right(slash, nstr));
- /* strings
- *
- * this is so convoluted in order to make it LALR including the
- * precedence rules for octal escapes ("\123" vs "\12 3" vs "\1 23")
- * and end-of-line ("CRLF" vs "CR LF").
- *
- * we have to split the base rule 'str' into variants 'str_o' and
- * 'str_l' depending on whether they may start with an octal digit or
- * linefeed, respectively.
- */
- H_RULE(str_ol, h_indirect());
- H_RULE(str_o, h_indirect());
- H_RULE(str_l, h_indirect());
- H_RULE(str, h_indirect());
+ /* strings */
+ H_RULE(snest, h_indirect());
H_RULE(bsn, p_mapch('n', 0x0a)); /* LF */
H_RULE(bsr, p_mapch('r', 0x0d)); /* CR */
H_RULE(bst, p_mapch('t', 0x09)); /* HT */
H_RULE(bsb, p_mapch('b', 0x08)); /* BS (backspace) */
H_RULE(bsf, p_mapch('f', 0x0c)); /* FF */
H_RULE(escape, CHX(bsn, bsr, bst, bsb, bsf, lparen, rparen, bslash));
- H_ARULE(oct3, REP(odigit,3));
- H_ARULE(oct2, REP(odigit,2));
- H_ARULE(oct1, REP(odigit,1));
- H_RULE(octesc, CHX(SEQ(oct3, str),
- SEQ(oct2, str_o),
- SEQ(oct1, str_o)));
- H_RULE(eolesc, CHX(SEQ(IGN(crlf), str),
- SEQ(IGN(cr), str_l),
- SEQ(IGN(lf), str)));
- H_RULE(schar_o, NOT_IN("()\n\r\\" "01234567"));
- H_RULE(schar_e, NOT_IN("()\n\r\\" "01234567" "nrtbf"));
- H_RULE(str_o_, CHX(SEQ(lf, str), str_ol)); /* str "but not" odigit */
- H_RULE(str_l_, CHX(SEQ(odigit, str), str_ol)); /* str "but not" lf */
- H_RULE(str_ol_, CHX(SEQ(cr, str_l), /* str "but neither" */
- SEQ(crlf, str),
- SEQ(schar_o, str),
- SEQ(lparen, str, rparen, str),
- SEQ(IGN(bslash), escape, str),
- SEQ(IGN(bslash), schar_e, str), /* "lone" bs */
- /* NB: ^ lone backslashes are to be ignored per spec, but we
- * let them "escape" with the following character. this works
- * because they are never truly alone. */
- SEQ(IGN(bslash), octesc),
- SEQ(IGN(bslash), eolesc), /* line split */
- epsilon));
- H_ARULE(str_, CHX(SEQ(lf, str), SEQ(odigit, str), str_ol));
- H_RULE(litstr, h_middle(lparen, str, rparen));
+ H_ARULE(octal, CHX(REP(odigit,3), REP(odigit,2), REP(odigit,1)));
+ H_RULE(wrap, IGN(eol));
+ H_RULE(sesc, h_right(bslash, CHX(escape, octal, wrap, epsilon)));
+ /* NB: lone backslashes and escaped newlines are ignored */
+ H_ARULE(schars, h_many(CHX(schar, snest, sesc, eol)));
+ H_RULE(snest_, SEQ(lparen, schars, rparen));
+ H_RULE(litstr, h_middle(lparen, schars, rparen));
H_RULE(hexstr, h_middle(langle, MANY_WS(hdigit), rangle));
H_ARULE(string, CHX(litstr, hexstr));
- h_bind_indirect(str_ol, str_ol_);
- h_bind_indirect(str_o, str_o_);
- h_bind_indirect(str_l, str_l_);
- h_bind_indirect(str, str_);
+ h_bind_indirect(snest, snest_);
H_RULE(array, h_indirect());
H_RULE(dict, h_indirect());
/* dictionaries */
H_RULE(dopen, LIT("<<"));
H_RULE(dclose, LIT(">>"));
- H_RULE(k_v, CHX(SEQ(name, wel,ws, obj),
- SEQ(name, CHX(name,dobj))));
+ H_RULE(k_v, CHX(CHX(SEQ(name, wel,ws, obj),
+ SEQ(name, CHX(name,dobj))),
+ VIOL(SEQ(name, wel,ws), "Key with no value (severity=2)")));
H_ARULE(dict_, h_middle(dopen, MANY_WS(k_v), dclose));
// XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok?
// XXX validate: dict keys must be unique
h_bind_indirect(array, array_);
/* streams */
- H_RULE(stmbeg, SEQ(dict, ws, LIT("stream"), OPT(cr), lf));
- H_RULE(stmend, SEQ(OPT(eol), LIT("endstream")));
+ H_RULE(stmbeg, SEQ(dict, OPT(ws), LIT("stream"), OPT(cr), lf));
+ H_RULE(stmend, CHX(SEQ(eol, LIT("endstream")),
+ VIOL(LIT("ndstream"), "Stream length >1-too-long (severity=10)"),
+ VIOL(SEQ(h_many1(wchar), LIT("endstream")),
+ "No newline before endstream (severity=7)"),
+ VIOL(LIT("endstream"), "Stream length 1-too-long (severity=9)"),
+ VIOL(SEQ(OPT(h_ch_range(0, 255)), OPT(eol), LIT("endstream")),
+ "Stream length 1-too-short (severity=4)"),
+ VIOL(SEQ(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"),
+ SEQ(npair, wel, KW("obj")),
+ KW("xref"),
+ LIT("endstream")))), LIT("endstream")),
+ "Stream length >1-too-short (severity=5)"),
+ VIOL(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"),
+ SEQ(npair, wel, KW("obj")),
+ KW("xref")))),
+ "Missing endstream token (severity=7)")));
+
H_RULE(stream, h_left(h_bind(stmbeg, kstream, aux), stmend));
// XXX is whitespace allowed between the eol and "endstream"?
+ // peter wyatt says no. (2020-03-25)
/*
* file structure
*/
/* header */
- H_RULE(version, SEQ(pdigit, IGN(period), pdigit));
+ H_RULE(version, SEQ(pdigit, IGN(period), digit));
H_RULE(header, h_middle(LIT("%PDF-"), version, nl));
/* body */
H_RULE(indobj, CHX(stream, obj));
- H_RULE(objdef, SEQ(ws, npair, wel, KW("obj"), ws, indobj, KW("endobj")));
+ H_RULE(objdef, SEQ(ws, npair, wel, KW("obj"), ws, indobj,
+ CHX(VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(KW("endobj"))),
+ "More than 1 endobj token (severity=1)"),
+ VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(SEQ(dclose, h_many1(CHX(wchar, eol)), KW("endobj")))),
+ "More than 1 >> and endobj token (severity=2)"),
+ SEQ(OPT(ws), OPT(lws), KW("endobj")),
+ VIOL(h_optional(KW("endobj")), "Missing endobj token (severity=1)"))));
H_RULE(body, h_many(objdef));
- /* for object streams */
- //H_RULE(osidx, h_sepBy(npair, SEQ(wel,ws)));
-
/* cross-reference section */
H_RULE(xreol, CHX(SEQ(sp, cr), SEQ(sp, lf), crlf));
// ^ XXX does the real world follow this rule?! cf. loop.pdf
H_RULE(xrtyp, CHX(h_ch('n'), h_ch('f')));
H_ARULE(xroff, REP(digit, 10));
H_ARULE(xrgen, REP(digit, 5));
- H_ARULE(xrent, SEQ(xroff, IGN(sp), xrgen, IGN(sp), xrtyp, IGN(xreol)));
+ H_ARULE(xrent, SEQ(xroff, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref offset_gen entry (severity=1)"), sp)),
+ xrgen, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref gen_use entry (severity=1)"), sp)),
+ xrtyp, IGN(CHX(VIOL(SEQ(wchar, wchar, h_many1(wchar)), "Greater-than-2-byte WS at end of xref entry (severity=1)"),
+ xreol,
+ VIOL(SEQ(h_many1(wchar)), "Nonconformant WS at end of xref entry (severity=1)")))));
H_RULE(xrhead, SEQ(nat, IGN(sp), nat, nl));
H_RULE(xrsub, SEQ(xrhead, h_many(xrent)));
H_ARULE(xrefs, SEQ(KW("xref"), nl, h_many(xrsub)));
// XXX skip however much we consumed and check for "endstream endobj"?
/* trailer */
- H_RULE(startxr, SEQ(nl, KW("startxref"), nl,
+ H_RULE(startxr, SEQ(nl, KW("startxref"), nl,
lws, nat, nl,
- LIT("%%EOF"), CHX(nl, end)));
+ LIT("%%EOF"), OPT(nl)));
+
+ /* used for the backwards search */
+ H_RULE(lasteof, SEQ(nl, KW("startxref"), nl,
+ lws, nat, nl,
// XXX the real world sometimes omits nl after %%EOF inside the file.
// the next 'tail' would be appended right after the 'F',
// presumably because the previous version of the file
// ended without a trailing newline. m)
- // this is invalid per spec, because it creates a run-on
+ // this is invalid per spec, because it creates a run-on
// comment, but we should probably accept-and-warn.
// XXX should lws be allowed before EOF marker?
// NB: lws before xref offset is allowed, cf. p.48 (example 4)
+ LIT("%%EOF"),
+ CHX(VIOL(SEQ(nl, h_many1(nl), end),
+ "(offset FROM END) Multiple newlines after final %%EOF (severity=4)"),
+ SEQ(h_many(nl), end),
+ VIOL(SEQ(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))),
+ "(offset FROM END) Data after final %%EOF (severity=7)"))));
+
H_RULE(xr_td, SEQ(xrefs, KW("trailer"), ws, dict));
- H_RULE(tail, SEQ(body, h_optional(xr_td), startxr));
- // XXX the real world likes to omit 'startxr' from all but the
- // last trailer. we should accept-and-warn in that case.
- H_RULE(pdf, SEQ(header, h_many1(tail), end));
+ H_RULE(hdr_junk, CHX(comment,
+ VIOL(h_many1(h_butnot(h_ch_range(0, 255), SEQ(npair, wel, KW("obj")))),
+ "Uncommented junk after header (severity=1)")));
+ H_RULE(tail, SEQ(body, CHX(SEQ(h_optional(xr_td), startxr),
+ VIOL(SEQ(xr_td, OPT(SEQ(nl, KW("startxref"), nl, lws, nat, nl)),
+ OPT(nl), OPT(LIT("%%EOF")), OPT(nl)),
+ "Improper end of trailer - missing startxref and/or %%EOF (severity=5)"))));
+ H_RULE(final_eof_junk, CHX(VIOL(SEQ(h_many1(nl), end), "Multiple newlines after final %%EOF (severity=4)"),
+ VIOL(h_many1(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))),
+ "Data after final %%EOF (severity=7)"),
+ end));
+ H_RULE(pdf, SEQ(header, OPT(hdr_junk), h_many1(tail), final_eof_junk));
/* debug parser to consume as much as possible */
- H_RULE(pdfdbg, SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr)));
+ H_RULE(pdfdbg, SEQ(header, OPT(hdr_junk), h_many(tail), body, OPT(xr_td), OPT(SEQ(startxr, final_eof_junk))));
+ /*
+ * filters
+ */
+ /* Ascii85Decode */
+ H_RULE(a85eod, SEQ(h_ch('~'), OPT(h_many(lwchar)), h_ch('>')));
+ H_ARULE(a85zero, h_ch('z'));
+ H_ARULE(a85digit, h_ch_range('!', 'u'));
+
+ /* Line whitespace can occur between any digit and has to be ignored, */
+ /* Comments are not allowed inside streams, and % character should cause
+ * a parse error. */
+ #define MANY_LWS(X) h_many(CHX(lws, X))
+
+ /* This encoding of zero is not allowed */
+ H_RULE(a85fiveexcl, h_repeat_n(MANY_LWS(h_ch('!')), 5));
+ H_VARULE(a85fivedigits, SEQ(h_and(h_not(a85fiveexcl)), h_repeat_n(MANY_LWS(a85digit), 5)));
+ H_ARULE(a85group, CHX(a85zero, a85fivedigits));
+
+ H_VARULE(a85partial2group, h_repeat_n(MANY_LWS(a85digit), 2));
+ H_VARULE(a85partial3group, h_repeat_n(MANY_LWS(a85digit), 3));
+ H_VARULE(a85partial4group, h_repeat_n(MANY_LWS(a85digit), 4));
+ H_ARULE(a85partialgroup, CHX(a85partial4group, a85partial3group, a85partial2group));
+
+ H_ARULE(a85string, SEQ(h_many(a85group), OPT(a85partialgroup), IGN(a85eod)));
+
+ /* AsciiHexDecode */
+ H_RULE(ahexeod, h_ch('>'));
+ H_ARULE(hdigitpair, SEQ(IGN(OPT(h_many(lwchar))), hdigit, IGN(OPT(h_many(lwchar))), hdigit));
+ H_ARULE(ahextruncated, SEQ(IGN(OPT(h_many(lwchar))), hdigit, IGN(OPT(h_many(lwchar)))));
+
+ H_ARULE(hs_end, SEQ(CHX(hdigitpair, ahextruncated), ahexeod));
+ H_ARULE(ahexstream, SEQ(h_many(hdigitpair), hs_end));
+
+ init_runlengthdecode_parser(aux);
+
+
/* global parser variables */
p_pdf = pdf;
p_pdfdbg = pdfdbg;
- p_startxref = startxr;
+ p_startxref = lasteof; //startxr;
p_xref = CHX(xr_td, xrstm);
p_objdef = objdef;
+ p_a85string = a85string;
+ p_ahexstream = ahexstream;
+ p_ws = ws;
+ p_wel = wel;
+ p_elemr = h_action(elemr, h_act_flatten, NULL);
+ p_npair = npair;
p_fail = h_nothing_p();
p_epsilon = epsilon;
p_return_0 = h_action(epsilon, act_return_uint, (void *)0);
p_return_1 = h_action(epsilon, act_return_uint, (void *)1);
+
+ /* Parsing of severity messages */
+ H_RULE(viol_preamble, SEQ(h_many(NOT_IN("=")), LIT("=")));
+ H_RULE(severity_num, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)),
+ act_nat, NULL));
+ H_RULE(violsev, SEQ(IGN(viol_preamble), severity_num));
+ p_violsev = violsev;
#if 0
// XXX testing
uint8_t *buf; /* previous row of input */
uint8_t c; /* byte 'c' (upper left) */
int x; /* current position */
+
+#ifndef ITERATIVE // XXX
+ uint8_t *out;
+ size_t nout;
+#endif
};
int
depred_none(struct predictor *pred, uint8_t *inp, size_t sz)
{
+#ifdef ITERATIVE // XXX
return h_parse_chunk(pred->sp, inp, sz);
+#else
+ pred->out = realloc(pred->out, pred->nout + sz);
+ assert(pred->out != NULL);
+ memcpy(pred->out + pred->nout, inp, sz);
+ pred->nout += sz;
+ return false;
+#endif
}
uint8_t pp_none(int a, int b, int c) { return 0; }
/* when row complete, pass it to parser and start a new row */
if (x == pred->rowsz) {
+#ifdef ITERATIVE // XXX
done = h_parse_chunk(pred->sp, pred->buf, pred->rowsz);
+#else
+ pred->out = realloc(pred->out, pred->nout + pred->rowsz);
+ assert(pred->out != NULL);
+ memcpy(pred->out + pred->nout, pred->buf, pred->rowsz);
+ pred->nout += pred->rowsz;
+#endif
pred->c = pred->x = 0;
if (pred->num != 2) /* support for 8-bpc TIFF */
pred->predfun = NULL;
{
size_t const BUFSIZE = 8 * 1024;
uint8_t *buf;
+#ifdef ITERATIVE // XXX
HSuspendedParser *sp;
+#endif
HParseResult *res;
const HParsedToken *v;
size_t sz;
if (buf == NULL)
err(1, "FlateDecode");
+#ifdef ITERATIVE // XXX
/* initialize target parser */
sp = h_parse_start(p);
assert(sp != NULL);
pred.sp = sp;
+#endif
done = 0;
strm.avail_in = b.len;
done = depredict(&pred, buf, sz);
} while (done == 0 && ret == Z_OK);
+#ifdef ITERATIVE // XXX
res = h_parse_finish(sp);
// XXX always return NULL on error?
+#else
+ res = h_parse(p, pred.out, pred.nout);
+ free(pred.out);
+#endif
inflateEnd(&strm);
free(pred.buf);
free(buf);
if (done == -1)
+ return NULL;
+ return res;
+}
+
+/* LZW helpers */
+
+typedef struct
+{
+ uint8_t *lzw_buf;
+ size_t total_buf_size;
+ size_t write_head;
+ size_t write_tail;
+ uint8_t write_checksum;
+ size_t eof_loc;
+
+ HBytes *input_stream;
+ size_t read_head;
+ size_t read_tail;
+ uint8_t read_checksum;
+} lzwspec;
+
+lzwspec *cur_lzw_spec;
+
+/* used by write_lzw_buffer to get more space for decoding if needed */
+void
+grow_lzw_buffer(size_t amount)
+{
+ uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t));
+ if(ret_buf != NULL)
+ {
+ cur_lzw_spec->total_buf_size += amount;
+ cur_lzw_spec->lzw_buf = ret_buf;
+ }
+ else
+ {
+ fprintf(stderr, "LZWDecode: h_arena_realloc() failed");
+ return;
+ }
+}
+
+lzwspec *
+new_lzw_spec(HBytes *bytes)
+{
+ size_t const BUFSIZE = sizeof(uint8_t) * 1024;
+ lzwspec *ret = malloc(sizeof(lzwspec));
+ ret->input_stream = bytes;
+ ret->lzw_buf = malloc(BUFSIZE);
+ ret->total_buf_size = BUFSIZE;
+ return ret;
+}
+
+void
+delete_lzw_spec(lzwspec *spec)
+{
+ free(spec->lzw_buf);
+ free(spec);
+}
+
+void
+bind_lzw_spec(lzwspec *spec)
+{
+ cur_lzw_spec = spec;
+}
+
+
+#include "lzw-lib.h"
+
+/* Buffer writer function for the lzw-ab implementation, with a fixed signature.
+ * Although the type is defined as int, it is expected to write one byte at a time.
+ * Modifies cur_lzw_spec. Set up the lzw spec to use with bind_lzw_spec() */
+
+void
+write_lzw_buffer(int value)
+{
+ size_t const BUFSIZE = sizeof(uint8_t) * 1024;
+
+ if(!cur_lzw_spec->lzw_buf)
+ {
+ fprintf(stderr, "LZWDecode: lzw_buf is null!");
+ assert(cur_lzw_spec->lzw_buf != NULL);
+ }
+
+ assert(cur_lzw_spec->write_head <= cur_lzw_spec->total_buf_size);
+
+ if (value == EOF) {
+ cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head] = (uint8_t) value;
+ cur_lzw_spec->eof_loc = cur_lzw_spec->write_head;
+ cur_lzw_spec->write_head++;
+ return;
+ }
+
+ /* We can get away with this cast due to writing single bytes. */
+ cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head++] = (uint8_t) value;
+
+ /* If you looked at lzw-ab's code, the write head is reset here
+ * This function uses write_head as the offset of the last written item */
+ if (cur_lzw_spec->write_head >= cur_lzw_spec->total_buf_size)
+ {
+ grow_lzw_buffer(BUFSIZE);
+ }
+
+ cur_lzw_spec->write_checksum = cur_lzw_spec->write_checksum * 3 + (uint8_t) value;
+}
+
+
+/* Fixed signature function for reading bytes. Modifies cur_lzw_spec. Set cur_lzw_spec
+ * with bind_lzw_spec() */
+int read_lzw_buffer(void)
+{
+ uint8_t byte_read;
+ int ret_value;
+
+ /* Input data is already waiting in the buffer */
+ if (cur_lzw_spec->read_head == cur_lzw_spec->read_tail)
+ cur_lzw_spec->read_tail = cur_lzw_spec->input_stream->len;
+
+ if (cur_lzw_spec->read_head < cur_lzw_spec->read_tail)
+ {
+ byte_read = cur_lzw_spec->input_stream->token[cur_lzw_spec->read_head++];
+ cur_lzw_spec->read_checksum = cur_lzw_spec->read_checksum * 3 + byte_read;
+ ret_value = byte_read;
+ }
+ else
+ ret_value = EOF;
+
+ return ret_value;
+}
+
+
+HParseResult *
+LZWDecode(const Dict *parms, HBytes b, HParser *p)
+{
+ struct predictor pred = {1, 1, 8, 1};
+ int (*depredict)(struct predictor *, uint8_t *, size_t);
+ HParseResult *res;
+ int done;
+ int ret;
+ const HParsedToken *v;
+
+ /* set up the predictor (if any) */
+ #define SETPARM(VAR,STR) do { \
+ v = dictentry(parms, (STR)); \
+ if (v != NULL) { \
+ if (v->token_type != TT_SINT || v->sint < 0) \
+ return NULL; \
+ VAR = v->sint; \
+ } } while(0)
+ SETPARM(pred.num, "Predictor");
+ SETPARM(pred.colors, "Colors");
+ SETPARM(pred.bpc, "BitsPerComponent");
+ SETPARM(pred.columns, "Columns");
+ #undef SETPARM
+ if (pred.num == 1)
+ depredict = depred_none;
+ else {
+ if (pred.num >= 10 && pred.num <= 15)
+ depredict = depred_png;
+ else if (pred.num == 2) {
+ /* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */
+ if (pred.bpc == 8) {
+ pred.predfun = pp_sub; /* predict left */
+ depredict = depred_png;
+ } else {
+ // XXX add general TIFF predictor (bpc != 8)
+ fprintf(stderr, "LZWDecode: /Predictor %d "
+ "not supported for /BitsPerComponent %d\n",
+ pred.num, pred.bpc);
+ return NULL;
+ }
+ } else {
+ fprintf(stderr, "LZWDecode: /Predictor %d"
+ " not supported\n", pred.num);
+ return NULL;
+ }
+
+ /* allocate row buffer */
+ if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) {
+ fprintf(stderr, "LZWDecode: overflow\n");
+ return NULL;
+ }
+ pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8;
+ pred.buf = calloc(1, pred.rowsz);
+ if (pred.buf == NULL)
+ err(1, "LZWDecode");
+ }
+
+ lzwspec *lzw_spec = new_lzw_spec(&b);
+ bind_lzw_spec(lzw_spec);
+
+ ret = lzw_decompress(write_lzw_buffer, read_lzw_buffer);
+ if (ret) {
+ fprintf(stderr, "lzw_decompress: error (%d)\n", ret);
+ assert(!"LZWDecode: failed to decompress\n");
+ }
+ done = depredict(&pred, cur_lzw_spec->lzw_buf, cur_lzw_spec->write_head-1);
+ assert(!done); // XXX ITERATIVE
+
+ res = h_parse(p, pred.out, pred.nout);
+ free(pred.out);
+
+ bind_lzw_spec(NULL);
+ delete_lzw_spec(lzw_spec);
+
+ return res;
+}
+
+HParseResult *
+RunLengthDecode(const Dict *parms, HBytes b, HParser *p)
+{
+ HParseResult *res;
+
+ res = h_parse(p_rldstring, b.token, b.len);
+ if(!res)
+ {
+ fprintf(stderr, "parse error in RunLengthDecode filter\n");
return NULL;
+ }
+
+ assert(res->ast && res->ast->token_type == TT_BYTES);
+ res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len);
+
return res;
}
/*
+ * Decodes ASCII hexadecimal data into binary data.
+ * parms should be empty, because the filter has no parameters
+ */
+HParseResult *
+ASCIIHexDecode(const Dict *parms, HBytes b, HParser *p)
+{
+ HParseResult *res;
+
+ res = h_parse(p_ahexstream, b.token, b.len);
+ if(!res)
+ {
+ fprintf(stderr, "parse error in ASCIIHexDecode filter\n");
+ return NULL;
+ }
+
+ assert(res->ast && res->ast->token_type == TT_BYTES);
+ res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len);
+
+ return res;
+}
+
+/*
+ * Decodes ASCII base-85 encoded data and produces binary data.
+ * parms should be empty, because the filter has no parameters
+ */
+HParseResult*
+ASCII85Decode(const Dict *parms, HBytes b, HParser *p)
+{
+ HParseResult *res;
+
+ res = h_parse(p_a85string, b.token, b.len);
+ if(!res)
+ {
+ fprintf(stderr, "parse error in ASCII85Decode filter\n");
+ return NULL;
+ }
+
+ assert(res->ast && res->ast->token_type == TT_BYTES);
+ res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len);
+
+ return res;
+}
+
+/*
* decode the bytes in 'b' according to metadata in the stream dictionary 'd'
* and parse the result with 'p'.
*/
if (v == NULL)
return h_parse(p, b.token, b.len);
+#ifdef ITERATIVE // XXX
/* compile to a CF backend to enable incremental parsing */
if (h_compile(p, PB_LLk, NULL) == -1)
errx(1, "stream data parser: LL(1) compile failed");
+#endif
if (v->token_type == TT_SEQUENCE)
return NULL; // XXX filter chains not supported, yet
assert(v->token_type == TT_BYTES);
if (bytes_eq(v->bytes, "FlateDecode"))
filter = FlateDecode;
+ else if (bytes_eq(v->bytes, "ASCIIHexDecode"))
+ filter = ASCIIHexDecode;
+ else if (bytes_eq(v->bytes, "ASCII85Decode"))
+ filter = ASCII85Decode;
+ else if (bytes_eq(v->bytes, "RunLengthDecode"))
+ filter = RunLengthDecode;
+ else if (bytes_eq(v->bytes, "LZWDecode"))
+ filter = LZWDecode;
else
return NULL; /* filter not supported */
return h_left__m(mm__, bytes, skip);
}
-HParser *
-p_xrefdata__m(HAllocator *mm__, const Dict *dict);
+HParser *p_xrefdata__m(HAllocator *, const Dict *);
+HParser *p_objstm__m(HAllocator *, const Dict *);
HParser *
p_stream_data__m(HAllocator *mm__, const Dict *dict)
/* interpret known stream types */
if (bytes_eq(v->bytes, "XRef"))
return p_xrefdata__m(mm__, dict);
- // XXX
- //if (bytes_eq(v->bytes, "ObjStm"))
- // return p_objstm__m(mm__, dict);
+#ifndef NOOBJSTM
+ if (bytes_eq(v->bytes, "ObjStm"))
+ return p_objstm__m(mm__, dict);
+#endif
return NULL; /* unrecognized type */
}
/* decode and parse the stream data */
res = decode_stream(spec->dict, bytes, spec->parser);
- assert(res != NULL); // XXX parse failure!
+ if (res == NULL) {
+ HBytes b = {NULL, 0};
+ const HParsedToken *v = dictentry(spec->dict, "Type");
+ if (v != NULL && v->token_type == TT_BYTES)
+ b = v->bytes;
+ if (b.len > INT_MAX)
+ b.len = INT_MAX;
+ fprintf(stderr, "parse error in stream (%*s)\n",
+ (int)b.len, b.token);
+ // XXX return the undecoded stream (p->ast)?
+ }
return H_MAKE(HParseResult, res);
}
//fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug
- dict_p = p_return__m(mm__, dict_t);
+ dict_p = p_return__m(mm__, dict_t);
bytes_p = p_take__m(mm__, sz, aux);
spec = h_alloc(mm__, sizeof(struct streamspec));
return h_sequence__ma(mm__, (void **)p_subs);
}
+HParser *
+p_objstm__m(HAllocator *mm__, const Dict *dict)
+{
+ const HParsedToken *v;
+ size_t N;
+
+ v = dictentry(dict, "N");
+ if (v == NULL || v->token_type != TT_SINT || v->sint < 0 ||
+ (uint64_t)v->sint > SIZE_MAX) {
+ fprintf(stderr, "missing /N on object stream\n");
+ return p_fail;
+ }
+ N = v->sint;
+
+ HParser *wel_ws = h_sequence__m(mm__, p_wel, p_ws, NULL);
+ HParser *idx = p_sepBy_n__m(mm__, p_npair, wel_ws, N);
+
+ return h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL);
+ // XXX leading and trailing ws OK?
+
+ // XXX consistency-check against /First, idx, /N
+}
+
/*
* This continuation is very similar to kstream, except that it does not
* rely on /Length to consume the right amount of input. If /Length is
int fd;
/* command line handling */
- if (argc != 2) {
+ if (argc > 3) {
fprintf(stderr, "usage: %s file\n", argv[0]);
return 1;
}
+ if (argc == 3) {
+ H_RULE(nat, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)),
+ act_nat, NULL));
+ strictness = h_parse(nat, (uint8_t *)argv[2], strlen(argv[2]))->ast->uint;
+ }
infile = argv[1];
/* mmap the input file */
blob - /dev/null
blob + 65d4a2e4b96304208852290e3d3bf6ee7ce3dde8 (mode 644)
--- /dev/null
+++ lzw-ab-license.txt
+ Copyright (c) David Bryant
+ All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Conifer Software nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
blob - /dev/null
blob + 8283dfc52f1df905e2c0cce028eee9969684864b (mode 644)
--- /dev/null
+++ t/hello_ascii85.pdf
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 59 /Filter /ASCII85Decode>>
+stream
+<!k9J0d&5.+B2q[0eb:(0eb:(<+H=a+@p'[Ci:FtDfTZ)+=SF4C'mC/$3~>
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000009 00000 n
+0000000074 00000 n
+0000000120 00000 n
+0000000179 00000 n
+0000000322 00000 n
+0000000453 00000 n
+0000000483 00000 n
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+592
+%%EOF
blob - /dev/null
blob + 4f54996cce84b86df745dc4912ca558e1760b557 (mode 644)
--- /dev/null
+++ t/hello_ascii85_lf.pdf
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 60 /Filter /ASCII85Decode>>
+stream
+<!k9J0d&5.+B2q[0eb:(0eb:(<+H=
+a+@p'[Ci:FtDfTZ)+=SF4C'mC/$3~>
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000009 00000 n
+0000000074 00000 n
+0000000120 00000 n
+0000000179 00000 n
+0000000322 00000 n
+0000000454 00000 n
+0000000484 00000 n
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+593
+%%EOF
blob - /dev/null
blob + 7496f59864d8d8b22ac06557f7ce3d0e3ed2e2c3 (mode 644)
--- /dev/null
+++ t/hello_asciihex.pdf
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 91 /Filter /ASCIIHexDecode>>
+stream
+540a2f46312032342054660a313030203130302054640a282048656c6c6f20576f726c64202920546a0a45540a>
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000009 00000 n
+0000000074 00000 n
+0000000120 00000 n
+0000000179 00000 n
+0000000322 00000 n
+0000000486 00000 n
+0000000516 00000 n
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+624
+%%EOF
blob - /dev/null
blob + b077906ba43c89935d5d6fbe226777cc84afb0bb (mode 644)
--- /dev/null
+++ t/hello_asciihex_lf.pdf
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 92 /Filter /ASCIIHexDecode>>
+stream
+540a2f46312032342054660a3130302031303
+02054640a282048656c6c6f20576f726c64202920546a0a45540a>
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000009 00000 n
+0000000074 00000 n
+0000000120 00000 n
+0000000179 00000 n
+0000000322 00000 n
+0000000487 00000 n
+0000000517 00000 n
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+625
+%%EOF
blob - /dev/null
blob + 7f887ccce2c1c995ea65f1524e28685713f435d3 (mode 644)
Binary files /dev/null and t/hello_lzwdecode_march.pdf differ
blob - /dev/null
blob + 8f14de953d0952359834e9c7e67874246996a6ec (mode 644)
--- /dev/null
+++ t/hello_runlength.pdf
+%PDF-1.4
+1 0 obj
+<< /Type /Catalog
+/Outlines 2 0 R
+/Pages 3 0 R
+>>
+endobj
+2 0 obj
+<< /Type /Outlines
+/Count 0
+>>
+endobj
+3 0 obj
+<< /Type /Pages
+/Kids [ 4 0 R ]
+/Count 1
+>>
+endobj
+4 0 obj
+<< /Type /Page
+/Parent 3 0 R
+/MediaBox [ 0 0 612 792 ]
+/Contents 5 0 R
+/Resources << /ProcSet 6 0 R
+/Font << /F1 7 0 R >>
+>>
+>>
+endobj
+5 0 obj
+<< /Length 23 /Filter /RunLengthDecode>>
+stream
+ AAAAAAAAAAAAAAAAAAAA@B
+endstream
+endobj
+6 0 obj
+[ /PDF /Text ]
+endobj
+7 0 obj
+<< /Type /Font
+/Subtype /Type1
+/Name /F1
+/BaseFont /Helvetica
+/Encoding /MacRomanEncoding
+>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000009 00000 n
+0000000074 00000 n
+0000000120 00000 n
+0000000179 00000 n
+0000000322 00000 n
+0000000419 00000 n
+0000000450 00000 n
+trailer
+<< /Size 8
+/Root 1 0 R
+>>
+startxref
+557
+%%EOF
blob - /dev/null
blob + a7919358495066d05aba0366b1c55e4b38f109d2 (mode 644)
Binary files /dev/null and t/lzw.pdf differ
blob - /dev/null
blob + 38ecd30a3ed38605d2cb752efb66dc8ecb4861c6 (mode 644)
--- /dev/null
+++ lzw-lib.c
+////////////////////////////////////////////////////////////////////////////
+// **** LZW-AB **** //
+// Adjusted Binary LZW Compressor/Decompressor //
+// Copyright (c) 2016 David Bryant //
+// All Rights Reserved //
+// Distributed under the BSD Software License (see license.txt) //
+////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "lzw-lib.h"
+
+/* This library implements the LZW general-purpose data compression algorithm.
+ * The algorithm was originally described as a hardware implementation by
+ * Terry Welsh here:
+ *
+ * Welch, T.A. “A Technique for High-Performance Data Compression.”
+ * IEEE Computer 17,6 (June 1984), pp. 8-19.
+ *
+ * Since then there have been enumerable refinements and variations on the
+ * basic technique, and this implementation is no different. The target of
+ * the present implementation is embedded systems, and so emphasis was placed
+ * on simplicity, fast execution, and minimal RAM usage.
+ *
+ * The symbols are stored in adjusted binary, which provides considerably
+ * better compression performance with virtually no speed penalty compared to
+ * the fixed sizes normally used. To ensure good performance on data with
+ * varying characteristics (like executable images) the encoder resets as
+ * soon as the dictionary is full. Also, worst-case performance is limited
+ * to about 8% inflation by catching poor performance and forcing an early
+ * reset before longer symbols are sent.
+ *
+ * The maximum symbol size is configurable on the encode side (from 9 bits
+ * to 12 bits) and determines the RAM footprint required by both sides and,
+ * to a large extent, the compression performance. This information is
+ * communicated to the decoder in the first stream byte so that it can
+ * allocate accordingly. The RAM requirements are as follows:
+ *
+ * maximum encoder RAM decoder RAM
+ * symbol size requirement requirement
+ * -----------------------------------------
+ * 9-bit 1792 bytes 1024 bytes
+ * 10-bit 4352 bytes 3072 bytes
+ * 11-bit 9472 bytes 7168 bytes
+ * 12-bit 19712 bytes 15360 bytes
+ *
+ * This implementation uses malloc(), but obviously an embedded version could
+ * use static arrays instead if desired (assuming that the maxbits was
+ * controlled outside).
+ */
+
+#define NULL_CODE -1 // indicates a NULL prefix
+#define CLEAR_CODE 256 // code to flush dictionary and restart decoder
+#define EOD_CODE 257 // used in PDF's LZWDecode to signal end of data
+#define FIRST_STRING 258 // code of first dictionary string, PDF edition
+
+/* This macro writes the adjusted-binary symbol "code" given the maximum
+ * symbol "maxcode". A macro is used here just to avoid the duplication in
+ * the lzw_compress() function. The idea is that if "maxcode" is not one
+ * less than a power of two (which it rarely will be) then this code can
+ * often send fewer bits that would be required with a fixed-sized code.
+ *
+ * For example, the first code we send will have a "maxcode" of 257, so
+ * every "code" would normally consume 9 bits. But with adjusted binary we
+ * can actually represent any code from 0 to 253 with just 8 bits -- only
+ * the 4 codes from 254 to 257 take 9 bits.
+ */
+
+#define WRITE_CODE(code,maxcode) do { \
+ int code_bits = (maxcode) < 1024 ? \
+ ((maxcode) < 512 ? 8 : 9) : \
+ ((maxcode) < 2048 ? 10 : 11); \
+ int extras = (1 << (code_bits + 1)) - (maxcode) - 1; \
+ if ((code) < extras) { \
+ shifter |= ((long)(code) << bits); \
+ bits += code_bits; \
+ } \
+ else { \
+ shifter |= ((long)(((code) + extras) >> 1) << bits); \
+ bits += code_bits; \
+ shifter |= ((long)(((code) + extras) & 1) << bits++); \
+ } \
+ do { (*dst)(shifter); shifter >>= 8; output_bytes++; \
+ } while ((bits -= 8) >= 8); \
+} while (0)
+
+/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the
+ * "maxbits" parameter specifies the maximum symbol size (9-12), which in turn determines
+ * the RAM requirement and, to a large extent, the level of compression achievable. A return
+ * value of EOF from the "src" callback terminates the compression process. A non-zero return
+ * value indicates one of the two possible errors -- bad "maxbits" param or failed malloc().
+ */
+
+int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits)
+{
+ int next = FIRST_STRING, prefix = NULL_CODE, bits = 0, total_codes, c;
+ unsigned long input_bytes = 0, output_bytes = 0;
+ short *first_references, *next_references;
+ unsigned char *terminators;
+ unsigned long shifter = 0;
+
+ if (maxbits < 9 || maxbits > 12) // check for valid "maxbits" setting
+ return 1;
+
+ // based on the "maxbits" parameter, compute total codes and allocate dictionary storage
+
+ total_codes = 1 << maxbits;
+ first_references = malloc (total_codes * sizeof (first_references [0]));
+ next_references = malloc ((total_codes - 256) * sizeof (next_references [0]));
+ terminators = malloc ((total_codes - 256) * sizeof (terminators [0]));
+
+ if (!first_references || !next_references || !terminators)
+ return 1; // failed malloc()
+
+ // clear the dictionary
+
+ memset (first_references, 0, total_codes * sizeof (first_references [0]));
+ memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0]));
+ memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0]));
+
+ (*dst)(maxbits - 9); // first byte in output stream indicates the maximum symbol bits
+
+ // This is the main loop where we read input bytes and compress them. We always keep track of the
+ // "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that
+ // has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits"
+ // variables and are sent to the output every time 8 bits are available (done in the macro).
+
+ while ((c = (*src)()) != EOF) {
+ int cti; // coding table index
+
+ input_bytes++;
+
+ if (prefix == NULL_CODE) { // this only happens the very first byte when we don't yet have a prefix
+ prefix = c;
+ continue;
+ }
+
+ if ((cti = first_references [prefix])) { // if any longer strings are built on the current prefix...
+ while (1)
+ if (terminators [cti - 256] == c) { // we found a matching string, so we just update the prefix
+ prefix = cti; // to that string and continue without sending anything
+ break;
+ }
+ else if (!next_references [cti - 256]) { // this string did not match the new character and
+ next_references [cti - 256] = next; // there aren't any more, so we'll add a new string
+ cti = 0; // and point to it with "next_reference"
+ break;
+ }
+ else
+ cti = next_references [cti - 256]; // there are more possible matches to check, so loop back
+ }
+ else // no longer strings are based on the current prefix, so now
+ first_references [prefix] = next; // the current prefix plus the new byte will be the next string
+
+ // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a
+ // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the
+ // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix.
+
+ if (!cti) {
+ WRITE_CODE (prefix, next); // send symbol for current prefix (0 to next-1)
+ terminators [next - 256] = c; // newly created string has current byte as the terminator
+ prefix = c; // current byte also becomes new prefix for next string
+
+ // This is where we bump the next string index and decide whether to clear the dictionary and start over.
+ // The triggers for that are either the dictionary is full or we've been outputting too many bytes and
+ // decide to cut our losses before the symbols get any larger. Note that for the dictionary full case we
+ // do NOT send the CLEAR_CODE because the decoder knows about this and we don't want to be redundant.
+
+ if (++next == total_codes || output_bytes > 8 + input_bytes + (input_bytes >> 4)) {
+ if (next < total_codes)
+ WRITE_CODE (CLEAR_CODE, next);
+
+ // clear the dictionary and reset the byte counters -- basically everything starts over
+ // except that we keep the last pending "prefix" (which, of course, was never sent)
+
+ memset (first_references, 0, total_codes * sizeof (first_references [0]));
+ memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0]));
+ memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0]));
+ input_bytes = output_bytes = 0;
+ next = FIRST_STRING;
+ }
+ }
+ }
+
+ // we're done with input, so if we've received anything we still need to send that pesky pending prefix...
+
+ if (prefix != NULL_CODE) {
+ WRITE_CODE (prefix, next);
+
+ if (++next == total_codes) // watch for clearing to the first string to stay in step with the decoder!
+ next = FIRST_STRING; // (this was actually a corner-case bug that did not trigger often)
+ }
+
+ WRITE_CODE (next, next); // the maximum possible code is always reserved for our END_CODE
+
+ if (bits) // finally, flush any pending bits from the shifter
+ (*dst)(shifter);
+
+ free (terminators); free (next_references); free (first_references);
+ return 0;
+}
+
+/* LZW decompression function. Bytes (8-bit) are read and written through callbacks.
+ * A return value of EOF from the "src" callback terminates the compression process
+ * (although this should not normally occur). A non-zero return value
+ * indicates an error, which in this case can be a
+ * failed malloc(), or if an EOF is read from the input stream before the compression
+ * terminates naturally with END_CODE.
+ */
+
+int lzw_decompress (void (*dst)(int), int (*src)(void))
+{
+ int read_byte, next = FIRST_STRING, prefix = CLEAR_CODE, bits = 0, total_codes;
+ unsigned char *terminators, *reverse_buffer;
+ unsigned long shifter = 0;
+ short *prefixes;
+
+ // PDF specific change: maxbits is not in the input stream
+ // we'll just be pessimistic and allocate the maximal size buffer
+
+ total_codes = 4096;
+ reverse_buffer = malloc ((total_codes - 256) * sizeof (reverse_buffer [0]));
+ prefixes = malloc ((total_codes - 256) * sizeof (prefixes [0]));
+ terminators = malloc ((total_codes - 256) * sizeof (terminators [0]));
+
+ if (!reverse_buffer || !prefixes || !terminators) // check for mallco() failure
+ return 1;
+
+ // This is the main loop where we read input symbols. The values range from 0 to the code value
+ // of the "next" string in the dictionary. Note that receiving an EOF from the input
+ // stream is actually an error because we should have gotten the END_CODE first.
+
+ while (1) {
+ int code_bits = next < 512 ? 9 : (next < 1024 ? 10 : (next < 2048 ? 11 : 12) ), code;
+
+ #define TOP_BITMASK (((1 << code_bits) - 1) << (bits - code_bits) )
+ #define BOTTOM_BITMASK ((1 << (bits - code_bits)) - 1)
+
+ do {
+ if ((read_byte = ((*src)())) == EOF) {
+ free (terminators); free (prefixes); free (reverse_buffer);
+ return 1;
+ }
+
+ /* shifter reworked: everything shifted left by a byte,
+ * and the byte we just read becomes the least significant
+ * byte */
+
+ // prepare to shift in next byte
+ shifter <<= 8;
+ /* the bitstrings forming the symbols are stored MSB first,
+ * so we can just OR in the next */
+ shifter |= (unsigned long) read_byte;
+ } while ((bits += 8) < code_bits);
+
+
+ /* for a 12-bit code, the shifter's bits now look like
+ * from MSB to LSB: 00...0cccccccccn...n
+ * where c are the bits of our code
+ * and n are the bits we're not yet interested in
+ * the number of times n is repeated is bits - code_bits
+ * ie. the number of bits read in minus the bits we're interested in */
+
+ // shift our code bits into thier proper place, and save it as the final code
+ code = (int) shifter >> (bits - code_bits);
+ /* we can now clear the shifter's top bits. the result looks like:
+ * 00...0n...n
+ * number of n is bits-code_bits
+ * */
+ shifter &= BOTTOM_BITMASK;
+ // update the count of bytes in the shifter
+ bits -= code_bits;
+
+ if (code == EOD_CODE) // In PDF, EOD is signalled by 257, rather than the max code
+ break;
+ else if (code == CLEAR_CODE) // otherwise check for a CLEAR_CODE to start over early
+ next = FIRST_STRING;
+ else if (prefix == CLEAR_CODE) { // this only happens at the first symbol which is always sent
+ (*dst)(code); // literally and becomes our initial prefix
+ next++;
+ }
+ // Otherwise we have a valid prefix so we step through the string from end to beginning storing the
+ // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case
+ // we have to handle here is that the string might be the same one that is actually being defined
+ // now (code == next-1). Also, the first 256 entries of "terminators" and "prefixes" are fixed and
+ // not allocated, so that messes things up a bit.
+ else {
+ int cti = (code == next-1) ? prefix : code;
+ unsigned char *rbp = reverse_buffer, c;
+
+ do *rbp++ = cti < 256 ? cti : terminators [cti - 256]; // step backward through string...
+ while ((cti = (cti < 256) ? NULL_CODE : prefixes [cti - 256]) != NULL_CODE);
+
+ c = *--rbp; // the first byte in this string is the terminator for the last string, which is
+ // the one that we'll create a new dictionary entry for this time
+
+ do (*dst)(*rbp); // send string in corrected order (except for the terminator
+ while (rbp-- != reverse_buffer); // which we don't know yet)
+
+ if (code == next-1)
+ (*dst)(c);
+
+ prefixes [next - 1 - 256] = prefix; // now update the next dictionary entry with the new string
+ terminators [next - 1 - 256] = c; // (but we're always one behind, so it's not the string just sent)
+
+ if (++next == total_codes) // check for full dictionary, which forces a reset (and, BTW,
+ next = FIRST_STRING; // means we'll never use the dictionary entry we just wrote)
+ }
+
+ prefix = code; // the code we just received becomes the prefix for the next dictionary string entry
+ // (which we'll create once we find out the terminator)
+ }
+
+ free (terminators); free (prefixes); free (reverse_buffer);
+ return 0;
+}
blob - /dev/null
blob + 81fdeb15e6ade7ef0fd6089a4fdc3d3f2d593578 (mode 644)
--- /dev/null
+++ lzw-lib.h
+////////////////////////////////////////////////////////////////////////////
+// **** LZW-AB **** //
+// Adjusted Binary LZW Compressor/Decompressor //
+// Copyright (c) 2016 David Bryant //
+// All Rights Reserved //
+// Distributed under the BSD Software License (see license.txt) //
+////////////////////////////////////////////////////////////////////////////
+
+#ifndef LZWLIB_H_
+#define LZWLIB_H_
+
+int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits);
+int lzw_decompress (void (*dst)(int), int (*src)(void));
+
+#endif /* LZWLIB_H_ */