commit f31903c8acf3479533aa88556b5632ef54159fe3 from: pompolic date: Mon May 4 15:06:04 2020 UTC Merge branch '2020-04-27_RELEASE' into 'master' Bring master up to date See merge request pesco/pdf!12 commit - 1191d61e7323c479a891b8810b0d56996ab0075f commit + f31903c8acf3479533aa88556b5632ef54159fe3 blob - d9374c2925bf1ddb2966d2d99dba2cbc29f8989c blob + f39a06d3076d76b1399fdc6a8823d602bfbd097a --- LICENSE +++ LICENSE @@ -1,4 +1,6 @@ Copyright (c) 2019, 2020 Sven M. Hallberg +Copyright (c) 2020 pompolic +Copyright (c) 2020 Paul Vines Permission to use, copy, modify, and distribute this software for any purpose with or without fee is hereby granted, provided that the above blob - 9ca933445c2668987c6aa7bc34117e0f70f47fbd blob + 6154e1d8363343e12153e52a27e9826581afdc3a --- Makefile +++ Makefile @@ -1,4 +1,4 @@ -CFLAGS += -std=c99 -Wall -Werror +CFLAGS += -std=c99 -Wall -Werror -DLOG # find our hammer build - adjust this to your needs # i have, for instance, symlinks: @@ -8,6 +8,7 @@ HAMMER_INCLUDE = . HAMMER_LIB = ./lib CFLAGS += -I$(HAMMER_INCLUDE) LDFLAGS += -L$(HAMMER_LIB) +SOURCES = pdf.c lzw-lib.c .PHONY: all test clean all: pdf @@ -17,8 +18,8 @@ test: pdf 'for x in t/*.pdf; do ./pdf "$$x" >/dev/null && echo OK: "$$x"; done' @true -pdf: pdf.c - $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $< -lhammer -lz +pdf: $(SOURCES) + $(CC) -o $@ $(CFLAGS) $(LDFLAGS) $(SOURCES) -lhammer -lz clean: rm -f pdf blob - 0b2cc8912c3730af5961ac3d76eaf5497e51a5f1 blob + a2da3272b5216acdead625a852d6cd6fdf91a017 --- README +++ README @@ -5,15 +5,61 @@ Beginnings of a PDF parser in Hammer https://gitlab.special-circumstanc.es/pesco/hammer/tree/pdf + For detailed build instructions, see README.md in that repository. + - Help the default Makefile find Hammer - $ ln -s ../hammer/src hammer - $ ln -s ../hammer/build/opt/src lib + $ ln -s ../hammer/src hammer # needed for building pdf, include files + $ ln -s ../hammer/build/opt/src lib # needed for running pdf, to locate libhammer.so - - Build/Usage: + - Notes for 2020-04-27 release: + The release branch has been tested to build with the 2020-04-27_RELEASE` branch located at https://gitlab.special-circumstanc.es/pesco/hammer/tree/2020-04-27_RELEASE + + - Build: + + $ pushd ../hammer; scons; popd # build Hammer $ make pdf - $ ./pdf test.pdf + - Usage: + + $ export LD_LIBRARY_PATH=./lib # see Troubleshooting section below to see if this is needed + $ ldd ./pdf | grep libhammer # verify that libhammer.so was found + $ ./pdf + # place some test files in the t/ directory... $ make test + + - Troubleshooting: + + libhammer.so not found: + + If Hammer is not installed as a system library, ld may fail to locate libhammer.so. The quick fix for this is altering LD_LIBRARY_PATH before running pdf: + + $ export LD_LIBRARY_PATH=./lib + $ make test + + The second solution is executing "scons install" when building Hammer, which will install it in ld's usual search path: + + $ pushd ../hammer; scons install; popd + # ... Update ldconfig cache if needed + $ make pdf + $ make test + + - Evaluating test results: + + For every file in the t/ directory, the pdf parser is executed. On successful parse, a message of the following form is displayed: + + OK: t/ + + In case of a non-fatal parse error, error messages may be displayed, but presence of the "OK" indicates pdf exited successfully. On a failed test run, only parse error messages are displayed. + + - Copyright: + + - pesco 2019,2020 + - pompolic 2020 + - Paul Vines 2020 + - David Bryant (modified lzw-ab code) + + See LICENSE and lzw-ab-license.txt for full copyright and licensing notice. + blob - f363e541d5613ebce67d335fc00c4288801a1982 blob + d649711646645c3e6fb1a86dac043752e2e4f12a --- pdf.c +++ pdf.c @@ -1,8 +1,11 @@ /* beginnings of a PDF parser in hammer * pesco 2019,2020 + * pompolic 2020 + * Paul Vines 2020 */ -#include /* strncmp(), memset() */ +#include /* strncmp(), memset(), memcpy() */ +#include /* exit() */ #include #include @@ -17,7 +20,14 @@ #define IN(STR) h_in((const uint8_t *)(STR), sizeof(STR) - 1) #define NOT_IN(STR) h_not_in((const uint8_t *)(STR), sizeof(STR) - 1) +#ifdef LOG +#define VIOL(P,VIOL) h_action(h_sequence(P, h_tell(), NULL), act_viol, VIOL) +#else +#define VIOL(P,VIOL) P +#endif + + /* * some helpers */ @@ -26,6 +36,7 @@ HParser *p_fail; HParser *p_epsilon; HParser *p_return_0; HParser *p_return_1; +uint8_t strictness = 0; /* a combinator to parse a given character but return a different value */ @@ -62,6 +73,20 @@ p_return_uint__m(HAllocator *mm__, uint64_t x) return h_action__m(mm__, p_epsilon, act_return_uint, (void *)x); } +/* like h_sepBy but parses a fixed number of elements */ +HParser * +p_sepBy_n__m(HAllocator *mm__, HParser *p, HParser *sep, size_t n) +{ + if (n == 0) + return p_epsilon; + + HParser *sep_p = h_sequence__m(mm__, sep, p, NULL); + HParser *tail = h_repeat_n__m(mm__, sep_p, n - 1); + HParser *seq = h_sequence__m(mm__, p, tail, NULL); + + return h_action__m(mm__, seq, h_act_flatten, NULL); +} + /* a helper to compare an HBytes to a string */ bool bytes_eq(HBytes b, const char *s) @@ -169,7 +194,10 @@ pp_parseresult(FILE *stream, const HParsedToken *tok, { HParseResult *res = H_CAST(HParseResult, tok); - h_pprint(stream, res->ast, indent, delta); + if (res == NULL) + fprintf(stream, "null"); + else + h_pprint(stream, res->ast, indent, delta); } void @@ -213,16 +241,290 @@ act_digit(const HParseResult *p, void *u) HParsedToken * act_hlower(const HParseResult *p, void *u) { - return H_MAKE_UINT(H_CAST_UINT(p->ast) - 'a'); + return H_MAKE_UINT(10 + H_CAST_UINT(p->ast) - 'a'); } HParsedToken * act_hupper(const HParseResult *p, void *u) { - return H_MAKE_UINT(H_CAST_UINT(p->ast) - 'A'); + return H_MAKE_UINT(10 + H_CAST_UINT(p->ast) - 'A'); } +HParsedToken* +act_hdigitpair(const HParseResult *p, void *u) +{ + uint8_t b = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + size_t digits_processed = 0; + uint8_t digits[2]; + for(size_t i = 0; i < seq->used; ++i) + { + switch(seq->elements[i]->token_type) + { + case TT_UINT: + digits[digits_processed] = H_CAST_UINT(seq->elements[i]); + digits_processed++; + break; + default: + break; + } + assert(digits_processed == 2); + } + + b = (digits[0] << 4) + digits[1]; + return H_MAKE_UINT(b); +} + HParsedToken * +act_ahextruncated(const HParseResult *p, void *u) +{ + uint8_t b = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + + /* Assumption: At this point seq->elements[0] is a hex digit + * and seq->elements[1] holds '>' (EOD) + */ + // XXX figure out how to compare to '>' + assert(seq->used == 2); + b = H_CAST_UINT(seq->elements[0]) << 4; + return H_MAKE_UINT(b); +} + +HParsedToken * +act_hs_end(const HParseResult *p, void *u) +{ + HParsedToken *res; + HCountedArray *seq = H_CAST_SEQ(p->ast); + assert(seq->used >= 1); + + res = H_MAKE_UINT(H_CAST_UINT(seq->elements[0])); + return res; +} + +HParsedToken * +act_ahexstream(const HParseResult *p, void *u) +{ + uint8_t *result_bytes; + size_t required_bytes; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken *res; + + /* Ignore the the last element, which is EOD */ + required_bytes = (seq->used - 1); + + result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * required_bytes); + + /* memcpy all but the last group's bytes into a single array */ + for (size_t i = 0; i < seq->used-1; ++i) + { + assert(i < required_bytes); + result_bytes[i] = H_CAST_UINT(seq->elements[i]); + } + + + res = H_MAKE_BYTES(result_bytes, required_bytes); + return res; +} + + +HParsedToken * +act_a85zero(const HParseResult *p, void *u) +{ + uint32_t b = 0; + return H_MAKE_UINT(b); +} + +HParsedToken * +act_a85digit(const HParseResult *p, void *u) +{ + uint8_t b = H_CAST_UINT(p->ast); + b -= '!'; + + /* At this point we have the base85 value of one byte out of 5 */ + return H_MAKE_UINT(b); +} + +HParsedToken * +act_a85fivedigits(const HParseResult *p, void *u) +{ + uint64_t fourbytes = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken **digits = h_seq_elements(p->ast); + + /* 2^32-1, the max value the group can hold as per spec */ + #define A85GRPMAX 4294967295 + + /* Only for groups that do not need to padded to 5 */ + assert(seq->used == 5); + fourbytes = H_CAST_UINT(digits[0]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[3]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[4]); + + assert(fourbytes <= A85GRPMAX); + return H_MAKE_UINT(fourbytes); +} + +/* Checking the following condition in the spec: + * The value represented by a group of 5 characters is greater than 2^32 - 1. + */ +bool +validate_a85fivedigits(HParseResult *p, void *u) +{ + /* "s8W-!" should be the highest accepted value */ + return H_CAST_UINT(p->ast) <= A85GRPMAX; +} + +HParsedToken * +act_a85group(const HParseResult *p, void *u) +{ + uint8_t *bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * 4); + uint32_t fourbytes = H_CAST_UINT(p->ast); + + bytes[0] = (fourbytes & 0xFF000000) >> 24; + bytes[1] = (fourbytes & 0x00FF0000) >> 16; + bytes[2] = (fourbytes & 0x0000FF00) >> 8; + bytes[3] = (fourbytes & 0x000000FF); + + HParsedToken *b = H_MAKE_BYTES(bytes, 4); + return b; +} + +HParsedToken * +act_a85partial2group(const HParseResult *p, void *u) +{ + uint64_t fourbytes = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken **digits = h_seq_elements(p->ast); + + assert(seq->used == 2); + fourbytes = H_CAST_UINT(digits[0]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]); + fourbytes *= 85 * 85 * 85; + + assert(fourbytes <= A85GRPMAX); + return H_MAKE_UINT(fourbytes); +} + +bool +validate_a85partial2group(HParseResult *p, void *u) +{ + return H_CAST_UINT(p->ast) <= A85GRPMAX; +} + +HParsedToken * +act_a85partial3group(const HParseResult *p, void *u) +{ + uint64_t fourbytes = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken **digits = h_seq_elements(p->ast); + + assert(seq->used == 3); + fourbytes = H_CAST_UINT(digits[0]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]); + fourbytes *= 85 * 85; + + assert(fourbytes <= A85GRPMAX); + return H_MAKE_UINT(fourbytes); +} + +bool +validate_a85partial3group(HParseResult *p, void *u) +{ + return H_CAST_UINT(p->ast) <= A85GRPMAX; +} + +HParsedToken * +act_a85partial4group(const HParseResult *p, void *u) +{ + uint64_t fourbytes = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken **digits = h_seq_elements(p->ast); + + assert(seq->used == 4); + fourbytes = H_CAST_UINT(digits[0]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[1]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[2]); + fourbytes = fourbytes * 85 + H_CAST_UINT(digits[3]); + fourbytes *= 85; + + assert(fourbytes <= A85GRPMAX); + return H_MAKE_UINT(fourbytes); +} + +bool +validate_a85partial4group(HParseResult *p, void *u) +{ + return H_CAST_UINT(p->ast) <= A85GRPMAX; +} + +HParsedToken * +act_a85partialgroup(const HParseResult *p, void *u) +{ + uint8_t bytes_helper[4]; + size_t bytes_used = 0; + uint8_t *bytes; + + uint32_t fourbytes = H_CAST_UINT(p->ast); + + /* Scan the uint backwards to find the first non-zero byte */ + for (size_t i = 3; i > 0; --i) + { + /* Shift by 0, 8, 16 and 24 to get the correct byte */ + bytes_helper[i] = (fourbytes >> ((3-i) * 8)) & 0xFF; + /* If we haven't set bytes_used yet, and the particular byte is nonzero */ + if (!bytes_used && bytes_helper[i]) + bytes_used = i; + } + assert(bytes_used > 0); + + bytes = h_arena_malloc(p->arena, bytes_used); + return H_MAKE_BYTES(bytes, bytes_used); +} + +HParsedToken * +act_a85string(const HParseResult *p, void *u) +{ + uint8_t *result_bytes; + size_t chunk_number; + size_t required_bytes; + size_t out_pos = 0; + HCountedArray *seq = H_CAST_SEQ(p->ast); + HParsedToken *res; + + /* Number of 4-byte chunks, minus the potential last partial group and EOD */ + chunk_number = seq->used - 2; + + /* Special-case: last chunk before EOD may be 4, 3, 2 or 1 bytes + * The latter two happening if the group was parsed from a partial + * group consisting less than 5 chars */ + HBytes *last_chunk = &H_CAST_BYTES(seq->elements[seq->used-1]); + required_bytes = (chunk_number * 4 + last_chunk->len); + + result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * required_bytes); + + /* memcpy all but the last group's bytes into a single array */ + for (size_t i = 0; i < seq->used-1; ++i) + { + HBytes *chunk = &H_CAST_BYTES(seq->elements[i]); + assert(out_pos < required_bytes); + memcpy(&(result_bytes[out_pos]), chunk->token, 4); + out_pos += 4; + assert(out_pos < required_bytes); + } + + memcpy(&(result_bytes[out_pos]), last_chunk->token, last_chunk->len); + out_pos += last_chunk->len; + /* We should have filled the array exactly by this point */ + assert(out_pos == required_bytes-1); + + res = H_MAKE_BYTES(result_bytes, required_bytes); + return res; +} + +HParsedToken * act_nat(const HParseResult *p, void *u) { uint64_t x = 0; @@ -237,6 +539,34 @@ act_nat(const HParseResult *p, void *u) #define act_xroff act_nat #define act_xrgen act_nat +HParser *p_violsev; +HParsedToken * +act_viol(const HParseResult *p, void *viol) +{ + uint32_t severity; + uint32_t offset; + HParseResult *severity_parse; + viol = (uint8_t *) viol; + severity_parse = h_parse(p_violsev, viol, strlen((char *)viol)); + if (!severity_parse) { + fprintf(stderr, "Severity for violaiton %s not assigned!\n", (char *)viol); + severity = 99999; + } + else { + severity = severity_parse->ast->seq->elements[0]->uint; + } + offset = p->ast->seq->elements[p->ast->seq->used-1]->uint / 8; + fprintf(stderr, "VIOLATION[%d]@%d (0x%x): %s\n", severity, offset, offset, (char *) viol); + if (strictness && severity > strictness) { + exit(1); + } + /* Just return the parse AST, drop the h_tell */ + if (p->ast->seq->used == 1) { + return (HParsedToken *) NULL; + } + return (HParsedToken *) p->ast->seq->elements[0]; +} + bool validate_pnat(HParseResult *p, void *u) { @@ -327,7 +657,7 @@ act_nesc(const HParseResult *p, void *u) return H_MAKE_UINT(H_FIELD_UINT(1)*16 + H_FIELD_UINT(2)); } -#define act_str_ h_act_flatten +#define act_schars h_act_flatten #define act_string act_token HParsedToken * @@ -341,9 +671,6 @@ act_octal(const HParseResult *p, void *u) return H_MAKE_UINT(x); } -#define act_oct3 act_octal -#define act_oct2 act_octal -#define act_oct1 act_octal HParsedToken * act_xrent(const HParseResult *p, void *u) @@ -458,7 +785,61 @@ act_dict_(const HParseResult *p, void *env) #define act_array_ h_act_flatten +HParsedToken * +act_shortlength(const HParseResult *p, void *u) +{ + uint8_t length = H_CAST_UINT(p->ast); + /* Length can range from 0-127, corresponding to the range 1-128, inclusive */ + uint8_t finallength = length+1; + return H_MAKE_UINT(finallength); +} + +HParsedToken * +act_longlength(const HParseResult *p, void *u) +{ + uint8_t length = H_CAST_UINT(p->ast); + uint8_t finallength = 257-length; + + return H_MAKE_UINT(finallength); +} + +HParsedToken * +act_longrun(const HParseResult *p, void *u) +{ + HParsedToken **elements = h_seq_elements(p->ast); + HParsedToken *res = H_MAKE_SEQ(); + + uint8_t length = H_CAST_UINT(elements[0]); + uint8_t data = H_CAST_UINT(elements[1]); + + for (size_t len = 0; len < length; ++len) + { + h_seq_snoc(res, H_MAKE_UINT(data)); + } + + return res; +} + +HParsedToken * +act_rldstring(const HParseResult *p, void *u) +{ + const HParsedToken *flattened = h_seq_flatten(p->arena, p->ast); + HCountedArray *flattened_seq = H_CAST_SEQ(flattened); + uint8_t bytes_required; + uint8_t *result_bytes; + + bytes_required = flattened_seq->used - 1; + result_bytes = h_arena_malloc(p->arena, sizeof(uint8_t) * bytes_required); + + for (size_t i = 0; i < flattened_seq->used-1; ++i) + { + result_bytes[i] = H_CAST_UINT(flattened_seq->elements[i]); + } + + return H_MAKE_BYTES(result_bytes, bytes_required); +} + /* * input grammar */ @@ -468,12 +849,37 @@ HParser *p_pdfdbg; HParser *p_startxref; HParser *p_xref; HParser *p_objdef; +HParser *p_a85string; +HParser *p_ahexstream; +HParser *p_rldstring; +HParser *p_ws; +HParser *p_wel; +HParser *p_elemr; +HParser *p_npair; /* continuations for h_bind() */ HParser *kstream(HAllocator *, const HParsedToken *, void *); HParser *kxstream(HAllocator *, const HParsedToken *, void *); void +init_runlengthdecode_parser(struct Env *aux) +{ + H_RULE(rldeod, h_ch(0x80)); + H_ARULE(longlength, h_ch_range(0x81, 0xFF)); + H_ARULE(shortlength, h_ch_range(0x0, 0x7F)); + + H_RULE(shortdata, h_uint8()); + H_RULE(longdata, h_uint8()); + + H_RULE(shortrun, h_length_value(shortlength, shortdata)); + H_ARULE(longrun, SEQ(longlength, longdata)); + + H_ARULE(rldstring, SEQ(h_many(CHX(shortrun, longrun)), IGN(rldeod))); + + p_rldstring = rldstring; +} + +void init_parser(struct Env *aux) { TT_HParseResult = h_allocate_token_new("HParseResult", NULL, pp_parseresult); @@ -497,6 +903,7 @@ init_parser(struct Env *aux) //H_RULE(dchar, IN(DCHARS)); /* delimiter */ H_RULE(rchar, NOT_IN(WCHARS DCHARS)); /* regular */ H_RULE(nchar, NOT_IN(WCHARS DCHARS "#")); /* name */ + H_RULE(schar, NOT_IN("()\n\r\\")); /* string literal */ H_ARULE(digit, h_ch_range('0', '9')); H_ARULE(pdigit, h_ch_range('1', '9')); H_ARULE(hlower, h_ch_range('a', 'f')); @@ -551,16 +958,10 @@ init_parser(struct Env *aux) /* numbers */ H_ARULE(sign, CHX(minus, IGN(plus))); H_VRULE(intnn, nat); - #if 1 H_ARULE(realnn, CHX(SEQ(digits, period, digits), /* 12.3 */ SEQ(digits, period, empty), /* 123. */ SEQ(empty, period, digits))); /* .123 */ // XXX ^ we _could_ move the "123." case into intnn... - #else - // XXX the .123 case above somehow leads to a conflict with litstr... - H_ARULE(realnn, CHX(SEQ(digits, period, digits), /* 12.3 */ - SEQ(digits, period, empty))); /* 123. */ - #endif H_RULE(numbnn, CHX(realnn, intnn)); H_RULE(snumb, SEQ(sign, numbnn)); H_ARULE(numb, CHX(snumb, numbnn)); @@ -570,59 +971,24 @@ init_parser(struct Env *aux) H_ARULE(nstr, h_many(CHX(nchar, nesc))); /* '/' is valid */ H_RULE(name, h_right(slash, nstr)); - /* strings - * - * this is so convoluted in order to make it LALR including the - * precedence rules for octal escapes ("\123" vs "\12 3" vs "\1 23") - * and end-of-line ("CRLF" vs "CR LF"). - * - * we have to split the base rule 'str' into variants 'str_o' and - * 'str_l' depending on whether they may start with an octal digit or - * linefeed, respectively. - */ - H_RULE(str_ol, h_indirect()); - H_RULE(str_o, h_indirect()); - H_RULE(str_l, h_indirect()); - H_RULE(str, h_indirect()); + /* strings */ + H_RULE(snest, h_indirect()); H_RULE(bsn, p_mapch('n', 0x0a)); /* LF */ H_RULE(bsr, p_mapch('r', 0x0d)); /* CR */ H_RULE(bst, p_mapch('t', 0x09)); /* HT */ H_RULE(bsb, p_mapch('b', 0x08)); /* BS (backspace) */ H_RULE(bsf, p_mapch('f', 0x0c)); /* FF */ H_RULE(escape, CHX(bsn, bsr, bst, bsb, bsf, lparen, rparen, bslash)); - H_ARULE(oct3, REP(odigit,3)); - H_ARULE(oct2, REP(odigit,2)); - H_ARULE(oct1, REP(odigit,1)); - H_RULE(octesc, CHX(SEQ(oct3, str), - SEQ(oct2, str_o), - SEQ(oct1, str_o))); - H_RULE(eolesc, CHX(SEQ(IGN(crlf), str), - SEQ(IGN(cr), str_l), - SEQ(IGN(lf), str))); - H_RULE(schar_o, NOT_IN("()\n\r\\" "01234567")); - H_RULE(schar_e, NOT_IN("()\n\r\\" "01234567" "nrtbf")); - H_RULE(str_o_, CHX(SEQ(lf, str), str_ol)); /* str "but not" odigit */ - H_RULE(str_l_, CHX(SEQ(odigit, str), str_ol)); /* str "but not" lf */ - H_RULE(str_ol_, CHX(SEQ(cr, str_l), /* str "but neither" */ - SEQ(crlf, str), - SEQ(schar_o, str), - SEQ(lparen, str, rparen, str), - SEQ(IGN(bslash), escape, str), - SEQ(IGN(bslash), schar_e, str), /* "lone" bs */ - /* NB: ^ lone backslashes are to be ignored per spec, but we - * let them "escape" with the following character. this works - * because they are never truly alone. */ - SEQ(IGN(bslash), octesc), - SEQ(IGN(bslash), eolesc), /* line split */ - epsilon)); - H_ARULE(str_, CHX(SEQ(lf, str), SEQ(odigit, str), str_ol)); - H_RULE(litstr, h_middle(lparen, str, rparen)); + H_ARULE(octal, CHX(REP(odigit,3), REP(odigit,2), REP(odigit,1))); + H_RULE(wrap, IGN(eol)); + H_RULE(sesc, h_right(bslash, CHX(escape, octal, wrap, epsilon))); + /* NB: lone backslashes and escaped newlines are ignored */ + H_ARULE(schars, h_many(CHX(schar, snest, sesc, eol))); + H_RULE(snest_, SEQ(lparen, schars, rparen)); + H_RULE(litstr, h_middle(lparen, schars, rparen)); H_RULE(hexstr, h_middle(langle, MANY_WS(hdigit), rangle)); H_ARULE(string, CHX(litstr, hexstr)); - h_bind_indirect(str_ol, str_ol_); - h_bind_indirect(str_o, str_o_); - h_bind_indirect(str_l, str_l_); - h_bind_indirect(str, str_); + h_bind_indirect(snest, snest_); H_RULE(array, h_indirect()); H_RULE(dict, h_indirect()); @@ -635,8 +1001,9 @@ init_parser(struct Env *aux) /* dictionaries */ H_RULE(dopen, LIT("<<")); H_RULE(dclose, LIT(">>")); - H_RULE(k_v, CHX(SEQ(name, wel,ws, obj), - SEQ(name, CHX(name,dobj)))); + H_RULE(k_v, CHX(CHX(SEQ(name, wel,ws, obj), + SEQ(name, CHX(name,dobj))), + VIOL(SEQ(name, wel,ws), "Key with no value (severity=2)"))); H_ARULE(dict_, h_middle(dopen, MANY_WS(k_v), dclose)); // XXX this allows, for instance, "<<<<" to be parsed as "<< <<". ok? // XXX validate: dict keys must be unique @@ -659,34 +1026,58 @@ init_parser(struct Env *aux) h_bind_indirect(array, array_); /* streams */ - H_RULE(stmbeg, SEQ(dict, ws, LIT("stream"), OPT(cr), lf)); - H_RULE(stmend, SEQ(OPT(eol), LIT("endstream"))); + H_RULE(stmbeg, SEQ(dict, OPT(ws), LIT("stream"), OPT(cr), lf)); + H_RULE(stmend, CHX(SEQ(eol, LIT("endstream")), + VIOL(LIT("ndstream"), "Stream length >1-too-long (severity=10)"), + VIOL(SEQ(h_many1(wchar), LIT("endstream")), + "No newline before endstream (severity=7)"), + VIOL(LIT("endstream"), "Stream length 1-too-long (severity=9)"), + VIOL(SEQ(OPT(h_ch_range(0, 255)), OPT(eol), LIT("endstream")), + "Stream length 1-too-short (severity=4)"), + VIOL(SEQ(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"), + SEQ(npair, wel, KW("obj")), + KW("xref"), + LIT("endstream")))), LIT("endstream")), + "Stream length >1-too-short (severity=5)"), + VIOL(h_many1(h_butnot(h_ch_range(0, 255), CHX(KW("endobj"), + SEQ(npair, wel, KW("obj")), + KW("xref")))), + "Missing endstream token (severity=7)"))); + H_RULE(stream, h_left(h_bind(stmbeg, kstream, aux), stmend)); // XXX is whitespace allowed between the eol and "endstream"? + // peter wyatt says no. (2020-03-25) /* * file structure */ /* header */ - H_RULE(version, SEQ(pdigit, IGN(period), pdigit)); + H_RULE(version, SEQ(pdigit, IGN(period), digit)); H_RULE(header, h_middle(LIT("%PDF-"), version, nl)); /* body */ H_RULE(indobj, CHX(stream, obj)); - H_RULE(objdef, SEQ(ws, npair, wel, KW("obj"), ws, indobj, KW("endobj"))); + H_RULE(objdef, SEQ(ws, npair, wel, KW("obj"), ws, indobj, + CHX(VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(KW("endobj"))), + "More than 1 endobj token (severity=1)"), + VIOL(SEQ(OPT(ws), OPT(lws), KW("endobj"), h_many(CHX(wel, eol)), h_many1(SEQ(dclose, h_many1(CHX(wchar, eol)), KW("endobj")))), + "More than 1 >> and endobj token (severity=2)"), + SEQ(OPT(ws), OPT(lws), KW("endobj")), + VIOL(h_optional(KW("endobj")), "Missing endobj token (severity=1)")))); H_RULE(body, h_many(objdef)); - /* for object streams */ - //H_RULE(osidx, h_sepBy(npair, SEQ(wel,ws))); - /* cross-reference section */ H_RULE(xreol, CHX(SEQ(sp, cr), SEQ(sp, lf), crlf)); // ^ XXX does the real world follow this rule?! cf. loop.pdf H_RULE(xrtyp, CHX(h_ch('n'), h_ch('f'))); H_ARULE(xroff, REP(digit, 10)); H_ARULE(xrgen, REP(digit, 5)); - H_ARULE(xrent, SEQ(xroff, IGN(sp), xrgen, IGN(sp), xrtyp, IGN(xreol))); + H_ARULE(xrent, SEQ(xroff, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref offset_gen entry (severity=1)"), sp)), + xrgen, IGN(CHX(VIOL(SEQ(lwchar, h_many1(lwchar)), "Multi-WS in xref gen_use entry (severity=1)"), sp)), + xrtyp, IGN(CHX(VIOL(SEQ(wchar, wchar, h_many1(wchar)), "Greater-than-2-byte WS at end of xref entry (severity=1)"), + xreol, + VIOL(SEQ(h_many1(wchar)), "Nonconformant WS at end of xref entry (severity=1)"))))); H_RULE(xrhead, SEQ(nat, IGN(sp), nat, nl)); H_RULE(xrsub, SEQ(xrhead, h_many(xrent))); H_ARULE(xrefs, SEQ(KW("xref"), nl, h_many(xrsub))); @@ -697,40 +1088,108 @@ init_parser(struct Env *aux) // XXX skip however much we consumed and check for "endstream endobj"? /* trailer */ - H_RULE(startxr, SEQ(nl, KW("startxref"), nl, + H_RULE(startxr, SEQ(nl, KW("startxref"), nl, lws, nat, nl, - LIT("%%EOF"), CHX(nl, end))); + LIT("%%EOF"), OPT(nl))); + + /* used for the backwards search */ + H_RULE(lasteof, SEQ(nl, KW("startxref"), nl, + lws, nat, nl, // XXX the real world sometimes omits nl after %%EOF inside the file. // the next 'tail' would be appended right after the 'F', // presumably because the previous version of the file // ended without a trailing newline. m) - // this is invalid per spec, because it creates a run-on + // this is invalid per spec, because it creates a run-on // comment, but we should probably accept-and-warn. // XXX should lws be allowed before EOF marker? // NB: lws before xref offset is allowed, cf. p.48 (example 4) + LIT("%%EOF"), + CHX(VIOL(SEQ(nl, h_many1(nl), end), + "(offset FROM END) Multiple newlines after final %%EOF (severity=4)"), + SEQ(h_many(nl), end), + VIOL(SEQ(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))), + "(offset FROM END) Data after final %%EOF (severity=7)")))); + H_RULE(xr_td, SEQ(xrefs, KW("trailer"), ws, dict)); - H_RULE(tail, SEQ(body, h_optional(xr_td), startxr)); - // XXX the real world likes to omit 'startxr' from all but the - // last trailer. we should accept-and-warn in that case. - H_RULE(pdf, SEQ(header, h_many1(tail), end)); + H_RULE(hdr_junk, CHX(comment, + VIOL(h_many1(h_butnot(h_ch_range(0, 255), SEQ(npair, wel, KW("obj")))), + "Uncommented junk after header (severity=1)"))); + H_RULE(tail, SEQ(body, CHX(SEQ(h_optional(xr_td), startxr), + VIOL(SEQ(xr_td, OPT(SEQ(nl, KW("startxref"), nl, lws, nat, nl)), + OPT(nl), OPT(LIT("%%EOF")), OPT(nl)), + "Improper end of trailer - missing startxref and/or %%EOF (severity=5)")))); + H_RULE(final_eof_junk, CHX(VIOL(SEQ(h_many1(nl), end), "Multiple newlines after final %%EOF (severity=4)"), + VIOL(h_many1(h_butnot(h_ch_range(0, 255), LIT("%%EOF"))), + "Data after final %%EOF (severity=7)"), + end)); + H_RULE(pdf, SEQ(header, OPT(hdr_junk), h_many1(tail), final_eof_junk)); /* debug parser to consume as much as possible */ - H_RULE(pdfdbg, SEQ(header, h_many(tail), body, OPT(xr_td), OPT(startxr))); + H_RULE(pdfdbg, SEQ(header, OPT(hdr_junk), h_many(tail), body, OPT(xr_td), OPT(SEQ(startxr, final_eof_junk)))); + /* + * filters + */ + /* Ascii85Decode */ + H_RULE(a85eod, SEQ(h_ch('~'), OPT(h_many(lwchar)), h_ch('>'))); + H_ARULE(a85zero, h_ch('z')); + H_ARULE(a85digit, h_ch_range('!', 'u')); + + /* Line whitespace can occur between any digit and has to be ignored, */ + /* Comments are not allowed inside streams, and % character should cause + * a parse error. */ + #define MANY_LWS(X) h_many(CHX(lws, X)) + + /* This encoding of zero is not allowed */ + H_RULE(a85fiveexcl, h_repeat_n(MANY_LWS(h_ch('!')), 5)); + H_VARULE(a85fivedigits, SEQ(h_and(h_not(a85fiveexcl)), h_repeat_n(MANY_LWS(a85digit), 5))); + H_ARULE(a85group, CHX(a85zero, a85fivedigits)); + + H_VARULE(a85partial2group, h_repeat_n(MANY_LWS(a85digit), 2)); + H_VARULE(a85partial3group, h_repeat_n(MANY_LWS(a85digit), 3)); + H_VARULE(a85partial4group, h_repeat_n(MANY_LWS(a85digit), 4)); + H_ARULE(a85partialgroup, CHX(a85partial4group, a85partial3group, a85partial2group)); + + H_ARULE(a85string, SEQ(h_many(a85group), OPT(a85partialgroup), IGN(a85eod))); + + /* AsciiHexDecode */ + H_RULE(ahexeod, h_ch('>')); + H_ARULE(hdigitpair, SEQ(IGN(OPT(h_many(lwchar))), hdigit, IGN(OPT(h_many(lwchar))), hdigit)); + H_ARULE(ahextruncated, SEQ(IGN(OPT(h_many(lwchar))), hdigit, IGN(OPT(h_many(lwchar))))); + + H_ARULE(hs_end, SEQ(CHX(hdigitpair, ahextruncated), ahexeod)); + H_ARULE(ahexstream, SEQ(h_many(hdigitpair), hs_end)); + + init_runlengthdecode_parser(aux); + + /* global parser variables */ p_pdf = pdf; p_pdfdbg = pdfdbg; - p_startxref = startxr; + p_startxref = lasteof; //startxr; p_xref = CHX(xr_td, xrstm); p_objdef = objdef; + p_a85string = a85string; + p_ahexstream = ahexstream; + p_ws = ws; + p_wel = wel; + p_elemr = h_action(elemr, h_act_flatten, NULL); + p_npair = npair; p_fail = h_nothing_p(); p_epsilon = epsilon; p_return_0 = h_action(epsilon, act_return_uint, (void *)0); p_return_1 = h_action(epsilon, act_return_uint, (void *)1); + /* Parsing of severity messages */ + H_RULE(viol_preamble, SEQ(h_many(NOT_IN("=")), LIT("="))); + H_RULE(severity_num, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)), + act_nat, NULL)); + H_RULE(violsev, SEQ(IGN(viol_preamble), severity_num)); + p_violsev = violsev; + #if 0 // XXX testing int r; @@ -910,12 +1369,25 @@ struct predictor { uint8_t *buf; /* previous row of input */ uint8_t c; /* byte 'c' (upper left) */ int x; /* current position */ + +#ifndef ITERATIVE // XXX + uint8_t *out; + size_t nout; +#endif }; int depred_none(struct predictor *pred, uint8_t *inp, size_t sz) { +#ifdef ITERATIVE // XXX return h_parse_chunk(pred->sp, inp, sz); +#else + pred->out = realloc(pred->out, pred->nout + sz); + assert(pred->out != NULL); + memcpy(pred->out + pred->nout, inp, sz); + pred->nout += sz; + return false; +#endif } uint8_t pp_none(int a, int b, int c) { return 0; } @@ -982,7 +1454,14 @@ depred_png(struct predictor *pred, uint8_t *inp, size_ /* when row complete, pass it to parser and start a new row */ if (x == pred->rowsz) { +#ifdef ITERATIVE // XXX done = h_parse_chunk(pred->sp, pred->buf, pred->rowsz); +#else + pred->out = realloc(pred->out, pred->nout + pred->rowsz); + assert(pred->out != NULL); + memcpy(pred->out + pred->nout, pred->buf, pred->rowsz); + pred->nout += pred->rowsz; +#endif pred->c = pred->x = 0; if (pred->num != 2) /* support for 8-bpc TIFF */ pred->predfun = NULL; @@ -997,7 +1476,9 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) { size_t const BUFSIZE = 8 * 1024; uint8_t *buf; +#ifdef ITERATIVE // XXX HSuspendedParser *sp; +#endif HParseResult *res; const HParsedToken *v; size_t sz; @@ -1063,10 +1544,12 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) if (buf == NULL) err(1, "FlateDecode"); +#ifdef ITERATIVE // XXX /* initialize target parser */ sp = h_parse_start(p); assert(sp != NULL); pred.sp = sp; +#endif done = 0; strm.avail_in = b.len; @@ -1085,8 +1568,13 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) done = depredict(&pred, buf, sz); } while (done == 0 && ret == Z_OK); +#ifdef ITERATIVE // XXX res = h_parse_finish(sp); // XXX always return NULL on error? +#else + res = h_parse(p, pred.out, pred.nout); + free(pred.out); +#endif inflateEnd(&strm); free(pred.buf); free(buf); @@ -1096,7 +1584,271 @@ FlateDecode(const Dict *parms, HBytes b, HParser *p) return res; } +/* LZW helpers */ + +typedef struct +{ + uint8_t *lzw_buf; + size_t total_buf_size; + size_t write_head; + size_t write_tail; + uint8_t write_checksum; + size_t eof_loc; + + HBytes *input_stream; + size_t read_head; + size_t read_tail; + uint8_t read_checksum; +} lzwspec; + +lzwspec *cur_lzw_spec; + +/* used by write_lzw_buffer to get more space for decoding if needed */ +void +grow_lzw_buffer(size_t amount) +{ + uint8_t *ret_buf = realloc(cur_lzw_spec->lzw_buf, (cur_lzw_spec->total_buf_size+amount) * sizeof(uint8_t)); + if(ret_buf != NULL) + { + cur_lzw_spec->total_buf_size += amount; + cur_lzw_spec->lzw_buf = ret_buf; + } + else + { + fprintf(stderr, "LZWDecode: h_arena_realloc() failed"); + return; + } +} + +lzwspec * +new_lzw_spec(HBytes *bytes) +{ + size_t const BUFSIZE = sizeof(uint8_t) * 1024; + lzwspec *ret = malloc(sizeof(lzwspec)); + ret->input_stream = bytes; + ret->lzw_buf = malloc(BUFSIZE); + ret->total_buf_size = BUFSIZE; + return ret; +} + +void +delete_lzw_spec(lzwspec *spec) +{ + free(spec->lzw_buf); + free(spec); +} + +void +bind_lzw_spec(lzwspec *spec) +{ + cur_lzw_spec = spec; +} + + +#include "lzw-lib.h" + +/* Buffer writer function for the lzw-ab implementation, with a fixed signature. + * Although the type is defined as int, it is expected to write one byte at a time. + * Modifies cur_lzw_spec. Set up the lzw spec to use with bind_lzw_spec() */ + +void +write_lzw_buffer(int value) +{ + size_t const BUFSIZE = sizeof(uint8_t) * 1024; + + if(!cur_lzw_spec->lzw_buf) + { + fprintf(stderr, "LZWDecode: lzw_buf is null!"); + assert(cur_lzw_spec->lzw_buf != NULL); + } + + assert(cur_lzw_spec->write_head <= cur_lzw_spec->total_buf_size); + + if (value == EOF) { + cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head] = (uint8_t) value; + cur_lzw_spec->eof_loc = cur_lzw_spec->write_head; + cur_lzw_spec->write_head++; + return; + } + + /* We can get away with this cast due to writing single bytes. */ + cur_lzw_spec->lzw_buf[cur_lzw_spec->write_head++] = (uint8_t) value; + + /* If you looked at lzw-ab's code, the write head is reset here + * This function uses write_head as the offset of the last written item */ + if (cur_lzw_spec->write_head >= cur_lzw_spec->total_buf_size) + { + grow_lzw_buffer(BUFSIZE); + } + + cur_lzw_spec->write_checksum = cur_lzw_spec->write_checksum * 3 + (uint8_t) value; +} + + +/* Fixed signature function for reading bytes. Modifies cur_lzw_spec. Set cur_lzw_spec + * with bind_lzw_spec() */ +int read_lzw_buffer(void) +{ + uint8_t byte_read; + int ret_value; + + /* Input data is already waiting in the buffer */ + if (cur_lzw_spec->read_head == cur_lzw_spec->read_tail) + cur_lzw_spec->read_tail = cur_lzw_spec->input_stream->len; + + if (cur_lzw_spec->read_head < cur_lzw_spec->read_tail) + { + byte_read = cur_lzw_spec->input_stream->token[cur_lzw_spec->read_head++]; + cur_lzw_spec->read_checksum = cur_lzw_spec->read_checksum * 3 + byte_read; + ret_value = byte_read; + } + else + ret_value = EOF; + + return ret_value; +} + + +HParseResult * +LZWDecode(const Dict *parms, HBytes b, HParser *p) +{ + struct predictor pred = {1, 1, 8, 1}; + int (*depredict)(struct predictor *, uint8_t *, size_t); + HParseResult *res; + int done; + int ret; + const HParsedToken *v; + + /* set up the predictor (if any) */ + #define SETPARM(VAR,STR) do { \ + v = dictentry(parms, (STR)); \ + if (v != NULL) { \ + if (v->token_type != TT_SINT || v->sint < 0) \ + return NULL; \ + VAR = v->sint; \ + } } while(0) + SETPARM(pred.num, "Predictor"); + SETPARM(pred.colors, "Colors"); + SETPARM(pred.bpc, "BitsPerComponent"); + SETPARM(pred.columns, "Columns"); + #undef SETPARM + if (pred.num == 1) + depredict = depred_none; + else { + if (pred.num >= 10 && pred.num <= 15) + depredict = depred_png; + else if (pred.num == 2) { + /* for 8-bpc TIFF pred. 2, we can reuse PNG Sub */ + if (pred.bpc == 8) { + pred.predfun = pp_sub; /* predict left */ + depredict = depred_png; + } else { + // XXX add general TIFF predictor (bpc != 8) + fprintf(stderr, "LZWDecode: /Predictor %d " + "not supported for /BitsPerComponent %d\n", + pred.num, pred.bpc); + return NULL; + } + } else { + fprintf(stderr, "LZWDecode: /Predictor %d" + " not supported\n", pred.num); + return NULL; + } + + /* allocate row buffer */ + if (pred.columns > (INT_MAX - 7) / pred.colors / pred.bpc) { + fprintf(stderr, "LZWDecode: overflow\n"); + return NULL; + } + pred.rowsz = (pred.colors * pred.bpc * pred.columns + 7) / 8; + pred.buf = calloc(1, pred.rowsz); + if (pred.buf == NULL) + err(1, "LZWDecode"); + } + + lzwspec *lzw_spec = new_lzw_spec(&b); + bind_lzw_spec(lzw_spec); + + ret = lzw_decompress(write_lzw_buffer, read_lzw_buffer); + if (ret) { + fprintf(stderr, "lzw_decompress: error (%d)\n", ret); + assert(!"LZWDecode: failed to decompress\n"); + } + done = depredict(&pred, cur_lzw_spec->lzw_buf, cur_lzw_spec->write_head-1); + assert(!done); // XXX ITERATIVE + + res = h_parse(p, pred.out, pred.nout); + free(pred.out); + + bind_lzw_spec(NULL); + delete_lzw_spec(lzw_spec); + + return res; +} + +HParseResult * +RunLengthDecode(const Dict *parms, HBytes b, HParser *p) +{ + HParseResult *res; + + res = h_parse(p_rldstring, b.token, b.len); + if(!res) + { + fprintf(stderr, "parse error in RunLengthDecode filter\n"); + return NULL; + } + + assert(res->ast && res->ast->token_type == TT_BYTES); + res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len); + + return res; +} + /* + * Decodes ASCII hexadecimal data into binary data. + * parms should be empty, because the filter has no parameters + */ +HParseResult * +ASCIIHexDecode(const Dict *parms, HBytes b, HParser *p) +{ + HParseResult *res; + + res = h_parse(p_ahexstream, b.token, b.len); + if(!res) + { + fprintf(stderr, "parse error in ASCIIHexDecode filter\n"); + return NULL; + } + + assert(res->ast && res->ast->token_type == TT_BYTES); + res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len); + + return res; +} + +/* + * Decodes ASCII base-85 encoded data and produces binary data. + * parms should be empty, because the filter has no parameters + */ +HParseResult* +ASCII85Decode(const Dict *parms, HBytes b, HParser *p) +{ + HParseResult *res; + + res = h_parse(p_a85string, b.token, b.len); + if(!res) + { + fprintf(stderr, "parse error in ASCII85Decode filter\n"); + return NULL; + } + + assert(res->ast && res->ast->token_type == TT_BYTES); + res = h_parse(p, res->ast->bytes.token, res->ast->bytes.len); + + return res; +} + +/* * decode the bytes in 'b' according to metadata in the stream dictionary 'd' * and parse the result with 'p'. */ @@ -1111,15 +1863,25 @@ decode_stream(const Dict *d, HBytes b, HParser *p) if (v == NULL) return h_parse(p, b.token, b.len); +#ifdef ITERATIVE // XXX /* compile to a CF backend to enable incremental parsing */ if (h_compile(p, PB_LLk, NULL) == -1) errx(1, "stream data parser: LL(1) compile failed"); +#endif if (v->token_type == TT_SEQUENCE) return NULL; // XXX filter chains not supported, yet assert(v->token_type == TT_BYTES); if (bytes_eq(v->bytes, "FlateDecode")) filter = FlateDecode; + else if (bytes_eq(v->bytes, "ASCIIHexDecode")) + filter = ASCIIHexDecode; + else if (bytes_eq(v->bytes, "ASCII85Decode")) + filter = ASCII85Decode; + else if (bytes_eq(v->bytes, "RunLengthDecode")) + filter = RunLengthDecode; + else if (bytes_eq(v->bytes, "LZWDecode")) + filter = LZWDecode; else return NULL; /* filter not supported */ @@ -1177,8 +1939,8 @@ p_take__m(HAllocator *mm__, size_t n, struct Env *aux) return h_left__m(mm__, bytes, skip); } -HParser * -p_xrefdata__m(HAllocator *mm__, const Dict *dict); +HParser *p_xrefdata__m(HAllocator *, const Dict *); +HParser *p_objstm__m(HAllocator *, const Dict *); HParser * p_stream_data__m(HAllocator *mm__, const Dict *dict) @@ -1192,9 +1954,10 @@ p_stream_data__m(HAllocator *mm__, const Dict *dict) /* interpret known stream types */ if (bytes_eq(v->bytes, "XRef")) return p_xrefdata__m(mm__, dict); - // XXX - //if (bytes_eq(v->bytes, "ObjStm")) - // return p_objstm__m(mm__, dict); +#ifndef NOOBJSTM + if (bytes_eq(v->bytes, "ObjStm")) + return p_objstm__m(mm__, dict); +#endif return NULL; /* unrecognized type */ } @@ -1213,7 +1976,17 @@ act_ks_value(const HParseResult *p, void *u) /* decode and parse the stream data */ res = decode_stream(spec->dict, bytes, spec->parser); - assert(res != NULL); // XXX parse failure! + if (res == NULL) { + HBytes b = {NULL, 0}; + const HParsedToken *v = dictentry(spec->dict, "Type"); + if (v != NULL && v->token_type == TT_BYTES) + b = v->bytes; + if (b.len > INT_MAX) + b.len = INT_MAX; + fprintf(stderr, "parse error in stream (%*s)\n", + (int)b.len, b.token); + // XXX return the undecoded stream (p->ast)? + } return H_MAKE(HParseResult, res); } @@ -1245,7 +2018,7 @@ kstream(HAllocator *mm__, const HParsedToken *x, void //fprintf(stderr, "parsing stream object, length %zu.\n", sz); // XXX debug - dict_p = p_return__m(mm__, dict_t); + dict_p = p_return__m(mm__, dict_t); bytes_p = p_take__m(mm__, sz, aux); spec = h_alloc(mm__, sizeof(struct streamspec)); @@ -1436,6 +2209,29 @@ p_xrefdata__m(HAllocator *mm__, const Dict *dict) return h_sequence__ma(mm__, (void **)p_subs); } +HParser * +p_objstm__m(HAllocator *mm__, const Dict *dict) +{ + const HParsedToken *v; + size_t N; + + v = dictentry(dict, "N"); + if (v == NULL || v->token_type != TT_SINT || v->sint < 0 || + (uint64_t)v->sint > SIZE_MAX) { + fprintf(stderr, "missing /N on object stream\n"); + return p_fail; + } + N = v->sint; + + HParser *wel_ws = h_sequence__m(mm__, p_wel, p_ws, NULL); + HParser *idx = p_sepBy_n__m(mm__, p_npair, wel_ws, N); + + return h_sequence__m(mm__, p_ws, idx, p_elemr, p_ws, NULL); + // XXX leading and trailing ws OK? + + // XXX consistency-check against /First, idx, /N +} + /* * This continuation is very similar to kstream, except that it does not * rely on /Length to consume the right amount of input. If /Length is @@ -1589,10 +2385,15 @@ main(int argc, char *argv[]) int fd; /* command line handling */ - if (argc != 2) { + if (argc > 3) { fprintf(stderr, "usage: %s file\n", argv[0]); return 1; } + if (argc == 3) { + H_RULE(nat, h_action(h_many1(h_action(h_ch_range('0', '9'), act_digit, NULL)), + act_nat, NULL)); + strictness = h_parse(nat, (uint8_t *)argv[2], strlen(argv[2]))->ast->uint; + } infile = argv[1]; /* mmap the input file */ blob - /dev/null blob + 65d4a2e4b96304208852290e3d3bf6ee7ce3dde8 (mode 644) --- /dev/null +++ lzw-ab-license.txt @@ -0,0 +1,25 @@ + Copyright (c) David Bryant + All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Conifer Software nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blob - /dev/null blob + 8283dfc52f1df905e2c0cce028eee9969684864b (mode 644) --- /dev/null +++ t/hello_ascii85.pdf @@ -0,0 +1,62 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog +/Outlines 2 0 R +/Pages 3 0 R +>> +endobj +2 0 obj +<< /Type /Outlines +/Count 0 +>> +endobj +3 0 obj +<< /Type /Pages +/Kids [ 4 0 R ] +/Count 1 +>> +endobj +4 0 obj +<< /Type /Page +/Parent 3 0 R +/MediaBox [ 0 0 612 792 ] +/Contents 5 0 R +/Resources << /ProcSet 6 0 R +/Font << /F1 7 0 R >> +>> +>> +endobj +5 0 obj +<< /Length 59 /Filter /ASCII85Decode>> +stream + +endstream +endobj +6 0 obj +[ /PDF /Text ] +endobj +7 0 obj +<< /Type /Font +/Subtype /Type1 +/Name /F1 +/BaseFont /Helvetica +/Encoding /MacRomanEncoding +>> +endobj +xref +0 8 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000120 00000 n +0000000179 00000 n +0000000322 00000 n +0000000453 00000 n +0000000483 00000 n +trailer +<< /Size 8 +/Root 1 0 R +>> +startxref +592 +%%EOF blob - /dev/null blob + 4f54996cce84b86df745dc4912ca558e1760b557 (mode 644) --- /dev/null +++ t/hello_ascii85_lf.pdf @@ -0,0 +1,63 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog +/Outlines 2 0 R +/Pages 3 0 R +>> +endobj +2 0 obj +<< /Type /Outlines +/Count 0 +>> +endobj +3 0 obj +<< /Type /Pages +/Kids [ 4 0 R ] +/Count 1 +>> +endobj +4 0 obj +<< /Type /Page +/Parent 3 0 R +/MediaBox [ 0 0 612 792 ] +/Contents 5 0 R +/Resources << /ProcSet 6 0 R +/Font << /F1 7 0 R >> +>> +>> +endobj +5 0 obj +<< /Length 60 /Filter /ASCII85Decode>> +stream + +endstream +endobj +6 0 obj +[ /PDF /Text ] +endobj +7 0 obj +<< /Type /Font +/Subtype /Type1 +/Name /F1 +/BaseFont /Helvetica +/Encoding /MacRomanEncoding +>> +endobj +xref +0 8 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000120 00000 n +0000000179 00000 n +0000000322 00000 n +0000000454 00000 n +0000000484 00000 n +trailer +<< /Size 8 +/Root 1 0 R +>> +startxref +593 +%%EOF blob - /dev/null blob + 7496f59864d8d8b22ac06557f7ce3d0e3ed2e2c3 (mode 644) --- /dev/null +++ t/hello_asciihex.pdf @@ -0,0 +1,62 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog +/Outlines 2 0 R +/Pages 3 0 R +>> +endobj +2 0 obj +<< /Type /Outlines +/Count 0 +>> +endobj +3 0 obj +<< /Type /Pages +/Kids [ 4 0 R ] +/Count 1 +>> +endobj +4 0 obj +<< /Type /Page +/Parent 3 0 R +/MediaBox [ 0 0 612 792 ] +/Contents 5 0 R +/Resources << /ProcSet 6 0 R +/Font << /F1 7 0 R >> +>> +>> +endobj +5 0 obj +<< /Length 91 /Filter /ASCIIHexDecode>> +stream +540a2f46312032342054660a313030203130302054640a282048656c6c6f20576f726c64202920546a0a45540a> +endstream +endobj +6 0 obj +[ /PDF /Text ] +endobj +7 0 obj +<< /Type /Font +/Subtype /Type1 +/Name /F1 +/BaseFont /Helvetica +/Encoding /MacRomanEncoding +>> +endobj +xref +0 8 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000120 00000 n +0000000179 00000 n +0000000322 00000 n +0000000486 00000 n +0000000516 00000 n +trailer +<< /Size 8 +/Root 1 0 R +>> +startxref +624 +%%EOF blob - /dev/null blob + b077906ba43c89935d5d6fbe226777cc84afb0bb (mode 644) --- /dev/null +++ t/hello_asciihex_lf.pdf @@ -0,0 +1,63 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog +/Outlines 2 0 R +/Pages 3 0 R +>> +endobj +2 0 obj +<< /Type /Outlines +/Count 0 +>> +endobj +3 0 obj +<< /Type /Pages +/Kids [ 4 0 R ] +/Count 1 +>> +endobj +4 0 obj +<< /Type /Page +/Parent 3 0 R +/MediaBox [ 0 0 612 792 ] +/Contents 5 0 R +/Resources << /ProcSet 6 0 R +/Font << /F1 7 0 R >> +>> +>> +endobj +5 0 obj +<< /Length 92 /Filter /ASCIIHexDecode>> +stream +540a2f46312032342054660a3130302031303 +02054640a282048656c6c6f20576f726c64202920546a0a45540a> +endstream +endobj +6 0 obj +[ /PDF /Text ] +endobj +7 0 obj +<< /Type /Font +/Subtype /Type1 +/Name /F1 +/BaseFont /Helvetica +/Encoding /MacRomanEncoding +>> +endobj +xref +0 8 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000120 00000 n +0000000179 00000 n +0000000322 00000 n +0000000487 00000 n +0000000517 00000 n +trailer +<< /Size 8 +/Root 1 0 R +>> +startxref +625 +%%EOF blob - /dev/null blob + 7f887ccce2c1c995ea65f1524e28685713f435d3 (mode 644) Binary files /dev/null and t/hello_lzwdecode_march.pdf differ blob - /dev/null blob + 8f14de953d0952359834e9c7e67874246996a6ec (mode 644) --- /dev/null +++ t/hello_runlength.pdf @@ -0,0 +1,62 @@ +%PDF-1.4 +1 0 obj +<< /Type /Catalog +/Outlines 2 0 R +/Pages 3 0 R +>> +endobj +2 0 obj +<< /Type /Outlines +/Count 0 +>> +endobj +3 0 obj +<< /Type /Pages +/Kids [ 4 0 R ] +/Count 1 +>> +endobj +4 0 obj +<< /Type /Page +/Parent 3 0 R +/MediaBox [ 0 0 612 792 ] +/Contents 5 0 R +/Resources << /ProcSet 6 0 R +/Font << /F1 7 0 R >> +>> +>> +endobj +5 0 obj +<< /Length 23 /Filter /RunLengthDecode>> +stream + AAAAAAAAAAAAAAAAAAAA@B +endstream +endobj +6 0 obj +[ /PDF /Text ] +endobj +7 0 obj +<< /Type /Font +/Subtype /Type1 +/Name /F1 +/BaseFont /Helvetica +/Encoding /MacRomanEncoding +>> +endobj +xref +0 8 +0000000000 65535 f +0000000009 00000 n +0000000074 00000 n +0000000120 00000 n +0000000179 00000 n +0000000322 00000 n +0000000419 00000 n +0000000450 00000 n +trailer +<< /Size 8 +/Root 1 0 R +>> +startxref +557 +%%EOF blob - /dev/null blob + a7919358495066d05aba0366b1c55e4b38f109d2 (mode 644) Binary files /dev/null and t/lzw.pdf differ blob - /dev/null blob + 38ecd30a3ed38605d2cb752efb66dc8ecb4861c6 (mode 644) --- /dev/null +++ lzw-lib.c @@ -0,0 +1,318 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + +#include +#include +#include + +#include "lzw-lib.h" + +/* This library implements the LZW general-purpose data compression algorithm. + * The algorithm was originally described as a hardware implementation by + * Terry Welsh here: + * + * Welch, T.A. “A Technique for High-Performance Data Compression.” + * IEEE Computer 17,6 (June 1984), pp. 8-19. + * + * Since then there have been enumerable refinements and variations on the + * basic technique, and this implementation is no different. The target of + * the present implementation is embedded systems, and so emphasis was placed + * on simplicity, fast execution, and minimal RAM usage. + * + * The symbols are stored in adjusted binary, which provides considerably + * better compression performance with virtually no speed penalty compared to + * the fixed sizes normally used. To ensure good performance on data with + * varying characteristics (like executable images) the encoder resets as + * soon as the dictionary is full. Also, worst-case performance is limited + * to about 8% inflation by catching poor performance and forcing an early + * reset before longer symbols are sent. + * + * The maximum symbol size is configurable on the encode side (from 9 bits + * to 12 bits) and determines the RAM footprint required by both sides and, + * to a large extent, the compression performance. This information is + * communicated to the decoder in the first stream byte so that it can + * allocate accordingly. The RAM requirements are as follows: + * + * maximum encoder RAM decoder RAM + * symbol size requirement requirement + * ----------------------------------------- + * 9-bit 1792 bytes 1024 bytes + * 10-bit 4352 bytes 3072 bytes + * 11-bit 9472 bytes 7168 bytes + * 12-bit 19712 bytes 15360 bytes + * + * This implementation uses malloc(), but obviously an embedded version could + * use static arrays instead if desired (assuming that the maxbits was + * controlled outside). + */ + +#define NULL_CODE -1 // indicates a NULL prefix +#define CLEAR_CODE 256 // code to flush dictionary and restart decoder +#define EOD_CODE 257 // used in PDF's LZWDecode to signal end of data +#define FIRST_STRING 258 // code of first dictionary string, PDF edition + +/* This macro writes the adjusted-binary symbol "code" given the maximum + * symbol "maxcode". A macro is used here just to avoid the duplication in + * the lzw_compress() function. The idea is that if "maxcode" is not one + * less than a power of two (which it rarely will be) then this code can + * often send fewer bits that would be required with a fixed-sized code. + * + * For example, the first code we send will have a "maxcode" of 257, so + * every "code" would normally consume 9 bits. But with adjusted binary we + * can actually represent any code from 0 to 253 with just 8 bits -- only + * the 4 codes from 254 to 257 take 9 bits. + */ + +#define WRITE_CODE(code,maxcode) do { \ + int code_bits = (maxcode) < 1024 ? \ + ((maxcode) < 512 ? 8 : 9) : \ + ((maxcode) < 2048 ? 10 : 11); \ + int extras = (1 << (code_bits + 1)) - (maxcode) - 1; \ + if ((code) < extras) { \ + shifter |= ((long)(code) << bits); \ + bits += code_bits; \ + } \ + else { \ + shifter |= ((long)(((code) + extras) >> 1) << bits); \ + bits += code_bits; \ + shifter |= ((long)(((code) + extras) & 1) << bits++); \ + } \ + do { (*dst)(shifter); shifter >>= 8; output_bytes++; \ + } while ((bits -= 8) >= 8); \ +} while (0) + +/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the + * "maxbits" parameter specifies the maximum symbol size (9-12), which in turn determines + * the RAM requirement and, to a large extent, the level of compression achievable. A return + * value of EOF from the "src" callback terminates the compression process. A non-zero return + * value indicates one of the two possible errors -- bad "maxbits" param or failed malloc(). + */ + +int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits) +{ + int next = FIRST_STRING, prefix = NULL_CODE, bits = 0, total_codes, c; + unsigned long input_bytes = 0, output_bytes = 0; + short *first_references, *next_references; + unsigned char *terminators; + unsigned long shifter = 0; + + if (maxbits < 9 || maxbits > 12) // check for valid "maxbits" setting + return 1; + + // based on the "maxbits" parameter, compute total codes and allocate dictionary storage + + total_codes = 1 << maxbits; + first_references = malloc (total_codes * sizeof (first_references [0])); + next_references = malloc ((total_codes - 256) * sizeof (next_references [0])); + terminators = malloc ((total_codes - 256) * sizeof (terminators [0])); + + if (!first_references || !next_references || !terminators) + return 1; // failed malloc() + + // clear the dictionary + + memset (first_references, 0, total_codes * sizeof (first_references [0])); + memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0])); + memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0])); + + (*dst)(maxbits - 9); // first byte in output stream indicates the maximum symbol bits + + // This is the main loop where we read input bytes and compress them. We always keep track of the + // "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that + // has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits" + // variables and are sent to the output every time 8 bits are available (done in the macro). + + while ((c = (*src)()) != EOF) { + int cti; // coding table index + + input_bytes++; + + if (prefix == NULL_CODE) { // this only happens the very first byte when we don't yet have a prefix + prefix = c; + continue; + } + + if ((cti = first_references [prefix])) { // if any longer strings are built on the current prefix... + while (1) + if (terminators [cti - 256] == c) { // we found a matching string, so we just update the prefix + prefix = cti; // to that string and continue without sending anything + break; + } + else if (!next_references [cti - 256]) { // this string did not match the new character and + next_references [cti - 256] = next; // there aren't any more, so we'll add a new string + cti = 0; // and point to it with "next_reference" + break; + } + else + cti = next_references [cti - 256]; // there are more possible matches to check, so loop back + } + else // no longer strings are based on the current prefix, so now + first_references [prefix] = next; // the current prefix plus the new byte will be the next string + + // If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a + // dictionary match, so we send the symbol representing the current "prefix" and add the new string to the + // dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix. + + if (!cti) { + WRITE_CODE (prefix, next); // send symbol for current prefix (0 to next-1) + terminators [next - 256] = c; // newly created string has current byte as the terminator + prefix = c; // current byte also becomes new prefix for next string + + // This is where we bump the next string index and decide whether to clear the dictionary and start over. + // The triggers for that are either the dictionary is full or we've been outputting too many bytes and + // decide to cut our losses before the symbols get any larger. Note that for the dictionary full case we + // do NOT send the CLEAR_CODE because the decoder knows about this and we don't want to be redundant. + + if (++next == total_codes || output_bytes > 8 + input_bytes + (input_bytes >> 4)) { + if (next < total_codes) + WRITE_CODE (CLEAR_CODE, next); + + // clear the dictionary and reset the byte counters -- basically everything starts over + // except that we keep the last pending "prefix" (which, of course, was never sent) + + memset (first_references, 0, total_codes * sizeof (first_references [0])); + memset (next_references, 0, (total_codes - 256) * sizeof (next_references [0])); + memset (terminators, 0, (total_codes - 256) * sizeof (terminators [0])); + input_bytes = output_bytes = 0; + next = FIRST_STRING; + } + } + } + + // we're done with input, so if we've received anything we still need to send that pesky pending prefix... + + if (prefix != NULL_CODE) { + WRITE_CODE (prefix, next); + + if (++next == total_codes) // watch for clearing to the first string to stay in step with the decoder! + next = FIRST_STRING; // (this was actually a corner-case bug that did not trigger often) + } + + WRITE_CODE (next, next); // the maximum possible code is always reserved for our END_CODE + + if (bits) // finally, flush any pending bits from the shifter + (*dst)(shifter); + + free (terminators); free (next_references); free (first_references); + return 0; +} + +/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. + * A return value of EOF from the "src" callback terminates the compression process + * (although this should not normally occur). A non-zero return value + * indicates an error, which in this case can be a + * failed malloc(), or if an EOF is read from the input stream before the compression + * terminates naturally with END_CODE. + */ + +int lzw_decompress (void (*dst)(int), int (*src)(void)) +{ + int read_byte, next = FIRST_STRING, prefix = CLEAR_CODE, bits = 0, total_codes; + unsigned char *terminators, *reverse_buffer; + unsigned long shifter = 0; + short *prefixes; + + // PDF specific change: maxbits is not in the input stream + // we'll just be pessimistic and allocate the maximal size buffer + + total_codes = 4096; + reverse_buffer = malloc ((total_codes - 256) * sizeof (reverse_buffer [0])); + prefixes = malloc ((total_codes - 256) * sizeof (prefixes [0])); + terminators = malloc ((total_codes - 256) * sizeof (terminators [0])); + + if (!reverse_buffer || !prefixes || !terminators) // check for mallco() failure + return 1; + + // This is the main loop where we read input symbols. The values range from 0 to the code value + // of the "next" string in the dictionary. Note that receiving an EOF from the input + // stream is actually an error because we should have gotten the END_CODE first. + + while (1) { + int code_bits = next < 512 ? 9 : (next < 1024 ? 10 : (next < 2048 ? 11 : 12) ), code; + + #define TOP_BITMASK (((1 << code_bits) - 1) << (bits - code_bits) ) + #define BOTTOM_BITMASK ((1 << (bits - code_bits)) - 1) + + do { + if ((read_byte = ((*src)())) == EOF) { + free (terminators); free (prefixes); free (reverse_buffer); + return 1; + } + + /* shifter reworked: everything shifted left by a byte, + * and the byte we just read becomes the least significant + * byte */ + + // prepare to shift in next byte + shifter <<= 8; + /* the bitstrings forming the symbols are stored MSB first, + * so we can just OR in the next */ + shifter |= (unsigned long) read_byte; + } while ((bits += 8) < code_bits); + + + /* for a 12-bit code, the shifter's bits now look like + * from MSB to LSB: 00...0cccccccccn...n + * where c are the bits of our code + * and n are the bits we're not yet interested in + * the number of times n is repeated is bits - code_bits + * ie. the number of bits read in minus the bits we're interested in */ + + // shift our code bits into thier proper place, and save it as the final code + code = (int) shifter >> (bits - code_bits); + /* we can now clear the shifter's top bits. the result looks like: + * 00...0n...n + * number of n is bits-code_bits + * */ + shifter &= BOTTOM_BITMASK; + // update the count of bytes in the shifter + bits -= code_bits; + + if (code == EOD_CODE) // In PDF, EOD is signalled by 257, rather than the max code + break; + else if (code == CLEAR_CODE) // otherwise check for a CLEAR_CODE to start over early + next = FIRST_STRING; + else if (prefix == CLEAR_CODE) { // this only happens at the first symbol which is always sent + (*dst)(code); // literally and becomes our initial prefix + next++; + } + // Otherwise we have a valid prefix so we step through the string from end to beginning storing the + // bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case + // we have to handle here is that the string might be the same one that is actually being defined + // now (code == next-1). Also, the first 256 entries of "terminators" and "prefixes" are fixed and + // not allocated, so that messes things up a bit. + else { + int cti = (code == next-1) ? prefix : code; + unsigned char *rbp = reverse_buffer, c; + + do *rbp++ = cti < 256 ? cti : terminators [cti - 256]; // step backward through string... + while ((cti = (cti < 256) ? NULL_CODE : prefixes [cti - 256]) != NULL_CODE); + + c = *--rbp; // the first byte in this string is the terminator for the last string, which is + // the one that we'll create a new dictionary entry for this time + + do (*dst)(*rbp); // send string in corrected order (except for the terminator + while (rbp-- != reverse_buffer); // which we don't know yet) + + if (code == next-1) + (*dst)(c); + + prefixes [next - 1 - 256] = prefix; // now update the next dictionary entry with the new string + terminators [next - 1 - 256] = c; // (but we're always one behind, so it's not the string just sent) + + if (++next == total_codes) // check for full dictionary, which forces a reset (and, BTW, + next = FIRST_STRING; // means we'll never use the dictionary entry we just wrote) + } + + prefix = code; // the code we just received becomes the prefix for the next dictionary string entry + // (which we'll create once we find out the terminator) + } + + free (terminators); free (prefixes); free (reverse_buffer); + return 0; +} blob - /dev/null blob + 81fdeb15e6ade7ef0fd6089a4fdc3d3f2d593578 (mode 644) --- /dev/null +++ lzw-lib.h @@ -0,0 +1,15 @@ +//////////////////////////////////////////////////////////////////////////// +// **** LZW-AB **** // +// Adjusted Binary LZW Compressor/Decompressor // +// Copyright (c) 2016 David Bryant // +// All Rights Reserved // +// Distributed under the BSD Software License (see license.txt) // +//////////////////////////////////////////////////////////////////////////// + +#ifndef LZWLIB_H_ +#define LZWLIB_H_ + +int lzw_compress (void (*dst)(int), int (*src)(void), int maxbits); +int lzw_decompress (void (*dst)(int), int (*src)(void)); + +#endif /* LZWLIB_H_ */