commit 53d31efb90ba3586f0da6ef2d0cee999ba562b90 from: Sven M. Hallberg date: Sun Jul 02 12:47:20 2023 UTC add dumpcstream (-d c) commit - 4e73234b1dbaf15f9776d9c17cfdb6b343108c75 commit + 53d31efb90ba3586f0da6ef2d0cee999ba562b90 blob - 7c59c8704db10ae825fbbacf4672e7a63a4304aa blob + bc1057b35cbcba3ce10c8beacb89bebb0fcd0c5f --- pdf.1.mdoc +++ pdf.1.mdoc @@ -32,6 +32,11 @@ The generation number may be omitted to select the lat .Pp The options are as follows: .Bl -tag -width Ds +.It Fl d Cm c +Decode and parse the given stream object(s) as a single content stream. +At least one +.Ar oid +argument is required. .It Fl d Cm s Dump the body data, after filter decoding, of a given stream object. An blob - 4962b08bcf3907d45ae0db57127e5cf76c7e88d1 blob + 3bfea7f72603d25746377db94e34368405e44507 --- pdf.1.txt +++ pdf.1.txt @@ -4,19 +4,22 @@ NAME pdf - validation and inspection of PDF files SYNOPSIS - pdf [-qsv] [-d what] [-x txtfile] input.pdf [oid] + pdf [-qsv] [-d type] [-x txtfile] input.pdf [oid ...] DESCRIPTION The pdf utility attempts to parse and validate the given PDF file. It prints the resulting AST to standard output using a JSON format. - The optional oid argument selects a specific object to be printed instead - of the whole document. It is expected to be of the form "n.g" where n + The optional oid arguments select specific objects to be printed instead + of the whole document. Each is expected to be of the form "n.g" where n and g are object and generation numbers, respectively. The generation number may be omitted to select the latest object matching n. The options are as follows: + -d c Decode and parse the given stream object(s) as a single content + stream. At least one oid argument is required. + -d s Dump the body data, after filter decoding, of a given stream object. An oid argument is required. @@ -47,4 +50,4 @@ STANDARDS Document management -- Portable document format -- Part 2: PDF 2.0, ISO 32000-2, 2020. - January 6, 2023 + July 2, 2023 blob - 047745e413075672abecbc5108fe1f15345288b6 blob + 90652212d83170dd293b8e3c8df517e6e34af290 --- pdf.c +++ pdf.c @@ -6,6 +6,7 @@ #include #include #include "pdf.h" +#include "content.h" #ifdef LOG #define VIOL(P,VIOL) h_action(h_sequence(P, h_tell(), NULL), act_viol, VIOL) @@ -2518,6 +2519,8 @@ init_parser(struct Env *aux) p_violsev = violsev; + init_content_parser(); + #if 0 // XXX testing int r; @@ -4772,25 +4775,67 @@ usage(void) exit(2); } -void -dumpstream(FILE *f, const HParsedToken *obj) +/* helper to extract the actual data payload out of a stream object */ +HBytes +streamdata(const HParsedToken *obj) { HParseResult *res; - HBytes data; // XXX properly verify that obj is a stream (needs custom token type) + /* verify that obj is a stream */ if (obj->token_type != TT_SEQUENCE || obj->seq->used != 2 || obj->seq->elements[1]->token_type != TT_HParseResult) errx(2, "%s: requested object is not a stream", infile); + /* extract stream data */ res = H_INDEX(HParseResult, obj, 1); assert(res != NULL); assert(res->ast != NULL); - data = H_CAST_BYTES(res->ast); + return H_CAST_BYTES(res->ast); +} - fwrite(data.token, 1, data.len, f); +void +dumpstream(FILE *f, const HParsedToken *obj) +{ + HBytes b; + + b = streamdata(obj); + fwrite(b.token, 1, b.len, f); } +void +dumpcstream(FILE *f, const HParsedToken **obj, size_t n) +{ + HSuspendedParser *p; + HParseResult *res; + HBytes b; + int i; + + /* start the parse */ + p = h_parse_start(p_cstream); + + /* feed the concatenation of the streams into the parser */ + for (i = 0; i < n; i++) { + b = streamdata(obj[i]); + + if (i > 0) + h_parse_chunk(p, "", 1); /* separator '\0' */ + h_parse_chunk(p, b.token, b.len); + } + + /* finish the parse and print the result */ + res = h_parse_finish(p); + if (res == NULL) { + if (!qflag) { + fprintf(stderr, "%s: parse error in content stream\n", + infile); + } + exit(1); + } + if (!qflag) + h_pprintln(f, res->ast); +} + /* * This helper implements the standard backwards parsing strategy to read all * cross-reference sections and trailer dictionaries, starting from the @@ -5054,7 +5099,9 @@ main(int argc, char *argv[]) /* print desired output */ if (!qflag) { - if (dflag == 's') + if (dflag == 'c') + dumpcstream(stdout, obj, argc); + else if (dflag == 's') for (i = 0; i < argc; i++) dumpstream(stdout, obj[i]); else if (obj != NULL)