Commit Diff


commit - 4e73234b1dbaf15f9776d9c17cfdb6b343108c75
commit + 53d31efb90ba3586f0da6ef2d0cee999ba562b90
blob - 7c59c8704db10ae825fbbacf4672e7a63a4304aa
blob + bc1057b35cbcba3ce10c8beacb89bebb0fcd0c5f
--- pdf.1.mdoc
+++ pdf.1.mdoc
@@ -32,6 +32,11 @@ The generation number may be omitted to select the lat
 .Pp
 The options are as follows:
 .Bl -tag -width Ds
+.It Fl d Cm c
+Decode and parse the given stream object(s) as a single content stream.
+At least one
+.Ar oid
+argument is required.
 .It Fl d Cm s
 Dump the body data, after filter decoding, of a given stream object.
 An
blob - 4962b08bcf3907d45ae0db57127e5cf76c7e88d1
blob + 3bfea7f72603d25746377db94e34368405e44507
--- pdf.1.txt
+++ pdf.1.txt
@@ -4,19 +4,22 @@ NAME
      pdf - validation and inspection of PDF files
 
 SYNOPSIS
-     pdf [-qsv] [-d what] [-x txtfile] input.pdf [oid]
+     pdf [-qsv] [-d type] [-x txtfile] input.pdf [oid ...]
 
 DESCRIPTION
      The pdf utility attempts to parse and validate the given PDF file.	 It
      prints the resulting AST to standard output using a JSON format.
 
-     The optional oid argument selects a specific object to be printed instead
-     of the whole document.  It is expected to be of the form "n.g" where n
+     The optional oid arguments select specific objects to be printed instead
+     of the whole document.  Each is expected to be of the form "n.g" where n
      and g are object and generation numbers, respectively.  The generation
      number may be omitted to select the latest object matching n.
 
      The options are as follows:
 
+     -d c    Decode and parse the given stream object(s) as a single content
+	     stream.  At least one oid argument is required.
+
      -d s    Dump the body data, after filter decoding, of a given stream
 	     object.  An oid argument is required.
 
@@ -47,4 +50,4 @@ STANDARDS
      Document management -- Portable document format -- Part 2: PDF 2.0, ISO
      32000-2, 2020.
 
-				January 6, 2023
+				 July 2, 2023
blob - 047745e413075672abecbc5108fe1f15345288b6
blob + 90652212d83170dd293b8e3c8df517e6e34af290
--- pdf.c
+++ pdf.c
@@ -6,6 +6,7 @@
 #include <hammer/glue.h>
 #include <math.h>
 #include "pdf.h"
+#include "content.h"
 
 #ifdef LOG
 #define VIOL(P,VIOL)	h_action(h_sequence(P, h_tell(), NULL), act_viol, VIOL)
@@ -2518,6 +2519,8 @@ init_parser(struct Env *aux)
 	p_violsev = violsev;
 
 
+	init_content_parser();
+
 #if 0
 	// XXX testing
 	int r;
@@ -4772,25 +4775,67 @@ usage(void)
 	exit(2);
 }
 
-void
-dumpstream(FILE *f, const HParsedToken *obj)
+/* helper to extract the actual data payload out of a stream object */
+HBytes
+streamdata(const HParsedToken *obj)
 {
 	HParseResult *res;
-	HBytes data;
 
 	// XXX properly verify that obj is a stream (needs custom token type)
+	/* verify that obj is a stream */
 	if (obj->token_type != TT_SEQUENCE || obj->seq->used != 2 ||
 	    obj->seq->elements[1]->token_type != TT_HParseResult)
 		errx(2, "%s: requested object is not a stream", infile);
 
+	/* extract stream data */
 	res = H_INDEX(HParseResult, obj, 1);
 	assert(res != NULL);
 	assert(res->ast != NULL);
-	data = H_CAST_BYTES(res->ast);
+	return H_CAST_BYTES(res->ast);
+}
 
-	fwrite(data.token, 1, data.len, f);
+void
+dumpstream(FILE *f, const HParsedToken *obj)
+{
+	HBytes b;
+
+	b = streamdata(obj);
+	fwrite(b.token, 1, b.len, f);
 }
 
+void
+dumpcstream(FILE *f, const HParsedToken **obj, size_t n)
+{
+	HSuspendedParser *p;
+	HParseResult *res;
+	HBytes b;
+	int i;
+
+	/* start the parse */
+	p = h_parse_start(p_cstream);
+
+	/* feed the concatenation of the streams into the parser */
+	for (i = 0; i < n; i++) {
+		b = streamdata(obj[i]);
+
+		if (i > 0)
+			h_parse_chunk(p, "", 1);	/* separator '\0' */
+		h_parse_chunk(p, b.token, b.len);
+	}
+
+	/* finish the parse and print the result */
+	res = h_parse_finish(p);
+	if (res == NULL) {
+		if (!qflag) {
+			fprintf(stderr, "%s: parse error in content stream\n",
+			    infile);
+		}
+		exit(1);
+	}
+	if (!qflag)
+		h_pprintln(f, res->ast);
+}
+
 /*
  * This helper implements the standard backwards parsing strategy to read all
  * cross-reference sections and trailer dictionaries, starting from the
@@ -5054,7 +5099,9 @@ main(int argc, char *argv[])
 		
 	/* print desired output */
 	if (!qflag) {
-		if (dflag == 's')
+		if (dflag == 'c')
+			dumpcstream(stdout, obj, argc);
+		else if (dflag == 's')
 			for (i = 0; i < argc; i++)
 				dumpstream(stdout, obj[i]);
 		else if (obj != NULL)