commit ea6ac09d253bcf098c41163d460bbff0d9b17de6
from: Sven M. Hallberg <pesco@khjk.org>
date: Sat Jul  1 17:42:21 2023 UTC

unify loops in act_txtobj

Instead of counting the space that will be needed for the concatenated
string, use a dynamically-grown buffer. We use malloc/realloc for the
growing buffer, because h_arena_realloc() is, at this time, very
inefficient. After the loop, we copy the whole string into the arena at
once and free the buffer.

commit - aded37ad86d0f32120068643147415c158458857
commit + ea6ac09d253bcf098c41163d460bbff0d9b17de6
blob - 17787b8e05991c59a17705620cb85583734743e5
blob + 0ca0fffc4a8c6e26b04f6196b65b35d3f0c1e12c
--- pdf.c
+++ pdf.c
@@ -1856,12 +1856,8 @@ act_txtobj(const HParseResult *p, void *u)
 	const HParsedToken *opstream = H_INDEX_TOKEN(p->ast, 1);
 	const HParsedToken *tt_text=NULL;
 	uint8_t            *tstr=NULL;
-	int                 textlen=0;
 	struct textmat      tm;
 	PtNode_T           *node = aux->curr_node;
-	double              cs = node->ts.char_spacing;
-	double              ws = node->ts.word_spacing;
-	double              ls = node->ts.line_spacing;
 	double             *px=&tm.cell[4];
 	double             *py=&tm.cell[5];
 
@@ -1879,18 +1875,26 @@ act_txtobj(const HParseResult *p, void *u)
 
 	// TODO:: Handle non-horizontal text
 
+	char *buf;
+	size_t len = 0, cap = 64;	/* characters used/allocated */
+	const char *app;		/* characters to append */
+	size_t napp;			/* number of characters to append */
+	int newline;			/* append a newline first? */
 
-	// Walk through the tokens to determine how much space to allocate
-	// Count the number of characters in the stream
-	// Concatenate the text into the allocated space
+	buf = malloc(cap);
+	assert(buf != NULL);		// XXX
+
+	// Walk through the tokens and concatenate the pieces
 	for (int i =0; i < opstream->seq->used; i++) {
-
 		txte = H_CAST(TextEntry, opstream->seq->elements[i]);
-
-		// make sure we are working on the same node as the current node
 		assert(txte != NULL);
 		assert(txte->node == node);
 
+		app = NULL;
+		napp = 0;
+		newline = 0;
+
+		// Process the text operators
 		switch (txte->type) {
 		// text state operators
 		case TS_Tc:
@@ -1910,7 +1914,7 @@ act_txtobj(const HParseResult *p, void *u)
 			break;
 
 		case TS_Tf:
-			node->ts.font = txte;
+			node->ts.font      = txte;
 			node->ts.font_size = txte->fref.fontsize;
 			break;
 
@@ -1918,22 +1922,18 @@ act_txtobj(const HParseResult *p, void *u)
 		// text positioning and showing operators
 		case TP_TD:
 			node->ts.line_spacing = txte->pos.ty;
+			/* fall through! */
 		case TP_Td:
 			if ( (*px == 0.0) && (*py == 0.0) ) { // initialize
 				*px = txte->pos.tx;
 				*py = txte->pos.ty;
-				// check to see if we are starting a new line
-				if ( (node->ts.curr_pos.ty != 0.0) &&
-				    (node->ts.curr_pos.ty != *py) ) {
-					textlen += 1; // add a newline
-				}
+				if ((node->ts.curr_pos.ty != 0.0) &&
+				    (node->ts.curr_pos.ty != *py) )
+					newline = 1;
 			} else {
 				if (txte->pos.ty != 0.0) {
-					// we are not rendering -- we just know
-					// it is not in the same line if y not
-					// equal
-					textlen += 1; // add a newline
-					*py -= txte->pos.ty; // should this be a +=??
+					*py -= txte->pos.ty;
+					newline = 1;
 				}
 				if (txte->pos.tx) { // handle x -- when should we add a space
 					// TODO:: handle x -- not sure .. for now, ignore
@@ -1941,26 +1941,32 @@ act_txtobj(const HParseResult *p, void *u)
 				}
 			}
 			break;
+
 		case TP_Tstar:
 			*py -= node->ts.line_spacing;
-			textlen += 1;
+			newline = 1;
 			break;
 
 
 		case TW_Tqq:
 			node->ts.word_spacing = txte->aw;
 			node->ts.char_spacing = txte->ac;
+			/* fall through! */
 		case TW_Tq:
 			*py -= node->ts.line_spacing;
-			textlen += 1;
+			newline = 1;
+			/* fall through! */
 		case TW_Tj:
-			textlen += txte->tstr.nchars;
+			app = txte->tstr.text;
+			napp = txte->tstr.nchars;
 			if (node->ts.font != NULL)
 				*px += txte->tstr.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description
 			break;
 
+
 		case TW_TJ:
-			textlen += txte->tarray.flattened.nchars;
+			app = txte->tarray.flattened.text;
+			napp = txte->tarray.flattened.nchars;
 			if (node->ts.font != NULL)
 				*px += txte->tarray.flattened.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description
 			break;
@@ -1968,8 +1974,30 @@ act_txtobj(const HParseResult *p, void *u)
 		default:
 			; // ignore
 		}
+
+		/* grow buf as required, always leave room for a newline */
+		assert(len <= SIZE_MAX - napp - 1);	// XXX
+		while (cap < len + napp + 1) {
+			assert(cap < SIZE_MAX / 3);	// XXX
+			cap = cap * 3 / 2;
+			buf = realloc(buf, cap);
+			assert(buf != NULL);		// XXX
+		}
+
+		/* append text to buf */
+		if (newline)
+			buf[len++] = '\n';
+		if (app != NULL) {
+			memcpy(buf + len, app, napp);
+			len += napp;
+		}
 	}
 
+	/* copy buf into arena-allocated tstr and free buf */
+	tstr = h_arena_malloc(p->arena, len);
+	memcpy(tstr, buf, len);
+	free(buf);
+
 	// Are we within the page bounds? If not, generate a warning
 	if ( (*px < 0) || (*px > aux->curr_node->mediaBox.tx) ) {
 		fprintf (stdout, "Final position of the text string is outside media box bounds.\n"
@@ -1982,109 +2010,14 @@ act_txtobj(const HParseResult *p, void *u)
 		    aux->curr_node->mediaBox.tx, *px);
 	}
 
-	// reset text state
-	*px = *py = 0.0;
-	node->ts.char_spacing = cs;
-	node->ts.word_spacing = ws;
-	node->ts.line_spacing = ls;
-
-	tstr = h_arena_malloc(p->arena, sizeof(uint8_t) * textlen);
-	int idx=0;
-	// Now concatenate the pieces
-	for (int i =0; i < opstream->seq->used; i++) {
-		txte = H_CAST(TextEntry, opstream->seq->elements[i]);
-		assert(txte != NULL);
-		assert(txte->node == node);
-
-		// Process the text operators
-		switch (txte->type) {
-		// text state operators
-		case TS_Tc:
-			node->ts.char_spacing = txte->value;
-			break;
-
-		case TS_Tw:
-			node->ts.word_spacing = txte->value;
-			break;
-
-		case TS_Tz:
-			node->ts.horiz_scaling = txte->value;
-			break;
-
-		case TS_TL:
-			node->ts.line_spacing = txte->value;
-			break;
-
-		case TS_Tf:
-			node->ts.font      = txte;
-			node->ts.font_size = txte->fref.fontsize;
-			break;
-
-
-		// text positioning and showing operators
-		case TP_TD:
-			node->ts.line_spacing = txte->pos.ty;
-		case TP_Td:
-			if ( (*px == 0.0) && (*py == 0.0) ) { // initialize
-				*px = txte->pos.tx;
-				*py = txte->pos.ty;
-				if ( (node->ts.curr_pos.ty != 0.0) &&
-				    (node->ts.curr_pos.ty != *py) ) {
-					tstr[idx] = '\n'; idx += 1;
-				}
-			} else {
-				if (txte->pos.ty != 0.0) {
-					tstr[idx] = '\n'; idx += 1;
-					*py -= txte->pos.ty; // should this be a +=??
-				}
-				if (txte->pos.tx) { // handle x -- when should we add a space
-					// TODO:: handle x -- not sure .. for now, ignore
-					*px += txte->pos.tx;
-				}
-			}
-			break;
-
-		case TP_Tstar:
-			tstr[idx] = '\n'; idx += 1;
-			*py -= node->ts.line_spacing;
-			break;
-
-
-		case TW_Tqq:
-			node->ts.word_spacing = txte->aw;
-			node->ts.char_spacing = txte->ac;
-		case TW_Tq:
-			*py -= node->ts.line_spacing;
-			tstr[idx] = '\n'; idx += 1;
-		case TW_Tj:
-			memcpy(&tstr[idx], txte->tstr.text, txte->tstr.nchars);
-			idx += txte->tstr.nchars;
-			if (node->ts.font != NULL)
-				*px += txte->tstr.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description
-			break;
-
-
-		case TW_TJ:
-			memcpy(&tstr[idx], txte->tarray.flattened.text, txte->tarray.flattened.nchars);
-			idx += txte->tarray.flattened.nchars;
-			if (node->ts.font != NULL)
-				*px += txte->tarray.flattened.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description
-			break;
-
-		default:
-			; // ignore
-		}
-	}
-	assert(idx == textlen);
-
 	// update the position on the page
 	node->ts.curr_pos.tx  = *px;
 	node->ts.curr_pos.ty  = *py;
 
 	txtobj->type         = TW_Tj;
 	txtobj->tstr.text    = tstr;
-	txtobj->tstr.nchars  = textlen;
-	if (textlen)
+	txtobj->tstr.nchars  = len;
+	if (len)
 		txtobj->node = node;
 	else
 		txtobj->node = NULL;