commit ea6ac09d253bcf098c41163d460bbff0d9b17de6 from: Sven M. Hallberg date: Sat Jul 01 17:42:21 2023 UTC unify loops in act_txtobj Instead of counting the space that will be needed for the concatenated string, use a dynamically-grown buffer. We use malloc/realloc for the growing buffer, because h_arena_realloc() is, at this time, very inefficient. After the loop, we copy the whole string into the arena at once and free the buffer. commit - aded37ad86d0f32120068643147415c158458857 commit + ea6ac09d253bcf098c41163d460bbff0d9b17de6 blob - 17787b8e05991c59a17705620cb85583734743e5 blob + 0ca0fffc4a8c6e26b04f6196b65b35d3f0c1e12c --- pdf.c +++ pdf.c @@ -1856,12 +1856,8 @@ act_txtobj(const HParseResult *p, void *u) const HParsedToken *opstream = H_INDEX_TOKEN(p->ast, 1); const HParsedToken *tt_text=NULL; uint8_t *tstr=NULL; - int textlen=0; struct textmat tm; PtNode_T *node = aux->curr_node; - double cs = node->ts.char_spacing; - double ws = node->ts.word_spacing; - double ls = node->ts.line_spacing; double *px=&tm.cell[4]; double *py=&tm.cell[5]; @@ -1879,18 +1875,26 @@ act_txtobj(const HParseResult *p, void *u) // TODO:: Handle non-horizontal text + char *buf; + size_t len = 0, cap = 64; /* characters used/allocated */ + const char *app; /* characters to append */ + size_t napp; /* number of characters to append */ + int newline; /* append a newline first? */ - // Walk through the tokens to determine how much space to allocate - // Count the number of characters in the stream - // Concatenate the text into the allocated space - for (int i =0; i < opstream->seq->used; i++) { + buf = malloc(cap); + assert(buf != NULL); // XXX + // Walk through the tokens and concatenate the pieces + for (int i =0; i < opstream->seq->used; i++) { txte = H_CAST(TextEntry, opstream->seq->elements[i]); - - // make sure we are working on the same node as the current node assert(txte != NULL); assert(txte->node == node); + app = NULL; + napp = 0; + newline = 0; + + // Process the text operators switch (txte->type) { // text state operators case TS_Tc: @@ -1910,7 +1914,7 @@ act_txtobj(const HParseResult *p, void *u) break; case TS_Tf: - node->ts.font = txte; + node->ts.font = txte; node->ts.font_size = txte->fref.fontsize; break; @@ -1918,22 +1922,18 @@ act_txtobj(const HParseResult *p, void *u) // text positioning and showing operators case TP_TD: node->ts.line_spacing = txte->pos.ty; + /* fall through! */ case TP_Td: if ( (*px == 0.0) && (*py == 0.0) ) { // initialize *px = txte->pos.tx; *py = txte->pos.ty; - // check to see if we are starting a new line - if ( (node->ts.curr_pos.ty != 0.0) && - (node->ts.curr_pos.ty != *py) ) { - textlen += 1; // add a newline - } + if ((node->ts.curr_pos.ty != 0.0) && + (node->ts.curr_pos.ty != *py) ) + newline = 1; } else { if (txte->pos.ty != 0.0) { - // we are not rendering -- we just know - // it is not in the same line if y not - // equal - textlen += 1; // add a newline - *py -= txte->pos.ty; // should this be a +=?? + *py -= txte->pos.ty; + newline = 1; } if (txte->pos.tx) { // handle x -- when should we add a space // TODO:: handle x -- not sure .. for now, ignore @@ -1941,26 +1941,32 @@ act_txtobj(const HParseResult *p, void *u) } } break; + case TP_Tstar: *py -= node->ts.line_spacing; - textlen += 1; + newline = 1; break; case TW_Tqq: node->ts.word_spacing = txte->aw; node->ts.char_spacing = txte->ac; + /* fall through! */ case TW_Tq: *py -= node->ts.line_spacing; - textlen += 1; + newline = 1; + /* fall through! */ case TW_Tj: - textlen += txte->tstr.nchars; + app = txte->tstr.text; + napp = txte->tstr.nchars; if (node->ts.font != NULL) *px += txte->tstr.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description break; + case TW_TJ: - textlen += txte->tarray.flattened.nchars; + app = txte->tarray.flattened.text; + napp = txte->tarray.flattened.nchars; if (node->ts.font != NULL) *px += txte->tarray.flattened.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description break; @@ -1968,8 +1974,30 @@ act_txtobj(const HParseResult *p, void *u) default: ; // ignore } + + /* grow buf as required, always leave room for a newline */ + assert(len <= SIZE_MAX - napp - 1); // XXX + while (cap < len + napp + 1) { + assert(cap < SIZE_MAX / 3); // XXX + cap = cap * 3 / 2; + buf = realloc(buf, cap); + assert(buf != NULL); // XXX + } + + /* append text to buf */ + if (newline) + buf[len++] = '\n'; + if (app != NULL) { + memcpy(buf + len, app, napp); + len += napp; + } } + /* copy buf into arena-allocated tstr and free buf */ + tstr = h_arena_malloc(p->arena, len); + memcpy(tstr, buf, len); + free(buf); + // Are we within the page bounds? If not, generate a warning if ( (*px < 0) || (*px > aux->curr_node->mediaBox.tx) ) { fprintf (stdout, "Final position of the text string is outside media box bounds.\n" @@ -1982,109 +2010,14 @@ act_txtobj(const HParseResult *p, void *u) aux->curr_node->mediaBox.tx, *px); } - // reset text state - *px = *py = 0.0; - node->ts.char_spacing = cs; - node->ts.word_spacing = ws; - node->ts.line_spacing = ls; - - tstr = h_arena_malloc(p->arena, sizeof(uint8_t) * textlen); - int idx=0; - // Now concatenate the pieces - for (int i =0; i < opstream->seq->used; i++) { - txte = H_CAST(TextEntry, opstream->seq->elements[i]); - assert(txte != NULL); - assert(txte->node == node); - - // Process the text operators - switch (txte->type) { - // text state operators - case TS_Tc: - node->ts.char_spacing = txte->value; - break; - - case TS_Tw: - node->ts.word_spacing = txte->value; - break; - - case TS_Tz: - node->ts.horiz_scaling = txte->value; - break; - - case TS_TL: - node->ts.line_spacing = txte->value; - break; - - case TS_Tf: - node->ts.font = txte; - node->ts.font_size = txte->fref.fontsize; - break; - - - // text positioning and showing operators - case TP_TD: - node->ts.line_spacing = txte->pos.ty; - case TP_Td: - if ( (*px == 0.0) && (*py == 0.0) ) { // initialize - *px = txte->pos.tx; - *py = txte->pos.ty; - if ( (node->ts.curr_pos.ty != 0.0) && - (node->ts.curr_pos.ty != *py) ) { - tstr[idx] = '\n'; idx += 1; - } - } else { - if (txte->pos.ty != 0.0) { - tstr[idx] = '\n'; idx += 1; - *py -= txte->pos.ty; // should this be a +=?? - } - if (txte->pos.tx) { // handle x -- when should we add a space - // TODO:: handle x -- not sure .. for now, ignore - *px += txte->pos.tx; - } - } - break; - - case TP_Tstar: - tstr[idx] = '\n'; idx += 1; - *py -= node->ts.line_spacing; - break; - - - case TW_Tqq: - node->ts.word_spacing = txte->aw; - node->ts.char_spacing = txte->ac; - case TW_Tq: - *py -= node->ts.line_spacing; - tstr[idx] = '\n'; idx += 1; - case TW_Tj: - memcpy(&tstr[idx], txte->tstr.text, txte->tstr.nchars); - idx += txte->tstr.nchars; - if (node->ts.font != NULL) - *px += txte->tstr.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description - break; - - - case TW_TJ: - memcpy(&tstr[idx], txte->tarray.flattened.text, txte->tarray.flattened.nchars); - idx += txte->tarray.flattened.nchars; - if (node->ts.font != NULL) - *px += txte->tarray.flattened.nchars * node->ts.font->fref.fontsize; // TODO:: handle character width from font description - break; - - default: - ; // ignore - } - } - assert(idx == textlen); - // update the position on the page node->ts.curr_pos.tx = *px; node->ts.curr_pos.ty = *py; txtobj->type = TW_Tj; txtobj->tstr.text = tstr; - txtobj->tstr.nchars = textlen; - if (textlen) + txtobj->tstr.nchars = len; + if (len) txtobj->node = node; else txtobj->node = NULL;