Commit Diff


commit - 3b3bcd173d9adec4f238ec015acf9cd88c05845d
commit + 1d031337e6b891688dbcdacf72bb17e51487f5f6
blob - 23cf121c93cbf2f07d6ddd74150de47f6180baf8
blob + 3c3a30c92198cfb34695c84edb65d1da7da676ea
--- pdf.c
+++ pdf.c
@@ -327,7 +327,7 @@ act_nesc(const HParseResult *p, void *u)
 	return H_MAKE_UINT(H_FIELD_UINT(1)*16 + H_FIELD_UINT(2));
 }
 
-#define act_schars h_act_flatten
+#define act_str_ h_act_flatten
 #define act_string act_token
 
 HParsedToken *
@@ -341,6 +341,9 @@ act_octal(const HParseResult *p, void *u)
 
 	return H_MAKE_UINT(x);
 }
+#define act_oct3 act_octal
+#define act_oct2 act_octal
+#define act_oct1 act_octal
 
 HParsedToken *
 act_xrent(const HParseResult *p, void *u)
@@ -494,7 +497,6 @@ init_parser(struct Env *aux)
 	//H_RULE(dchar,	IN(DCHARS));			/* delimiter */
 	H_RULE(rchar,	NOT_IN(WCHARS DCHARS));		/* regular */
 	H_RULE(nchar,	NOT_IN(WCHARS DCHARS "#"));	/* name */
-	H_RULE(schar,	NOT_IN("()\n\r\\"));		/* string literal */
 	H_ARULE(digit,	h_ch_range('0', '9'));
 	H_ARULE(pdigit,	h_ch_range('1', '9'));
 	H_ARULE(hlower,	h_ch_range('a', 'f'));
@@ -549,10 +551,16 @@ init_parser(struct Env *aux)
 	/* numbers */
 	H_ARULE(sign,	CHX(minus, IGN(plus)));
 	H_VRULE(intnn,	nat);
+	#if 1
 	H_ARULE(realnn,	CHX(SEQ(digits, period, digits),	/* 12.3 */
 			    SEQ(digits, period, empty),		/* 123. */
 			    SEQ(empty, period, digits)));	/* .123 */
 		// XXX ^ we _could_ move the "123." case into intnn...
+	#else
+	// XXX the .123 case above somehow leads to a conflict with litstr...
+	H_ARULE(realnn,	CHX(SEQ(digits, period, digits),	/* 12.3 */
+			    SEQ(digits, period, empty)));	/* 123. */
+	#endif
 	H_RULE(numbnn,	CHX(realnn, intnn));
 	H_RULE(snumb,	SEQ(sign, numbnn));
 	H_ARULE(numb,	CHX(snumb, numbnn));
@@ -562,24 +570,59 @@ init_parser(struct Env *aux)
 	H_ARULE(nstr,	h_many(CHX(nchar, nesc)));	/* '/' is valid */
 	H_RULE(name,	h_right(slash, nstr));
 
-	/* strings */
-	H_RULE(snest,	h_indirect());
+	/* strings
+	 *
+	 * this is so convoluted in order to make it LALR including the
+	 * precedence rules for octal escapes ("\123" vs "\12 3" vs "\1 23")
+	 * and end-of-line ("CRLF" vs "CR LF").
+	 *
+	 * we have to split the base rule 'str' into variants 'str_o' and
+	 * 'str_l' depending on whether they may start with an octal digit or
+	 * linefeed, respectively.
+	 */
+	H_RULE(str_ol,	h_indirect());
+	H_RULE(str_o,	h_indirect());
+	H_RULE(str_l,	h_indirect());
+	H_RULE(str,	h_indirect());
 	H_RULE(bsn,	p_mapch('n', 0x0a));	/* LF */
 	H_RULE(bsr,	p_mapch('r', 0x0d));	/* CR */
 	H_RULE(bst,	p_mapch('t', 0x09));	/* HT */
 	H_RULE(bsb,	p_mapch('b', 0x08));	/* BS (backspace) */
 	H_RULE(bsf,	p_mapch('f', 0x0c));	/* FF */
 	H_RULE(escape,	CHX(bsn, bsr, bst, bsb, bsf, lparen, rparen, bslash));
-	H_ARULE(octal,	CHX(REP(odigit,3), REP(odigit,2), REP(odigit,1)));
-	H_RULE(wrap,	IGN(eol));
-	H_RULE(sesc,	h_right(bslash, CHX(escape, octal, wrap, epsilon)));
-		/* NB: lone backslashes and escaped newlines are ignored */
-	H_ARULE(schars,	h_many(CHX(schar, snest, sesc, eol)));
-	H_RULE(snest_,	SEQ(lparen, schars, rparen));
-	H_RULE(litstr,	h_middle(lparen, schars, rparen));
+	H_ARULE(oct3,	REP(odigit,3));
+	H_ARULE(oct2,	REP(odigit,2));
+	H_ARULE(oct1,	REP(odigit,1));
+	H_RULE(octesc,	CHX(SEQ(oct3, str),
+			    SEQ(oct2, str_o),
+			    SEQ(oct1, str_o)));
+	H_RULE(eolesc,	CHX(SEQ(IGN(crlf), str),
+			    SEQ(IGN(cr), str_l),
+			    SEQ(IGN(lf), str)));
+	H_RULE(schar_o,	NOT_IN("()\n\r\\" "01234567"));
+	H_RULE(schar_e,	NOT_IN("()\n\r\\" "01234567" "nrtbf"));
+	H_RULE(str_o_,	CHX(SEQ(lf, str), str_ol));	/* str "but not" odigit */
+	H_RULE(str_l_,	CHX(SEQ(odigit, str), str_ol));	/* str "but not" lf */
+	H_RULE(str_ol_,	CHX(SEQ(cr, str_l),		/* str "but neither" */
+			    SEQ(crlf, str),
+			    SEQ(schar_o, str),
+			    SEQ(lparen, str, rparen, str),
+			    SEQ(IGN(bslash), escape, str),
+			    SEQ(IGN(bslash), schar_e, str),	/* "lone" bs */
+		/* NB: ^ lone backslashes are to be ignored per spec, but we
+		 * let them "escape" with the following character. this works
+		 * because they are never truly alone. */
+			    SEQ(IGN(bslash), octesc),
+			    SEQ(IGN(bslash), eolesc),		/* line split */
+			    epsilon));
+	H_ARULE(str_,	CHX(SEQ(lf, str), SEQ(odigit, str), str_ol));
+	H_RULE(litstr,	h_middle(lparen, str, rparen));
 	H_RULE(hexstr,	h_middle(langle, MANY_WS(hdigit), rangle));
 	H_ARULE(string,	CHX(litstr, hexstr));
-	h_bind_indirect(snest, snest_);
+	h_bind_indirect(str_ol,	str_ol_);
+	h_bind_indirect(str_o,	str_o_);
+	h_bind_indirect(str_l,	str_l_);
+	h_bind_indirect(str,	str_);
 
 	H_RULE(array,	h_indirect());
 	H_RULE(dict,	h_indirect());
@@ -687,6 +730,16 @@ init_parser(struct Env *aux)
 	p_epsilon = epsilon;
 	p_return_0 = h_action(epsilon, act_return_uint, (void *)0);
 	p_return_1 = h_action(epsilon, act_return_uint, (void *)1);
+
+#if 0
+	// XXX testing
+	int r;
+	void errx(int, const char *, ...);
+	HParser *p = obj;
+	if ((r = h_compile(p, PB_LALR, NULL)) != 0)
+		errx(1, "h_compile() failed: %d", r);
+	errx(0, "OK");
+#endif
 }