Make the lexer pass whitespace through (as OTHER tokens) for text lines.

With this change, we can recreate the original text-line input exactly. Previously we were inserting a space between every pair of tokens so our output had a lot more whitespace than our input. With this change, we can drop the "-b" option to diff and match the input exactly.
2010-05-25 15:04:32 -07:00
parent 808401fd79
commit 9fb8b7a495
3 changed files with 84 additions and 42 deletions
@@ -32,6 +32,21 @@
 %option reentrant noyywrap
 %option extra-type="glcpp_parser_t *"

+	/* This lexer has two states:
+	 *
+	 * The CONTROL state is for control lines (directives)
+	 * It lexes exactly as specified in the C99 specification.
+	 *
+	 * The INITIAL state is for input lines. In this state, we
+	 * make the OTHER token much more broad in that it now
+	 * includes tokens consisting entirely of whitespace. This
+	 * allows us to pass text through verbatim. It avoids the
+	 * "inadvertent token pasting" problem that would occur if we
+	 * just printed tokens, while also avoiding excess whitespace
+	 * insertion in the output.*/
+
+%x CONTROL
+
 SPACE		[[:space:]]
 NONSPACE	[^[:space:]]
 NEWLINE		[\n]
@@ -48,75 +63,104 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 %%

 {HASH}define{HSPACE}+/{IDENTIFIER}"(" {
+	BEGIN CONTROL;
 	return HASH_DEFINE_FUNC;
 }

 {HASH}define {
+	BEGIN CONTROL;
 	return HASH_DEFINE_OBJ;
 }

 {HASH}undef {
+	BEGIN CONTROL;
 	return HASH_UNDEF;
 }

 {HASH} {
+	BEGIN CONTROL;
 	return HASH;
 }

+<CONTROL>{IDENTIFIER} {
+	yylval.str = xtalloc_strdup (yyextra, yytext);
+	return IDENTIFIER;
+}
+
+<CONTROL>"<<"  {
+	return LEFT_SHIFT;
+}
+
+<CONTROL>">>" {
+	return RIGHT_SHIFT;
+}
+
+<CONTROL>"<=" {
+	return LESS_OR_EQUAL;
+}
+
+<CONTROL>">=" {
+	return GREATER_OR_EQUAL;
+}
+
+<CONTROL>"==" {
+	return EQUAL;
+}
+
+<CONTROL>"!=" {
+	return NOT_EQUAL;
+}
+
+<CONTROL>"&&" {
+	return AND;
+}
+
+<CONTROL>"||" {
+	return OR;
+}
+
+<CONTROL>"##" {
+	return PASTE;
+}
+
+<CONTROL>{PUNCTUATION} {
+	return yytext[0];
+}
+
+<CONTROL>{OTHER} {
+	yylval.str = xtalloc_strdup (yyextra, yytext);
+	return OTHER;
+}
+
+<CONTROL>{HSPACE}+
+
+<CONTROL>\n {
+	BEGIN INITIAL;
+	return NEWLINE;
+}
+
 {IDENTIFIER} {
 	yylval.str = xtalloc_strdup (yyextra, yytext);
 	return IDENTIFIER;
 }

-"<<"  {
-	return LEFT_SHIFT;
+{OTHER}+ {
+	yylval.str = xtalloc_strdup (yyextra, yytext);
+	return OTHER;
 }

-">>" {
-	return RIGHT_SHIFT;
-}
-
-"<=" {
-	return LESS_OR_EQUAL;
-}
-
-">=" {
-	return GREATER_OR_EQUAL;
-}
-
-"==" {
-	return EQUAL;
-}
-
-"!=" {
-	return NOT_EQUAL;
-}
-
-"&&" {
-	return AND;
-}
-
-"||" {
-	return OR;
-}
-
-"##" {
-	return PASTE;
-}
-
-{PUNCTUATION} {
-	return yytext[0];
+{HSPACE}+ {
+	yylval.str = xtalloc_strdup (yyextra, yytext);
+	return OTHER;
 }

 \n {
 	return NEWLINE;
 }

-{OTHER} {
+. {
 	yylval.str = xtalloc_strdup (yyextra, yytext);
 	return OTHER;
 }

-{HSPACE}+
-
 %%
@@ -517,8 +517,6 @@ _token_list_print (token_list_t *list)

 	for (node = list->head; node; node = node->next) {
 		_token_print (node->token);
-		if (node->next)
-			printf (" ");
 	}
 }

@@ -9,5 +9,5 @@ for test in *.c; do
    gcc -E $test -o $test.gcc
 #    grep -v '^#' < $test.gcc > $test.expected
    grep -v '^[ 	]*#' < $test > $test.expected
-    diff -w -u $test.expected $test.out
+    diff -u $test.expected $test.out
 done