From 9fb8b7a495c9dc6f9a62cf82300fae5925af92fc Mon Sep 17 00:00:00 2001 From: Carl Worth Date: Tue, 25 May 2010 15:04:32 -0700 Subject: [PATCH] Make the lexer pass whitespace through (as OTHER tokens) for text lines. With this change, we can recreate the original text-line input exactly. Previously we were inserting a space between every pair of tokens so our output had a lot more whitespace than our input. With this change, we can drop the "-b" option to diff and match the input exactly. --- glcpp-lex.l | 122 ++++++++++++++++++++++++++++++++--------------- glcpp-parse.y | 2 - tests/glcpp-test | 2 +- 3 files changed, 84 insertions(+), 42 deletions(-) diff --git a/glcpp-lex.l b/glcpp-lex.l index f1dd11ea9bd..7b5cdd57a0f 100644 --- a/glcpp-lex.l +++ b/glcpp-lex.l @@ -32,6 +32,21 @@ %option reentrant noyywrap %option extra-type="glcpp_parser_t *" + /* This lexer has two states: + * + * The CONTROL state is for control lines (directives) + * It lexes exactly as specified in the C99 specification. + * + * The INITIAL state is for input lines. In this state, we + * make the OTHER token much more broad in that it now + * includes tokens consisting entirely of whitespace. This + * allows us to pass text through verbatim. It avoids the + * "inadvertent token pasting" problem that would occur if we + * just printed tokens, while also avoiding excess whitespace + * insertion in the output.*/ + +%x CONTROL + SPACE [[:space:]] NONSPACE [^[:space:]] NEWLINE [\n] @@ -48,75 +63,104 @@ HEXADECIMAL_INTEGER 0[xX][0-9a-fA-F]+[uU]? %% {HASH}define{HSPACE}+/{IDENTIFIER}"(" { + BEGIN CONTROL; return HASH_DEFINE_FUNC; } {HASH}define { + BEGIN CONTROL; return HASH_DEFINE_OBJ; } {HASH}undef { + BEGIN CONTROL; return HASH_UNDEF; } {HASH} { + BEGIN CONTROL; return HASH; } +{IDENTIFIER} { + yylval.str = xtalloc_strdup (yyextra, yytext); + return IDENTIFIER; +} + +"<<" { + return LEFT_SHIFT; +} + +">>" { + return RIGHT_SHIFT; +} + +"<=" { + return LESS_OR_EQUAL; +} + +">=" { + return GREATER_OR_EQUAL; +} + +"==" { + return EQUAL; +} + +"!=" { + return NOT_EQUAL; +} + +"&&" { + return AND; +} + +"||" { + return OR; +} + +"##" { + return PASTE; +} + +{PUNCTUATION} { + return yytext[0]; +} + +{OTHER} { + yylval.str = xtalloc_strdup (yyextra, yytext); + return OTHER; +} + +{HSPACE}+ + +\n { + BEGIN INITIAL; + return NEWLINE; +} + {IDENTIFIER} { yylval.str = xtalloc_strdup (yyextra, yytext); return IDENTIFIER; } -"<<" { - return LEFT_SHIFT; +{OTHER}+ { + yylval.str = xtalloc_strdup (yyextra, yytext); + return OTHER; } -">>" { - return RIGHT_SHIFT; -} - -"<=" { - return LESS_OR_EQUAL; -} - -">=" { - return GREATER_OR_EQUAL; -} - -"==" { - return EQUAL; -} - -"!=" { - return NOT_EQUAL; -} - -"&&" { - return AND; -} - -"||" { - return OR; -} - -"##" { - return PASTE; -} - -{PUNCTUATION} { - return yytext[0]; +{HSPACE}+ { + yylval.str = xtalloc_strdup (yyextra, yytext); + return OTHER; } \n { return NEWLINE; } -{OTHER} { +. { yylval.str = xtalloc_strdup (yyextra, yytext); return OTHER; } -{HSPACE}+ - %% diff --git a/glcpp-parse.y b/glcpp-parse.y index 991b8a0b856..957421b864e 100644 --- a/glcpp-parse.y +++ b/glcpp-parse.y @@ -517,8 +517,6 @@ _token_list_print (token_list_t *list) for (node = list->head; node; node = node->next) { _token_print (node->token); - if (node->next) - printf (" "); } } diff --git a/tests/glcpp-test b/tests/glcpp-test index 34cca883301..8074e471197 100755 --- a/tests/glcpp-test +++ b/tests/glcpp-test @@ -9,5 +9,5 @@ for test in *.c; do gcc -E $test -o $test.gcc # grep -v '^#' < $test.gcc > $test.expected grep -v '^[ ]*#' < $test > $test.expected - diff -w -u $test.expected $test.out + diff -u $test.expected $test.out done