diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 992eb7a1c..338f12b5e 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -294,6 +294,22 @@ bool Lexer::isBoundary (int left, int right) return false; } +//////////////////////////////////////////////////////////////////////////////// +bool Lexer::isHardBoundary (int left, int right) +{ + // EOS + if (right == '\0') return true; + + // FILTER operators that don't need to be surrounded by whitespace. + if (left == '(' || + left == ')' || + right == '(' || + right == ')') + return true; + + return false; +} + //////////////////////////////////////////////////////////////////////////////// bool Lexer::isPunctuation (int c) { @@ -1230,6 +1246,11 @@ bool Lexer::readWord ( // abcU+0020def // abc\u0020def // a\tb +// +// Ends at: +// Lexer::isEOS +// Lexer::isWhitespace +// Lexer::isHardBoundary bool Lexer::readWord ( const std::string& text, std::string::size_type& cursor, @@ -1239,12 +1260,17 @@ bool Lexer::readWord ( word = ""; int c; - while ((c = text[cursor])) + int prev = 0; + while ((c = text[cursor])) // Handles EOS. { // Unquoted word ends on white space. if (Lexer::isWhitespace (c)) break; + // Parentheses mostly. + if (prev && Lexer::isHardBoundary (prev, c)) + break; + // Unicode U+XXXX or \uXXXX codepoint. else if (eos - cursor >= 6 && ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') || @@ -1290,6 +1316,8 @@ bool Lexer::readWord ( // Ordinary character. else word += utf8_character (utf8_next_char (text, cursor)); + + prev = c; } return word.length () > 0 ? true : false; diff --git a/src/Lexer.h b/src/Lexer.h index d2f04b435..7f96bda00 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -70,6 +70,7 @@ public: static bool isDoubleCharOperator (int, int, int); static bool isTripleCharOperator (int, int, int, int); static bool isBoundary (int, int); + static bool isHardBoundary (int, int); static bool isPunctuation (int); static bool isAllDigits (const std::string&); static bool isOneWord (const std::string&);