diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 9365f2344..9b399fec7 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; @@ -59,7 +60,7 @@ Lexer::~Lexer () bool Lexer::token (std::string& token, Lexer::Type& type) { // Eat white space. - while (isWhitespace (_text[_cursor])) + while (unicodeWhitespace (_text[_cursor])) utf8_next_char (_text, _cursor); // Terminate at EOS. @@ -142,48 +143,6 @@ const std::string Lexer::typeName (const Lexer::Type& type) return "unknown"; } -//////////////////////////////////////////////////////////////////////////////// -// Complete Unicode whitespace list. -// -// http://en.wikipedia.org/wiki/Whitespace_character -// Updated 2015-09-13 -// Static -// -// TODO This list should be derived from the Unicode database. -bool Lexer::isWhitespace (int c) -{ - return (c == 0x0020 || // space Common Separator, space - c == 0x0009 || // Common Other, control HT, Horizontal Tab - c == 0x000A || // Common Other, control LF, Line feed - c == 0x000B || // Common Other, control VT, Vertical Tab - c == 0x000C || // Common Other, control FF, Form feed - c == 0x000D || // Common Other, control CR, Carriage return - c == 0x0085 || // Common Other, control NEL, Next line - c == 0x00A0 || // no-break space Common Separator, space - c == 0x1680 || // ogham space mark Ogham Separator, space - c == 0x180E || // mongolian vowel separator Mongolian Separator, space - c == 0x2000 || // en quad Common Separator, space - c == 0x2001 || // em quad Common Separator, space - c == 0x2002 || // en space Common Separator, space - c == 0x2003 || // em space Common Separator, space - c == 0x2004 || // three-per-em space Common Separator, space - c == 0x2005 || // four-per-em space Common Separator, space - c == 0x2006 || // six-per-em space Common Separator, space - c == 0x2007 || // figure space Common Separator, space - c == 0x2008 || // punctuation space Common Separator, space - c == 0x2009 || // thin space Common Separator, space - c == 0x200A || // hair space Common Separator, space - c == 0x200B || // zero width space - c == 0x200C || // zero width non-joiner - c == 0x200D || // zero width joiner - c == 0x2028 || // line separator Common Separator, line - c == 0x2029 || // paragraph separator Common Separator, paragraph - c == 0x202F || // narrow no-break space Common Separator, space - c == 0x205F || // medium mathematical space Common Separator, space - c == 0x2060 || // word joiner - c == 0x3000); // ideographic space Common Separator, space -} - //////////////////////////////////////////////////////////////////////////////// bool Lexer::isAlpha (int c) { @@ -213,7 +172,7 @@ bool Lexer::isHexDigit (int c) bool Lexer::isIdentifierStart (int c) { return c && // Include null character check. - ! isWhitespace (c) && + ! unicodeWhitespace (c) && ! isDigit (c) && ! isSingleCharOperator (c) && ! isPunctuation (c); @@ -225,7 +184,7 @@ bool Lexer::isIdentifierNext (int c) return c && // Include null character check. c != ':' && // Used in isPair. c != '=' && // Used in isPair. - ! isWhitespace (c) && + ! unicodeWhitespace (c) && ! isSingleCharOperator (c); } @@ -272,15 +231,15 @@ bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3) bool Lexer::isBoundary (int left, int right) { // EOS - if (right == '\0') return true; + if (right == '\0') return true; // XOR - if (isAlpha (left) != isAlpha (right)) return true; - if (isDigit (left) != isDigit (right)) return true; - if (isWhitespace (left) != isWhitespace (right)) return true; + if (isAlpha (left) != isAlpha (right)) return true; + if (isDigit (left) != isDigit (right)) return true; + if (unicodeWhitespace (left) != unicodeWhitespace (right)) return true; // OR - if (isPunctuation (left) || isPunctuation (right)) return true; + if (isPunctuation (left) || isPunctuation (right)) return true; return false; } @@ -289,7 +248,8 @@ bool Lexer::isBoundary (int left, int right) bool Lexer::isHardBoundary (int left, int right) { // EOS - if (right == '\0') return true; + if (right == '\0') + return true; // FILTER operators that don't need to be surrounded by whitespace. if (left == '(' || @@ -628,10 +588,10 @@ bool Lexer::isUUID (std::string& token, Lexer::Type& type, bool endBoundary) break; } - if (i >= uuid_min_length && - (! endBoundary || - ! _text[marker + i] || - isWhitespace (_text[marker + i]) || + if (i >= uuid_min_length && + (! endBoundary || + ! _text[marker + i] || + unicodeWhitespace (_text[marker + i]) || isSingleCharOperator (_text[marker + i]))) { token = _text.substr (_cursor, i); @@ -726,10 +686,10 @@ bool Lexer::isNumber (std::string& token, Lexer::Type& type) } } - // Lookahead: ! | ! + // Lookahead: ! | ! // If there is an immediately consecutive character, that is not an operator, fail. if (_eos > marker && - ! isWhitespace (_text[marker]) && + ! unicodeWhitespace (_text[marker]) && ! isSingleCharOperator (_text[marker])) return false; @@ -806,7 +766,7 @@ bool Lexer::isURL (std::string& token, Lexer::Type& type) marker += 3; while (marker < _eos && - ! isWhitespace (_text[marker])) + ! unicodeWhitespace (_text[marker])) utf8_next_char (_text, marker); token = _text.substr (_cursor, marker - _cursor); @@ -847,7 +807,7 @@ bool Lexer::isPair (std::string& token, Lexer::Type& type) if (readWord (_text, "'\"", _cursor, ignoredToken) || readWord (_text, _cursor, ignoredToken) || isEOS () || - isWhitespace (_text[_cursor])) + unicodeWhitespace (_text[_cursor])) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::pair; @@ -901,7 +861,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type) // Success is multiple numbers, matching the pattern. if (count > 1 && (isEOS () || - isWhitespace (_text[_cursor]) || + unicodeWhitespace (_text[_cursor]) || isHardBoundary (_text[_cursor], _text[_cursor + 1]))) { token = _text.substr (marker, _cursor - marker); @@ -916,7 +876,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type) //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::tag -// ^ | '(' | ')' | +// ^ | '(' | ')' | // [ +|- ] [ ]* bool Lexer::isTag (std::string& token, Lexer::Type& type) { @@ -924,7 +884,7 @@ bool Lexer::isTag (std::string& token, Lexer::Type& type) // Lookbehind: Assert ^ or preceded by whitespace, (, or ). if (marker > 0 && - ! isWhitespace (_text[marker - 1]) && + ! unicodeWhitespace (_text[marker - 1]) && _text[marker - 1] != '(' && _text[marker - 1] != ')') return false; @@ -970,12 +930,12 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type) break; if (_text[marker] && - ! isWhitespace (_text[marker]) && + ! unicodeWhitespace (_text[marker]) && _text[marker] != '/') { utf8_next_char (_text, marker); while (_text[marker] && - ! isWhitespace (_text[marker]) && + ! unicodeWhitespace (_text[marker]) && _text[marker] != '/') utf8_next_char (_text, marker); } @@ -997,7 +957,7 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type) //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::substitution -// / / / [g] | +// / / / [g] | bool Lexer::isSubstitution (std::string& token, Lexer::Type& type) { std::size_t marker = _cursor; @@ -1012,9 +972,9 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type) if (_text[_cursor] == 'g') ++_cursor; - // Lookahread: | + // Lookahread: | if (_text[_cursor] == '\0' || - isWhitespace (_text[_cursor])) + unicodeWhitespace (_text[_cursor])) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::substitution; @@ -1029,7 +989,7 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type) //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::pattern -// / / | +// / / | bool Lexer::isPattern (std::string& token, Lexer::Type& type) { std::size_t marker = _cursor; @@ -1037,7 +997,7 @@ bool Lexer::isPattern (std::string& token, Lexer::Type& type) std::string word; if (readWord (_text, "/", _cursor, word) && (isEOS () || - isWhitespace (_text[_cursor]))) + unicodeWhitespace (_text[_cursor]))) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::pattern; @@ -1357,7 +1317,7 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type) std::size_t marker = _cursor; while (_text[marker] && - ! isWhitespace (_text[marker]) && + ! unicodeWhitespace (_text[marker]) && ! isSingleCharOperator (_text[marker])) utf8_next_char (_text, marker); @@ -1393,7 +1353,7 @@ bool Lexer::isLiteral ( // End boundary conditions must be met. if (endBoundary && _text[_cursor + common] && - ! Lexer::isWhitespace (_text[_cursor + common]) && + ! unicodeWhitespace (_text[_cursor + common]) && ! Lexer::isSingleCharOperator (_text[_cursor + common])) return false; @@ -1567,7 +1527,7 @@ bool Lexer::readWord ( // // Ends at: // Lexer::isEOS -// Lexer::isWhitespace +// unicodeWhitespace // Lexer::isHardBoundary bool Lexer::readWord ( const std::string& text, @@ -1582,7 +1542,7 @@ bool Lexer::readWord ( while ((c = text[cursor])) // Handles EOS. { // Unquoted word ends on white space. - if (Lexer::isWhitespace (c)) + if (unicodeWhitespace (c)) break; // Parentheses mostly. diff --git a/src/Lexer.h b/src/Lexer.h index 410843335..fe8b80e59 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -61,7 +61,6 @@ public: // Static helpers. static const std::string typeName (const Lexer::Type&); - static bool isWhitespace (int); static bool isAlpha (int); static bool isDigit (int); static bool isHexDigit (int); diff --git a/src/util.cpp b/src/util.cpp index b5aa15873..1f9222cb8 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include @@ -268,7 +269,7 @@ bool nontrivial (const std::string& input) std::string::size_type i = 0; int character; while ((character = utf8_next_char (input, i))) - if (! Lexer::isWhitespace (character)) + if (! unicodeWhitespace (character)) return true; return false; diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp index cc0c2747d..e16d12976 100644 --- a/test/lexer.t.cpp +++ b/test/lexer.t.cpp @@ -37,9 +37,9 @@ int main (int, char**) { #ifdef PRODUCT_TASKWARRIOR - UnitTest t (1280); + UnitTest t (1253); #else - UnitTest t (1262); + UnitTest t (1235); #endif // Use same Datetime/Duraiton configuration as Context∴:staticInitialization. @@ -58,35 +58,6 @@ int main (int, char**) Lexer::attributes["tags"] = "string"; Lexer::attributes["description"] = "string"; - // White space detection. - t.notok (Lexer::isWhitespace (0x0041), "U+0041 (A) ! isWhitespace"); - t.ok (Lexer::isWhitespace (0x0020), "U+0020 isWhitespace"); - t.ok (Lexer::isWhitespace (0x0009), "U+0009 isWhitespace"); - t.ok (Lexer::isWhitespace (0x000A), "U+000A isWhitespace"); - t.ok (Lexer::isWhitespace (0x000B), "U+000B isWhitespace"); - t.ok (Lexer::isWhitespace (0x000C), "U+000C isWhitespace"); - t.ok (Lexer::isWhitespace (0x000D), "U+000D isWhitespace"); - t.ok (Lexer::isWhitespace (0x0085), "U+0085 isWhitespace"); - t.ok (Lexer::isWhitespace (0x00A0), "U+00A0 isWhitespace"); - t.ok (Lexer::isWhitespace (0x1680), "U+1680 isWhitespace"); // 10 - t.ok (Lexer::isWhitespace (0x180E), "U+180E isWhitespace"); - t.ok (Lexer::isWhitespace (0x2000), "U+2000 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2001), "U+2001 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2002), "U+2002 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2003), "U+2003 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2004), "U+2004 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2005), "U+2005 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2006), "U+2006 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2007), "U+2007 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2008), "U+2008 isWhitespace"); // 20 - t.ok (Lexer::isWhitespace (0x2009), "U+2009 isWhitespace"); - t.ok (Lexer::isWhitespace (0x200A), "U+200A isWhitespace"); - t.ok (Lexer::isWhitespace (0x2028), "U+2028 isWhitespace"); - t.ok (Lexer::isWhitespace (0x2029), "U+2029 isWhitespace"); - t.ok (Lexer::isWhitespace (0x202F), "U+202F isWhitespace"); - t.ok (Lexer::isWhitespace (0x205F), "U+205F isWhitespace"); - t.ok (Lexer::isWhitespace (0x3000), "U+3000 isWhitespace"); - // static bool Lexer::isBoundary (int, int); t.ok (Lexer::isBoundary (' ', 'a'), "' ' --> 'a' = isBoundary"); t.ok (Lexer::isBoundary ('a', ' '), "'a' --> ' ' = isBoundary");