Lexer: Migrated to unicodeWhitespace

This commit is contained in:
Paul Beckingham 2018-01-25 00:47:23 -05:00
parent 2c89688b46
commit 49dedfbc86
4 changed files with 37 additions and 106 deletions

View file

@ -30,6 +30,7 @@
#include <ctype.h> #include <ctype.h>
#include <Datetime.h> #include <Datetime.h>
#include <Duration.h> #include <Duration.h>
#include <unicode.h>
#include <utf8.h> #include <utf8.h>
static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
@ -59,7 +60,7 @@ Lexer::~Lexer ()
bool Lexer::token (std::string& token, Lexer::Type& type) bool Lexer::token (std::string& token, Lexer::Type& type)
{ {
// Eat white space. // Eat white space.
while (isWhitespace (_text[_cursor])) while (unicodeWhitespace (_text[_cursor]))
utf8_next_char (_text, _cursor); utf8_next_char (_text, _cursor);
// Terminate at EOS. // Terminate at EOS.
@ -142,48 +143,6 @@ const std::string Lexer::typeName (const Lexer::Type& type)
return "unknown"; return "unknown";
} }
////////////////////////////////////////////////////////////////////////////////
// Complete Unicode whitespace list.
//
// http://en.wikipedia.org/wiki/Whitespace_character
// Updated 2015-09-13
// Static
//
// TODO This list should be derived from the Unicode database.
bool Lexer::isWhitespace (int c)
{
return (c == 0x0020 || // space Common Separator, space
c == 0x0009 || // Common Other, control HT, Horizontal Tab
c == 0x000A || // Common Other, control LF, Line feed
c == 0x000B || // Common Other, control VT, Vertical Tab
c == 0x000C || // Common Other, control FF, Form feed
c == 0x000D || // Common Other, control CR, Carriage return
c == 0x0085 || // Common Other, control NEL, Next line
c == 0x00A0 || // no-break space Common Separator, space
c == 0x1680 || // ogham space mark Ogham Separator, space
c == 0x180E || // mongolian vowel separator Mongolian Separator, space
c == 0x2000 || // en quad Common Separator, space
c == 0x2001 || // em quad Common Separator, space
c == 0x2002 || // en space Common Separator, space
c == 0x2003 || // em space Common Separator, space
c == 0x2004 || // three-per-em space Common Separator, space
c == 0x2005 || // four-per-em space Common Separator, space
c == 0x2006 || // six-per-em space Common Separator, space
c == 0x2007 || // figure space Common Separator, space
c == 0x2008 || // punctuation space Common Separator, space
c == 0x2009 || // thin space Common Separator, space
c == 0x200A || // hair space Common Separator, space
c == 0x200B || // zero width space
c == 0x200C || // zero width non-joiner
c == 0x200D || // zero width joiner
c == 0x2028 || // line separator Common Separator, line
c == 0x2029 || // paragraph separator Common Separator, paragraph
c == 0x202F || // narrow no-break space Common Separator, space
c == 0x205F || // medium mathematical space Common Separator, space
c == 0x2060 || // word joiner
c == 0x3000); // ideographic space Common Separator, space
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
bool Lexer::isAlpha (int c) bool Lexer::isAlpha (int c)
{ {
@ -213,7 +172,7 @@ bool Lexer::isHexDigit (int c)
bool Lexer::isIdentifierStart (int c) bool Lexer::isIdentifierStart (int c)
{ {
return c && // Include null character check. return c && // Include null character check.
! isWhitespace (c) && ! unicodeWhitespace (c) &&
! isDigit (c) && ! isDigit (c) &&
! isSingleCharOperator (c) && ! isSingleCharOperator (c) &&
! isPunctuation (c); ! isPunctuation (c);
@ -225,7 +184,7 @@ bool Lexer::isIdentifierNext (int c)
return c && // Include null character check. return c && // Include null character check.
c != ':' && // Used in isPair. c != ':' && // Used in isPair.
c != '=' && // Used in isPair. c != '=' && // Used in isPair.
! isWhitespace (c) && ! unicodeWhitespace (c) &&
! isSingleCharOperator (c); ! isSingleCharOperator (c);
} }
@ -272,15 +231,15 @@ bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3)
bool Lexer::isBoundary (int left, int right) bool Lexer::isBoundary (int left, int right)
{ {
// EOS // EOS
if (right == '\0') return true; if (right == '\0') return true;
// XOR // XOR
if (isAlpha (left) != isAlpha (right)) return true; if (isAlpha (left) != isAlpha (right)) return true;
if (isDigit (left) != isDigit (right)) return true; if (isDigit (left) != isDigit (right)) return true;
if (isWhitespace (left) != isWhitespace (right)) return true; if (unicodeWhitespace (left) != unicodeWhitespace (right)) return true;
// OR // OR
if (isPunctuation (left) || isPunctuation (right)) return true; if (isPunctuation (left) || isPunctuation (right)) return true;
return false; return false;
} }
@ -289,7 +248,8 @@ bool Lexer::isBoundary (int left, int right)
bool Lexer::isHardBoundary (int left, int right) bool Lexer::isHardBoundary (int left, int right)
{ {
// EOS // EOS
if (right == '\0') return true; if (right == '\0')
return true;
// FILTER operators that don't need to be surrounded by whitespace. // FILTER operators that don't need to be surrounded by whitespace.
if (left == '(' || if (left == '(' ||
@ -628,10 +588,10 @@ bool Lexer::isUUID (std::string& token, Lexer::Type& type, bool endBoundary)
break; break;
} }
if (i >= uuid_min_length && if (i >= uuid_min_length &&
(! endBoundary || (! endBoundary ||
! _text[marker + i] || ! _text[marker + i] ||
isWhitespace (_text[marker + i]) || unicodeWhitespace (_text[marker + i]) ||
isSingleCharOperator (_text[marker + i]))) isSingleCharOperator (_text[marker + i])))
{ {
token = _text.substr (_cursor, i); token = _text.substr (_cursor, i);
@ -726,10 +686,10 @@ bool Lexer::isNumber (std::string& token, Lexer::Type& type)
} }
} }
// Lookahead: !<isWhitespace> | !<isSingleCharOperator> // Lookahead: !<unicodeWhitespace> | !<isSingleCharOperator>
// If there is an immediately consecutive character, that is not an operator, fail. // If there is an immediately consecutive character, that is not an operator, fail.
if (_eos > marker && if (_eos > marker &&
! isWhitespace (_text[marker]) && ! unicodeWhitespace (_text[marker]) &&
! isSingleCharOperator (_text[marker])) ! isSingleCharOperator (_text[marker]))
return false; return false;
@ -806,7 +766,7 @@ bool Lexer::isURL (std::string& token, Lexer::Type& type)
marker += 3; marker += 3;
while (marker < _eos && while (marker < _eos &&
! isWhitespace (_text[marker])) ! unicodeWhitespace (_text[marker]))
utf8_next_char (_text, marker); utf8_next_char (_text, marker);
token = _text.substr (_cursor, marker - _cursor); token = _text.substr (_cursor, marker - _cursor);
@ -847,7 +807,7 @@ bool Lexer::isPair (std::string& token, Lexer::Type& type)
if (readWord (_text, "'\"", _cursor, ignoredToken) || if (readWord (_text, "'\"", _cursor, ignoredToken) ||
readWord (_text, _cursor, ignoredToken) || readWord (_text, _cursor, ignoredToken) ||
isEOS () || isEOS () ||
isWhitespace (_text[_cursor])) unicodeWhitespace (_text[_cursor]))
{ {
token = _text.substr (marker, _cursor - marker); token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::pair; type = Lexer::Type::pair;
@ -901,7 +861,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type)
// Success is multiple numbers, matching the pattern. // Success is multiple numbers, matching the pattern.
if (count > 1 && if (count > 1 &&
(isEOS () || (isEOS () ||
isWhitespace (_text[_cursor]) || unicodeWhitespace (_text[_cursor]) ||
isHardBoundary (_text[_cursor], _text[_cursor + 1]))) isHardBoundary (_text[_cursor], _text[_cursor + 1])))
{ {
token = _text.substr (marker, _cursor - marker); token = _text.substr (marker, _cursor - marker);
@ -916,7 +876,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Lexer::Type::tag // Lexer::Type::tag
// ^ | '(' | ')' | <isWhitespace> // ^ | '(' | ')' | <unicodeWhitespace>
// [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]* // [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]*
bool Lexer::isTag (std::string& token, Lexer::Type& type) bool Lexer::isTag (std::string& token, Lexer::Type& type)
{ {
@ -924,7 +884,7 @@ bool Lexer::isTag (std::string& token, Lexer::Type& type)
// Lookbehind: Assert ^ or preceded by whitespace, (, or ). // Lookbehind: Assert ^ or preceded by whitespace, (, or ).
if (marker > 0 && if (marker > 0 &&
! isWhitespace (_text[marker - 1]) && ! unicodeWhitespace (_text[marker - 1]) &&
_text[marker - 1] != '(' && _text[marker - 1] != '(' &&
_text[marker - 1] != ')') _text[marker - 1] != ')')
return false; return false;
@ -970,12 +930,12 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type)
break; break;
if (_text[marker] && if (_text[marker] &&
! isWhitespace (_text[marker]) && ! unicodeWhitespace (_text[marker]) &&
_text[marker] != '/') _text[marker] != '/')
{ {
utf8_next_char (_text, marker); utf8_next_char (_text, marker);
while (_text[marker] && while (_text[marker] &&
! isWhitespace (_text[marker]) && ! unicodeWhitespace (_text[marker]) &&
_text[marker] != '/') _text[marker] != '/')
utf8_next_char (_text, marker); utf8_next_char (_text, marker);
} }
@ -997,7 +957,7 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Lexer::Type::substitution // Lexer::Type::substitution
// / <unquoted-string> / <unquoted-string> / [g] <EOS> | <isWhitespace> // / <unquoted-string> / <unquoted-string> / [g] <EOS> | <unicodeWhitespace>
bool Lexer::isSubstitution (std::string& token, Lexer::Type& type) bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
{ {
std::size_t marker = _cursor; std::size_t marker = _cursor;
@ -1012,9 +972,9 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
if (_text[_cursor] == 'g') if (_text[_cursor] == 'g')
++_cursor; ++_cursor;
// Lookahread: <EOS> | <isWhitespace> // Lookahread: <EOS> | <unicodeWhitespace>
if (_text[_cursor] == '\0' || if (_text[_cursor] == '\0' ||
isWhitespace (_text[_cursor])) unicodeWhitespace (_text[_cursor]))
{ {
token = _text.substr (marker, _cursor - marker); token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::substitution; type = Lexer::Type::substitution;
@ -1029,7 +989,7 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Lexer::Type::pattern // Lexer::Type::pattern
// / <unquoted-string> / <EOS> | <isWhitespace> // / <unquoted-string> / <EOS> | <unicodeWhitespace>
bool Lexer::isPattern (std::string& token, Lexer::Type& type) bool Lexer::isPattern (std::string& token, Lexer::Type& type)
{ {
std::size_t marker = _cursor; std::size_t marker = _cursor;
@ -1037,7 +997,7 @@ bool Lexer::isPattern (std::string& token, Lexer::Type& type)
std::string word; std::string word;
if (readWord (_text, "/", _cursor, word) && if (readWord (_text, "/", _cursor, word) &&
(isEOS () || (isEOS () ||
isWhitespace (_text[_cursor]))) unicodeWhitespace (_text[_cursor])))
{ {
token = _text.substr (marker, _cursor - marker); token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::pattern; type = Lexer::Type::pattern;
@ -1357,7 +1317,7 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
std::size_t marker = _cursor; std::size_t marker = _cursor;
while (_text[marker] && while (_text[marker] &&
! isWhitespace (_text[marker]) && ! unicodeWhitespace (_text[marker]) &&
! isSingleCharOperator (_text[marker])) ! isSingleCharOperator (_text[marker]))
utf8_next_char (_text, marker); utf8_next_char (_text, marker);
@ -1393,7 +1353,7 @@ bool Lexer::isLiteral (
// End boundary conditions must be met. // End boundary conditions must be met.
if (endBoundary && if (endBoundary &&
_text[_cursor + common] && _text[_cursor + common] &&
! Lexer::isWhitespace (_text[_cursor + common]) && ! unicodeWhitespace (_text[_cursor + common]) &&
! Lexer::isSingleCharOperator (_text[_cursor + common])) ! Lexer::isSingleCharOperator (_text[_cursor + common]))
return false; return false;
@ -1567,7 +1527,7 @@ bool Lexer::readWord (
// //
// Ends at: // Ends at:
// Lexer::isEOS // Lexer::isEOS
// Lexer::isWhitespace // unicodeWhitespace
// Lexer::isHardBoundary // Lexer::isHardBoundary
bool Lexer::readWord ( bool Lexer::readWord (
const std::string& text, const std::string& text,
@ -1582,7 +1542,7 @@ bool Lexer::readWord (
while ((c = text[cursor])) // Handles EOS. while ((c = text[cursor])) // Handles EOS.
{ {
// Unquoted word ends on white space. // Unquoted word ends on white space.
if (Lexer::isWhitespace (c)) if (unicodeWhitespace (c))
break; break;
// Parentheses mostly. // Parentheses mostly.

View file

@ -61,7 +61,6 @@ public:
// Static helpers. // Static helpers.
static const std::string typeName (const Lexer::Type&); static const std::string typeName (const Lexer::Type&);
static bool isWhitespace (int);
static bool isAlpha (int); static bool isAlpha (int);
static bool isDigit (int); static bool isDigit (int);
static bool isHexDigit (int); static bool isHexDigit (int);

View file

@ -51,6 +51,7 @@
#include <signal.h> #include <signal.h>
#include <sys/select.h> #include <sys/select.h>
#include <Lexer.h> #include <Lexer.h>
#include <unicode.h>
#include <utf8.h> #include <utf8.h>
#include <util.h> #include <util.h>
#include <main.h> #include <main.h>
@ -268,7 +269,7 @@ bool nontrivial (const std::string& input)
std::string::size_type i = 0; std::string::size_type i = 0;
int character; int character;
while ((character = utf8_next_char (input, i))) while ((character = utf8_next_char (input, i)))
if (! Lexer::isWhitespace (character)) if (! unicodeWhitespace (character))
return true; return true;
return false; return false;

View file

@ -37,9 +37,9 @@
int main (int, char**) int main (int, char**)
{ {
#ifdef PRODUCT_TASKWARRIOR #ifdef PRODUCT_TASKWARRIOR
UnitTest t (1280); UnitTest t (1253);
#else #else
UnitTest t (1262); UnitTest t (1235);
#endif #endif
// Use same Datetime/Duraiton configuration as Context∴:staticInitialization. // Use same Datetime/Duraiton configuration as Context∴:staticInitialization.
@ -58,35 +58,6 @@ int main (int, char**)
Lexer::attributes["tags"] = "string"; Lexer::attributes["tags"] = "string";
Lexer::attributes["description"] = "string"; Lexer::attributes["description"] = "string";
// White space detection.
t.notok (Lexer::isWhitespace (0x0041), "U+0041 (A) ! isWhitespace");
t.ok (Lexer::isWhitespace (0x0020), "U+0020 isWhitespace");
t.ok (Lexer::isWhitespace (0x0009), "U+0009 isWhitespace");
t.ok (Lexer::isWhitespace (0x000A), "U+000A isWhitespace");
t.ok (Lexer::isWhitespace (0x000B), "U+000B isWhitespace");
t.ok (Lexer::isWhitespace (0x000C), "U+000C isWhitespace");
t.ok (Lexer::isWhitespace (0x000D), "U+000D isWhitespace");
t.ok (Lexer::isWhitespace (0x0085), "U+0085 isWhitespace");
t.ok (Lexer::isWhitespace (0x00A0), "U+00A0 isWhitespace");
t.ok (Lexer::isWhitespace (0x1680), "U+1680 isWhitespace"); // 10
t.ok (Lexer::isWhitespace (0x180E), "U+180E isWhitespace");
t.ok (Lexer::isWhitespace (0x2000), "U+2000 isWhitespace");
t.ok (Lexer::isWhitespace (0x2001), "U+2001 isWhitespace");
t.ok (Lexer::isWhitespace (0x2002), "U+2002 isWhitespace");
t.ok (Lexer::isWhitespace (0x2003), "U+2003 isWhitespace");
t.ok (Lexer::isWhitespace (0x2004), "U+2004 isWhitespace");
t.ok (Lexer::isWhitespace (0x2005), "U+2005 isWhitespace");
t.ok (Lexer::isWhitespace (0x2006), "U+2006 isWhitespace");
t.ok (Lexer::isWhitespace (0x2007), "U+2007 isWhitespace");
t.ok (Lexer::isWhitespace (0x2008), "U+2008 isWhitespace"); // 20
t.ok (Lexer::isWhitespace (0x2009), "U+2009 isWhitespace");
t.ok (Lexer::isWhitespace (0x200A), "U+200A isWhitespace");
t.ok (Lexer::isWhitespace (0x2028), "U+2028 isWhitespace");
t.ok (Lexer::isWhitespace (0x2029), "U+2029 isWhitespace");
t.ok (Lexer::isWhitespace (0x202F), "U+202F isWhitespace");
t.ok (Lexer::isWhitespace (0x205F), "U+205F isWhitespace");
t.ok (Lexer::isWhitespace (0x3000), "U+3000 isWhitespace");
// static bool Lexer::isBoundary (int, int); // static bool Lexer::isBoundary (int, int);
t.ok (Lexer::isBoundary (' ', 'a'), "' ' --> 'a' = isBoundary"); t.ok (Lexer::isBoundary (' ', 'a'), "' ' --> 'a' = isBoundary");
t.ok (Lexer::isBoundary ('a', ' '), "'a' --> ' ' = isBoundary"); t.ok (Lexer::isBoundary ('a', ' '), "'a' --> ' ' = isBoundary");