Lexer: Migrated to unicodeWhitespace

This commit is contained in:
Paul Beckingham 2018-01-25 00:47:23 -05:00
parent 2c89688b46
commit 49dedfbc86
4 changed files with 37 additions and 106 deletions

View file

@ -30,6 +30,7 @@
#include <ctype.h>
#include <Datetime.h>
#include <Duration.h>
#include <unicode.h>
#include <utf8.h>
static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
@ -59,7 +60,7 @@ Lexer::~Lexer ()
bool Lexer::token (std::string& token, Lexer::Type& type)
{
// Eat white space.
while (isWhitespace (_text[_cursor]))
while (unicodeWhitespace (_text[_cursor]))
utf8_next_char (_text, _cursor);
// Terminate at EOS.
@ -142,48 +143,6 @@ const std::string Lexer::typeName (const Lexer::Type& type)
return "unknown";
}
////////////////////////////////////////////////////////////////////////////////
// Complete Unicode whitespace list.
//
// http://en.wikipedia.org/wiki/Whitespace_character
// Updated 2015-09-13
// Static
//
// TODO This list should be derived from the Unicode database.
bool Lexer::isWhitespace (int c)
{
return (c == 0x0020 || // space Common Separator, space
c == 0x0009 || // Common Other, control HT, Horizontal Tab
c == 0x000A || // Common Other, control LF, Line feed
c == 0x000B || // Common Other, control VT, Vertical Tab
c == 0x000C || // Common Other, control FF, Form feed
c == 0x000D || // Common Other, control CR, Carriage return
c == 0x0085 || // Common Other, control NEL, Next line
c == 0x00A0 || // no-break space Common Separator, space
c == 0x1680 || // ogham space mark Ogham Separator, space
c == 0x180E || // mongolian vowel separator Mongolian Separator, space
c == 0x2000 || // en quad Common Separator, space
c == 0x2001 || // em quad Common Separator, space
c == 0x2002 || // en space Common Separator, space
c == 0x2003 || // em space Common Separator, space
c == 0x2004 || // three-per-em space Common Separator, space
c == 0x2005 || // four-per-em space Common Separator, space
c == 0x2006 || // six-per-em space Common Separator, space
c == 0x2007 || // figure space Common Separator, space
c == 0x2008 || // punctuation space Common Separator, space
c == 0x2009 || // thin space Common Separator, space
c == 0x200A || // hair space Common Separator, space
c == 0x200B || // zero width space
c == 0x200C || // zero width non-joiner
c == 0x200D || // zero width joiner
c == 0x2028 || // line separator Common Separator, line
c == 0x2029 || // paragraph separator Common Separator, paragraph
c == 0x202F || // narrow no-break space Common Separator, space
c == 0x205F || // medium mathematical space Common Separator, space
c == 0x2060 || // word joiner
c == 0x3000); // ideographic space Common Separator, space
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isAlpha (int c)
{
@ -213,7 +172,7 @@ bool Lexer::isHexDigit (int c)
bool Lexer::isIdentifierStart (int c)
{
return c && // Include null character check.
! isWhitespace (c) &&
! unicodeWhitespace (c) &&
! isDigit (c) &&
! isSingleCharOperator (c) &&
! isPunctuation (c);
@ -225,7 +184,7 @@ bool Lexer::isIdentifierNext (int c)
return c && // Include null character check.
c != ':' && // Used in isPair.
c != '=' && // Used in isPair.
! isWhitespace (c) &&
! unicodeWhitespace (c) &&
! isSingleCharOperator (c);
}
@ -272,15 +231,15 @@ bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3)
bool Lexer::isBoundary (int left, int right)
{
// EOS
if (right == '\0') return true;
if (right == '\0') return true;
// XOR
if (isAlpha (left) != isAlpha (right)) return true;
if (isDigit (left) != isDigit (right)) return true;
if (isWhitespace (left) != isWhitespace (right)) return true;
if (isAlpha (left) != isAlpha (right)) return true;
if (isDigit (left) != isDigit (right)) return true;
if (unicodeWhitespace (left) != unicodeWhitespace (right)) return true;
// OR
if (isPunctuation (left) || isPunctuation (right)) return true;
if (isPunctuation (left) || isPunctuation (right)) return true;
return false;
}
@ -289,7 +248,8 @@ bool Lexer::isBoundary (int left, int right)
bool Lexer::isHardBoundary (int left, int right)
{
// EOS
if (right == '\0') return true;
if (right == '\0')
return true;
// FILTER operators that don't need to be surrounded by whitespace.
if (left == '(' ||
@ -628,10 +588,10 @@ bool Lexer::isUUID (std::string& token, Lexer::Type& type, bool endBoundary)
break;
}
if (i >= uuid_min_length &&
(! endBoundary ||
! _text[marker + i] ||
isWhitespace (_text[marker + i]) ||
if (i >= uuid_min_length &&
(! endBoundary ||
! _text[marker + i] ||
unicodeWhitespace (_text[marker + i]) ||
isSingleCharOperator (_text[marker + i])))
{
token = _text.substr (_cursor, i);
@ -726,10 +686,10 @@ bool Lexer::isNumber (std::string& token, Lexer::Type& type)
}
}
// Lookahead: !<isWhitespace> | !<isSingleCharOperator>
// Lookahead: !<unicodeWhitespace> | !<isSingleCharOperator>
// If there is an immediately consecutive character, that is not an operator, fail.
if (_eos > marker &&
! isWhitespace (_text[marker]) &&
! unicodeWhitespace (_text[marker]) &&
! isSingleCharOperator (_text[marker]))
return false;
@ -806,7 +766,7 @@ bool Lexer::isURL (std::string& token, Lexer::Type& type)
marker += 3;
while (marker < _eos &&
! isWhitespace (_text[marker]))
! unicodeWhitespace (_text[marker]))
utf8_next_char (_text, marker);
token = _text.substr (_cursor, marker - _cursor);
@ -847,7 +807,7 @@ bool Lexer::isPair (std::string& token, Lexer::Type& type)
if (readWord (_text, "'\"", _cursor, ignoredToken) ||
readWord (_text, _cursor, ignoredToken) ||
isEOS () ||
isWhitespace (_text[_cursor]))
unicodeWhitespace (_text[_cursor]))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::pair;
@ -901,7 +861,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type)
// Success is multiple numbers, matching the pattern.
if (count > 1 &&
(isEOS () ||
isWhitespace (_text[_cursor]) ||
unicodeWhitespace (_text[_cursor]) ||
isHardBoundary (_text[_cursor], _text[_cursor + 1])))
{
token = _text.substr (marker, _cursor - marker);
@ -916,7 +876,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type)
////////////////////////////////////////////////////////////////////////////////
// Lexer::Type::tag
// ^ | '(' | ')' | <isWhitespace>
// ^ | '(' | ')' | <unicodeWhitespace>
// [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]*
bool Lexer::isTag (std::string& token, Lexer::Type& type)
{
@ -924,7 +884,7 @@ bool Lexer::isTag (std::string& token, Lexer::Type& type)
// Lookbehind: Assert ^ or preceded by whitespace, (, or ).
if (marker > 0 &&
! isWhitespace (_text[marker - 1]) &&
! unicodeWhitespace (_text[marker - 1]) &&
_text[marker - 1] != '(' &&
_text[marker - 1] != ')')
return false;
@ -970,12 +930,12 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type)
break;
if (_text[marker] &&
! isWhitespace (_text[marker]) &&
! unicodeWhitespace (_text[marker]) &&
_text[marker] != '/')
{
utf8_next_char (_text, marker);
while (_text[marker] &&
! isWhitespace (_text[marker]) &&
! unicodeWhitespace (_text[marker]) &&
_text[marker] != '/')
utf8_next_char (_text, marker);
}
@ -997,7 +957,7 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type)
////////////////////////////////////////////////////////////////////////////////
// Lexer::Type::substitution
// / <unquoted-string> / <unquoted-string> / [g] <EOS> | <isWhitespace>
// / <unquoted-string> / <unquoted-string> / [g] <EOS> | <unicodeWhitespace>
bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
{
std::size_t marker = _cursor;
@ -1012,9 +972,9 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
if (_text[_cursor] == 'g')
++_cursor;
// Lookahread: <EOS> | <isWhitespace>
// Lookahread: <EOS> | <unicodeWhitespace>
if (_text[_cursor] == '\0' ||
isWhitespace (_text[_cursor]))
unicodeWhitespace (_text[_cursor]))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::substitution;
@ -1029,7 +989,7 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
////////////////////////////////////////////////////////////////////////////////
// Lexer::Type::pattern
// / <unquoted-string> / <EOS> | <isWhitespace>
// / <unquoted-string> / <EOS> | <unicodeWhitespace>
bool Lexer::isPattern (std::string& token, Lexer::Type& type)
{
std::size_t marker = _cursor;
@ -1037,7 +997,7 @@ bool Lexer::isPattern (std::string& token, Lexer::Type& type)
std::string word;
if (readWord (_text, "/", _cursor, word) &&
(isEOS () ||
isWhitespace (_text[_cursor])))
unicodeWhitespace (_text[_cursor])))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::pattern;
@ -1357,7 +1317,7 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
std::size_t marker = _cursor;
while (_text[marker] &&
! isWhitespace (_text[marker]) &&
! unicodeWhitespace (_text[marker]) &&
! isSingleCharOperator (_text[marker]))
utf8_next_char (_text, marker);
@ -1393,7 +1353,7 @@ bool Lexer::isLiteral (
// End boundary conditions must be met.
if (endBoundary &&
_text[_cursor + common] &&
! Lexer::isWhitespace (_text[_cursor + common]) &&
! unicodeWhitespace (_text[_cursor + common]) &&
! Lexer::isSingleCharOperator (_text[_cursor + common]))
return false;
@ -1567,7 +1527,7 @@ bool Lexer::readWord (
//
// Ends at:
// Lexer::isEOS
// Lexer::isWhitespace
// unicodeWhitespace
// Lexer::isHardBoundary
bool Lexer::readWord (
const std::string& text,
@ -1582,7 +1542,7 @@ bool Lexer::readWord (
while ((c = text[cursor])) // Handles EOS.
{
// Unquoted word ends on white space.
if (Lexer::isWhitespace (c))
if (unicodeWhitespace (c))
break;
// Parentheses mostly.

View file

@ -61,7 +61,6 @@ public:
// Static helpers.
static const std::string typeName (const Lexer::Type&);
static bool isWhitespace (int);
static bool isAlpha (int);
static bool isDigit (int);
static bool isHexDigit (int);

View file

@ -51,6 +51,7 @@
#include <signal.h>
#include <sys/select.h>
#include <Lexer.h>
#include <unicode.h>
#include <utf8.h>
#include <util.h>
#include <main.h>
@ -268,7 +269,7 @@ bool nontrivial (const std::string& input)
std::string::size_type i = 0;
int character;
while ((character = utf8_next_char (input, i)))
if (! Lexer::isWhitespace (character))
if (! unicodeWhitespace (character))
return true;
return false;

View file

@ -37,9 +37,9 @@
int main (int, char**)
{
#ifdef PRODUCT_TASKWARRIOR
UnitTest t (1280);
UnitTest t (1253);
#else
UnitTest t (1262);
UnitTest t (1235);
#endif
// Use same Datetime/Duraiton configuration as Context∴:staticInitialization.
@ -58,35 +58,6 @@ int main (int, char**)
Lexer::attributes["tags"] = "string";
Lexer::attributes["description"] = "string";
// White space detection.
t.notok (Lexer::isWhitespace (0x0041), "U+0041 (A) ! isWhitespace");
t.ok (Lexer::isWhitespace (0x0020), "U+0020 isWhitespace");
t.ok (Lexer::isWhitespace (0x0009), "U+0009 isWhitespace");
t.ok (Lexer::isWhitespace (0x000A), "U+000A isWhitespace");
t.ok (Lexer::isWhitespace (0x000B), "U+000B isWhitespace");
t.ok (Lexer::isWhitespace (0x000C), "U+000C isWhitespace");
t.ok (Lexer::isWhitespace (0x000D), "U+000D isWhitespace");
t.ok (Lexer::isWhitespace (0x0085), "U+0085 isWhitespace");
t.ok (Lexer::isWhitespace (0x00A0), "U+00A0 isWhitespace");
t.ok (Lexer::isWhitespace (0x1680), "U+1680 isWhitespace"); // 10
t.ok (Lexer::isWhitespace (0x180E), "U+180E isWhitespace");
t.ok (Lexer::isWhitespace (0x2000), "U+2000 isWhitespace");
t.ok (Lexer::isWhitespace (0x2001), "U+2001 isWhitespace");
t.ok (Lexer::isWhitespace (0x2002), "U+2002 isWhitespace");
t.ok (Lexer::isWhitespace (0x2003), "U+2003 isWhitespace");
t.ok (Lexer::isWhitespace (0x2004), "U+2004 isWhitespace");
t.ok (Lexer::isWhitespace (0x2005), "U+2005 isWhitespace");
t.ok (Lexer::isWhitespace (0x2006), "U+2006 isWhitespace");
t.ok (Lexer::isWhitespace (0x2007), "U+2007 isWhitespace");
t.ok (Lexer::isWhitespace (0x2008), "U+2008 isWhitespace"); // 20
t.ok (Lexer::isWhitespace (0x2009), "U+2009 isWhitespace");
t.ok (Lexer::isWhitespace (0x200A), "U+200A isWhitespace");
t.ok (Lexer::isWhitespace (0x2028), "U+2028 isWhitespace");
t.ok (Lexer::isWhitespace (0x2029), "U+2029 isWhitespace");
t.ok (Lexer::isWhitespace (0x202F), "U+202F isWhitespace");
t.ok (Lexer::isWhitespace (0x205F), "U+205F isWhitespace");
t.ok (Lexer::isWhitespace (0x3000), "U+3000 isWhitespace");
// static bool Lexer::isBoundary (int, int);
t.ok (Lexer::isBoundary (' ', 'a'), "' ' --> 'a' = isBoundary");
t.ok (Lexer::isBoundary ('a', ' '), "'a' --> ' ' = isBoundary");