mirror of
https://github.com/GothenburgBitFactory/taskwarrior.git
synced 2025-06-26 10:54:26 +02:00
Lexer: Migrated to unicodeWhitespace
This commit is contained in:
parent
2c89688b46
commit
49dedfbc86
4 changed files with 37 additions and 106 deletions
106
src/Lexer.cpp
106
src/Lexer.cpp
|
@ -30,6 +30,7 @@
|
|||
#include <ctype.h>
|
||||
#include <Datetime.h>
|
||||
#include <Duration.h>
|
||||
#include <unicode.h>
|
||||
#include <utf8.h>
|
||||
|
||||
static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
|
||||
|
@ -59,7 +60,7 @@ Lexer::~Lexer ()
|
|||
bool Lexer::token (std::string& token, Lexer::Type& type)
|
||||
{
|
||||
// Eat white space.
|
||||
while (isWhitespace (_text[_cursor]))
|
||||
while (unicodeWhitespace (_text[_cursor]))
|
||||
utf8_next_char (_text, _cursor);
|
||||
|
||||
// Terminate at EOS.
|
||||
|
@ -142,48 +143,6 @@ const std::string Lexer::typeName (const Lexer::Type& type)
|
|||
return "unknown";
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Complete Unicode whitespace list.
|
||||
//
|
||||
// http://en.wikipedia.org/wiki/Whitespace_character
|
||||
// Updated 2015-09-13
|
||||
// Static
|
||||
//
|
||||
// TODO This list should be derived from the Unicode database.
|
||||
bool Lexer::isWhitespace (int c)
|
||||
{
|
||||
return (c == 0x0020 || // space Common Separator, space
|
||||
c == 0x0009 || // Common Other, control HT, Horizontal Tab
|
||||
c == 0x000A || // Common Other, control LF, Line feed
|
||||
c == 0x000B || // Common Other, control VT, Vertical Tab
|
||||
c == 0x000C || // Common Other, control FF, Form feed
|
||||
c == 0x000D || // Common Other, control CR, Carriage return
|
||||
c == 0x0085 || // Common Other, control NEL, Next line
|
||||
c == 0x00A0 || // no-break space Common Separator, space
|
||||
c == 0x1680 || // ogham space mark Ogham Separator, space
|
||||
c == 0x180E || // mongolian vowel separator Mongolian Separator, space
|
||||
c == 0x2000 || // en quad Common Separator, space
|
||||
c == 0x2001 || // em quad Common Separator, space
|
||||
c == 0x2002 || // en space Common Separator, space
|
||||
c == 0x2003 || // em space Common Separator, space
|
||||
c == 0x2004 || // three-per-em space Common Separator, space
|
||||
c == 0x2005 || // four-per-em space Common Separator, space
|
||||
c == 0x2006 || // six-per-em space Common Separator, space
|
||||
c == 0x2007 || // figure space Common Separator, space
|
||||
c == 0x2008 || // punctuation space Common Separator, space
|
||||
c == 0x2009 || // thin space Common Separator, space
|
||||
c == 0x200A || // hair space Common Separator, space
|
||||
c == 0x200B || // zero width space
|
||||
c == 0x200C || // zero width non-joiner
|
||||
c == 0x200D || // zero width joiner
|
||||
c == 0x2028 || // line separator Common Separator, line
|
||||
c == 0x2029 || // paragraph separator Common Separator, paragraph
|
||||
c == 0x202F || // narrow no-break space Common Separator, space
|
||||
c == 0x205F || // medium mathematical space Common Separator, space
|
||||
c == 0x2060 || // word joiner
|
||||
c == 0x3000); // ideographic space Common Separator, space
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
bool Lexer::isAlpha (int c)
|
||||
{
|
||||
|
@ -213,7 +172,7 @@ bool Lexer::isHexDigit (int c)
|
|||
bool Lexer::isIdentifierStart (int c)
|
||||
{
|
||||
return c && // Include null character check.
|
||||
! isWhitespace (c) &&
|
||||
! unicodeWhitespace (c) &&
|
||||
! isDigit (c) &&
|
||||
! isSingleCharOperator (c) &&
|
||||
! isPunctuation (c);
|
||||
|
@ -225,7 +184,7 @@ bool Lexer::isIdentifierNext (int c)
|
|||
return c && // Include null character check.
|
||||
c != ':' && // Used in isPair.
|
||||
c != '=' && // Used in isPair.
|
||||
! isWhitespace (c) &&
|
||||
! unicodeWhitespace (c) &&
|
||||
! isSingleCharOperator (c);
|
||||
}
|
||||
|
||||
|
@ -272,15 +231,15 @@ bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3)
|
|||
bool Lexer::isBoundary (int left, int right)
|
||||
{
|
||||
// EOS
|
||||
if (right == '\0') return true;
|
||||
if (right == '\0') return true;
|
||||
|
||||
// XOR
|
||||
if (isAlpha (left) != isAlpha (right)) return true;
|
||||
if (isDigit (left) != isDigit (right)) return true;
|
||||
if (isWhitespace (left) != isWhitespace (right)) return true;
|
||||
if (isAlpha (left) != isAlpha (right)) return true;
|
||||
if (isDigit (left) != isDigit (right)) return true;
|
||||
if (unicodeWhitespace (left) != unicodeWhitespace (right)) return true;
|
||||
|
||||
// OR
|
||||
if (isPunctuation (left) || isPunctuation (right)) return true;
|
||||
if (isPunctuation (left) || isPunctuation (right)) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -289,7 +248,8 @@ bool Lexer::isBoundary (int left, int right)
|
|||
bool Lexer::isHardBoundary (int left, int right)
|
||||
{
|
||||
// EOS
|
||||
if (right == '\0') return true;
|
||||
if (right == '\0')
|
||||
return true;
|
||||
|
||||
// FILTER operators that don't need to be surrounded by whitespace.
|
||||
if (left == '(' ||
|
||||
|
@ -628,10 +588,10 @@ bool Lexer::isUUID (std::string& token, Lexer::Type& type, bool endBoundary)
|
|||
break;
|
||||
}
|
||||
|
||||
if (i >= uuid_min_length &&
|
||||
(! endBoundary ||
|
||||
! _text[marker + i] ||
|
||||
isWhitespace (_text[marker + i]) ||
|
||||
if (i >= uuid_min_length &&
|
||||
(! endBoundary ||
|
||||
! _text[marker + i] ||
|
||||
unicodeWhitespace (_text[marker + i]) ||
|
||||
isSingleCharOperator (_text[marker + i])))
|
||||
{
|
||||
token = _text.substr (_cursor, i);
|
||||
|
@ -726,10 +686,10 @@ bool Lexer::isNumber (std::string& token, Lexer::Type& type)
|
|||
}
|
||||
}
|
||||
|
||||
// Lookahead: !<isWhitespace> | !<isSingleCharOperator>
|
||||
// Lookahead: !<unicodeWhitespace> | !<isSingleCharOperator>
|
||||
// If there is an immediately consecutive character, that is not an operator, fail.
|
||||
if (_eos > marker &&
|
||||
! isWhitespace (_text[marker]) &&
|
||||
! unicodeWhitespace (_text[marker]) &&
|
||||
! isSingleCharOperator (_text[marker]))
|
||||
return false;
|
||||
|
||||
|
@ -806,7 +766,7 @@ bool Lexer::isURL (std::string& token, Lexer::Type& type)
|
|||
marker += 3;
|
||||
|
||||
while (marker < _eos &&
|
||||
! isWhitespace (_text[marker]))
|
||||
! unicodeWhitespace (_text[marker]))
|
||||
utf8_next_char (_text, marker);
|
||||
|
||||
token = _text.substr (_cursor, marker - _cursor);
|
||||
|
@ -847,7 +807,7 @@ bool Lexer::isPair (std::string& token, Lexer::Type& type)
|
|||
if (readWord (_text, "'\"", _cursor, ignoredToken) ||
|
||||
readWord (_text, _cursor, ignoredToken) ||
|
||||
isEOS () ||
|
||||
isWhitespace (_text[_cursor]))
|
||||
unicodeWhitespace (_text[_cursor]))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::pair;
|
||||
|
@ -901,7 +861,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type)
|
|||
// Success is multiple numbers, matching the pattern.
|
||||
if (count > 1 &&
|
||||
(isEOS () ||
|
||||
isWhitespace (_text[_cursor]) ||
|
||||
unicodeWhitespace (_text[_cursor]) ||
|
||||
isHardBoundary (_text[_cursor], _text[_cursor + 1])))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
|
@ -916,7 +876,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type)
|
|||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Lexer::Type::tag
|
||||
// ^ | '(' | ')' | <isWhitespace>
|
||||
// ^ | '(' | ')' | <unicodeWhitespace>
|
||||
// [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]*
|
||||
bool Lexer::isTag (std::string& token, Lexer::Type& type)
|
||||
{
|
||||
|
@ -924,7 +884,7 @@ bool Lexer::isTag (std::string& token, Lexer::Type& type)
|
|||
|
||||
// Lookbehind: Assert ^ or preceded by whitespace, (, or ).
|
||||
if (marker > 0 &&
|
||||
! isWhitespace (_text[marker - 1]) &&
|
||||
! unicodeWhitespace (_text[marker - 1]) &&
|
||||
_text[marker - 1] != '(' &&
|
||||
_text[marker - 1] != ')')
|
||||
return false;
|
||||
|
@ -970,12 +930,12 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type)
|
|||
break;
|
||||
|
||||
if (_text[marker] &&
|
||||
! isWhitespace (_text[marker]) &&
|
||||
! unicodeWhitespace (_text[marker]) &&
|
||||
_text[marker] != '/')
|
||||
{
|
||||
utf8_next_char (_text, marker);
|
||||
while (_text[marker] &&
|
||||
! isWhitespace (_text[marker]) &&
|
||||
! unicodeWhitespace (_text[marker]) &&
|
||||
_text[marker] != '/')
|
||||
utf8_next_char (_text, marker);
|
||||
}
|
||||
|
@ -997,7 +957,7 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type)
|
|||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Lexer::Type::substitution
|
||||
// / <unquoted-string> / <unquoted-string> / [g] <EOS> | <isWhitespace>
|
||||
// / <unquoted-string> / <unquoted-string> / [g] <EOS> | <unicodeWhitespace>
|
||||
bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
|
||||
{
|
||||
std::size_t marker = _cursor;
|
||||
|
@ -1012,9 +972,9 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
|
|||
if (_text[_cursor] == 'g')
|
||||
++_cursor;
|
||||
|
||||
// Lookahread: <EOS> | <isWhitespace>
|
||||
// Lookahread: <EOS> | <unicodeWhitespace>
|
||||
if (_text[_cursor] == '\0' ||
|
||||
isWhitespace (_text[_cursor]))
|
||||
unicodeWhitespace (_text[_cursor]))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::substitution;
|
||||
|
@ -1029,7 +989,7 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
|
|||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Lexer::Type::pattern
|
||||
// / <unquoted-string> / <EOS> | <isWhitespace>
|
||||
// / <unquoted-string> / <EOS> | <unicodeWhitespace>
|
||||
bool Lexer::isPattern (std::string& token, Lexer::Type& type)
|
||||
{
|
||||
std::size_t marker = _cursor;
|
||||
|
@ -1037,7 +997,7 @@ bool Lexer::isPattern (std::string& token, Lexer::Type& type)
|
|||
std::string word;
|
||||
if (readWord (_text, "/", _cursor, word) &&
|
||||
(isEOS () ||
|
||||
isWhitespace (_text[_cursor])))
|
||||
unicodeWhitespace (_text[_cursor])))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::pattern;
|
||||
|
@ -1357,7 +1317,7 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
|
|||
std::size_t marker = _cursor;
|
||||
|
||||
while (_text[marker] &&
|
||||
! isWhitespace (_text[marker]) &&
|
||||
! unicodeWhitespace (_text[marker]) &&
|
||||
! isSingleCharOperator (_text[marker]))
|
||||
utf8_next_char (_text, marker);
|
||||
|
||||
|
@ -1393,7 +1353,7 @@ bool Lexer::isLiteral (
|
|||
// End boundary conditions must be met.
|
||||
if (endBoundary &&
|
||||
_text[_cursor + common] &&
|
||||
! Lexer::isWhitespace (_text[_cursor + common]) &&
|
||||
! unicodeWhitespace (_text[_cursor + common]) &&
|
||||
! Lexer::isSingleCharOperator (_text[_cursor + common]))
|
||||
return false;
|
||||
|
||||
|
@ -1567,7 +1527,7 @@ bool Lexer::readWord (
|
|||
//
|
||||
// Ends at:
|
||||
// Lexer::isEOS
|
||||
// Lexer::isWhitespace
|
||||
// unicodeWhitespace
|
||||
// Lexer::isHardBoundary
|
||||
bool Lexer::readWord (
|
||||
const std::string& text,
|
||||
|
@ -1582,7 +1542,7 @@ bool Lexer::readWord (
|
|||
while ((c = text[cursor])) // Handles EOS.
|
||||
{
|
||||
// Unquoted word ends on white space.
|
||||
if (Lexer::isWhitespace (c))
|
||||
if (unicodeWhitespace (c))
|
||||
break;
|
||||
|
||||
// Parentheses mostly.
|
||||
|
|
|
@ -61,7 +61,6 @@ public:
|
|||
|
||||
// Static helpers.
|
||||
static const std::string typeName (const Lexer::Type&);
|
||||
static bool isWhitespace (int);
|
||||
static bool isAlpha (int);
|
||||
static bool isDigit (int);
|
||||
static bool isHexDigit (int);
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
#include <signal.h>
|
||||
#include <sys/select.h>
|
||||
#include <Lexer.h>
|
||||
#include <unicode.h>
|
||||
#include <utf8.h>
|
||||
#include <util.h>
|
||||
#include <main.h>
|
||||
|
@ -268,7 +269,7 @@ bool nontrivial (const std::string& input)
|
|||
std::string::size_type i = 0;
|
||||
int character;
|
||||
while ((character = utf8_next_char (input, i)))
|
||||
if (! Lexer::isWhitespace (character))
|
||||
if (! unicodeWhitespace (character))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
|
|
@ -37,9 +37,9 @@
|
|||
int main (int, char**)
|
||||
{
|
||||
#ifdef PRODUCT_TASKWARRIOR
|
||||
UnitTest t (1280);
|
||||
UnitTest t (1253);
|
||||
#else
|
||||
UnitTest t (1262);
|
||||
UnitTest t (1235);
|
||||
#endif
|
||||
|
||||
// Use same Datetime/Duraiton configuration as Context∴:staticInitialization.
|
||||
|
@ -58,35 +58,6 @@ int main (int, char**)
|
|||
Lexer::attributes["tags"] = "string";
|
||||
Lexer::attributes["description"] = "string";
|
||||
|
||||
// White space detection.
|
||||
t.notok (Lexer::isWhitespace (0x0041), "U+0041 (A) ! isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x0020), "U+0020 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x0009), "U+0009 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x000A), "U+000A isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x000B), "U+000B isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x000C), "U+000C isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x000D), "U+000D isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x0085), "U+0085 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x00A0), "U+00A0 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x1680), "U+1680 isWhitespace"); // 10
|
||||
t.ok (Lexer::isWhitespace (0x180E), "U+180E isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2000), "U+2000 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2001), "U+2001 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2002), "U+2002 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2003), "U+2003 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2004), "U+2004 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2005), "U+2005 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2006), "U+2006 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2007), "U+2007 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2008), "U+2008 isWhitespace"); // 20
|
||||
t.ok (Lexer::isWhitespace (0x2009), "U+2009 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x200A), "U+200A isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2028), "U+2028 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x2029), "U+2029 isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x202F), "U+202F isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x205F), "U+205F isWhitespace");
|
||||
t.ok (Lexer::isWhitespace (0x3000), "U+3000 isWhitespace");
|
||||
|
||||
// static bool Lexer::isBoundary (int, int);
|
||||
t.ok (Lexer::isBoundary (' ', 'a'), "' ' --> 'a' = isBoundary");
|
||||
t.ok (Lexer::isBoundary ('a', ' '), "'a' --> ' ' = isBoundary");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue