mirror of
https://github.com/GothenburgBitFactory/timewarrior.git
synced 2025-07-07 20:06:39 +02:00
Lexer: Added operator support
This commit is contained in:
parent
476551c563
commit
a164a72156
3 changed files with 175 additions and 1 deletions
138
src/Lexer.cpp
138
src/Lexer.cpp
|
@ -57,6 +57,7 @@ bool Lexer::token (std::string& token, Lexer::Type& type)
|
||||||
isNumber (token, type) ||
|
isNumber (token, type) ||
|
||||||
isPath (token, type) ||
|
isPath (token, type) ||
|
||||||
isPattern (token, type) ||
|
isPattern (token, type) ||
|
||||||
|
isOperator (token, type) ||
|
||||||
isWord (token, type))
|
isWord (token, type))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
@ -75,6 +76,7 @@ const std::string Lexer::typeName (const Lexer::Type& type)
|
||||||
case Lexer::Type::url: return "url";
|
case Lexer::Type::url: return "url";
|
||||||
case Lexer::Type::path: return "path";
|
case Lexer::Type::path: return "path";
|
||||||
case Lexer::Type::pattern: return "pattern";
|
case Lexer::Type::pattern: return "pattern";
|
||||||
|
case Lexer::Type::op: return "op";
|
||||||
case Lexer::Type::word: return "word";
|
case Lexer::Type::word: return "word";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -123,6 +125,13 @@ bool Lexer::isWhitespace (int c)
|
||||||
c == 0x3000); // ideographic space Common Separator, space
|
c == 0x3000); // ideographic space Common Separator, space
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
bool Lexer::isAlpha (int c)
|
||||||
|
{
|
||||||
|
return (c >= 'A' && c <= 'Z') ||
|
||||||
|
(c >= 'a' && c <= 'z');
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Digits 0-9.
|
// Digits 0-9.
|
||||||
//
|
//
|
||||||
|
@ -252,6 +261,44 @@ bool Lexer::isSingleCharOperator (int c)
|
||||||
c == '~'; // Pattern match
|
c == '~'; // Pattern match
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
bool Lexer::isDoubleCharOperator (int c0, int c1, int c2)
|
||||||
|
{
|
||||||
|
return (c0 == '=' && c1 == '=') ||
|
||||||
|
(c0 == '!' && c1 == '=') ||
|
||||||
|
(c0 == '<' && c1 == '=') ||
|
||||||
|
(c0 == '>' && c1 == '=') ||
|
||||||
|
(c0 == 'o' && c1 == 'r' && isBoundary (c1, c2)) ||
|
||||||
|
(c0 == '|' && c1 == '|') ||
|
||||||
|
(c0 == '&' && c1 == '&') ||
|
||||||
|
(c0 == '!' && c1 == '~');
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3)
|
||||||
|
{
|
||||||
|
return (c0 == 'a' && c1 == 'n' && c2 == 'd' && isBoundary (c2, c3)) ||
|
||||||
|
(c0 == 'x' && c1 == 'o' && c2 == 'r' && isBoundary (c2, c3)) ||
|
||||||
|
(c0 == '!' && c1 == '=' && c2 == '=');
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
bool Lexer::isBoundary (int left, int right)
|
||||||
|
{
|
||||||
|
// EOS
|
||||||
|
if (right == '\0') return true;
|
||||||
|
|
||||||
|
// XOR
|
||||||
|
if (isAlpha (left) != isAlpha (right)) return true;
|
||||||
|
if (isDigit (left) != isDigit (right)) return true;
|
||||||
|
if (isWhitespace (left) != isWhitespace (right)) return true;
|
||||||
|
|
||||||
|
// OR
|
||||||
|
if (isPunctuation (left) || isPunctuation (right)) return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool Lexer::isHardBoundary (int left, int right)
|
bool Lexer::isHardBoundary (int left, int right)
|
||||||
{
|
{
|
||||||
|
@ -269,6 +316,19 @@ bool Lexer::isHardBoundary (int left, int right)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
bool Lexer::isPunctuation (int c)
|
||||||
|
{
|
||||||
|
return isprint (c) &&
|
||||||
|
c != ' ' &&
|
||||||
|
c != '@' &&
|
||||||
|
c != '#' &&
|
||||||
|
c != '$' &&
|
||||||
|
c != '_' &&
|
||||||
|
! isDigit (c) &&
|
||||||
|
! isAlpha (c);
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool Lexer::isEOS () const
|
bool Lexer::isEOS () const
|
||||||
{
|
{
|
||||||
|
@ -499,6 +559,83 @@ bool Lexer::isPattern (std::string& token, Lexer::Type& type)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Lexer::Type::op
|
||||||
|
// _hastag_ | _notag | _neg_ | _pos_ |
|
||||||
|
// <isTripleCharOperator> |
|
||||||
|
// <isDoubleCharOperator> |
|
||||||
|
// <isSingleCharOperator> |
|
||||||
|
bool Lexer::isOperator (std::string& token, Lexer::Type& type)
|
||||||
|
{
|
||||||
|
std::size_t marker = _cursor;
|
||||||
|
|
||||||
|
if (_eos - marker >= 8 && _text.substr (marker, 8) == "_hastag_")
|
||||||
|
{
|
||||||
|
marker += 8;
|
||||||
|
type = Lexer::Type::op;
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (_eos - marker >= 7 && _text.substr (marker, 7) == "_notag_")
|
||||||
|
{
|
||||||
|
marker += 7;
|
||||||
|
type = Lexer::Type::op;
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_neg_")
|
||||||
|
{
|
||||||
|
marker += 5;
|
||||||
|
type = Lexer::Type::op;
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_pos_")
|
||||||
|
{
|
||||||
|
marker += 5;
|
||||||
|
type = Lexer::Type::op;
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (_eos - marker >= 3 &&
|
||||||
|
isTripleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2], _text[marker + 3]))
|
||||||
|
{
|
||||||
|
marker += 3;
|
||||||
|
type = Lexer::Type::op;
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (_eos - marker >= 2 &&
|
||||||
|
isDoubleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2]))
|
||||||
|
{
|
||||||
|
marker += 2;
|
||||||
|
type = Lexer::Type::op;
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
else if (isSingleCharOperator (_text[marker]))
|
||||||
|
{
|
||||||
|
token = _text[marker];
|
||||||
|
type = Lexer::Type::op;
|
||||||
|
_cursor = ++marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Static
|
// Static
|
||||||
std::string Lexer::typeToString (Lexer::Type type)
|
std::string Lexer::typeToString (Lexer::Type type)
|
||||||
|
@ -509,6 +646,7 @@ std::string Lexer::typeToString (Lexer::Type type)
|
||||||
else if (type == Lexer::Type::url) return std::string ("\033[38;5;7m\033[48;5;4m") + "url" + "\033[0m";
|
else if (type == Lexer::Type::url) return std::string ("\033[38;5;7m\033[48;5;4m") + "url" + "\033[0m";
|
||||||
else if (type == Lexer::Type::path) return std::string ("\033[37;102m") + "path" + "\033[0m";
|
else if (type == Lexer::Type::path) return std::string ("\033[37;102m") + "path" + "\033[0m";
|
||||||
else if (type == Lexer::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m";
|
else if (type == Lexer::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m";
|
||||||
|
else if (type == Lexer::Type::op) return std::string ("\033[38;5;7m\033[48;5;203m") + "op" + "\033[0m";
|
||||||
else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m";
|
else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m";
|
||||||
else return std::string ("\033[37;41m") + "unknown" + "\033[0m";
|
else return std::string ("\033[37;41m") + "unknown" + "\033[0m";
|
||||||
}
|
}
|
||||||
|
|
|
@ -40,6 +40,7 @@ public:
|
||||||
url,
|
url,
|
||||||
path,
|
path,
|
||||||
pattern,
|
pattern,
|
||||||
|
op,
|
||||||
word };
|
word };
|
||||||
|
|
||||||
Lexer (const std::string&);
|
Lexer (const std::string&);
|
||||||
|
@ -49,10 +50,15 @@ public:
|
||||||
// Static helpers.
|
// Static helpers.
|
||||||
static const std::string typeName (const Lexer::Type&);
|
static const std::string typeName (const Lexer::Type&);
|
||||||
static bool isWhitespace (int);
|
static bool isWhitespace (int);
|
||||||
|
static bool isAlpha (int);
|
||||||
static bool isDigit (int);
|
static bool isDigit (int);
|
||||||
static bool isHexDigit (int);
|
static bool isHexDigit (int);
|
||||||
static bool isSingleCharOperator (int);
|
static bool isSingleCharOperator (int);
|
||||||
|
static bool isDoubleCharOperator (int, int, int);
|
||||||
|
static bool isTripleCharOperator (int, int, int, int);
|
||||||
|
static bool isBoundary (int, int);
|
||||||
static bool isHardBoundary (int, int);
|
static bool isHardBoundary (int, int);
|
||||||
|
static bool isPunctuation (int);
|
||||||
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
||||||
static bool readWord (const std::string&, std::string::size_type&, std::string&);
|
static bool readWord (const std::string&, std::string::size_type&, std::string&);
|
||||||
static int hexToInt (int);
|
static int hexToInt (int);
|
||||||
|
@ -71,6 +77,7 @@ public:
|
||||||
bool isURL (std::string&, Lexer::Type&);
|
bool isURL (std::string&, Lexer::Type&);
|
||||||
bool isPath (std::string&, Lexer::Type&);
|
bool isPath (std::string&, Lexer::Type&);
|
||||||
bool isPattern (std::string&, Lexer::Type&);
|
bool isPattern (std::string&, Lexer::Type&);
|
||||||
|
bool isOperator (std::string&, Lexer::Type&);
|
||||||
bool isWord (std::string&, Lexer::Type&);
|
bool isWord (std::string&, Lexer::Type&);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main (int, char**)
|
int main (int, char**)
|
||||||
{
|
{
|
||||||
UnitTest t (210);
|
UnitTest t (367);
|
||||||
|
|
||||||
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
||||||
std::string token;
|
std::string token;
|
||||||
|
@ -193,6 +193,34 @@ int main (int, char**)
|
||||||
{ "1.2e-3.4", { { "1.2e-3.4", Lexer::Type::number }, NO, NO, NO, NO }, },
|
{ "1.2e-3.4", { { "1.2e-3.4", Lexer::Type::number }, NO, NO, NO, NO }, },
|
||||||
{ "0x2f", { { "0x2f", Lexer::Type::hex }, NO, NO, NO, NO }, },
|
{ "0x2f", { { "0x2f", Lexer::Type::hex }, NO, NO, NO, NO }, },
|
||||||
|
|
||||||
|
// Operator - complete set
|
||||||
|
{ "^", { { "^", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "!", { { "!", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "_neg_", { { "_neg_", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "_pos_", { { "_pos_", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "_hastag_", { { "_hastag_", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "_notag_", { { "_notag_", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "*", { { "*", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "/", { { "/", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "%", { { "%", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "+", { { "+", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "-", { { "-", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "<=", { { "<=", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ ">=", { { ">=", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ ">", { { ">", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "<", { { "<", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "=", { { "=", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "==", { { "==", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "!=", { { "!=", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "!==", { { "!==", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "~", { { "~", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "!~", { { "!~", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "and", { { "and", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "or", { { "or", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "xor", { { "xor", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ "(", { { "(", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
{ ")", { { ")", Lexer::Type::op }, NO, NO, NO, NO }, },
|
||||||
|
|
||||||
};
|
};
|
||||||
#define NUM_TESTS (sizeof (lexerTests) / sizeof (lexerTests[0]))
|
#define NUM_TESTS (sizeof (lexerTests) / sizeof (lexerTests[0]))
|
||||||
|
|
||||||
|
@ -233,6 +261,7 @@ int main (int, char**)
|
||||||
t.is (Lexer::typeName (Lexer::Type::url), "url", "Lexer::typeName (Lexer::Type::url)");
|
t.is (Lexer::typeName (Lexer::Type::url), "url", "Lexer::typeName (Lexer::Type::url)");
|
||||||
t.is (Lexer::typeName (Lexer::Type::path), "path", "Lexer::typeName (Lexer::Type::path)");
|
t.is (Lexer::typeName (Lexer::Type::path), "path", "Lexer::typeName (Lexer::Type::path)");
|
||||||
t.is (Lexer::typeName (Lexer::Type::pattern), "pattern", "Lexer::typeName (Lexer::Type::pattern)");
|
t.is (Lexer::typeName (Lexer::Type::pattern), "pattern", "Lexer::typeName (Lexer::Type::pattern)");
|
||||||
|
t.is (Lexer::typeName (Lexer::Type::op), "op", "Lexer::typeName (Lexer::Type::op)");
|
||||||
t.is (Lexer::typeName (Lexer::Type::word), "word", "Lexer::typeName (Lexer::Type::word)");
|
t.is (Lexer::typeName (Lexer::Type::word), "word", "Lexer::typeName (Lexer::Type::word)");
|
||||||
|
|
||||||
// std::string Lexer::trimLeft (const std::string& in, const std::string&)
|
// std::string Lexer::trimLeft (const std::string& in, const std::string&)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue