Lexer: Added operator support

This commit is contained in:
Paul Beckingham 2015-12-25 16:44:21 -05:00
parent 476551c563
commit a164a72156
3 changed files with 175 additions and 1 deletions

View file

@ -57,6 +57,7 @@ bool Lexer::token (std::string& token, Lexer::Type& type)
isNumber (token, type) || isNumber (token, type) ||
isPath (token, type) || isPath (token, type) ||
isPattern (token, type) || isPattern (token, type) ||
isOperator (token, type) ||
isWord (token, type)) isWord (token, type))
return true; return true;
@ -75,6 +76,7 @@ const std::string Lexer::typeName (const Lexer::Type& type)
case Lexer::Type::url: return "url"; case Lexer::Type::url: return "url";
case Lexer::Type::path: return "path"; case Lexer::Type::path: return "path";
case Lexer::Type::pattern: return "pattern"; case Lexer::Type::pattern: return "pattern";
case Lexer::Type::op: return "op";
case Lexer::Type::word: return "word"; case Lexer::Type::word: return "word";
} }
@ -123,6 +125,13 @@ bool Lexer::isWhitespace (int c)
c == 0x3000); // ideographic space Common Separator, space c == 0x3000); // ideographic space Common Separator, space
} }
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isAlpha (int c)
{
return (c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z');
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Digits 0-9. // Digits 0-9.
// //
@ -252,6 +261,44 @@ bool Lexer::isSingleCharOperator (int c)
c == '~'; // Pattern match c == '~'; // Pattern match
} }
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isDoubleCharOperator (int c0, int c1, int c2)
{
return (c0 == '=' && c1 == '=') ||
(c0 == '!' && c1 == '=') ||
(c0 == '<' && c1 == '=') ||
(c0 == '>' && c1 == '=') ||
(c0 == 'o' && c1 == 'r' && isBoundary (c1, c2)) ||
(c0 == '|' && c1 == '|') ||
(c0 == '&' && c1 == '&') ||
(c0 == '!' && c1 == '~');
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3)
{
return (c0 == 'a' && c1 == 'n' && c2 == 'd' && isBoundary (c2, c3)) ||
(c0 == 'x' && c1 == 'o' && c2 == 'r' && isBoundary (c2, c3)) ||
(c0 == '!' && c1 == '=' && c2 == '=');
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isBoundary (int left, int right)
{
// EOS
if (right == '\0') return true;
// XOR
if (isAlpha (left) != isAlpha (right)) return true;
if (isDigit (left) != isDigit (right)) return true;
if (isWhitespace (left) != isWhitespace (right)) return true;
// OR
if (isPunctuation (left) || isPunctuation (right)) return true;
return false;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
bool Lexer::isHardBoundary (int left, int right) bool Lexer::isHardBoundary (int left, int right)
{ {
@ -269,6 +316,19 @@ bool Lexer::isHardBoundary (int left, int right)
return false; return false;
} }
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isPunctuation (int c)
{
return isprint (c) &&
c != ' ' &&
c != '@' &&
c != '#' &&
c != '$' &&
c != '_' &&
! isDigit (c) &&
! isAlpha (c);
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
bool Lexer::isEOS () const bool Lexer::isEOS () const
{ {
@ -499,6 +559,83 @@ bool Lexer::isPattern (std::string& token, Lexer::Type& type)
return false; return false;
} }
////////////////////////////////////////////////////////////////////////////////
// Lexer::Type::op
// _hastag_ | _notag | _neg_ | _pos_ |
// <isTripleCharOperator> |
// <isDoubleCharOperator> |
// <isSingleCharOperator> |
bool Lexer::isOperator (std::string& token, Lexer::Type& type)
{
std::size_t marker = _cursor;
if (_eos - marker >= 8 && _text.substr (marker, 8) == "_hastag_")
{
marker += 8;
type = Lexer::Type::op;
token = _text.substr (_cursor, marker - _cursor);
_cursor = marker;
return true;
}
else if (_eos - marker >= 7 && _text.substr (marker, 7) == "_notag_")
{
marker += 7;
type = Lexer::Type::op;
token = _text.substr (_cursor, marker - _cursor);
_cursor = marker;
return true;
}
else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_neg_")
{
marker += 5;
type = Lexer::Type::op;
token = _text.substr (_cursor, marker - _cursor);
_cursor = marker;
return true;
}
else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_pos_")
{
marker += 5;
type = Lexer::Type::op;
token = _text.substr (_cursor, marker - _cursor);
_cursor = marker;
return true;
}
else if (_eos - marker >= 3 &&
isTripleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2], _text[marker + 3]))
{
marker += 3;
type = Lexer::Type::op;
token = _text.substr (_cursor, marker - _cursor);
_cursor = marker;
return true;
}
else if (_eos - marker >= 2 &&
isDoubleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2]))
{
marker += 2;
type = Lexer::Type::op;
token = _text.substr (_cursor, marker - _cursor);
_cursor = marker;
return true;
}
else if (isSingleCharOperator (_text[marker]))
{
token = _text[marker];
type = Lexer::Type::op;
_cursor = ++marker;
return true;
}
return false;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Static // Static
std::string Lexer::typeToString (Lexer::Type type) std::string Lexer::typeToString (Lexer::Type type)
@ -509,6 +646,7 @@ std::string Lexer::typeToString (Lexer::Type type)
else if (type == Lexer::Type::url) return std::string ("\033[38;5;7m\033[48;5;4m") + "url" + "\033[0m"; else if (type == Lexer::Type::url) return std::string ("\033[38;5;7m\033[48;5;4m") + "url" + "\033[0m";
else if (type == Lexer::Type::path) return std::string ("\033[37;102m") + "path" + "\033[0m"; else if (type == Lexer::Type::path) return std::string ("\033[37;102m") + "path" + "\033[0m";
else if (type == Lexer::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m"; else if (type == Lexer::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m";
else if (type == Lexer::Type::op) return std::string ("\033[38;5;7m\033[48;5;203m") + "op" + "\033[0m";
else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m"; else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m";
else return std::string ("\033[37;41m") + "unknown" + "\033[0m"; else return std::string ("\033[37;41m") + "unknown" + "\033[0m";
} }

View file

@ -40,6 +40,7 @@ public:
url, url,
path, path,
pattern, pattern,
op,
word }; word };
Lexer (const std::string&); Lexer (const std::string&);
@ -49,10 +50,15 @@ public:
// Static helpers. // Static helpers.
static const std::string typeName (const Lexer::Type&); static const std::string typeName (const Lexer::Type&);
static bool isWhitespace (int); static bool isWhitespace (int);
static bool isAlpha (int);
static bool isDigit (int); static bool isDigit (int);
static bool isHexDigit (int); static bool isHexDigit (int);
static bool isSingleCharOperator (int); static bool isSingleCharOperator (int);
static bool isDoubleCharOperator (int, int, int);
static bool isTripleCharOperator (int, int, int, int);
static bool isBoundary (int, int);
static bool isHardBoundary (int, int); static bool isHardBoundary (int, int);
static bool isPunctuation (int);
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
static bool readWord (const std::string&, std::string::size_type&, std::string&); static bool readWord (const std::string&, std::string::size_type&, std::string&);
static int hexToInt (int); static int hexToInt (int);
@ -71,6 +77,7 @@ public:
bool isURL (std::string&, Lexer::Type&); bool isURL (std::string&, Lexer::Type&);
bool isPath (std::string&, Lexer::Type&); bool isPath (std::string&, Lexer::Type&);
bool isPattern (std::string&, Lexer::Type&); bool isPattern (std::string&, Lexer::Type&);
bool isOperator (std::string&, Lexer::Type&);
bool isWord (std::string&, Lexer::Type&); bool isWord (std::string&, Lexer::Type&);
private: private:

View file

@ -34,7 +34,7 @@
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main (int, char**) int main (int, char**)
{ {
UnitTest t (210); UnitTest t (367);
std::vector <std::pair <std::string, Lexer::Type>> tokens; std::vector <std::pair <std::string, Lexer::Type>> tokens;
std::string token; std::string token;
@ -193,6 +193,34 @@ int main (int, char**)
{ "1.2e-3.4", { { "1.2e-3.4", Lexer::Type::number }, NO, NO, NO, NO }, }, { "1.2e-3.4", { { "1.2e-3.4", Lexer::Type::number }, NO, NO, NO, NO }, },
{ "0x2f", { { "0x2f", Lexer::Type::hex }, NO, NO, NO, NO }, }, { "0x2f", { { "0x2f", Lexer::Type::hex }, NO, NO, NO, NO }, },
// Operator - complete set
{ "^", { { "^", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "!", { { "!", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "_neg_", { { "_neg_", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "_pos_", { { "_pos_", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "_hastag_", { { "_hastag_", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "_notag_", { { "_notag_", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "*", { { "*", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "/", { { "/", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "%", { { "%", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "+", { { "+", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "-", { { "-", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "<=", { { "<=", Lexer::Type::op }, NO, NO, NO, NO }, },
{ ">=", { { ">=", Lexer::Type::op }, NO, NO, NO, NO }, },
{ ">", { { ">", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "<", { { "<", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "=", { { "=", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "==", { { "==", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "!=", { { "!=", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "!==", { { "!==", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "~", { { "~", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "!~", { { "!~", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "and", { { "and", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "or", { { "or", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "xor", { { "xor", Lexer::Type::op }, NO, NO, NO, NO }, },
{ "(", { { "(", Lexer::Type::op }, NO, NO, NO, NO }, },
{ ")", { { ")", Lexer::Type::op }, NO, NO, NO, NO }, },
}; };
#define NUM_TESTS (sizeof (lexerTests) / sizeof (lexerTests[0])) #define NUM_TESTS (sizeof (lexerTests) / sizeof (lexerTests[0]))
@ -233,6 +261,7 @@ int main (int, char**)
t.is (Lexer::typeName (Lexer::Type::url), "url", "Lexer::typeName (Lexer::Type::url)"); t.is (Lexer::typeName (Lexer::Type::url), "url", "Lexer::typeName (Lexer::Type::url)");
t.is (Lexer::typeName (Lexer::Type::path), "path", "Lexer::typeName (Lexer::Type::path)"); t.is (Lexer::typeName (Lexer::Type::path), "path", "Lexer::typeName (Lexer::Type::path)");
t.is (Lexer::typeName (Lexer::Type::pattern), "pattern", "Lexer::typeName (Lexer::Type::pattern)"); t.is (Lexer::typeName (Lexer::Type::pattern), "pattern", "Lexer::typeName (Lexer::Type::pattern)");
t.is (Lexer::typeName (Lexer::Type::op), "op", "Lexer::typeName (Lexer::Type::op)");
t.is (Lexer::typeName (Lexer::Type::word), "word", "Lexer::typeName (Lexer::Type::word)"); t.is (Lexer::typeName (Lexer::Type::word), "word", "Lexer::typeName (Lexer::Type::word)");
// std::string Lexer::trimLeft (const std::string& in, const std::string&) // std::string Lexer::trimLeft (const std::string& in, const std::string&)