mirror of
https://github.com/GothenburgBitFactory/taskwarrior.git
synced 2025-08-29 17:07:19 +02:00
Lexer2
- Migrated new ::isURL and ::isPath methods. - Migrated new ::tokens method to access all tokens at once.
This commit is contained in:
parent
aab93b2cda
commit
66d5a8ba3d
2 changed files with 113 additions and 5 deletions
109
src/Lexer2.cpp
109
src/Lexer2.cpp
|
@ -46,6 +46,8 @@ Lexer2::~Lexer2 ()
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// When a Lexer2 object is constructed with a string, this method walks through
|
||||||
|
// the stream of low-level tokens.
|
||||||
bool Lexer2::token (std::string& token, Lexer2::Type& type)
|
bool Lexer2::token (std::string& token, Lexer2::Type& type)
|
||||||
{
|
{
|
||||||
// Eat white space.
|
// Eat white space.
|
||||||
|
@ -58,11 +60,11 @@ bool Lexer2::token (std::string& token, Lexer2::Type& type)
|
||||||
|
|
||||||
// The sequence is specific, and must follow these rules:
|
// The sequence is specific, and must follow these rules:
|
||||||
// - date < uuid < identifier
|
// - date < uuid < identifier
|
||||||
// - duraiton < identifier
|
// - duration < identifier
|
||||||
// - pair < identifier
|
// - url < pair < identifier
|
||||||
// - hex < number
|
// - hex < number
|
||||||
// - separator < tag < operator
|
// - separator < tag < operator
|
||||||
// - substitution < pattern
|
// - path < substitution < pattern
|
||||||
// - word last
|
// - word last
|
||||||
if (isString (token, type, '\'') ||
|
if (isString (token, type, '\'') ||
|
||||||
isString (token, type, '"') ||
|
isString (token, type, '"') ||
|
||||||
|
@ -72,8 +74,10 @@ bool Lexer2::token (std::string& token, Lexer2::Type& type)
|
||||||
isNumber (token, type) ||
|
isNumber (token, type) ||
|
||||||
isSeparator (token, type) ||
|
isSeparator (token, type) ||
|
||||||
isList (token, type) ||
|
isList (token, type) ||
|
||||||
|
isURL (token, type) ||
|
||||||
isPair (token, type) ||
|
isPair (token, type) ||
|
||||||
isTag (token, type) ||
|
isTag (token, type) ||
|
||||||
|
isPath (token, type) ||
|
||||||
isSubstitution (token, type) ||
|
isSubstitution (token, type) ||
|
||||||
isPattern (token, type) ||
|
isPattern (token, type) ||
|
||||||
isOperator (token, type) ||
|
isOperator (token, type) ||
|
||||||
|
@ -84,6 +88,22 @@ bool Lexer2::token (std::string& token, Lexer2::Type& type)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// This static method tokenizes the input and provides a vector of token/type
|
||||||
|
// results from a high-level lex.
|
||||||
|
std::vector <std::pair <std::string, Lexer2::Type>> Lexer2::tokens (
|
||||||
|
const std::string& text)
|
||||||
|
{
|
||||||
|
std::vector <std::pair <std::string, Lexer2::Type>> all;
|
||||||
|
std::string token;
|
||||||
|
Lexer2::Type type;
|
||||||
|
Lexer2 l (text);
|
||||||
|
while (l.token (token, type))
|
||||||
|
all.push_back (std::pair <std::string, Lexer2::Type> (token, type));
|
||||||
|
|
||||||
|
return all;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// No L10N - these are for internal purposes.
|
// No L10N - these are for internal purposes.
|
||||||
const std::string Lexer2::typeName (const Lexer2::Type& type)
|
const std::string Lexer2::typeName (const Lexer2::Type& type)
|
||||||
|
@ -95,9 +115,11 @@ const std::string Lexer2::typeName (const Lexer2::Type& type)
|
||||||
case Lexer2::Type::hex: return "hex";
|
case Lexer2::Type::hex: return "hex";
|
||||||
case Lexer2::Type::string: return "string";
|
case Lexer2::Type::string: return "string";
|
||||||
case Lexer2::Type::list: return "list";
|
case Lexer2::Type::list: return "list";
|
||||||
|
case Lexer2::Type::url: return "url";
|
||||||
case Lexer2::Type::pair: return "pair";
|
case Lexer2::Type::pair: return "pair";
|
||||||
case Lexer2::Type::separator: return "separator";
|
case Lexer2::Type::separator: return "separator";
|
||||||
case Lexer2::Type::tag: return "tag";
|
case Lexer2::Type::tag: return "tag";
|
||||||
|
case Lexer2::Type::path: return "path";
|
||||||
case Lexer2::Type::substitution: return "substitution";
|
case Lexer2::Type::substitution: return "substitution";
|
||||||
case Lexer2::Type::pattern: return "pattern";
|
case Lexer2::Type::pattern: return "pattern";
|
||||||
case Lexer2::Type::op: return "op";
|
case Lexer2::Type::op: return "op";
|
||||||
|
@ -564,6 +586,43 @@ bool Lexer2::isList (std::string& token, Lexer2::Type& type)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Lexer2::Type::url
|
||||||
|
// http [s] :// ...
|
||||||
|
bool Lexer2::isURL (std::string& token, Lexer2::Type& type)
|
||||||
|
{
|
||||||
|
std::size_t marker = _cursor;
|
||||||
|
|
||||||
|
if (_eos - _cursor > 9 && // length 'https://*'
|
||||||
|
(_text[marker + 0] == 'h' || _text[marker + 0] == 'H') &&
|
||||||
|
(_text[marker + 1] == 't' || _text[marker + 1] == 'T') &&
|
||||||
|
(_text[marker + 2] == 't' || _text[marker + 2] == 'T') &&
|
||||||
|
(_text[marker + 3] == 'p' || _text[marker + 3] == 'P'))
|
||||||
|
{
|
||||||
|
marker += 4;
|
||||||
|
if (_text[marker + 0] == 's' || _text[marker + 0] == 'S')
|
||||||
|
++marker;
|
||||||
|
|
||||||
|
if (_text[marker + 0] == ':' &&
|
||||||
|
_text[marker + 1] == '/' &&
|
||||||
|
_text[marker + 2] == '/')
|
||||||
|
{
|
||||||
|
marker += 3;
|
||||||
|
|
||||||
|
while (marker < _eos &&
|
||||||
|
! isWhitespace (_text[marker]))
|
||||||
|
utf8_next_char (_text, marker);
|
||||||
|
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
type = Lexer2::Type::url;
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Lexer2::Type::pair
|
// Lexer2::Type::pair
|
||||||
// <identifier> : [ <string> | <word> ]
|
// <identifier> : [ <string> | <word> ]
|
||||||
|
@ -624,6 +683,48 @@ bool Lexer2::isTag (std::string& token, Lexer2::Type& type)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Lexer2::Type::path
|
||||||
|
// ( / <non-slash, non-whitespace> )+
|
||||||
|
bool Lexer2::isPath (std::string& token, Lexer2::Type& type)
|
||||||
|
{
|
||||||
|
std::size_t marker = _cursor;
|
||||||
|
int slashCount = 0;
|
||||||
|
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
if (_text[marker] == '/')
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
++slashCount;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (! isWhitespace (_text[marker]) &&
|
||||||
|
_text[marker] != '/')
|
||||||
|
{
|
||||||
|
utf8_next_char (_text, marker);
|
||||||
|
while (! isWhitespace (_text[marker]) &&
|
||||||
|
_text[marker] != '/')
|
||||||
|
utf8_next_char (_text, marker);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (marker > _cursor &&
|
||||||
|
slashCount > 3)
|
||||||
|
{
|
||||||
|
type = Lexer2::Type::path;
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Lexer2::Type::substitution
|
// Lexer2::Type::substitution
|
||||||
// / <unquoted-string> / <unquoted-string> / [g]
|
// / <unquoted-string> / <unquoted-string> / [g]
|
||||||
|
@ -807,8 +908,10 @@ std::string Lexer2::typeToString (Lexer2::Type type)
|
||||||
else if (type == Lexer2::Type::number) return std::string ("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m";
|
else if (type == Lexer2::Type::number) return std::string ("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m";
|
||||||
else if (type == Lexer2::Type::separator) return std::string ("\033[38;5;7m\033[48;5;4m") + "separator" + "\033[0m";
|
else if (type == Lexer2::Type::separator) return std::string ("\033[38;5;7m\033[48;5;4m") + "separator" + "\033[0m";
|
||||||
else if (type == Lexer2::Type::list) return std::string ("\033[38;5;7m\033[48;5;4m") + "list" + "\033[0m";
|
else if (type == Lexer2::Type::list) return std::string ("\033[38;5;7m\033[48;5;4m") + "list" + "\033[0m";
|
||||||
|
else if (type == Lexer2::Type::url) return std::string ("\033[38;5;7m\033[48;5;4m") + "url" + "\033[0m";
|
||||||
else if (type == Lexer2::Type::pair) return std::string ("\033[38;5;7m\033[48;5;1m") + "pair" + "\033[0m";
|
else if (type == Lexer2::Type::pair) return std::string ("\033[38;5;7m\033[48;5;1m") + "pair" + "\033[0m";
|
||||||
else if (type == Lexer2::Type::tag) return std::string ("\033[37;45m") + "tag" + "\033[0m";
|
else if (type == Lexer2::Type::tag) return std::string ("\033[37;45m") + "tag" + "\033[0m";
|
||||||
|
else if (type == Lexer2::Type::path) return std::string ("\033[37;102m") + "path" + "\033[0m";
|
||||||
else if (type == Lexer2::Type::substitution) return std::string ("\033[37;102m") + "substitution" + "\033[0m";
|
else if (type == Lexer2::Type::substitution) return std::string ("\033[37;102m") + "substitution" + "\033[0m";
|
||||||
else if (type == Lexer2::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m";
|
else if (type == Lexer2::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m";
|
||||||
else if (type == Lexer2::Type::op) return std::string ("\033[38;5;7m\033[48;5;203m") + "op" + "\033[0m";
|
else if (type == Lexer2::Type::op) return std::string ("\033[38;5;7m\033[48;5;203m") + "op" + "\033[0m";
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
#define INCLUDED_LEXER2
|
#define INCLUDED_LEXER2
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
|
||||||
// Lexer2: A UTF8 lexical analyzer for every construct used on the Taskwarrior
|
// Lexer2: A UTF8 lexical analyzer for every construct used on the Taskwarrior
|
||||||
|
@ -38,9 +39,10 @@ class Lexer2
|
||||||
public:
|
public:
|
||||||
enum class Type { uuid, number, hex,
|
enum class Type { uuid, number, hex,
|
||||||
string,
|
string,
|
||||||
list, pair, separator,
|
list, url, pair, separator,
|
||||||
substitution, pattern,
|
|
||||||
tag,
|
tag,
|
||||||
|
path,
|
||||||
|
substitution, pattern,
|
||||||
op,
|
op,
|
||||||
identifier, word,
|
identifier, word,
|
||||||
/*date,*/ /*duration,*/ };
|
/*date,*/ /*duration,*/ };
|
||||||
|
@ -48,6 +50,7 @@ public:
|
||||||
Lexer2 (const std::string&);
|
Lexer2 (const std::string&);
|
||||||
~Lexer2 ();
|
~Lexer2 ();
|
||||||
bool token (std::string&, Lexer2::Type&);
|
bool token (std::string&, Lexer2::Type&);
|
||||||
|
static std::vector <std::pair <std::string, Lexer2::Type>> tokens (const std::string&);
|
||||||
static std::string typeToString (Lexer2::Type);
|
static std::string typeToString (Lexer2::Type);
|
||||||
|
|
||||||
// Static helpers.
|
// Static helpers.
|
||||||
|
@ -78,8 +81,10 @@ public:
|
||||||
bool isHexNumber (std::string&, Lexer2::Type&);
|
bool isHexNumber (std::string&, Lexer2::Type&);
|
||||||
bool isSeparator (std::string&, Lexer2::Type&);
|
bool isSeparator (std::string&, Lexer2::Type&);
|
||||||
bool isList (std::string&, Lexer2::Type&);
|
bool isList (std::string&, Lexer2::Type&);
|
||||||
|
bool isURL (std::string&, Lexer2::Type&);
|
||||||
bool isPair (std::string&, Lexer2::Type&);
|
bool isPair (std::string&, Lexer2::Type&);
|
||||||
bool isTag (std::string&, Lexer2::Type&);
|
bool isTag (std::string&, Lexer2::Type&);
|
||||||
|
bool isPath (std::string&, Lexer2::Type&);
|
||||||
bool isSubstitution (std::string&, Lexer2::Type&);
|
bool isSubstitution (std::string&, Lexer2::Type&);
|
||||||
bool isPattern (std::string&, Lexer2::Type&);
|
bool isPattern (std::string&, Lexer2::Type&);
|
||||||
bool isOperator (std::string&, Lexer2::Type&);
|
bool isOperator (std::string&, Lexer2::Type&);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue