mirror of
https://github.com/GothenburgBitFactory/timewarrior.git
synced 2025-06-26 10:54:28 +02:00
Lexer: Added number support
This commit is contained in:
parent
53bb3952b8
commit
fa0c0e5fa7
3 changed files with 140 additions and 2 deletions
105
src/Lexer.cpp
105
src/Lexer.cpp
|
@ -53,6 +53,7 @@ bool Lexer::token (std::string& token, Lexer::Type& type)
|
||||||
|
|
||||||
if (isString (token, type, "'\"") ||
|
if (isString (token, type, "'\"") ||
|
||||||
isHexNumber (token, type) ||
|
isHexNumber (token, type) ||
|
||||||
|
isNumber (token, type) ||
|
||||||
isWord (token, type))
|
isWord (token, type))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
@ -65,6 +66,7 @@ const std::string Lexer::typeName (const Lexer::Type& type)
|
||||||
{
|
{
|
||||||
switch (type)
|
switch (type)
|
||||||
{
|
{
|
||||||
|
case Lexer::Type::number: return "number";
|
||||||
case Lexer::Type::hex: return "hex";
|
case Lexer::Type::hex: return "hex";
|
||||||
case Lexer::Type::string: return "string";
|
case Lexer::Type::string: return "string";
|
||||||
case Lexer::Type::word: return "word";
|
case Lexer::Type::word: return "word";
|
||||||
|
@ -115,6 +117,15 @@ bool Lexer::isWhitespace (int c)
|
||||||
c == 0x3000); // ideographic space Common Separator, space
|
c == 0x3000); // ideographic space Common Separator, space
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Digits 0-9.
|
||||||
|
//
|
||||||
|
// TODO This list should be derived from the Unicode database.
|
||||||
|
bool Lexer::isDigit (int c)
|
||||||
|
{
|
||||||
|
return c >= 0x30 && c <= 0x39;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Digits 0-9 a-f A-F.
|
// Digits 0-9 a-f A-F.
|
||||||
bool Lexer::isHexDigit (int c)
|
bool Lexer::isHexDigit (int c)
|
||||||
|
@ -124,6 +135,99 @@ bool Lexer::isHexDigit (int c)
|
||||||
(c >= 'A' && c <= 'F');
|
(c >= 'A' && c <= 'F');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Lexer::Type::number
|
||||||
|
// \d+
|
||||||
|
// [ . \d+ ]
|
||||||
|
// [ e|E [ +|- ] \d+ [ . \d+ ] ]
|
||||||
|
// not followed by non-operator.
|
||||||
|
bool Lexer::isNumber (std::string& token, Lexer::Type& type)
|
||||||
|
{
|
||||||
|
std::size_t marker = _cursor;
|
||||||
|
|
||||||
|
if (isDigit (_text[marker]))
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
while (isDigit (_text[marker]))
|
||||||
|
utf8_next_char (_text, marker);
|
||||||
|
|
||||||
|
if (_text[marker] == '.')
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
if (isDigit (_text[marker]))
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
while (isDigit (_text[marker]))
|
||||||
|
utf8_next_char (_text, marker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_text[marker] == 'e' ||
|
||||||
|
_text[marker] == 'E')
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
|
||||||
|
if (_text[marker] == '+' ||
|
||||||
|
_text[marker] == '-')
|
||||||
|
++marker;
|
||||||
|
|
||||||
|
if (isDigit (_text[marker]))
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
while (isDigit (_text[marker]))
|
||||||
|
utf8_next_char (_text, marker);
|
||||||
|
|
||||||
|
if (_text[marker] == '.')
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
if (isDigit (_text[marker]))
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
while (isDigit (_text[marker]))
|
||||||
|
utf8_next_char (_text, marker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lookahread: !<isWhitespace> | !<isSingleCharOperator>
|
||||||
|
// If there is an immediately consecutive character, that is not an operator, fail.
|
||||||
|
if (_eos > marker &&
|
||||||
|
! isWhitespace (_text[marker]) &&
|
||||||
|
! isSingleCharOperator (_text[marker]))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
type = Lexer::Type::number;
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Lexer::Type::number
|
||||||
|
// \d+
|
||||||
|
bool Lexer::isInteger (std::string& token, Lexer::Type& type)
|
||||||
|
{
|
||||||
|
std::size_t marker = _cursor;
|
||||||
|
|
||||||
|
if (isDigit (_text[marker]))
|
||||||
|
{
|
||||||
|
++marker;
|
||||||
|
while (isDigit (_text[marker]))
|
||||||
|
utf8_next_char (_text, marker);
|
||||||
|
|
||||||
|
token = _text.substr (_cursor, marker - _cursor);
|
||||||
|
type = Lexer::Type::number;
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool Lexer::isSingleCharOperator (int c)
|
bool Lexer::isSingleCharOperator (int c)
|
||||||
{
|
{
|
||||||
|
@ -293,6 +397,7 @@ std::string Lexer::typeToString (Lexer::Type type)
|
||||||
{
|
{
|
||||||
if (type == Lexer::Type::string) return std::string ("\033[38;5;7m\033[48;5;3m") + "string" + "\033[0m";
|
if (type == Lexer::Type::string) return std::string ("\033[38;5;7m\033[48;5;3m") + "string" + "\033[0m";
|
||||||
else if (type == Lexer::Type::hex) return std::string ("\033[38;5;7m\033[48;5;14m") + "hex" + "\033[0m";
|
else if (type == Lexer::Type::hex) return std::string ("\033[38;5;7m\033[48;5;14m") + "hex" + "\033[0m";
|
||||||
|
else if (type == Lexer::Type::number) return std::string ("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m";
|
||||||
else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m";
|
else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m";
|
||||||
else return std::string ("\033[37;41m") + "unknown" + "\033[0m";
|
else return std::string ("\033[37;41m") + "unknown" + "\033[0m";
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,7 +35,7 @@
|
||||||
class Lexer
|
class Lexer
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
enum class Type { hex,
|
enum class Type { number, hex,
|
||||||
string,
|
string,
|
||||||
word };
|
word };
|
||||||
|
|
||||||
|
@ -46,6 +46,7 @@ public:
|
||||||
// Static helpers.
|
// Static helpers.
|
||||||
static const std::string typeName (const Lexer::Type&);
|
static const std::string typeName (const Lexer::Type&);
|
||||||
static bool isWhitespace (int);
|
static bool isWhitespace (int);
|
||||||
|
static bool isDigit (int);
|
||||||
static bool isHexDigit (int);
|
static bool isHexDigit (int);
|
||||||
static bool isSingleCharOperator (int);
|
static bool isSingleCharOperator (int);
|
||||||
static bool isHardBoundary (int, int);
|
static bool isHardBoundary (int, int);
|
||||||
|
@ -61,6 +62,8 @@ public:
|
||||||
// Stream Classifiers.
|
// Stream Classifiers.
|
||||||
bool isEOS () const;
|
bool isEOS () const;
|
||||||
bool isString (std::string&, Lexer::Type&, const std::string&);
|
bool isString (std::string&, Lexer::Type&, const std::string&);
|
||||||
|
bool isNumber (std::string&, Lexer::Type&);
|
||||||
|
bool isInteger (std::string&, Lexer::Type&);
|
||||||
bool isHexNumber (std::string&, Lexer::Type&);
|
bool isHexNumber (std::string&, Lexer::Type&);
|
||||||
bool isWord (std::string&, Lexer::Type&);
|
bool isWord (std::string&, Lexer::Type&);
|
||||||
|
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main (int, char**)
|
int main (int, char**)
|
||||||
{
|
{
|
||||||
UnitTest t (131);
|
UnitTest t (171);
|
||||||
|
|
||||||
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
||||||
std::string token;
|
std::string token;
|
||||||
|
@ -77,6 +77,31 @@ int main (int, char**)
|
||||||
Lexer l1 (" \t ");
|
Lexer l1 (" \t ");
|
||||||
t.notok (l1.token (token, type), "' \\t ' --> no tokens");
|
t.notok (l1.token (token, type), "' \\t ' --> no tokens");
|
||||||
|
|
||||||
|
// Test for numbers that are no longer ISO-8601 dates.
|
||||||
|
Lexer l3 ("1 12 123 1234 12345 123456 1234567");
|
||||||
|
tokens.clear ();
|
||||||
|
while (l3.token (token, type))
|
||||||
|
{
|
||||||
|
std::cout << "# «" << token << "» " << Lexer::typeName (type) << "\n";
|
||||||
|
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type));
|
||||||
|
}
|
||||||
|
|
||||||
|
t.is ((int)tokens.size (), 7, "7 tokens");
|
||||||
|
t.is (tokens[0].first, "1", "tokens[0] == '1'");
|
||||||
|
t.is ((int) tokens[0].second, (int) Lexer::Type::number, "tokens[0] == Type::number");
|
||||||
|
t.is (tokens[1].first, "12", "tokens[1] == '12'");
|
||||||
|
t.is ((int) tokens[1].second, (int) Lexer::Type::number, "tokens[1] == Type::date");
|
||||||
|
t.is (tokens[2].first, "123", "tokens[2] == '123'");
|
||||||
|
t.is ((int) tokens[2].second, (int) Lexer::Type::number, "tokens[2] == Type::number"); // 70
|
||||||
|
t.is (tokens[3].first, "1234", "tokens[3] == '1234'");
|
||||||
|
t.is ((int) tokens[3].second, (int) Lexer::Type::number, "tokens[3] == Type::date");
|
||||||
|
t.is (tokens[4].first, "12345", "tokens[4] == '12345'");
|
||||||
|
t.is ((int) tokens[4].second, (int) Lexer::Type::number, "tokens[4] == Type::number");
|
||||||
|
t.is (tokens[5].first, "123456", "tokens[5] == '123456'");
|
||||||
|
t.is ((int) tokens[5].second, (int) Lexer::Type::number, "tokens[5] == Type::date");
|
||||||
|
t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'");
|
||||||
|
t.is ((int) tokens[6].second, (int) Lexer::Type::number, "tokens[6] == Type::number");
|
||||||
|
|
||||||
// static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
// static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
||||||
std::string::size_type cursor = 0;
|
std::string::size_type cursor = 0;
|
||||||
std::string word;
|
std::string word;
|
||||||
|
@ -150,6 +175,10 @@ int main (int, char**)
|
||||||
{ "\"U+20AC4\"", { { "\"€4\"", Lexer::Type::string }, NO, NO, NO, NO }, },
|
{ "\"U+20AC4\"", { { "\"€4\"", Lexer::Type::string }, NO, NO, NO, NO }, },
|
||||||
|
|
||||||
// Number
|
// Number
|
||||||
|
{ "1", { { "1", Lexer::Type::number }, NO, NO, NO, NO }, },
|
||||||
|
{ "3.14", { { "3.14", Lexer::Type::number }, NO, NO, NO, NO }, },
|
||||||
|
{ "6.02217e23", { { "6.02217e23", Lexer::Type::number }, NO, NO, NO, NO }, },
|
||||||
|
{ "1.2e-3.4", { { "1.2e-3.4", Lexer::Type::number }, NO, NO, NO, NO }, },
|
||||||
{ "0x2f", { { "0x2f", Lexer::Type::hex }, NO, NO, NO, NO }, },
|
{ "0x2f", { { "0x2f", Lexer::Type::hex }, NO, NO, NO, NO }, },
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -186,6 +215,7 @@ int main (int, char**)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
t.is (Lexer::typeName (Lexer::Type::number), "number", "Lexer::typeName (Lexer::Type::number)");
|
||||||
t.is (Lexer::typeName (Lexer::Type::hex), "hex", "Lexer::typeName (Lexer::Type::hex)");
|
t.is (Lexer::typeName (Lexer::Type::hex), "hex", "Lexer::typeName (Lexer::Type::hex)");
|
||||||
t.is (Lexer::typeName (Lexer::Type::string), "string", "Lexer::typeName (Lexer::Type::string)");
|
t.is (Lexer::typeName (Lexer::Type::string), "string", "Lexer::typeName (Lexer::Type::string)");
|
||||||
t.is (Lexer::typeName (Lexer::Type::word), "word", "Lexer::typeName (Lexer::Type::word)");
|
t.is (Lexer::typeName (Lexer::Type::word), "word", "Lexer::typeName (Lexer::Type::word)");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue