mirror of
https://github.com/GothenburgBitFactory/timewarrior.git
synced 2025-06-26 10:54:28 +02:00
Lexer: Added string support
This commit is contained in:
parent
d236315450
commit
31c145ef9e
3 changed files with 300 additions and 3 deletions
241
src/Lexer.cpp
241
src/Lexer.cpp
|
@ -51,7 +51,8 @@ bool Lexer::token (std::string& token, Lexer::Type& type)
|
||||||
if (isEOS ())
|
if (isEOS ())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (isWord (token, type))
|
if (isString (token, type, "'\"") ||
|
||||||
|
isWord (token, type))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -99,6 +100,15 @@ bool Lexer::isWhitespace (int c)
|
||||||
c == 0x3000); // ideographic space Common Separator, space
|
c == 0x3000); // ideographic space Common Separator, space
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Digits 0-9 a-f A-F.
|
||||||
|
bool Lexer::isHexDigit (int c)
|
||||||
|
{
|
||||||
|
return (c >= '0' && c <= '9') ||
|
||||||
|
(c >= 'a' && c <= 'f') ||
|
||||||
|
(c >= 'A' && c <= 'F');
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool Lexer::isSingleCharOperator (int c)
|
bool Lexer::isSingleCharOperator (int c)
|
||||||
{
|
{
|
||||||
|
@ -117,12 +127,56 @@ bool Lexer::isSingleCharOperator (int c)
|
||||||
c == '~'; // Pattern match
|
c == '~'; // Pattern match
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
bool Lexer::isHardBoundary (int left, int right)
|
||||||
|
{
|
||||||
|
// EOS
|
||||||
|
if (right == '\0')
|
||||||
|
return true;
|
||||||
|
|
||||||
|
// FILTER operators that don't need to be surrounded by whitespace.
|
||||||
|
if (left == '(' ||
|
||||||
|
left == ')' ||
|
||||||
|
right == '(' ||
|
||||||
|
right == ')')
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool Lexer::isEOS () const
|
bool Lexer::isEOS () const
|
||||||
{
|
{
|
||||||
return _cursor >= _eos;
|
return _cursor >= _eos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Converts '0' -> 0
|
||||||
|
// '9' -> 9
|
||||||
|
// 'a'/'A' -> 10
|
||||||
|
// 'f'/'F' -> 15
|
||||||
|
int Lexer::hexToInt (int c)
|
||||||
|
{
|
||||||
|
if (c >= '0' && c <= '9') return (c - '0');
|
||||||
|
else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
|
||||||
|
else return (c - 'A' + 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int Lexer::hexToInt (int c0, int c1)
|
||||||
|
{
|
||||||
|
return (hexToInt (c0) << 4) + hexToInt (c1);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int Lexer::hexToInt (int c0, int c1, int c2, int c3)
|
||||||
|
{
|
||||||
|
return (hexToInt (c0) << 12) +
|
||||||
|
(hexToInt (c1) << 8) +
|
||||||
|
(hexToInt (c2) << 4) +
|
||||||
|
hexToInt (c3);
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
std::string Lexer::trimLeft (const std::string& in, const std::string& t /*= " "*/)
|
std::string Lexer::trimLeft (const std::string& in, const std::string& t /*= " "*/)
|
||||||
{
|
{
|
||||||
|
@ -149,6 +203,24 @@ std::string Lexer::trim (const std::string& in, const std::string& t /*= " "*/)
|
||||||
return trimLeft (trimRight (in, t), t);
|
return trimLeft (trimRight (in, t), t);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Lexer::Type::string
|
||||||
|
// '|"
|
||||||
|
// [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ]
|
||||||
|
// '|"
|
||||||
|
bool Lexer::isString (std::string& token, Lexer::Type& type, const std::string& quotes)
|
||||||
|
{
|
||||||
|
std::size_t marker = _cursor;
|
||||||
|
if (readWord (_text, quotes, marker, token))
|
||||||
|
{
|
||||||
|
type = Lexer::Type::string;
|
||||||
|
_cursor = marker;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Lexer::Type::word
|
// Lexer::Type::word
|
||||||
// [^\s]+
|
// [^\s]+
|
||||||
|
@ -173,3 +245,170 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Full implementation of a quoted word. Includes:
|
||||||
|
// '\''
|
||||||
|
// '"'
|
||||||
|
// "'"
|
||||||
|
// "\""
|
||||||
|
// 'one two'
|
||||||
|
// Result includes the quotes.
|
||||||
|
bool Lexer::readWord (
|
||||||
|
const std::string& text,
|
||||||
|
const std::string& quotes,
|
||||||
|
std::string::size_type& cursor,
|
||||||
|
std::string& word)
|
||||||
|
{
|
||||||
|
if (quotes.find (text[cursor]) == std::string::npos)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
std::string::size_type eos = text.length ();
|
||||||
|
int quote = text[cursor++];
|
||||||
|
word = quote;
|
||||||
|
|
||||||
|
int c;
|
||||||
|
while ((c = text[cursor]))
|
||||||
|
{
|
||||||
|
// Quoted word ends on a quote.
|
||||||
|
if (quote && quote == c)
|
||||||
|
{
|
||||||
|
word += utf8_character (utf8_next_char (text, cursor));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unicode U+XXXX or \uXXXX codepoint.
|
||||||
|
else if (eos - cursor >= 6 &&
|
||||||
|
((text[cursor + 0] == 'U' && text[cursor + 1] == '+') ||
|
||||||
|
(text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
|
||||||
|
isHexDigit (text[cursor + 2]) &&
|
||||||
|
isHexDigit (text[cursor + 3]) &&
|
||||||
|
isHexDigit (text[cursor + 4]) &&
|
||||||
|
isHexDigit (text[cursor + 5]))
|
||||||
|
{
|
||||||
|
word += utf8_character (
|
||||||
|
hexToInt (
|
||||||
|
text[cursor + 2],
|
||||||
|
text[cursor + 3],
|
||||||
|
text[cursor + 4],
|
||||||
|
text[cursor + 5]));
|
||||||
|
cursor += 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
// An escaped thing.
|
||||||
|
else if (c == '\\')
|
||||||
|
{
|
||||||
|
c = text[++cursor];
|
||||||
|
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case '"': word += (char) 0x22; ++cursor; break;
|
||||||
|
case '\'': word += (char) 0x27; ++cursor; break;
|
||||||
|
case '\\': word += (char) 0x5C; ++cursor; break;
|
||||||
|
case 'b': word += (char) 0x08; ++cursor; break;
|
||||||
|
case 'f': word += (char) 0x0C; ++cursor; break;
|
||||||
|
case 'n': word += (char) 0x0A; ++cursor; break;
|
||||||
|
case 'r': word += (char) 0x0D; ++cursor; break;
|
||||||
|
case 't': word += (char) 0x09; ++cursor; break;
|
||||||
|
case 'v': word += (char) 0x0B; ++cursor; break;
|
||||||
|
|
||||||
|
// This pass-through default case means that anything can be escaped
|
||||||
|
// harmlessly. In particular 'quote' is included, if it not one of the
|
||||||
|
// above characters.
|
||||||
|
default: word += (char) c; ++cursor; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ordinary character.
|
||||||
|
else
|
||||||
|
word += utf8_character (utf8_next_char (text, cursor));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify termination.
|
||||||
|
return word[0] == quote &&
|
||||||
|
word[word.length () - 1] == quote &&
|
||||||
|
word.length () >= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Full implementation of an unquoted word. Includes:
|
||||||
|
// one\ two
|
||||||
|
// abcU+0020def
|
||||||
|
// abc\u0020def
|
||||||
|
// a\tb
|
||||||
|
//
|
||||||
|
// Ends at:
|
||||||
|
// Lexer::isEOS
|
||||||
|
// Lexer::isWhitespace
|
||||||
|
// Lexer::isHardBoundary
|
||||||
|
bool Lexer::readWord (
|
||||||
|
const std::string& text,
|
||||||
|
std::string::size_type& cursor,
|
||||||
|
std::string& word)
|
||||||
|
{
|
||||||
|
std::string::size_type eos = text.length ();
|
||||||
|
|
||||||
|
word = "";
|
||||||
|
int c;
|
||||||
|
int prev = 0;
|
||||||
|
while ((c = text[cursor])) // Handles EOS.
|
||||||
|
{
|
||||||
|
// Unquoted word ends on white space.
|
||||||
|
if (Lexer::isWhitespace (c))
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Parentheses mostly.
|
||||||
|
if (prev && Lexer::isHardBoundary (prev, c))
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Unicode U+XXXX or \uXXXX codepoint.
|
||||||
|
else if (eos - cursor >= 6 &&
|
||||||
|
((text[cursor + 0] == 'U' && text[cursor + 1] == '+') ||
|
||||||
|
(text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
|
||||||
|
isHexDigit (text[cursor + 2]) &&
|
||||||
|
isHexDigit (text[cursor + 3]) &&
|
||||||
|
isHexDigit (text[cursor + 4]) &&
|
||||||
|
isHexDigit (text[cursor + 5]))
|
||||||
|
{
|
||||||
|
word += utf8_character (
|
||||||
|
hexToInt (
|
||||||
|
text[cursor + 2],
|
||||||
|
text[cursor + 3],
|
||||||
|
text[cursor + 4],
|
||||||
|
text[cursor + 5]));
|
||||||
|
cursor += 6;
|
||||||
|
}
|
||||||
|
|
||||||
|
// An escaped thing.
|
||||||
|
else if (c == '\\')
|
||||||
|
{
|
||||||
|
c = text[++cursor];
|
||||||
|
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case '"': word += (char) 0x22; ++cursor; break;
|
||||||
|
case '\'': word += (char) 0x27; ++cursor; break;
|
||||||
|
case '\\': word += (char) 0x5C; ++cursor; break;
|
||||||
|
case 'b': word += (char) 0x08; ++cursor; break;
|
||||||
|
case 'f': word += (char) 0x0C; ++cursor; break;
|
||||||
|
case 'n': word += (char) 0x0A; ++cursor; break;
|
||||||
|
case 'r': word += (char) 0x0D; ++cursor; break;
|
||||||
|
case 't': word += (char) 0x09; ++cursor; break;
|
||||||
|
case 'v': word += (char) 0x0B; ++cursor; break;
|
||||||
|
|
||||||
|
// This pass-through default case means that anything can be escaped
|
||||||
|
// harmlessly. In particular 'quote' is included, if it not one of the
|
||||||
|
// above characters.
|
||||||
|
default: word += (char) c; ++cursor; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ordinary character.
|
||||||
|
else
|
||||||
|
word += utf8_character (utf8_next_char (text, cursor));
|
||||||
|
|
||||||
|
prev = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
return word.length () > 0 ? true : false;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
11
src/Lexer.h
11
src/Lexer.h
|
@ -35,20 +35,29 @@
|
||||||
class Lexer
|
class Lexer
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
enum class Type { word };
|
enum class Type { string,
|
||||||
|
word };
|
||||||
|
|
||||||
Lexer (const std::string&);
|
Lexer (const std::string&);
|
||||||
bool token (std::string&, Lexer::Type&);
|
bool token (std::string&, Lexer::Type&);
|
||||||
|
|
||||||
// Static helpers.
|
// Static helpers.
|
||||||
static bool isWhitespace (int);
|
static bool isWhitespace (int);
|
||||||
|
static bool isHexDigit (int);
|
||||||
static bool isSingleCharOperator (int);
|
static bool isSingleCharOperator (int);
|
||||||
|
static bool isHardBoundary (int, int);
|
||||||
|
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
||||||
|
static bool readWord (const std::string&, std::string::size_type&, std::string&);
|
||||||
|
static int hexToInt (int);
|
||||||
|
static int hexToInt (int, int);
|
||||||
|
static int hexToInt (int, int, int, int);
|
||||||
static std::string trimLeft (const std::string& in, const std::string& t = " ");
|
static std::string trimLeft (const std::string& in, const std::string& t = " ");
|
||||||
static std::string trimRight (const std::string& in, const std::string& t = " ");
|
static std::string trimRight (const std::string& in, const std::string& t = " ");
|
||||||
static std::string trim (const std::string& in, const std::string& t = " ");
|
static std::string trim (const std::string& in, const std::string& t = " ");
|
||||||
|
|
||||||
// Stream Classifiers.
|
// Stream Classifiers.
|
||||||
bool isEOS () const;
|
bool isEOS () const;
|
||||||
|
bool isString (std::string&, Lexer::Type&, const std::string&);
|
||||||
bool isWord (std::string&, Lexer::Type&);
|
bool isWord (std::string&, Lexer::Type&);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main (int, char**)
|
int main (int, char**)
|
||||||
{
|
{
|
||||||
UnitTest t (50);
|
UnitTest t (74);
|
||||||
|
|
||||||
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
||||||
std::string token;
|
std::string token;
|
||||||
|
@ -77,6 +77,55 @@ int main (int, char**)
|
||||||
Lexer l1 (" \t ");
|
Lexer l1 (" \t ");
|
||||||
t.notok (l1.token (token, type), "' \\t ' --> no tokens");
|
t.notok (l1.token (token, type), "' \\t ' --> no tokens");
|
||||||
|
|
||||||
|
|
||||||
|
// static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
||||||
|
std::string::size_type cursor = 0;
|
||||||
|
std::string word;
|
||||||
|
t.ok (Lexer::readWord ("'one two'", "'\"", cursor, word), "readWord ''one two'' --> true");
|
||||||
|
t.is (word, "'one two'", " word '" + word + "'");
|
||||||
|
t.is ((int)cursor, 9, " cursor");
|
||||||
|
|
||||||
|
// Unterminated quoted string is invalid.
|
||||||
|
cursor = 0;
|
||||||
|
t.notok (Lexer::readWord ("'one", "'\"", cursor, word), "readWord ''one' --> false");
|
||||||
|
|
||||||
|
// static bool readWord (const std::string&, std::string::size_type&, std::string&);
|
||||||
|
cursor = 0;
|
||||||
|
t.ok (Lexer::readWord ("input", cursor, word), "readWord 'input' --> true");
|
||||||
|
t.is (word, "input", " word '" + word + "'");
|
||||||
|
t.is ((int)cursor, 5, " cursor");
|
||||||
|
|
||||||
|
cursor = 0;
|
||||||
|
t.ok (Lexer::readWord ("one\\ two", cursor, word), "readWord 'one\\ two' --> true");
|
||||||
|
t.is (word, "one two", " word '" + word + "'");
|
||||||
|
t.is ((int)cursor, 8, " cursor");
|
||||||
|
|
||||||
|
cursor = 0;
|
||||||
|
t.ok (Lexer::readWord ("\\u20A43", cursor, word), "readWord '\\u20A43' --> true");
|
||||||
|
t.is (word, "₤3", " word '" + word + "'");
|
||||||
|
t.is ((int)cursor, 7, " cursor");
|
||||||
|
|
||||||
|
cursor = 0;
|
||||||
|
t.ok (Lexer::readWord ("U+20AC4", cursor, word), "readWord '\\u20AC4' --> true");
|
||||||
|
t.is (word, "€4", " word '" + word + "'");
|
||||||
|
t.is ((int)cursor, 7, " cursor");
|
||||||
|
|
||||||
|
std::string text = "one 'two' three\\ four";
|
||||||
|
cursor = 0;
|
||||||
|
t.ok (Lexer::readWord (text, cursor, word), "readWord \"one 'two' three\\ four\" --> true");
|
||||||
|
t.is (word, "one", " word '" + word + "'");
|
||||||
|
cursor++;
|
||||||
|
t.ok (Lexer::readWord (text, cursor, word), "readWord \"one 'two' three\\ four\" --> true");
|
||||||
|
t.is (word, "'two'", " word '" + word + "'");
|
||||||
|
cursor++;
|
||||||
|
t.ok (Lexer::readWord (text, cursor, word), "readWord \"one 'two' three\\ four\" --> true");
|
||||||
|
t.is (word, "three four", " word '" + word + "'");
|
||||||
|
|
||||||
|
text = "one ";
|
||||||
|
cursor = 0;
|
||||||
|
t.ok (Lexer::readWord (text, cursor, word), "readWord \"one \" --> true");
|
||||||
|
t.is (word, "one", " word '" + word + "'");
|
||||||
|
|
||||||
// std::string Lexer::trimLeft (const std::string& in, const std::string&)
|
// std::string Lexer::trimLeft (const std::string& in, const std::string&)
|
||||||
t.is (Lexer::trimLeft (""), "", "Lexer::trimLeft '' -> ''");
|
t.is (Lexer::trimLeft (""), "", "Lexer::trimLeft '' -> ''");
|
||||||
t.is (Lexer::trimLeft (" "), "", "Lexer::trimLeft ' ' -> ''");
|
t.is (Lexer::trimLeft (" "), "", "Lexer::trimLeft ' ' -> ''");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue