mirror of
https://github.com/GothenburgBitFactory/taskwarrior.git
synced 2025-07-07 20:06:36 +02:00
Lexer: Implemented ::readWord
- Lexer::readWord is a general-purpose text parser, for finding plain words and quoted strings. It supports \uNNNN and U+NNNN unicode sequences, and general escapes, \t, \', \" etc.
This commit is contained in:
parent
26e6dd2131
commit
d82da280cb
2 changed files with 100 additions and 8 deletions
|
@ -342,7 +342,7 @@ bool Lexer::isEOS () const
|
|||
// '9' -> 9
|
||||
// 'a'/'A' -> 10
|
||||
// 'f'/'F' -> 15
|
||||
int Lexer::hexToInt (int c) const
|
||||
int Lexer::hexToInt (int c)
|
||||
{
|
||||
if (c >= '0' && c <= '9') return (c - '0');
|
||||
else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
|
||||
|
@ -350,13 +350,13 @@ int Lexer::hexToInt (int c) const
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int Lexer::hexToInt (int c0, int c1) const
|
||||
int Lexer::hexToInt (int c0, int c1)
|
||||
{
|
||||
return (hexToInt (c0) << 4) + hexToInt (c1);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int Lexer::hexToInt (int c0, int c1, int c2, int c3) const
|
||||
int Lexer::hexToInt (int c0, int c1, int c2, int c3)
|
||||
{
|
||||
return (hexToInt (c0) << 12) +
|
||||
(hexToInt (c1) << 8) +
|
||||
|
@ -1209,6 +1209,7 @@ bool Lexer::isAllDigits (const std::string& text)
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Not escape-proof.
|
||||
bool Lexer::isOneWord (const std::string& text)
|
||||
{
|
||||
std::string::size_type i = 0;
|
||||
|
@ -1220,6 +1221,97 @@ bool Lexer::isOneWord (const std::string& text)
|
|||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Full implementation of a word. Includes:
|
||||
// one\ two
|
||||
// '\''
|
||||
// '"'
|
||||
// "'"
|
||||
// "\""
|
||||
// 'one two'
|
||||
// abcU+0020def
|
||||
// abc\u0020def
|
||||
// a\tb
|
||||
bool Lexer::readWord (
|
||||
const std::string& text,
|
||||
const std::string& quotes,
|
||||
std::string::size_type& cursor,
|
||||
std::string& word)
|
||||
{
|
||||
std::string::size_type eos = text.length ();
|
||||
|
||||
int quote = 0;
|
||||
if (quotes.find (text[cursor]) != std::string::npos)
|
||||
quote = text[cursor++];
|
||||
|
||||
word = "";
|
||||
int c;
|
||||
while ((c = text[cursor]))
|
||||
{
|
||||
// Quoted word ends on a quote.
|
||||
if (quote && quote == c)
|
||||
{
|
||||
++cursor;
|
||||
break;
|
||||
}
|
||||
|
||||
// Unquoted word ends on white space.
|
||||
if (! quote && Lexer::isWhitespace (c))
|
||||
{
|
||||
++cursor;
|
||||
break;
|
||||
}
|
||||
|
||||
// Unicode U+XXXX or \uXXXX codepoint.
|
||||
else if (eos - cursor >= 6 &&
|
||||
((text[cursor + 0] == 'U' && text[cursor + 1] == '+') ||
|
||||
(text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
|
||||
isHexDigit (text[cursor + 2]) &&
|
||||
isHexDigit (text[cursor + 3]) &&
|
||||
isHexDigit (text[cursor + 4]) &&
|
||||
isHexDigit (text[cursor + 5]))
|
||||
{
|
||||
word += utf8_character (
|
||||
hexToInt (
|
||||
text[cursor + 2],
|
||||
text[cursor + 3],
|
||||
text[cursor + 4],
|
||||
text[cursor + 5]));
|
||||
cursor += 6;
|
||||
}
|
||||
|
||||
// An escaped thing.
|
||||
else if (c == '\\')
|
||||
{
|
||||
c = text[++cursor];
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case '"': word += (char) 0x22; ++cursor; break;
|
||||
case '\'': word += (char) 0x27; ++cursor; break;
|
||||
case '\\': word += (char) 0x5C; ++cursor; break;
|
||||
case 'b': word += (char) 0x08; ++cursor; break;
|
||||
case 'f': word += (char) 0x0C; ++cursor; break;
|
||||
case 'n': word += (char) 0x0A; ++cursor; break;
|
||||
case 'r': word += (char) 0x0D; ++cursor; break;
|
||||
case 't': word += (char) 0x09; ++cursor; break;
|
||||
case 'v': word += (char) 0x0B; ++cursor; break;
|
||||
|
||||
// This pass-through default case means that anything can be escaped
|
||||
// harmlessly. In particular 'quote' is included, if it not one of the
|
||||
// above characters.
|
||||
default: word += (char) c; ++cursor; break;
|
||||
}
|
||||
}
|
||||
|
||||
// Ordinary character.
|
||||
else
|
||||
word += utf8_character (utf8_next_char (text, cursor));
|
||||
}
|
||||
|
||||
return word.length () > 0 ? true : false;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// <name> [. <modifier>] <: | = | :: | :=> [<value>]
|
||||
bool Lexer::decomposePair (
|
||||
|
|
10
src/Lexer.h
10
src/Lexer.h
|
@ -75,13 +75,13 @@ public:
|
|||
static bool isOneWord (const std::string&);
|
||||
static void dequote (std::string&);
|
||||
static bool wasQuoted (const std::string&);
|
||||
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
||||
static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&);
|
||||
static int hexToInt (int);
|
||||
static int hexToInt (int, int);
|
||||
static int hexToInt (int, int, int, int);
|
||||
|
||||
// Helpers.
|
||||
bool isEOS () const;
|
||||
int hexToInt (int) const;
|
||||
int hexToInt (int, int) const;
|
||||
int hexToInt (int, int, int, int) const;
|
||||
bool isEOS () const;
|
||||
|
||||
// Stream Classifiers.
|
||||
bool isString (std::string&, Lexer::Type&, int quote);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue