mirror of
https://github.com/GothenburgBitFactory/taskwarrior.git
synced 2025-06-26 10:54:26 +02:00
Lexer: Integrated ::commonLength
- Uses std::string::size_type for all string lengths, offsets. - Rewrote ::isLiteral to be simpler. - Added support for abbreviated DOM refs. - Obeys rc.abbreviation.minimum, indirectly. - Added tests.
This commit is contained in:
parent
a9b701ae6d
commit
0c7e731b0d
3 changed files with 116 additions and 83 deletions
100
src/Lexer.cpp
100
src/Lexer.cpp
|
@ -37,7 +37,7 @@ static const unsigned int uuid_min_length = 8;
|
|||
|
||||
std::string Lexer::dateFormat = "";
|
||||
bool Lexer::isoEnabled = true;
|
||||
int Lexer::minimumMatchLength = 3;
|
||||
std::string::size_type Lexer::minimumMatchLength = 3;
|
||||
std::map <std::string, std::string> Lexer::attributes;
|
||||
|
||||
|
||||
|
@ -373,7 +373,9 @@ int Lexer::hexToInt (int c0, int c1, int c2, int c3)
|
|||
// left: wonderful
|
||||
// right: wonderbread
|
||||
// returns: ^ 6
|
||||
int Lexer::commonLength (const std::string& left, const std::string& right)
|
||||
std::string::size_type Lexer::commonLength (
|
||||
const std::string& left,
|
||||
const std::string& right)
|
||||
{
|
||||
std::string::size_type l = 0;
|
||||
std::string::size_type r = 0;
|
||||
|
@ -382,7 +384,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right)
|
|||
utf8_next_char (right, r))
|
||||
;
|
||||
|
||||
return (int) l;
|
||||
return l;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -393,7 +395,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right)
|
|||
// right: prowonderbread
|
||||
// r: ^
|
||||
// returns: ^ 6
|
||||
int Lexer::commonLength (
|
||||
std::string::size_type Lexer::commonLength (
|
||||
const std::string& left,
|
||||
std::string::size_type l,
|
||||
const std::string& right,
|
||||
|
@ -404,7 +406,7 @@ int Lexer::commonLength (
|
|||
utf8_next_char (right, r))
|
||||
;
|
||||
|
||||
return (int) l;
|
||||
return l;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -1077,7 +1079,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||
|
||||
std::string partialToken;
|
||||
Lexer::Type partialType;
|
||||
if (isLiteral ("rc.", false) &&
|
||||
if (isLiteral ("rc.", false, false) &&
|
||||
isWord (partialToken, partialType))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
|
@ -1090,7 +1092,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||
"context.width",
|
||||
"context.height",
|
||||
"system.version",
|
||||
"system.os"}, true))
|
||||
"system.os"}, false, true))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::dom;
|
||||
|
@ -1105,7 +1107,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||
if (isUUID (extractedToken, extractedType, false) ||
|
||||
isInteger (extractedToken, extractedType))
|
||||
{
|
||||
if (! isLiteral (".", false))
|
||||
if (! isLiteral (".", false, false))
|
||||
{
|
||||
_cursor = marker;
|
||||
return false;
|
||||
|
@ -1116,8 +1118,9 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||
std::size_t checkpoint = _cursor;
|
||||
|
||||
// [prefix]tags.<word>
|
||||
if (isLiteral ("tags.", false) &&
|
||||
isWord (partialToken, partialType))
|
||||
if (isLiteral ("tags", true, false) &&
|
||||
isLiteral (".", false, false) &&
|
||||
isWord (partialToken, partialType))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::dom;
|
||||
|
@ -1127,28 +1130,26 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||
_cursor = checkpoint;
|
||||
|
||||
// [prefix]attribute
|
||||
if (isOneOf (attributes, true))
|
||||
if (isOneOf (attributes, true, true))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::dom;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
_cursor = checkpoint;
|
||||
|
||||
// [prefix]attribute
|
||||
if (isOneOf (attributes, false))
|
||||
if (isOneOf (attributes, true, false))
|
||||
{
|
||||
if (isLiteral (".", false))
|
||||
if (isLiteral (".", false, false))
|
||||
{
|
||||
std::string attribute = _text.substr (checkpoint, _cursor - checkpoint - 1);
|
||||
|
||||
// if attribute type is 'date'
|
||||
// if attribute type is 'date', then it has sub-elements.
|
||||
if (attributes[attribute] == "date" &&
|
||||
isOneOf ({"year", "month", "day",
|
||||
"week", "weekday",
|
||||
"julian",
|
||||
"hour", "minute", "second"}, true))
|
||||
"hour", "minute", "second"}, true, true))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::dom;
|
||||
|
@ -1162,35 +1163,35 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||
return true;
|
||||
}
|
||||
}
|
||||
else
|
||||
_cursor = checkpoint;
|
||||
|
||||
// [prefix]annotations.
|
||||
if (isLiteral ("annotations.", false))
|
||||
if (isLiteral ("annotations", true, false) &&
|
||||
isLiteral (".", false, true))
|
||||
{
|
||||
std::string extractedToken;
|
||||
Lexer::Type extractedType;
|
||||
if (isInteger (extractedToken, extractedType))
|
||||
{
|
||||
if (isLiteral (".", false))
|
||||
if (isLiteral (".", false, false))
|
||||
{
|
||||
if (isLiteral ("description", true))
|
||||
if (isLiteral ("description", true, true))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::dom;
|
||||
return true;
|
||||
}
|
||||
else if (isLiteral ("entry", true))
|
||||
else if (isLiteral ("entry", true, true))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::dom;
|
||||
return true;
|
||||
}
|
||||
else if (isLiteral ("entry.", false) &&
|
||||
else if (isLiteral ("entry", true, false) &&
|
||||
isLiteral (".", false, true) &&
|
||||
isOneOf ({"year", "month", "day",
|
||||
"week", "weekday",
|
||||
"julian",
|
||||
"hour", "minute", "second"}, true))
|
||||
"hour", "minute", "second"}, true, true))
|
||||
{
|
||||
token = _text.substr (marker, _cursor - marker);
|
||||
type = Lexer::Type::dom;
|
||||
|
@ -1251,37 +1252,54 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
|
|||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
bool Lexer::isLiteral (const std::string& literal, bool endBoundary)
|
||||
bool Lexer::isLiteral (
|
||||
const std::string& literal,
|
||||
bool allowAbbreviations,
|
||||
bool endBoundary)
|
||||
{
|
||||
auto len = literal.length ();
|
||||
if (_text.find (literal, _cursor) == _cursor &&
|
||||
(! endBoundary ||
|
||||
_text.length () == _cursor + len ||
|
||||
Lexer::isWhitespace (_text[_cursor + len]) ||
|
||||
Lexer::isSingleCharOperator (_text[_cursor + len])))
|
||||
{
|
||||
_cursor += len;
|
||||
return true;
|
||||
}
|
||||
auto common = commonLength (literal, 0, _text, _cursor);
|
||||
|
||||
return false;
|
||||
// Without abbreviations, common must equal literal length.
|
||||
if (! allowAbbreviations &&
|
||||
common < literal.length ())
|
||||
return false;
|
||||
|
||||
// Abbreviations must meet the minimum size.
|
||||
if (allowAbbreviations &&
|
||||
common < minimumMatchLength)
|
||||
return false;
|
||||
|
||||
// End boundary conditions must be met.
|
||||
if (endBoundary &&
|
||||
! Lexer::isWhitespace (_text[_cursor + common]) &&
|
||||
! Lexer::isSingleCharOperator (_text[_cursor + common]))
|
||||
return false;
|
||||
|
||||
_cursor += common;
|
||||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
bool Lexer::isOneOf (const std::vector <std::string>& options, bool endBoundary)
|
||||
bool Lexer::isOneOf (
|
||||
const std::vector <std::string>& options,
|
||||
bool allowAbbreviations,
|
||||
bool endBoundary)
|
||||
{
|
||||
for (auto& item : options)
|
||||
if (isLiteral (item, endBoundary))
|
||||
if (isLiteral (item, allowAbbreviations, endBoundary))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
bool Lexer::isOneOf (const std::map <std::string, std::string>& options, bool endBoundary)
|
||||
bool Lexer::isOneOf (
|
||||
const std::map <std::string, std::string>& options,
|
||||
bool allowAbbreviations,
|
||||
bool endBoundary)
|
||||
{
|
||||
for (auto& item : options)
|
||||
if (isLiteral (item.first, endBoundary))
|
||||
if (isLiteral (item.first, allowAbbreviations, endBoundary))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
|
63
src/Lexer.h
63
src/Lexer.h
|
@ -41,7 +41,7 @@ public:
|
|||
// These are overridable.
|
||||
static std::string dateFormat;
|
||||
static bool isoEnabled;
|
||||
static int minimumMatchLength;
|
||||
static std::string::size_type minimumMatchLength;
|
||||
static std::map <std::string, std::string> attributes;
|
||||
|
||||
enum class Type { uuid, number, hex,
|
||||
|
@ -61,36 +61,35 @@ public:
|
|||
static std::string typeToString (Lexer::Type);
|
||||
|
||||
// Static helpers.
|
||||
static const std::string typeName (const Lexer::Type&);
|
||||
static bool isWhitespace (int);
|
||||
static bool isAlpha (int);
|
||||
static bool isDigit (int);
|
||||
static bool isHexDigit (int);
|
||||
static bool isIdentifierStart (int);
|
||||
static bool isIdentifierNext (int);
|
||||
static bool isSingleCharOperator (int);
|
||||
static bool isDoubleCharOperator (int, int, int);
|
||||
static bool isTripleCharOperator (int, int, int, int);
|
||||
static bool isBoundary (int, int);
|
||||
static bool isHardBoundary (int, int);
|
||||
static bool isPunctuation (int);
|
||||
static bool isAllDigits (const std::string&);
|
||||
static void dequote (std::string&, const std::string& quotes = "'\"");
|
||||
static bool wasQuoted (const std::string&);
|
||||
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
||||
static bool readWord (const std::string&, std::string::size_type&, std::string&);
|
||||
static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&);
|
||||
static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&);
|
||||
static bool decomposePattern (const std::string&, std::string&, std::string&);
|
||||
static int hexToInt (int);
|
||||
static int hexToInt (int, int);
|
||||
static int hexToInt (int, int, int, int);
|
||||
static int commonLength (const std::string&, const std::string&);
|
||||
static int commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
|
||||
|
||||
bool isEOS () const;
|
||||
static const std::string typeName (const Lexer::Type&);
|
||||
static bool isWhitespace (int);
|
||||
static bool isAlpha (int);
|
||||
static bool isDigit (int);
|
||||
static bool isHexDigit (int);
|
||||
static bool isIdentifierStart (int);
|
||||
static bool isIdentifierNext (int);
|
||||
static bool isSingleCharOperator (int);
|
||||
static bool isDoubleCharOperator (int, int, int);
|
||||
static bool isTripleCharOperator (int, int, int, int);
|
||||
static bool isBoundary (int, int);
|
||||
static bool isHardBoundary (int, int);
|
||||
static bool isPunctuation (int);
|
||||
static bool isAllDigits (const std::string&);
|
||||
static void dequote (std::string&, const std::string& quotes = "'\"");
|
||||
static bool wasQuoted (const std::string&);
|
||||
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
||||
static bool readWord (const std::string&, std::string::size_type&, std::string&);
|
||||
static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&);
|
||||
static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&);
|
||||
static bool decomposePattern (const std::string&, std::string&, std::string&);
|
||||
static int hexToInt (int);
|
||||
static int hexToInt (int, int);
|
||||
static int hexToInt (int, int, int, int);
|
||||
static std::string::size_type commonLength (const std::string&, const std::string&);
|
||||
static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
|
||||
|
||||
// Stream Classifiers.
|
||||
bool isEOS () const;
|
||||
bool isString (std::string&, Lexer::Type&, const std::string&);
|
||||
bool isDate (std::string&, Lexer::Type&);
|
||||
bool isDuration (std::string&, Lexer::Type&);
|
||||
|
@ -110,9 +109,9 @@ public:
|
|||
bool isDOM (std::string&, Lexer::Type&);
|
||||
bool isIdentifier (std::string&, Lexer::Type&);
|
||||
bool isWord (std::string&, Lexer::Type&);
|
||||
bool isLiteral (const std::string&, bool);
|
||||
bool isOneOf (const std::vector <std::string>&, bool);
|
||||
bool isOneOf (const std::map <std::string, std::string>&, bool);
|
||||
bool isLiteral (const std::string&, bool, bool);
|
||||
bool isOneOf (const std::vector <std::string>&, bool, bool);
|
||||
bool isOneOf (const std::map <std::string, std::string>&, bool, bool);
|
||||
|
||||
private:
|
||||
std::string _text;
|
||||
|
|
|
@ -37,7 +37,7 @@ Context context;
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main (int argc, char** argv)
|
||||
{
|
||||
UnitTest t (1160);
|
||||
UnitTest t (1170);
|
||||
|
||||
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
||||
std::string token;
|
||||
|
@ -265,18 +265,34 @@ int main (int argc, char** argv)
|
|||
t.ok (Lexer::readWord (text, cursor, word), "readWord \"one \" --> true");
|
||||
t.is (word, "one", " word '" + word + "'");
|
||||
|
||||
// bool isLiteral (const std::string&, bool);
|
||||
// bool isLiteral (const std::string&, bool, bool);
|
||||
Lexer l4 ("one.two");
|
||||
t.notok (l4.isLiteral("zero", false), "isLiteral 'one.two' --> false");
|
||||
t.ok (l4.isLiteral("one", false), "isLiteral 'one.two' --> 'one'");
|
||||
t.ok (l4.isLiteral(".", false), "isLiteral 'one.two' --> '.'");
|
||||
t.ok (l4.isLiteral("two", true), "isLiteral 'one.two' --> 'two'");
|
||||
t.notok (l4.isLiteral("zero", false, false), "isLiteral 'one.two' --> false");
|
||||
t.ok (l4.isLiteral("one", false, false), "isLiteral 'one.two' --> 'one'");
|
||||
t.ok (l4.isLiteral(".", false, false), "isLiteral 'one.two' --> '.'");
|
||||
t.ok (l4.isLiteral("two", false, true), "isLiteral 'one.two' --> 'two'");
|
||||
|
||||
// bool isOneOf (const std::string&, bool);
|
||||
Lexer l5 ("Grumpy.");
|
||||
Lexer l5 ("wonderful");
|
||||
t.notok (l5.isLiteral ("wonder", false, false), "isLiteral 'wonder' != 'wonderful' without abbreviation");
|
||||
t.ok (l5.isLiteral ("wonder", true, false), "isLiteral 'wonder' == 'wonderful' with abbreviation");
|
||||
|
||||
// bool isOneOf (const std::string&, bool, bool);
|
||||
Lexer l6 ("Grumpy.");
|
||||
std::vector <std::string> dwarves = {"Sneezy", "Doc", "Bashful", "Grumpy", "Happy", "Sleepy", "Dopey"};
|
||||
t.notok (l5.isOneOf (dwarves, true), "isOneof ('Grumpy', true) --> false");
|
||||
t.ok (l5.isOneOf (dwarves, false), "isOneOf ('Grumpy', false) --> true");
|
||||
t.notok (l6.isOneOf (dwarves, false, true), "isOneof ('Grumpy', true) --> false");
|
||||
t.ok (l6.isOneOf (dwarves, false, false), "isOneOf ('Grumpy', false) --> true");
|
||||
|
||||
// static std::string::size_type commonLength (const std::string&, const std::string&);
|
||||
t.is ((int)Lexer::commonLength ("", ""), 0, "commonLength '' : '' --> 0");
|
||||
t.is ((int)Lexer::commonLength ("a", "a"), 1, "commonLength 'a' : 'a' --> 1");
|
||||
t.is ((int)Lexer::commonLength ("abcde", "abcde"), 5, "commonLength 'abcde' : 'abcde' --> 5");
|
||||
t.is ((int)Lexer::commonLength ("abc", ""), 0, "commonLength 'abc' : '' --> 0");
|
||||
t.is ((int)Lexer::commonLength ("abc", "def"), 0, "commonLength 'abc' : 'def' --> 0");
|
||||
t.is ((int)Lexer::commonLength ("foobar", "foo"), 3, "commonLength 'foobar' : 'foo' --> 3");
|
||||
t.is ((int)Lexer::commonLength ("foo", "foobar"), 3, "commonLength 'foo' : 'foobar' --> 3");
|
||||
|
||||
// static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
|
||||
t.is ((int)Lexer::commonLength ("wonder", 0, "prowonderbread", 3), 6, "'wonder'+0 : 'prowonderbread'+3 --> 6");
|
||||
|
||||
// Test all Lexer types.
|
||||
#define NO {"",Lexer::Type::word}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue