Lexer: Integrated ::commonLength

- Uses std::string::size_type for all string lengths, offsets.
- Rewrote ::isLiteral to be simpler.
- Added support for abbreviated DOM refs.
- Obeys rc.abbreviation.minimum, indirectly.
- Added tests.
This commit is contained in:
Paul Beckingham 2015-07-27 00:31:15 -04:00
parent a9b701ae6d
commit 0c7e731b0d
3 changed files with 116 additions and 83 deletions

View file

@ -37,7 +37,7 @@ static const unsigned int uuid_min_length = 8;
std::string Lexer::dateFormat = "";
bool Lexer::isoEnabled = true;
int Lexer::minimumMatchLength = 3;
std::string::size_type Lexer::minimumMatchLength = 3;
std::map <std::string, std::string> Lexer::attributes;
@ -373,7 +373,9 @@ int Lexer::hexToInt (int c0, int c1, int c2, int c3)
// left: wonderful
// right: wonderbread
// returns: ^ 6
int Lexer::commonLength (const std::string& left, const std::string& right)
std::string::size_type Lexer::commonLength (
const std::string& left,
const std::string& right)
{
std::string::size_type l = 0;
std::string::size_type r = 0;
@ -382,7 +384,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right)
utf8_next_char (right, r))
;
return (int) l;
return l;
}
////////////////////////////////////////////////////////////////////////////////
@ -393,7 +395,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right)
// right: prowonderbread
// r: ^
// returns: ^ 6
int Lexer::commonLength (
std::string::size_type Lexer::commonLength (
const std::string& left,
std::string::size_type l,
const std::string& right,
@ -404,7 +406,7 @@ int Lexer::commonLength (
utf8_next_char (right, r))
;
return (int) l;
return l;
}
////////////////////////////////////////////////////////////////////////////////
@ -1077,7 +1079,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
std::string partialToken;
Lexer::Type partialType;
if (isLiteral ("rc.", false) &&
if (isLiteral ("rc.", false, false) &&
isWord (partialToken, partialType))
{
token = _text.substr (marker, _cursor - marker);
@ -1090,7 +1092,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
"context.width",
"context.height",
"system.version",
"system.os"}, true))
"system.os"}, false, true))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::dom;
@ -1105,7 +1107,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
if (isUUID (extractedToken, extractedType, false) ||
isInteger (extractedToken, extractedType))
{
if (! isLiteral (".", false))
if (! isLiteral (".", false, false))
{
_cursor = marker;
return false;
@ -1116,8 +1118,9 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
std::size_t checkpoint = _cursor;
// [prefix]tags.<word>
if (isLiteral ("tags.", false) &&
isWord (partialToken, partialType))
if (isLiteral ("tags", true, false) &&
isLiteral (".", false, false) &&
isWord (partialToken, partialType))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::dom;
@ -1127,28 +1130,26 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
_cursor = checkpoint;
// [prefix]attribute
if (isOneOf (attributes, true))
if (isOneOf (attributes, true, true))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::dom;
return true;
}
else
_cursor = checkpoint;
// [prefix]attribute
if (isOneOf (attributes, false))
if (isOneOf (attributes, true, false))
{
if (isLiteral (".", false))
if (isLiteral (".", false, false))
{
std::string attribute = _text.substr (checkpoint, _cursor - checkpoint - 1);
// if attribute type is 'date'
// if attribute type is 'date', then it has sub-elements.
if (attributes[attribute] == "date" &&
isOneOf ({"year", "month", "day",
"week", "weekday",
"julian",
"hour", "minute", "second"}, true))
"hour", "minute", "second"}, true, true))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::dom;
@ -1162,35 +1163,35 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
return true;
}
}
else
_cursor = checkpoint;
// [prefix]annotations.
if (isLiteral ("annotations.", false))
if (isLiteral ("annotations", true, false) &&
isLiteral (".", false, true))
{
std::string extractedToken;
Lexer::Type extractedType;
if (isInteger (extractedToken, extractedType))
{
if (isLiteral (".", false))
if (isLiteral (".", false, false))
{
if (isLiteral ("description", true))
if (isLiteral ("description", true, true))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::dom;
return true;
}
else if (isLiteral ("entry", true))
else if (isLiteral ("entry", true, true))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::dom;
return true;
}
else if (isLiteral ("entry.", false) &&
else if (isLiteral ("entry", true, false) &&
isLiteral (".", false, true) &&
isOneOf ({"year", "month", "day",
"week", "weekday",
"julian",
"hour", "minute", "second"}, true))
"hour", "minute", "second"}, true, true))
{
token = _text.substr (marker, _cursor - marker);
type = Lexer::Type::dom;
@ -1251,37 +1252,54 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isLiteral (const std::string& literal, bool endBoundary)
bool Lexer::isLiteral (
const std::string& literal,
bool allowAbbreviations,
bool endBoundary)
{
auto len = literal.length ();
if (_text.find (literal, _cursor) == _cursor &&
(! endBoundary ||
_text.length () == _cursor + len ||
Lexer::isWhitespace (_text[_cursor + len]) ||
Lexer::isSingleCharOperator (_text[_cursor + len])))
{
_cursor += len;
return true;
}
auto common = commonLength (literal, 0, _text, _cursor);
return false;
// Without abbreviations, common must equal literal length.
if (! allowAbbreviations &&
common < literal.length ())
return false;
// Abbreviations must meet the minimum size.
if (allowAbbreviations &&
common < minimumMatchLength)
return false;
// End boundary conditions must be met.
if (endBoundary &&
! Lexer::isWhitespace (_text[_cursor + common]) &&
! Lexer::isSingleCharOperator (_text[_cursor + common]))
return false;
_cursor += common;
return true;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isOneOf (const std::vector <std::string>& options, bool endBoundary)
bool Lexer::isOneOf (
const std::vector <std::string>& options,
bool allowAbbreviations,
bool endBoundary)
{
for (auto& item : options)
if (isLiteral (item, endBoundary))
if (isLiteral (item, allowAbbreviations, endBoundary))
return true;
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isOneOf (const std::map <std::string, std::string>& options, bool endBoundary)
bool Lexer::isOneOf (
const std::map <std::string, std::string>& options,
bool allowAbbreviations,
bool endBoundary)
{
for (auto& item : options)
if (isLiteral (item.first, endBoundary))
if (isLiteral (item.first, allowAbbreviations, endBoundary))
return true;
return false;

View file

@ -41,7 +41,7 @@ public:
// These are overridable.
static std::string dateFormat;
static bool isoEnabled;
static int minimumMatchLength;
static std::string::size_type minimumMatchLength;
static std::map <std::string, std::string> attributes;
enum class Type { uuid, number, hex,
@ -61,36 +61,35 @@ public:
static std::string typeToString (Lexer::Type);
// Static helpers.
static const std::string typeName (const Lexer::Type&);
static bool isWhitespace (int);
static bool isAlpha (int);
static bool isDigit (int);
static bool isHexDigit (int);
static bool isIdentifierStart (int);
static bool isIdentifierNext (int);
static bool isSingleCharOperator (int);
static bool isDoubleCharOperator (int, int, int);
static bool isTripleCharOperator (int, int, int, int);
static bool isBoundary (int, int);
static bool isHardBoundary (int, int);
static bool isPunctuation (int);
static bool isAllDigits (const std::string&);
static void dequote (std::string&, const std::string& quotes = "'\"");
static bool wasQuoted (const std::string&);
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
static bool readWord (const std::string&, std::string::size_type&, std::string&);
static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&);
static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&);
static bool decomposePattern (const std::string&, std::string&, std::string&);
static int hexToInt (int);
static int hexToInt (int, int);
static int hexToInt (int, int, int, int);
static int commonLength (const std::string&, const std::string&);
static int commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
bool isEOS () const;
static const std::string typeName (const Lexer::Type&);
static bool isWhitespace (int);
static bool isAlpha (int);
static bool isDigit (int);
static bool isHexDigit (int);
static bool isIdentifierStart (int);
static bool isIdentifierNext (int);
static bool isSingleCharOperator (int);
static bool isDoubleCharOperator (int, int, int);
static bool isTripleCharOperator (int, int, int, int);
static bool isBoundary (int, int);
static bool isHardBoundary (int, int);
static bool isPunctuation (int);
static bool isAllDigits (const std::string&);
static void dequote (std::string&, const std::string& quotes = "'\"");
static bool wasQuoted (const std::string&);
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
static bool readWord (const std::string&, std::string::size_type&, std::string&);
static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&);
static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&);
static bool decomposePattern (const std::string&, std::string&, std::string&);
static int hexToInt (int);
static int hexToInt (int, int);
static int hexToInt (int, int, int, int);
static std::string::size_type commonLength (const std::string&, const std::string&);
static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
// Stream Classifiers.
bool isEOS () const;
bool isString (std::string&, Lexer::Type&, const std::string&);
bool isDate (std::string&, Lexer::Type&);
bool isDuration (std::string&, Lexer::Type&);
@ -110,9 +109,9 @@ public:
bool isDOM (std::string&, Lexer::Type&);
bool isIdentifier (std::string&, Lexer::Type&);
bool isWord (std::string&, Lexer::Type&);
bool isLiteral (const std::string&, bool);
bool isOneOf (const std::vector <std::string>&, bool);
bool isOneOf (const std::map <std::string, std::string>&, bool);
bool isLiteral (const std::string&, bool, bool);
bool isOneOf (const std::vector <std::string>&, bool, bool);
bool isOneOf (const std::map <std::string, std::string>&, bool, bool);
private:
std::string _text;

View file

@ -37,7 +37,7 @@ Context context;
////////////////////////////////////////////////////////////////////////////////
int main (int argc, char** argv)
{
UnitTest t (1160);
UnitTest t (1170);
std::vector <std::pair <std::string, Lexer::Type>> tokens;
std::string token;
@ -265,18 +265,34 @@ int main (int argc, char** argv)
t.ok (Lexer::readWord (text, cursor, word), "readWord \"one \" --> true");
t.is (word, "one", " word '" + word + "'");
// bool isLiteral (const std::string&, bool);
// bool isLiteral (const std::string&, bool, bool);
Lexer l4 ("one.two");
t.notok (l4.isLiteral("zero", false), "isLiteral 'one.two' --> false");
t.ok (l4.isLiteral("one", false), "isLiteral 'one.two' --> 'one'");
t.ok (l4.isLiteral(".", false), "isLiteral 'one.two' --> '.'");
t.ok (l4.isLiteral("two", true), "isLiteral 'one.two' --> 'two'");
t.notok (l4.isLiteral("zero", false, false), "isLiteral 'one.two' --> false");
t.ok (l4.isLiteral("one", false, false), "isLiteral 'one.two' --> 'one'");
t.ok (l4.isLiteral(".", false, false), "isLiteral 'one.two' --> '.'");
t.ok (l4.isLiteral("two", false, true), "isLiteral 'one.two' --> 'two'");
// bool isOneOf (const std::string&, bool);
Lexer l5 ("Grumpy.");
Lexer l5 ("wonderful");
t.notok (l5.isLiteral ("wonder", false, false), "isLiteral 'wonder' != 'wonderful' without abbreviation");
t.ok (l5.isLiteral ("wonder", true, false), "isLiteral 'wonder' == 'wonderful' with abbreviation");
// bool isOneOf (const std::string&, bool, bool);
Lexer l6 ("Grumpy.");
std::vector <std::string> dwarves = {"Sneezy", "Doc", "Bashful", "Grumpy", "Happy", "Sleepy", "Dopey"};
t.notok (l5.isOneOf (dwarves, true), "isOneof ('Grumpy', true) --> false");
t.ok (l5.isOneOf (dwarves, false), "isOneOf ('Grumpy', false) --> true");
t.notok (l6.isOneOf (dwarves, false, true), "isOneof ('Grumpy', true) --> false");
t.ok (l6.isOneOf (dwarves, false, false), "isOneOf ('Grumpy', false) --> true");
// static std::string::size_type commonLength (const std::string&, const std::string&);
t.is ((int)Lexer::commonLength ("", ""), 0, "commonLength '' : '' --> 0");
t.is ((int)Lexer::commonLength ("a", "a"), 1, "commonLength 'a' : 'a' --> 1");
t.is ((int)Lexer::commonLength ("abcde", "abcde"), 5, "commonLength 'abcde' : 'abcde' --> 5");
t.is ((int)Lexer::commonLength ("abc", ""), 0, "commonLength 'abc' : '' --> 0");
t.is ((int)Lexer::commonLength ("abc", "def"), 0, "commonLength 'abc' : 'def' --> 0");
t.is ((int)Lexer::commonLength ("foobar", "foo"), 3, "commonLength 'foobar' : 'foo' --> 3");
t.is ((int)Lexer::commonLength ("foo", "foobar"), 3, "commonLength 'foo' : 'foobar' --> 3");
// static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
t.is ((int)Lexer::commonLength ("wonder", 0, "prowonderbread", 3), 6, "'wonder'+0 : 'prowonderbread'+3 --> 6");
// Test all Lexer types.
#define NO {"",Lexer::Type::word}