Lexer: Integrated ::commonLength

- Uses std::string::size_type for all string lengths, offsets. - Rewrote ::isLiteral to be simpler. - Added support for abbreviated DOM refs. - Obeys rc.abbreviation.minimum, indirectly. - Added tests.
2025-08-19 19:03:07 +02:00 · 2015-07-27 00:31:15 -04:00 · 2015-07-27 00:31:15 -04:00 · 0c7e731b0d
commit 0c7e731b0d
parent a9b701ae6d
3 changed files with 116 additions and 83 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -37,7 +37,7 @@ static const unsigned int uuid_min_length = 8;

 std::string Lexer::dateFormat = "";
 bool Lexer::isoEnabled = true;
-int Lexer::minimumMatchLength = 3;
+std::string::size_type Lexer::minimumMatchLength = 3;
 std::map <std::string, std::string> Lexer::attributes;


@ -373,7 +373,9 @@ int Lexer::hexToInt (int c0, int c1, int c2, int c3)
 // left:   wonderful
 // right:  wonderbread
 // returns:     ^ 6
-int Lexer::commonLength (const std::string& left, const std::string& right)
+std::string::size_type Lexer::commonLength (
+  const std::string& left,
+  const std::string& right)
 {
  std::string::size_type l = 0;
  std::string::size_type r = 0;
@ -382,7 +384,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right)
         utf8_next_char (right, r))
    ;

-  return (int) l;
+  return l;
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -393,7 +395,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right)
 // right:  prowonderbread
 // r:         ^
 // returns:        ^ 6
-int Lexer::commonLength (
+std::string::size_type Lexer::commonLength (
  const std::string& left,
  std::string::size_type l,
  const std::string& right,
@ -404,7 +406,7 @@ int Lexer::commonLength (
         utf8_next_char (right, r))
    ;

-  return (int) l;
+  return l;
 }

 ////////////////////////////////////////////////////////////////////////////////
@ -1077,7 +1079,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)

  std::string partialToken;
  Lexer::Type partialType;
-  if (isLiteral ("rc.", false) &&
+  if (isLiteral ("rc.", false, false) &&
      isWord (partialToken, partialType))
  {
    token = _text.substr (marker, _cursor - marker);
@ -1090,7 +1092,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
                "context.width",
                "context.height",
                "system.version",
-                "system.os"}, true))
+                "system.os"}, false, true))
  {
    token = _text.substr (marker, _cursor - marker);
    type = Lexer::Type::dom;
@ -1105,7 +1107,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
  if (isUUID (extractedToken, extractedType, false) ||
      isInteger (extractedToken, extractedType))
  {
-    if (! isLiteral (".", false))
+    if (! isLiteral (".", false, false))
    {
      _cursor = marker;
      return false;
@ -1116,8 +1118,9 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
  std::size_t checkpoint = _cursor;

  // [prefix]tags.<word>
-  if (isLiteral ("tags.", false) &&
-      isWord (partialToken, partialType))
+  if (isLiteral ("tags", true,  false) &&
+      isLiteral (".",    false, false) &&
+      isWord    (partialToken, partialType))
  {
    token = _text.substr (marker, _cursor - marker);
    type = Lexer::Type::dom;
@ -1127,28 +1130,26 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
    _cursor = checkpoint;

  // [prefix]attribute
-  if (isOneOf (attributes, true))
+  if (isOneOf (attributes, true, true))
  {
    token = _text.substr (marker, _cursor - marker);
    type = Lexer::Type::dom;
    return true;
  }
-  else
-    _cursor = checkpoint;

  // [prefix]attribute
-  if (isOneOf (attributes, false))
+  if (isOneOf (attributes, true, false))
  {
-    if (isLiteral (".", false))
+    if (isLiteral (".", false, false))
    {
      std::string attribute = _text.substr (checkpoint, _cursor - checkpoint - 1);

-      // if attribute type is 'date'
+      // if attribute type is 'date', then it has sub-elements.
      if (attributes[attribute] == "date" &&
          isOneOf ({"year", "month", "day",
                    "week", "weekday",
                    "julian",
-                    "hour", "minute", "second"}, true))
+                    "hour", "minute", "second"}, true, true))
      {
        token = _text.substr (marker, _cursor - marker);
        type = Lexer::Type::dom;
@ -1162,35 +1163,35 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
      return true;
    }
  }
-  else
-    _cursor = checkpoint;

  // [prefix]annotations.
-  if (isLiteral ("annotations.", false))
+  if (isLiteral ("annotations", true,  false) &&
+      isLiteral (".",           false, true))
  {
    std::string extractedToken;
    Lexer::Type extractedType;
    if (isInteger (extractedToken, extractedType))
    {
-      if (isLiteral (".", false))
+      if (isLiteral (".", false, false))
      {
-        if (isLiteral ("description", true))
+        if (isLiteral ("description", true, true))
        {
          token = _text.substr (marker, _cursor - marker);
          type = Lexer::Type::dom;
          return true;
        }
-        else if (isLiteral ("entry", true))
+        else if (isLiteral ("entry", true, true))
        {
          token = _text.substr (marker, _cursor - marker);
          type = Lexer::Type::dom;
          return true;
        }
-        else if (isLiteral ("entry.", false) &&
+        else if (isLiteral ("entry", true,  false) &&
+                 isLiteral (".",     false, true) &&
                 isOneOf ({"year", "month", "day",
                           "week", "weekday",
                           "julian",
-                           "hour", "minute", "second"}, true))
+                           "hour", "minute", "second"}, true, true))
        {
          token = _text.substr (marker, _cursor - marker);
          type = Lexer::Type::dom;
@ -1251,37 +1252,54 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
 }

 ////////////////////////////////////////////////////////////////////////////////
-bool Lexer::isLiteral (const std::string& literal, bool endBoundary)
+bool Lexer::isLiteral (
+  const std::string& literal,
+  bool allowAbbreviations,
+  bool endBoundary)
 {
-  auto len = literal.length ();
-  if (_text.find (literal, _cursor) == _cursor &&
-      (! endBoundary                              ||
-       _text.length () == _cursor + len           ||
-       Lexer::isWhitespace (_text[_cursor + len]) ||
-       Lexer::isSingleCharOperator (_text[_cursor + len])))
-  {
-    _cursor += len;
-    return true;
-  }
+  auto common = commonLength (literal, 0, _text, _cursor);

-  return false;
+  // Without abbreviations, common must equal literal length.
+  if (! allowAbbreviations &&
+      common < literal.length ())
+    return false;
+
+  // Abbreviations must meet the minimum size.
+  if (allowAbbreviations &&
+      common < minimumMatchLength)
+    return false;
+
+  // End boundary conditions must be met.
+  if (endBoundary &&
+      ! Lexer::isWhitespace (_text[_cursor + common]) &&
+      ! Lexer::isSingleCharOperator (_text[_cursor + common]))
+    return false;
+
+  _cursor += common;
+  return true;
 }

 ////////////////////////////////////////////////////////////////////////////////
-bool Lexer::isOneOf (const std::vector <std::string>& options, bool endBoundary)
+bool Lexer::isOneOf (
+  const std::vector <std::string>& options,
+  bool allowAbbreviations,
+  bool endBoundary)
 {
  for (auto& item : options)
-    if (isLiteral (item, endBoundary))
+    if (isLiteral (item, allowAbbreviations, endBoundary))
      return true;

  return false;
 }

 ////////////////////////////////////////////////////////////////////////////////
-bool Lexer::isOneOf (const std::map <std::string, std::string>& options, bool endBoundary)
+bool Lexer::isOneOf (
+  const std::map <std::string, std::string>& options,
+  bool allowAbbreviations,
+  bool endBoundary)
 {
  for (auto& item : options)
-    if (isLiteral (item.first, endBoundary))
+    if (isLiteral (item.first, allowAbbreviations, endBoundary))
      return true;

  return false;
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -41,7 +41,7 @@ public:
  // These are overridable.
  static std::string dateFormat;
  static bool isoEnabled;
-  static int minimumMatchLength;
+  static std::string::size_type minimumMatchLength;
  static std::map <std::string, std::string> attributes;

  enum class Type { uuid, number, hex,
@ -61,36 +61,35 @@ public:
  static std::string typeToString (Lexer::Type);

  // Static helpers.
-  static const std::string typeName (const Lexer::Type&);
-  static bool isWhitespace          (int);
-  static bool isAlpha               (int);
-  static bool isDigit               (int);
-  static bool isHexDigit            (int);
-  static bool isIdentifierStart     (int);
-  static bool isIdentifierNext      (int);
-  static bool isSingleCharOperator  (int);
-  static bool isDoubleCharOperator  (int, int, int);
-  static bool isTripleCharOperator  (int, int, int, int);
-  static bool isBoundary            (int, int);
-  static bool isHardBoundary        (int, int);
-  static bool isPunctuation         (int);
-  static bool isAllDigits           (const std::string&);
-  static void dequote               (std::string&, const std::string& quotes = "'\"");
-  static bool wasQuoted             (const std::string&);
-  static bool readWord              (const std::string&, const std::string&, std::string::size_type&, std::string&);
-  static bool readWord              (const std::string&, std::string::size_type&, std::string&);
-  static bool decomposePair         (const std::string&, std::string&, std::string&, std::string&, std::string&);
-  static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&);
-  static bool decomposePattern      (const std::string&, std::string&, std::string&);
-  static int hexToInt               (int);
-  static int hexToInt               (int, int);
-  static int hexToInt               (int, int, int, int);
-  static int commonLength           (const std::string&, const std::string&);
-  static int commonLength           (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
-
-  bool isEOS                        () const;
+  static const std::string typeName          (const Lexer::Type&);
+  static bool isWhitespace                   (int);
+  static bool isAlpha                        (int);
+  static bool isDigit                        (int);
+  static bool isHexDigit                     (int);
+  static bool isIdentifierStart              (int);
+  static bool isIdentifierNext               (int);
+  static bool isSingleCharOperator           (int);
+  static bool isDoubleCharOperator           (int, int, int);
+  static bool isTripleCharOperator           (int, int, int, int);
+  static bool isBoundary                     (int, int);
+  static bool isHardBoundary                 (int, int);
+  static bool isPunctuation                  (int);
+  static bool isAllDigits                    (const std::string&);
+  static void dequote                        (std::string&, const std::string& quotes = "'\"");
+  static bool wasQuoted                      (const std::string&);
+  static bool readWord                       (const std::string&, const std::string&, std::string::size_type&, std::string&);
+  static bool readWord                       (const std::string&, std::string::size_type&, std::string&);
+  static bool decomposePair                  (const std::string&, std::string&, std::string&, std::string&, std::string&);
+  static bool decomposeSubstitution          (const std::string&, std::string&, std::string&, std::string&);
+  static bool decomposePattern               (const std::string&, std::string&, std::string&);
+  static int hexToInt                        (int);
+  static int hexToInt                        (int, int);
+  static int hexToInt                        (int, int, int, int);
+  static std::string::size_type commonLength (const std::string&, const std::string&);
+  static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);

  // Stream Classifiers.
+  bool isEOS          () const;
  bool isString       (std::string&, Lexer::Type&, const std::string&);
  bool isDate         (std::string&, Lexer::Type&);
  bool isDuration     (std::string&, Lexer::Type&);
@ -110,9 +109,9 @@ public:
  bool isDOM          (std::string&, Lexer::Type&);
  bool isIdentifier   (std::string&, Lexer::Type&);
  bool isWord         (std::string&, Lexer::Type&);
-  bool isLiteral      (const std::string&, bool);
-  bool isOneOf        (const std::vector <std::string>&, bool);
-  bool isOneOf        (const std::map <std::string, std::string>&, bool);
+  bool isLiteral      (const std::string&, bool, bool);
+  bool isOneOf        (const std::vector <std::string>&, bool, bool);
+  bool isOneOf        (const std::map <std::string, std::string>&, bool, bool);

 private:
  std::string _text;
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@ -37,7 +37,7 @@ Context context;
 ////////////////////////////////////////////////////////////////////////////////
 int main (int argc, char** argv)
 {
-  UnitTest t (1160);
+  UnitTest t (1170);

  std::vector <std::pair <std::string, Lexer::Type>> tokens;
  std::string token;
@ -265,18 +265,34 @@ int main (int argc, char** argv)
  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one     \" --> true");
  t.is (word, "one",                                        "  word '" + word + "'");

-  // bool isLiteral (const std::string&, bool);
+  // bool isLiteral (const std::string&, bool, bool);
  Lexer l4 ("one.two");
-  t.notok (l4.isLiteral("zero", false),                     "isLiteral 'one.two' --> false");
-  t.ok    (l4.isLiteral("one", false),                      "isLiteral 'one.two' --> 'one'");
-  t.ok    (l4.isLiteral(".", false),                        "isLiteral 'one.two' --> '.'");
-  t.ok    (l4.isLiteral("two", true),                       "isLiteral 'one.two' --> 'two'");
+  t.notok (l4.isLiteral("zero", false, false),              "isLiteral 'one.two' --> false");
+  t.ok    (l4.isLiteral("one",  false, false),              "isLiteral 'one.two' --> 'one'");
+  t.ok    (l4.isLiteral(".",    false, false),              "isLiteral 'one.two' --> '.'");
+  t.ok    (l4.isLiteral("two",  false, true),               "isLiteral 'one.two' --> 'two'");

-  // bool isOneOf (const std::string&, bool);
-  Lexer l5 ("Grumpy.");
+  Lexer l5 ("wonderful");
+  t.notok (l5.isLiteral ("wonder", false, false),           "isLiteral 'wonder' != 'wonderful' without abbreviation");
+  t.ok    (l5.isLiteral ("wonder", true,  false),           "isLiteral 'wonder' == 'wonderful' with abbreviation");
+
+  // bool isOneOf (const std::string&, bool, bool);
+  Lexer l6 ("Grumpy.");
  std::vector <std::string> dwarves = {"Sneezy", "Doc", "Bashful", "Grumpy", "Happy", "Sleepy", "Dopey"};
-  t.notok (l5.isOneOf (dwarves, true),                      "isOneof ('Grumpy', true) --> false");
-  t.ok    (l5.isOneOf (dwarves, false),                     "isOneOf ('Grumpy', false) --> true");
+  t.notok (l6.isOneOf (dwarves, false, true),               "isOneof ('Grumpy', true) --> false");
+  t.ok    (l6.isOneOf (dwarves, false, false),              "isOneOf ('Grumpy', false) --> true");
+
+  // static std::string::size_type commonLength (const std::string&, const std::string&);
+  t.is ((int)Lexer::commonLength ("", ""),           0, "commonLength '' : '' --> 0");
+  t.is ((int)Lexer::commonLength ("a", "a"),         1, "commonLength 'a' : 'a' --> 1");
+  t.is ((int)Lexer::commonLength ("abcde", "abcde"), 5, "commonLength 'abcde' : 'abcde' --> 5");
+  t.is ((int)Lexer::commonLength ("abc", ""),        0, "commonLength 'abc' : '' --> 0");
+  t.is ((int)Lexer::commonLength ("abc", "def"),     0, "commonLength 'abc' : 'def' --> 0");
+  t.is ((int)Lexer::commonLength ("foobar", "foo"),  3, "commonLength 'foobar' : 'foo' --> 3");
+  t.is ((int)Lexer::commonLength ("foo", "foobar"),  3, "commonLength 'foo' : 'foobar' --> 3");
+
+  // static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
+  t.is ((int)Lexer::commonLength ("wonder", 0, "prowonderbread", 3), 6, "'wonder'+0 : 'prowonderbread'+3 --> 6");

  // Test all Lexer types.
  #define NO {"",Lexer::Type::word}