From 31c145ef9ea8b783ec951c8d98fb6277fa5c2375 Mon Sep 17 00:00:00 2001
From: Paul Beckingham <paul@beckingham.net>
Date: Sun, 20 Dec 2015 15:02:53 -0500
Subject: [PATCH] Lexer: Added string support

---
 src/Lexer.cpp    | 241 ++++++++++++++++++++++++++++++++++++++++++++++-
 src/Lexer.h      |  11 ++-
 test/lexer.t.cpp |  51 +++++++++-
 3 files changed, 300 insertions(+), 3 deletions(-)

diff --git a/src/Lexer.cpp b/src/Lexer.cpp
index 4a6d4015..75d52ca8 100644
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -51,7 +51,8 @@ bool Lexer::token (std::string& token, Lexer::Type& type)
   if (isEOS ())
     return false;
 
-  if (isWord (token, type))
+  if (isString (token, type, "'\"") ||
+      isWord   (token, type))
     return true;
 
   return false;
@@ -99,6 +100,15 @@ bool Lexer::isWhitespace (int c)
           c == 0x3000);    // ideographic space Common  Separator, space
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Digits 0-9 a-f A-F.
+bool Lexer::isHexDigit (int c)
+{
+  return (c >= '0' && c <= '9') ||
+         (c >= 'a' && c <= 'f') ||
+         (c >= 'A' && c <= 'F');
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isSingleCharOperator (int c)
 {
@@ -117,12 +127,56 @@ bool Lexer::isSingleCharOperator (int c)
          c == '~';    // Pattern match
 }
 
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::isHardBoundary (int left, int right)
+{
+  // EOS
+  if (right == '\0')
+    return true;
+
+  // FILTER operators that don't need to be surrounded by whitespace.
+  if (left == '(' ||
+      left == ')' ||
+      right == '(' ||
+      right == ')')
+    return true;
+
+  return false;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isEOS () const
 {
   return _cursor >= _eos;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Converts '0'     -> 0
+//          '9'     -> 9
+//          'a'/'A' -> 10
+//          'f'/'F' -> 15
+int Lexer::hexToInt (int c)
+{
+       if (c >= '0' && c <= '9') return (c - '0');
+  else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
+  else                           return (c - 'A' + 10);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int Lexer::hexToInt (int c0, int c1)
+{
+  return (hexToInt (c0) << 4) + hexToInt (c1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int Lexer::hexToInt (int c0, int c1, int c2, int c3)
+{
+  return (hexToInt (c0) << 12) +
+         (hexToInt (c1) << 8)  +
+         (hexToInt (c2) << 4)  +
+          hexToInt (c3);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 std::string Lexer::trimLeft (const std::string& in, const std::string& t /*= " "*/)
 {
@@ -149,6 +203,24 @@ std::string Lexer::trim (const std::string& in, const std::string& t /*= " "*/)
   return trimLeft (trimRight (in, t), t);
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Lexer::Type::string
+//   '|"
+//   [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ]
+//   '|"
+bool Lexer::isString (std::string& token, Lexer::Type& type, const std::string& quotes)
+{
+  std::size_t marker = _cursor;
+  if (readWord (_text, quotes, marker, token))
+  {
+    type = Lexer::Type::string;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::word
 //   [^\s]+
@@ -173,3 +245,170 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+// Full implementation of a quoted word.  Includes:
+//   '\''
+//   '"'
+//   "'"
+//   "\""
+//   'one two'
+// Result includes the quotes.
+bool Lexer::readWord (
+  const std::string& text,
+  const std::string& quotes,
+  std::string::size_type& cursor,
+  std::string& word)
+{
+  if (quotes.find (text[cursor]) == std::string::npos)
+    return false;
+
+  std::string::size_type eos = text.length ();
+  int quote = text[cursor++];
+  word = quote;
+
+  int c;
+  while ((c = text[cursor]))
+  {
+    // Quoted word ends on a quote.
+    if (quote && quote == c)
+    {
+      word += utf8_character (utf8_next_char (text, cursor));
+      break;
+    }
+
+    // Unicode U+XXXX or \uXXXX codepoint.
+    else if (eos - cursor >= 6 &&
+             ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
+              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
+             isHexDigit (text[cursor + 2]) &&
+             isHexDigit (text[cursor + 3]) &&
+             isHexDigit (text[cursor + 4]) &&
+             isHexDigit (text[cursor + 5]))
+    {
+      word += utf8_character (
+                hexToInt (
+                  text[cursor + 2],
+                  text[cursor + 3],
+                  text[cursor + 4],
+                  text[cursor + 5]));
+      cursor += 6;
+    }
+
+    // An escaped thing.
+    else if (c == '\\')
+    {
+      c = text[++cursor];
+
+      switch (c)
+      {
+      case '"':  word += (char) 0x22; ++cursor; break;
+      case '\'': word += (char) 0x27; ++cursor; break;
+      case '\\': word += (char) 0x5C; ++cursor; break;
+      case 'b':  word += (char) 0x08; ++cursor; break;
+      case 'f':  word += (char) 0x0C; ++cursor; break;
+      case 'n':  word += (char) 0x0A; ++cursor; break;
+      case 'r':  word += (char) 0x0D; ++cursor; break;
+      case 't':  word += (char) 0x09; ++cursor; break;
+      case 'v':  word += (char) 0x0B; ++cursor; break;
+
+      // This pass-through default case means that anything can be escaped
+      // harmlessly. In particular 'quote' is included, if it not one of the
+      // above characters.
+      default:   word += (char) c;    ++cursor; break;
+      }
+    }
+
+    // Ordinary character.
+    else
+      word += utf8_character (utf8_next_char (text, cursor));
+  }
+
+  // Verify termination.
+  return word[0]                  == quote &&
+         word[word.length () - 1] == quote &&
+         word.length () >= 2;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Full implementation of an unquoted word.  Includes:
+//   one\ two
+//   abcU+0020def
+//   abc\u0020def
+//   a\tb
+//
+// Ends at:
+//   Lexer::isEOS
+//   Lexer::isWhitespace
+//   Lexer::isHardBoundary
+bool Lexer::readWord (
+  const std::string& text,
+  std::string::size_type& cursor,
+  std::string& word)
+{
+  std::string::size_type eos = text.length ();
+
+  word = "";
+  int c;
+  int prev = 0;
+  while ((c = text[cursor]))  // Handles EOS.
+  {
+    // Unquoted word ends on white space.
+    if (Lexer::isWhitespace (c))
+      break;
+
+    // Parentheses mostly.
+    if (prev && Lexer::isHardBoundary (prev, c))
+      break;
+
+    // Unicode U+XXXX or \uXXXX codepoint.
+    else if (eos - cursor >= 6 &&
+             ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
+              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
+             isHexDigit (text[cursor + 2]) &&
+             isHexDigit (text[cursor + 3]) &&
+             isHexDigit (text[cursor + 4]) &&
+             isHexDigit (text[cursor + 5]))
+    {
+      word += utf8_character (
+                hexToInt (
+                  text[cursor + 2],
+                  text[cursor + 3],
+                  text[cursor + 4],
+                  text[cursor + 5]));
+      cursor += 6;
+    }
+
+    // An escaped thing.
+    else if (c == '\\')
+    {
+      c = text[++cursor];
+
+      switch (c)
+      {
+      case '"':  word += (char) 0x22; ++cursor; break;
+      case '\'': word += (char) 0x27; ++cursor; break;
+      case '\\': word += (char) 0x5C; ++cursor; break;
+      case 'b':  word += (char) 0x08; ++cursor; break;
+      case 'f':  word += (char) 0x0C; ++cursor; break;
+      case 'n':  word += (char) 0x0A; ++cursor; break;
+      case 'r':  word += (char) 0x0D; ++cursor; break;
+      case 't':  word += (char) 0x09; ++cursor; break;
+      case 'v':  word += (char) 0x0B; ++cursor; break;
+
+      // This pass-through default case means that anything can be escaped
+      // harmlessly. In particular 'quote' is included, if it not one of the
+      // above characters.
+      default:   word += (char) c;    ++cursor; break;
+      }
+    }
+
+    // Ordinary character.
+    else
+      word += utf8_character (utf8_next_char (text, cursor));
+
+    prev = c;
+  }
+
+  return word.length () > 0 ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/src/Lexer.h b/src/Lexer.h
index e3358fbd..a0a0428d 100644
--- a/src/Lexer.h
+++ b/src/Lexer.h
@@ -35,20 +35,29 @@
 class Lexer
 {
 public:
-  enum class Type { word };
+  enum class Type { string,
+                    word };
 
   Lexer (const std::string&);
   bool token (std::string&, Lexer::Type&);
 
   // Static helpers.
   static bool isWhitespace                   (int);
+  static bool isHexDigit                     (int);
   static bool isSingleCharOperator           (int);
+  static bool isHardBoundary                 (int, int);
+  static bool readWord                       (const std::string&, const std::string&, std::string::size_type&, std::string&);
+  static bool readWord                       (const std::string&, std::string::size_type&, std::string&);
+  static int hexToInt                        (int);
+  static int hexToInt                        (int, int);
+  static int hexToInt                        (int, int, int, int);
   static std::string trimLeft                (const std::string& in, const std::string& t = " ");
   static std::string trimRight               (const std::string& in, const std::string& t = " ");
   static std::string trim                    (const std::string& in, const std::string& t = " ");
 
   // Stream Classifiers.
   bool isEOS          () const;
+  bool isString       (std::string&, Lexer::Type&, const std::string&);
   bool isWord         (std::string&, Lexer::Type&);
 
 private:
diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp
index f5da0af2..7bc9ffee 100644
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@@ -34,7 +34,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 int main (int, char**)
 {
-  UnitTest t (50);
+  UnitTest t (74);
 
   std::vector <std::pair <std::string, Lexer::Type>> tokens;
   std::string token;
@@ -77,6 +77,55 @@ int main (int, char**)
   Lexer l1 ("       \t ");
   t.notok (l1.token (token, type), "'       \\t ' --> no tokens");
 
+
+  // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
+  std::string::size_type cursor = 0;
+  std::string word;
+  t.ok (Lexer::readWord ("'one two'", "'\"", cursor, word), "readWord ''one two'' --> true");
+  t.is (word, "'one two'",                                  "  word '" + word + "'");
+  t.is ((int)cursor, 9,                                     "  cursor");
+
+  // Unterminated quoted string is invalid.
+  cursor = 0;
+  t.notok (Lexer::readWord ("'one", "'\"", cursor, word),   "readWord ''one' --> false");
+
+  // static bool readWord (const std::string&, std::string::size_type&, std::string&);
+  cursor = 0;
+  t.ok (Lexer::readWord ("input", cursor, word),            "readWord 'input' --> true");
+  t.is (word, "input",                                      "  word '" + word + "'");
+  t.is ((int)cursor, 5,                                     "  cursor");
+
+  cursor = 0;
+  t.ok (Lexer::readWord ("one\\ two", cursor, word),        "readWord 'one\\ two' --> true");
+  t.is (word, "one two",                                    "  word '" + word + "'");
+  t.is ((int)cursor, 8,                                     "  cursor");
+
+  cursor = 0;
+  t.ok (Lexer::readWord ("\\u20A43", cursor, word),         "readWord '\\u20A43' --> true");
+  t.is (word, "₤3",                                         "  word '" + word + "'");
+  t.is ((int)cursor, 7,                                     "  cursor");
+
+  cursor = 0;
+  t.ok (Lexer::readWord ("U+20AC4", cursor, word),          "readWord '\\u20AC4' --> true");
+  t.is (word, "€4",                                         "  word '" + word + "'");
+  t.is ((int)cursor, 7,                                     "  cursor");
+
+  std::string text = "one 'two' three\\ four";
+  cursor = 0;
+  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one 'two' three\\ four\" --> true");
+  t.is (word, "one",                                        "  word '" + word + "'");
+  cursor++;
+  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one 'two' three\\ four\" --> true");
+  t.is (word, "'two'",                                      "  word '" + word + "'");
+  cursor++;
+  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one 'two' three\\ four\" --> true");
+  t.is (word, "three four",                                 "  word '" + word + "'");
+
+  text = "one     ";
+  cursor = 0;
+  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one     \" --> true");
+  t.is (word, "one",                                        "  word '" + word + "'");
+
   // std::string Lexer::trimLeft (const std::string& in, const std::string&)
   t.is (Lexer::trimLeft (""),                     "",            "Lexer::trimLeft '' -> ''");
   t.is (Lexer::trimLeft ("   "),                  "",            "Lexer::trimLeft '   ' -> ''");