From 611812007a574b2d9fb19063eac0ea2f0bade1aa Mon Sep 17 00:00:00 2001
From: Paul Beckingham <paul@beckingham.net>
Date: Wed, 23 Apr 2014 23:19:41 -0400
Subject: [PATCH] Lexer

- Implemented Lexer::word, which is just like ::token, but does not
  understand dates, durations or operators.
- Implemented Lexer::split, which uses Lexer::word.
- Added unit tests.
---
 src/Lexer.cpp    | 302 ++++++++++++++++++++++++++++++++++++++++++++++-
 src/Lexer.h      |   2 +
 test/lexer.t.cpp |  13 +-
 3 files changed, 314 insertions(+), 3 deletions(-)

diff --git a/src/Lexer.cpp b/src/Lexer.cpp
index 506208984..81df8510c 100644
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -393,6 +393,293 @@ bool Lexer::token (std::string& token, Type& type)
   return false;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Just like Lexer::token, but no operators, dates or durations.
+bool Lexer::word (std::string& token, Type& type)
+{
+  // Start with nothing.
+  token = "";
+
+  // Different types of matching quote:  ', ".
+  int quote = 0;
+
+  type = typeNone;
+  while (_n0)
+  {
+    switch (type)
+    {
+    case typeNone:
+      if (is_ws (_n0))
+        shift ();
+      else if (_n0 == '"' || _n0 == '\'')
+      {
+        type = typeString;
+        quote = _n0;
+        shift ();
+      }
+      else if (_n0 == '0' &&
+               _n1 == 'x' &&
+               is_hex_digit (_n2))
+      {
+        type = typeHex;
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (is_dec_digit (_n0))
+      {
+        // Speculatively try a date and duration parse.  Longest wins.
+        std::string::size_type iso_i = 0;
+        std::string iso_token;
+        ISO8601d iso;
+        iso.ambiguity (_ambiguity);
+        if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
+          iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
+
+        std::string::size_type dur_i = 0;
+        std::string dur_token;
+        Duration dur;
+        if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
+          dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
+
+        if (iso_token.length () > dur_token.length ())
+        {
+          while (iso_i--) shift ();
+          token = iso_token;
+          type = typeDate;
+          return true;
+        }
+        else if (dur_token.length () > iso_token.length ())
+        {
+          while (dur_i--) shift ();
+          token = dur_token;
+          type = typeDuration;
+          return true;
+        }
+
+        type = typeNumber;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.' && is_dec_digit (_n1))
+      {
+        type = typeDecimal;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '\\')
+      {
+        type = typeIdentifierEscape;
+        shift ();
+      }
+      else if (is_ident_start (_n0))
+      {
+        type = typeIdentifier;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+        throw std::string ("Unexpected error 1");
+      break;
+
+    case typeString:
+      if (_n0 == quote)
+      {
+        shift ();
+        quote = 0;
+        return true;
+      }
+      else if (_n0 == '\\')
+      {
+        type = typeEscape;
+        shift ();
+      }
+      else
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      break;
+
+    case typeIdentifier:
+      if (is_ident (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeIdentifierEscape:
+      if (_n0 == 'u')
+      {
+        type = typeEscapeUnicode;
+        shift ();
+      }
+      break;
+
+    case typeEscape:
+      if (_n0 == 'x')
+      {
+        type = typeEscapeHex;
+        shift ();
+      }
+      else if (_n0 == 'u')
+      {
+        type = typeEscapeUnicode;
+        shift ();
+      }
+      else
+      {
+        token += decode_escape (_n0);
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+      }
+      break;
+
+    case typeEscapeHex:
+      if (is_hex_digit (_n0) && is_hex_digit (_n1))
+      {
+        token += utf8_character (hex_to_int (_n0, _n1));
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+        shift ();
+      }
+      else
+      {
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+        quote = 0;
+        return true;
+      }
+      break;
+
+    case typeEscapeUnicode:
+      if (is_hex_digit (_n0) &&
+          is_hex_digit (_n1) &&
+          is_hex_digit (_n2) &&
+          is_hex_digit (_n3))
+      {
+        token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
+        shift ();
+        shift ();
+        shift ();
+        shift ();
+        type = quote ? typeString : typeIdentifier;
+      }
+      else if (_n0 == quote)
+      {
+        type = typeString;
+        shift ();
+        quote = 0;
+        return true;
+      }
+
+    case typeNumber:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.')
+      {
+        type = typeDecimal;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == 'e' || _n0 == 'E')
+      {
+        type = typeExponentIndicator;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeDecimal:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == 'e' || _n0 == 'E')
+      {
+        type = typeExponentIndicator;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeExponentIndicator:
+      if (_n0 == '+' || _n0 == '-')
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (is_dec_digit (_n0))
+      {
+        type = typeExponent;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      break;
+
+    case typeExponent:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.')
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        type = typeDecimal;
+        return true;
+      }
+      break;
+
+    case typeHex:
+      if (is_hex_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    default:
+      throw std::string ("Unexpected error 2");
+      break;
+    }
+
+    // Fence post.
+    if (!_n0 && token != "")
+      return true;
+  }
+
+  return false;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 void Lexer::ambiguity (bool value)
 {
@@ -457,6 +744,19 @@ bool Lexer::is_ws (int c)
           c == 0x3000);    // ideographic space Common  Separator, space
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes.
+void Lexer::split (std::vector <std::string>& words, const std::string& input)
+{
+  words.clear ();
+
+  std::string word;
+  Lexer::Type type;
+  Lexer lex (input);
+  while (lex.word (word, type))
+    words.push_back (word);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::is_punct (int c) const
 {
@@ -593,8 +893,6 @@ void Lexer::shift ()
   _n1 = _n2;
   _n2 = _n3;
   _n3 = utf8_next_char (_input, _i);
-
-  //std::cout << "# shift [" << (char) _n0 << (char) _n1 << (char) _n2 << (char) _n3 << "]\n";
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/Lexer.h b/src/Lexer.h
index 9010b6ea8..6d6a76043 100644
--- a/src/Lexer.h
+++ b/src/Lexer.h
@@ -58,10 +58,12 @@ public:
   Lexer& operator= (const Lexer&); // Not implemented.
   bool operator== (const Lexer&);  // Not implemented.
   bool token (std::string&, Type&);
+  bool word (std::string&, Type&);
   void ambiguity (bool);
 
   static const std::string type_name (const Type&);
   static bool is_ws (int);
+  static void split (std::vector <std::string>&, const std::string&);
 
 private:
   bool is_punct (int) const;
diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp
index f850650fd..bff4f2e87 100644
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@@ -36,7 +36,7 @@ Context context;
 ////////////////////////////////////////////////////////////////////////////////
 int main (int argc, char** argv)
 {
-  UnitTest t (170);
+  UnitTest t (176);
 
   std::vector <std::pair <std::string, Lexer::Type> > tokens;
   std::string token;
@@ -299,6 +299,17 @@ int main (int argc, char** argv)
   t.is (tokens[20].first,                     ")",                    "tokens[20] == ')'");
   t.is (tokens[20].second,                    Lexer::typeOperator,    "tokens[20] == typeOperator"); // 170
 
+  // void splitq (std::vector<std::string>&, const std::string&);
+  std::string unsplit = " ( A or B ) ";
+  std::vector <std::string> items;
+  Lexer::split (items, unsplit);
+  t.is (items.size (), (size_t) 5, "split ' ( A or B ) '");
+  t.is (items[0], "(",             "split ' ( A or B ) ' -> [0] '('");
+  t.is (items[1], "A",             "split ' ( A or B ) ' -> [1] 'A'");
+  t.is (items[2], "or",            "split ' ( A or B ) ' -> [2] 'or'");
+  t.is (items[3], "B",             "split ' ( A or B ) ' -> [3] 'B'");
+  t.is (items[4], ")",             "split ' ( A or B ) ' -> [4] ')'");
+
   return 0;
 }