From 592a3bb60ffa8f6ddf37ad4aa7f89686f79d4938 Mon Sep 17 00:00:00 2001
From: Paul Beckingham <paul@beckingham.net>
Date: Thu, 29 May 2014 18:09:11 -0400
Subject: [PATCH] Lexer

- Lexer now makes a speculative legacy dateformat parse whenever it encounters
  a decimal digit.  This assumes that rc.dateformat begins with a numeric date
  element, which is a restriction, but not a big one.
---
 src/Context.cpp |   1 +
 src/Lexer.cpp   | 123 ++++++++++++++++++++++++++++--------------------
 src/Lexer.h     |   2 +
 3 files changed, 75 insertions(+), 51 deletions(-)

diff --git a/src/Context.cpp b/src/Context.cpp
index 1112b8705..68a04ebe2 100644
--- a/src/Context.cpp
+++ b/src/Context.cpp
@@ -641,6 +641,7 @@ void Context::staticInitialization ()
       Task::coefficients[*var] = config.getReal (*var);
   }
 
+  Lexer::dateFormat   = config.get ("dateformat");
   Variant::dateFormat = config.get ("dateformat");
 }
 
diff --git a/src/Lexer.cpp b/src/Lexer.cpp
index 7e0e0ef7d..82e9f0a83 100644
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -26,9 +26,12 @@
 
 #include <utf8.h>
 #include <ISO8601.h>
+#include <Date.h>
 #include <Duration.h>
 #include <Lexer.h>
 
+std::string Lexer::dateFormat = "";
+
 ////////////////////////////////////////////////////////////////////////////////
 Lexer::Lexer (const std::string& input)
 : _input (input)
@@ -53,10 +56,10 @@ Lexer::~Lexer ()
 
 ////////////////////////////////////////////////////////////////////////////////
 // Walk the input string, looking for transitions.
-bool Lexer::token (std::string& token, Type& type)
+bool Lexer::token (std::string& result, Type& type)
 {
   // Start with nothing.
-  token = "";
+  result = "";
 
   // Different types of matching quote:  ', ".
   int quote = 0;
@@ -80,80 +83,98 @@ bool Lexer::token (std::string& token, Type& type)
                is_hex_digit (_n2))
       {
         type = typeHex;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else if (is_dec_digit (_n0))
       {
         // Speculatively try a date and duration parse.  Longest wins.
         std::string::size_type iso_i = 0;
-        std::string iso_token;
+        std::string iso_result;
         ISO8601d iso;
         iso.ambiguity (_ambiguity);
         if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
-          iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
+          iso_result = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
 
         std::string::size_type dur_i = 0;
-        std::string dur_token;
+        std::string dur_result;
         Duration dur;
         if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
-          dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
+          dur_result = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
 
-        if (iso_token.length () > dur_token.length ())
+        if (iso_result.length () > dur_result.length ())
         {
           while (iso_i--) shift ();
-          token = iso_token;
+          result = iso_result;
           type = typeDate;
           return true;
         }
-        else if (dur_token.length () > iso_token.length ())
+        else if (dur_result.length () > iso_result.length ())
         {
           while (dur_i--) shift ();
-          token = dur_token;
+          result = dur_result;
           type = typeDuration;
           return true;
         }
 
-        // TODO Try an rc.dateformat parse here.
+        // Try a legacy rc.dateformat parse here.
+        try
+        {
+          std::string::size_type start = _i < 4 ? 0 : _i - 4;
+          std::string::size_type space = _input.find (' ', _i);
+          if (space == std::string::npos)
+            space = _input.length ();
+
+          std::string legacy = _input.substr (start, space - start);
+          Date legacyDate (legacy, Lexer::dateFormat, false, false);
+
+          space -= start;
+          while (space--) shift ();
+          result = legacy;
+          type = typeDate;
+          return true;
+        }
+
+        catch (...) { /* Never mind. */ }
 
         type = typeNumber;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else if (_n0 == '.' && is_dec_digit (_n1))
       {
         type = typeDecimal;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else if (is_triple_op (_n0, _n1, _n2))
       {
         type = typeOperator;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
         return true;
       }
       else if (is_double_op (_n0, _n1))
       {
         type = typeOperator;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
         return true;
       }
       else if (is_single_op (_n0))
       {
         type = typeOperator;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
         return true;
       }
@@ -166,34 +187,34 @@ bool Lexer::token (std::string& token, Type& type)
       {
         // Speculatively try a date and duration parse.  Longest wins.
         std::string::size_type iso_i = 0;
-        std::string iso_token;
+        std::string iso_result;
         ISO8601p iso;
         if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
-          iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
+          iso_result = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
 
         std::string::size_type dur_i = 0;
-        std::string dur_token;
+        std::string dur_result;
         Duration dur;
         if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
-          dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
+          dur_result = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
 
-        if (iso_token.length () > dur_token.length ())
+        if (iso_result.length () > dur_result.length ())
         {
           while (iso_i--) shift ();
-          token = iso_token;
+          result = iso_result;
           type = typeDuration;
           return true;
         }
-        else if (dur_token.length () > iso_token.length ())
+        else if (dur_result.length () > iso_result.length ())
         {
           while (dur_i--) shift ();
-          token = dur_token;
+          result = dur_result;
           type = typeDuration;
           return true;
         }
 
         type = typeIdentifier;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else
@@ -214,7 +235,7 @@ bool Lexer::token (std::string& token, Type& type)
       }
       else
       {
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       break;
@@ -222,7 +243,7 @@ bool Lexer::token (std::string& token, Type& type)
     case typeIdentifier:
       if (is_ident (_n0))
       {
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else
@@ -230,10 +251,10 @@ bool Lexer::token (std::string& token, Type& type)
         // typeIdentifier is a catch-all type. Anything word-like becomes an
         // identifier. At this point in the processing, an identifier is found,
         // and can be matched against a list of potential upgrades.
-        if (token == "_hastag_" ||
-            token == "_notag_"  ||
-            token == "_neg_"    ||
-            token == "_pos_")
+        if (result == "_hastag_" ||
+            result == "_notag_"  ||
+            result == "_neg_"    ||
+            result == "_pos_")
           type = typeOperator;
 
         return true;
@@ -261,7 +282,7 @@ bool Lexer::token (std::string& token, Type& type)
       }
       else
       {
-        token += decode_escape (_n0);
+        result += decode_escape (_n0);
         type = quote ? typeString : typeIdentifier;
         shift ();
       }
@@ -270,7 +291,7 @@ bool Lexer::token (std::string& token, Type& type)
     case typeEscapeHex:
       if (is_hex_digit (_n0) && is_hex_digit (_n1))
       {
-        token += utf8_character (hex_to_int (_n0, _n1));
+        result += utf8_character (hex_to_int (_n0, _n1));
         type = quote ? typeString : typeIdentifier;
         shift ();
         shift ();
@@ -290,7 +311,7 @@ bool Lexer::token (std::string& token, Type& type)
           is_hex_digit (_n2) &&
           is_hex_digit (_n3))
       {
-        token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
+        result += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
         shift ();
         shift ();
         shift ();
@@ -309,19 +330,19 @@ bool Lexer::token (std::string& token, Type& type)
     case typeNumber:
       if (is_dec_digit (_n0))
       {
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else if (_n0 == '.')
       {
         type = typeDecimal;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else if (_n0 == 'e' || _n0 == 'E')
       {
         type = typeExponentIndicator;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else
@@ -333,13 +354,13 @@ bool Lexer::token (std::string& token, Type& type)
     case typeDecimal:
       if (is_dec_digit (_n0))
       {
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else if (_n0 == 'e' || _n0 == 'E')
       {
         type = typeExponentIndicator;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else
@@ -351,13 +372,13 @@ bool Lexer::token (std::string& token, Type& type)
     case typeExponentIndicator:
       if (_n0 == '+' || _n0 == '-')
       {
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else if (is_dec_digit (_n0))
       {
         type = typeExponent;
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       break;
@@ -365,12 +386,12 @@ bool Lexer::token (std::string& token, Type& type)
     case typeExponent:
       if (is_dec_digit (_n0))
       {
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else if (_n0 == '.')
       {
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else
@@ -383,7 +404,7 @@ bool Lexer::token (std::string& token, Type& type)
     case typeHex:
       if (is_hex_digit (_n0))
       {
-        token += utf8_character (_n0);
+        result += utf8_character (_n0);
         shift ();
       }
       else
@@ -398,7 +419,7 @@ bool Lexer::token (std::string& token, Type& type)
     }
 
     // Fence post.
-    if (!_n0 && token != "")
+    if (!_n0 && result != "")
       return true;
   }
 
diff --git a/src/Lexer.h b/src/Lexer.h
index 6d6a76043..5ce76bc3e 100644
--- a/src/Lexer.h
+++ b/src/Lexer.h
@@ -33,6 +33,8 @@
 class Lexer
 {
 public:
+  static std::string dateFormat;
+
   enum Type
   {
     typeNone = 0,