From 06319711f1980342209486bbb40387996f891b84 Mon Sep 17 00:00:00 2001
From: Paul Beckingham <paul@beckingham.net>
Date: Tue, 18 Nov 2014 00:55:53 -0500
Subject: [PATCH] Quoting

- Removed automatic dequoting by the Lexer.
- Implemented Lexer::dequote for manual control.
- Variant dequotes string values when appropriate.
- Fixed some unit tests that became wrong.
---
 ChangeLog        |  2 ++
 src/CLI.cpp      |  4 ++-
 src/Lexer.cpp    | 24 ++++++++++++--
 src/Lexer.h      |  1 +
 src/Variant.cpp  | 85 ++++++++++++++++++++++++++++++++++++++++++++++--
 test/filter.t    |  4 +--
 test/lexer.t.cpp |  6 ++--
 7 files changed, 114 insertions(+), 12 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 124775d29..790069a6e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -184,6 +184,8 @@
 - TW-1441 task import continues happily if filename doesn't exist.
 - TW-1444 Tag ordering is preserved, but should be sorted in reports.
 - TW-1460 Empty due dates lead to endless loop.
+- TW-1463 A few more problems with special characters in filters, pluses,
+          question marks, and braces (thanks to Ralph Bean).
 - Added new holidays.xy-XY.rc definition files
 - Removed deprecated 'echo.command' setting, in favor of the 'header' and
   'affected' verbosity tokens.
diff --git a/src/CLI.cpp b/src/CLI.cpp
index c9d24d5fe..8c6fffc2a 100644
--- a/src/CLI.cpp
+++ b/src/CLI.cpp
@@ -1637,7 +1637,9 @@ void CLI::desugarFilterPlainArgs ()
       op.tag ("FILTER");
       reconstructed.push_back (op);
 
-      A rhs ("argPattern", "'" + a->attribute ("raw") + "'");
+      std::string pattern = a->attribute ("raw");
+      Lexer::dequote (pattern);
+      A rhs ("argPattern", "'" + pattern + "'");
       rhs.tag ("LITERAL");
       rhs.tag ("FILTER");
       reconstructed.push_back (rhs);
diff --git a/src/Lexer.cpp b/src/Lexer.cpp
index 2729c9415..19acd0a69 100644
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -87,6 +87,7 @@ bool Lexer::token (std::string& result, Type& type)
       {
         type = typeString;
         quote = _n0;
+        result += utf8_character (_n0);
         shift ();
       }
       else if (_n0 == '0' &&
@@ -189,6 +190,7 @@ bool Lexer::token (std::string& result, Type& type)
     case typeString:
       if (_n0 == quote)
       {
+        result += utf8_character (_n0);
         shift ();
         quote = 0;
         return true;
@@ -247,6 +249,7 @@ bool Lexer::token (std::string& result, Type& type)
       else
       {
         type = quote ? typeString : typeIdentifier;
+        result += utf8_character (quote);
         result += utf8_character (_n0);
         shift ();
       }
@@ -265,7 +268,8 @@ bool Lexer::token (std::string& result, Type& type)
       }
       else
       {
-        result += decode_escape (_n0);
+        result += '\\';
+        result += utf8_character (_n0);
         type = quote ? typeString : typeIdentifier;
         shift ();
       }
@@ -444,6 +448,7 @@ bool Lexer::word (std::string& token, Type& type)
       {
         type = typeString;
         quote = _n0;
+        token += utf8_character (_n0);
         shift ();
       }
       else
@@ -457,6 +462,7 @@ bool Lexer::word (std::string& token, Type& type)
     case typeString:
       if (_n0 == quote)
       {
+        token += utf8_character (_n0);
         shift ();
         quote = 0;
         return true;
@@ -491,7 +497,8 @@ bool Lexer::word (std::string& token, Type& type)
       }
       else
       {
-        token += decode_escape (_n0);
+        token += '\\';
+        token += utf8_character (_n0);
         type = typeString;
         shift ();
       }
@@ -709,6 +716,18 @@ void Lexer::token_split (std::vector <std::pair <std::string, Lexer::Type> >& le
     lexemes.push_back (std::pair <std::string, Lexer::Type>(word, type));
 }
 
+////////////////////////////////////////////////////////////////////////////////
+void Lexer::dequote (std::string& input)
+{
+  int quote = input[0];
+  size_t len = input.length ();
+  if ((quote == '\'' || quote == '"') &&
+      quote == input[len - 1])
+  {
+    input = input.substr (1, len - 2);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::is_date (std::string& result)
 {
@@ -830,7 +849,6 @@ int Lexer::decode_escape (int c) const
   case 'v':  return 0x0B;
   case '\'': return 0x27;
   case '"':  return 0x22;
-  case '\\': return 0x5C;
   default:   return c;
   }
 }
diff --git a/src/Lexer.h b/src/Lexer.h
index a7ba74bc6..ff0c56554 100644
--- a/src/Lexer.h
+++ b/src/Lexer.h
@@ -84,6 +84,7 @@ public:
   static void word_split (std::vector <std::string>&, const std::string&);
   static void token_split (std::vector <std::string>&, const std::string&);
   static void token_split (std::vector <std::pair <std::string, Lexer::Type> >&, const std::string&);
+  static void dequote (std::string&);
 
 private:
   bool is_date (std::string&);
diff --git a/src/Variant.cpp b/src/Variant.cpp
index 39afff8ac..a08bf6e10 100644
--- a/src/Variant.cpp
+++ b/src/Variant.cpp
@@ -31,6 +31,7 @@
 #include <stdlib.h>
 #include <Variant.h>
 #include <ISO8601.h>
+#include <Lexer.h>
 #include <Date.h>
 #include <Duration.h>
 #include <RX.h>
@@ -193,6 +194,12 @@ bool Variant::operator&& (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   left.cast (type_boolean);
   right.cast (type_boolean);
 
@@ -205,6 +212,12 @@ bool Variant::operator|| (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   left.cast (type_boolean);
   right.cast (type_boolean);
 
@@ -217,6 +230,12 @@ bool Variant::operator_xor (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   left.cast (type_boolean);
   right.cast (type_boolean);
 
@@ -230,6 +249,12 @@ bool Variant::operator< (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   switch (left._type)
   {
   case type_unknown:
@@ -369,6 +394,12 @@ bool Variant::operator<= (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   switch (left._type)
   {
   case type_unknown:
@@ -509,6 +540,12 @@ bool Variant::operator> (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   switch (left._type)
   {
   case type_unknown:
@@ -647,6 +684,12 @@ bool Variant::operator>= (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   switch (left._type)
   {
   case type_unknown:
@@ -787,6 +830,12 @@ bool Variant::operator== (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   switch (left._type)
   {
   case type_unknown:
@@ -911,12 +960,21 @@ bool Variant::operator_match (const Variant& other, const Task& task) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   left.cast (type_string);
   right.cast (type_string);
 
+  std::string pattern = right._string;
+  Lexer::dequote (pattern);
+
   if (searchUsingRegex)
   {
-    RX r (right._string, searchCaseSensitive);
+    RX r (pattern, searchCaseSensitive);
     if (r.match (left._string))
       return true;
 
@@ -935,7 +993,7 @@ bool Variant::operator_match (const Variant& other, const Task& task) const
   }
   else
   {
-    if (find (left._string, right._string, searchCaseSensitive) != std::string::npos)
+    if (find (left._string, pattern, searchCaseSensitive) != std::string::npos)
       return true;
 
     // If the above did not match, and the left source is "description", then
@@ -947,7 +1005,7 @@ bool Variant::operator_match (const Variant& other, const Task& task) const
 
       std::map <std::string, std::string>::iterator a;
       for (a = annotations.begin (); a != annotations.end (); ++a)
-        if (find (a->second, right._string, searchCaseSensitive) != std::string::npos)
+        if (find (a->second, pattern, searchCaseSensitive) != std::string::npos)
           return true;
     }
   }
@@ -972,6 +1030,12 @@ bool Variant::operator_partial (const Variant& other) const
   Variant left (*this);
   Variant right (other);
 
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   switch (left._type)
   {
   case type_unknown:
@@ -1155,6 +1219,7 @@ bool Variant::operator_hastag (const Variant& other, const Task& task) const
 {
   Variant right (other);
   right.cast (type_string);
+  Lexer::dequote (right._string);
   return task.hasTag (right._string);
 }
 
@@ -1168,6 +1233,10 @@ bool Variant::operator_notag (const Variant& other, const Task& task) const
 bool Variant::operator! () const
 {
   Variant left (*this);
+
+  if (left._type == type_string)
+    Lexer::dequote (left._string);
+
   left.cast (type_boolean);
   return ! left._bool;
 }
@@ -1330,6 +1399,9 @@ Variant& Variant::operator+= (const Variant& other)
 {
   Variant right (other);
 
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   switch (_type)
   {
   case type_unknown:
@@ -1439,6 +1511,9 @@ Variant& Variant::operator*= (const Variant& other)
 {
   Variant right (other);
 
+  if (right._type == type_string)
+    Lexer::dequote (right._string);
+
   switch (_type)
   {
   case type_unknown:
@@ -1893,6 +1968,9 @@ Variant::operator std::string () const
 ////////////////////////////////////////////////////////////////////////////////
 void Variant::sqrt ()
 {
+  if (_type == type_string)
+    Lexer::dequote (_string);
+
   cast (type_real);
   if (_real < 0.0)
     throw std::string (STRING_VARIANT_SQRT_NEG);
@@ -1967,6 +2045,7 @@ void Variant::cast (const enum type new_type)
     break;
 
   case type_string:
+    Lexer::dequote (_string);
     switch (new_type)
     {
     case type_unknown:                                                        break;
diff --git a/test/filter.t b/test/filter.t
index 28ba837ae..509cfacc6 100755
--- a/test/filter.t
+++ b/test/filter.t
@@ -248,7 +248,7 @@ unlike ($output, qr/five/,  'v5');
 unlike ($output, qr/six/,   'v6');
 like   ($output, qr/seven/, 'v7');
 
-$output = qx{../src/task rc:filter.rc rc.regex:on list /^s/ 2>&1};
+$output = qx{../src/task rc:filter.rc rc.regex:on list /\\^s/ 2>&1};
 unlike ($output, qr/one/,   'w1');
 unlike ($output, qr/two/,   'w2');
 unlike ($output, qr/three/, 'w3');
@@ -257,7 +257,7 @@ unlike ($output, qr/five/,  'w5');
 like   ($output, qr/six/,   'w6');
 like   ($output, qr/seven/, 'w7');
 
-$output = qx{../src/task rc:filter.rc rc.regex:on list /^.i/ 2>&1};
+$output = qx{../src/task rc:filter.rc rc.regex:on list /\\^.i/ 2>&1};
 unlike ($output, qr/one/,   'x1');
 unlike ($output, qr/two/,   'x2');
 unlike ($output, qr/three/, 'x3');
diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp
index 25afa413f..d34d41974 100644
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@@ -101,7 +101,7 @@ int main (int argc, char** argv)
   t.is (tokens[0].first,                      "one",        "tokens[0] = 'left'"); // 30
   t.is (Lexer::type_name (tokens[0].second),  "Identifier", "tokens[0] = Identifier");
 
-  t.is (tokens[1].first,                      "two 'three'", "tokens[1] = 'two \\'three\\''");
+  t.is (tokens[1].first,                      "'two \\'three\\''", "tokens[1] = 'two \\'three\\''");
   t.is (Lexer::type_name (tokens[1].second),  "String",     "tokens[1] = String");
 
   t.is (tokens[2].first,                      "+",          "tokens[2] = '+'");
@@ -146,7 +146,7 @@ int main (int argc, char** argv)
   t.is (tokens[15].first,                     "and",        "tokens[15] = 'and'"); // 60
   t.is (Lexer::type_name (tokens[15].second), "Operator",   "tokens[15] = Operator");
 
-  t.is (tokens[16].first,                     "€",          "tokens[16] = \\u20ac --> '€'");
+  t.is (tokens[16].first,                     "'€'",        "tokens[16] = \\u20ac --> '€'");
   t.is (Lexer::type_name (tokens[16].second), "String",     "tokens[16] = String");
 
   // Test for ISO-8601 dates (favoring dates in ambiguous cases).
@@ -366,7 +366,7 @@ int main (int argc, char** argv)
   t.is (items[0], "+-*",           "word_split '  +-* a+b 12.3e4 'c d'' -> [0] '+-*'");
   t.is (items[1], "a+b",           "word_split '  +-* a+b 12.3e4 'c d'' -> [1] 'a+b'");
   t.is (items[2], "12.3e4",        "word_split '  +-* a+b 12.3e4 'c d'' -> [2] '12.3e4'");
-  t.is (items[3], "c d",           "word_split '  +-* a+b 12.3e4 'c d'' -> [3] 'c d'");
+  t.is (items[3], "'c d'",         "word_split '  +-* a+b 12.3e4 'c d'' -> [3] 'c d'");
 
   // Test common expression element.
   unsplit = "name=value";