From fa0c0e5fa76682c256443e9dc77f816ebf223d85 Mon Sep 17 00:00:00 2001
From: Paul Beckingham <paul@beckingham.net>
Date: Sun, 20 Dec 2015 22:00:02 -0500
Subject: [PATCH] Lexer: Added number support

---
 src/Lexer.cpp    | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 src/Lexer.h      |   5 ++-
 test/lexer.t.cpp |  32 ++++++++++++++-
 3 files changed, 140 insertions(+), 2 deletions(-)
diff --git a/src/Lexer.cpp b/src/Lexer.cpp
index e5052db2..bfe28f65 100644
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -53,6 +53,7 @@ bool Lexer::token (std::string& token, Lexer::Type& type)
 
   if (isString    (token, type, "'\"") ||
       isHexNumber (token, type)        ||
+      isNumber    (token, type)        ||
       isWord      (token, type))
     return true;
 
@@ -65,6 +66,7 @@ const std::string Lexer::typeName (const Lexer::Type& type)
 {
   switch (type)
   {
+  case Lexer::Type::number:       return "number";
   case Lexer::Type::hex:          return "hex";
   case Lexer::Type::string:       return "string";
   case Lexer::Type::word:         return "word";
@@ -115,6 +117,15 @@ bool Lexer::isWhitespace (int c)
           c == 0x3000);    // ideographic space Common  Separator, space
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Digits 0-9.
+//
+// TODO This list should be derived from the Unicode database.
+bool Lexer::isDigit (int c)
+{
+  return c >= 0x30 && c <= 0x39;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Digits 0-9 a-f A-F.
 bool Lexer::isHexDigit (int c)
@@ -124,6 +135,99 @@ bool Lexer::isHexDigit (int c)
          (c >= 'A' && c <= 'F');
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Lexer::Type::number
+//   \d+
+//   [ . \d+ ]
+//   [ e|E [ +|- ] \d+ [ . \d+ ] ]
+//   not followed by non-operator.
+bool Lexer::isNumber (std::string& token, Lexer::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (isDigit (_text[marker]))
+  {
+    ++marker;
+    while (isDigit (_text[marker]))
+      utf8_next_char (_text, marker);
+
+    if (_text[marker] == '.')
+    {
+      ++marker;
+      if (isDigit (_text[marker]))
+      {
+        ++marker;
+        while (isDigit (_text[marker]))
+          utf8_next_char (_text, marker);
+      }
+    }
+
+    if (_text[marker] == 'e' ||
+        _text[marker] == 'E')
+    {
+      ++marker;
+
+      if (_text[marker] == '+' ||
+          _text[marker] == '-')
+        ++marker;
+
+      if (isDigit (_text[marker]))
+      {
+        ++marker;
+        while (isDigit (_text[marker]))
+          utf8_next_char (_text, marker);
+
+        if (_text[marker] == '.')
+        {
+          ++marker;
+          if (isDigit (_text[marker]))
+          {
+            ++marker;
+            while (isDigit (_text[marker]))
+              utf8_next_char (_text, marker);
+          }
+        }
+      }
+    }
+
+    // Lookahread: !<isWhitespace> | !<isSingleCharOperator>
+    // If there is an immediately consecutive character, that is not an operator, fail.
+    if (_eos > marker &&
+        ! isWhitespace (_text[marker]) &&
+        ! isSingleCharOperator (_text[marker]))
+      return false;
+
+    token = _text.substr (_cursor, marker - _cursor);
+    type = Lexer::Type::number;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer::Type::number
+//   \d+
+bool Lexer::isInteger (std::string& token, Lexer::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (isDigit (_text[marker]))
+  {
+    ++marker;
+    while (isDigit (_text[marker]))
+      utf8_next_char (_text, marker);
+
+    token = _text.substr (_cursor, marker - _cursor);
+    type = Lexer::Type::number;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isSingleCharOperator (int c)
 {
@@ -293,6 +397,7 @@ std::string Lexer::typeToString (Lexer::Type type)
 {
        if (type == Lexer::Type::string)       return std::string ("\033[38;5;7m\033[48;5;3m")    + "string"       + "\033[0m";
   else if (type == Lexer::Type::hex)          return std::string ("\033[38;5;7m\033[48;5;14m")   + "hex"          + "\033[0m";
+  else if (type == Lexer::Type::number)       return std::string ("\033[38;5;7m\033[48;5;6m")    + "number"       + "\033[0m";
   else if (type == Lexer::Type::word)         return std::string ("\033[38;5;15m\033[48;5;236m") + "word"         + "\033[0m";
   else                                        return std::string ("\033[37;41m")                 + "unknown"      + "\033[0m";
 }
diff --git a/src/Lexer.h b/src/Lexer.h
index ebd5655c..562fe83c 100644
--- a/src/Lexer.h
+++ b/src/Lexer.h
@@ -35,7 +35,7 @@
 class Lexer
 {
 public:
-  enum class Type { hex,
+  enum class Type { number, hex,
                     string,
                     word };
 
@@ -46,6 +46,7 @@ public:
   // Static helpers.
   static const std::string typeName          (const Lexer::Type&);
   static bool isWhitespace                   (int);
+  static bool isDigit                        (int);
   static bool isHexDigit                     (int);
   static bool isSingleCharOperator           (int);
   static bool isHardBoundary                 (int, int);
@@ -61,6 +62,8 @@ public:
   // Stream Classifiers.
   bool isEOS          () const;
   bool isString       (std::string&, Lexer::Type&, const std::string&);
+  bool isNumber       (std::string&, Lexer::Type&);
+  bool isInteger      (std::string&, Lexer::Type&);
   bool isHexNumber    (std::string&, Lexer::Type&);
   bool isWord         (std::string&, Lexer::Type&);
 
diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp
index 5f483bb2..fe747be3 100644
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@@ -34,7 +34,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 int main (int, char**)
 {
-  UnitTest t (131);
+  UnitTest t (171);
 
   std::vector <std::pair <std::string, Lexer::Type>> tokens;
   std::string token;
@@ -77,6 +77,31 @@ int main (int, char**)
   Lexer l1 ("       \t ");
   t.notok (l1.token (token, type), "'       \\t ' --> no tokens");
 
+  // Test for numbers that are no longer ISO-8601 dates.
+  Lexer l3 ("1 12 123 1234 12345 123456 1234567");
+  tokens.clear ();
+  while (l3.token (token, type))
+  {
+    std::cout << "# «" << token << "» " << Lexer::typeName (type) << "\n";
+    tokens.push_back (std::pair <std::string, Lexer::Type> (token, type));
+  }
+
+  t.is ((int)tokens.size (),     7,                         "7 tokens");
+  t.is (tokens[0].first,         "1",                       "tokens[0] == '1'");
+  t.is ((int) tokens[0].second,  (int) Lexer::Type::number, "tokens[0] == Type::number");
+  t.is (tokens[1].first,         "12",                      "tokens[1] == '12'");
+  t.is ((int) tokens[1].second,  (int) Lexer::Type::number, "tokens[1] == Type::date");
+  t.is (tokens[2].first,         "123",                     "tokens[2] == '123'");
+  t.is ((int) tokens[2].second,  (int) Lexer::Type::number, "tokens[2] == Type::number"); // 70
+  t.is (tokens[3].first,         "1234",                    "tokens[3] == '1234'");
+  t.is ((int) tokens[3].second,  (int) Lexer::Type::number, "tokens[3] == Type::date");
+  t.is (tokens[4].first,         "12345",                   "tokens[4] == '12345'");
+  t.is ((int) tokens[4].second,  (int) Lexer::Type::number, "tokens[4] == Type::number");
+  t.is (tokens[5].first,         "123456",                  "tokens[5] == '123456'");
+  t.is ((int) tokens[5].second,  (int) Lexer::Type::number, "tokens[5] == Type::date");
+  t.is (tokens[6].first,         "1234567",                 "tokens[6] == '1234567'");
+  t.is ((int) tokens[6].second,  (int) Lexer::Type::number, "tokens[6] == Type::number");
+
   // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
   std::string::size_type cursor = 0;
   std::string word;
@@ -150,6 +175,10 @@ int main (int, char**)
     { "\"U+20AC4\"",                                  { { "\"€4\"",                                       Lexer::Type::string       }, NO, NO, NO, NO }, },
 
     // Number
+    { "1",                                            { { "1",                                            Lexer::Type::number       }, NO, NO, NO, NO }, },
+    { "3.14",                                         { { "3.14",                                         Lexer::Type::number       }, NO, NO, NO, NO }, },
+    { "6.02217e23",                                   { { "6.02217e23",                                   Lexer::Type::number       }, NO, NO, NO, NO }, },
+    { "1.2e-3.4",                                     { { "1.2e-3.4",                                     Lexer::Type::number       }, NO, NO, NO, NO }, },
     { "0x2f",                                         { { "0x2f",                                         Lexer::Type::hex          }, NO, NO, NO, NO }, },
 
   };
@@ -186,6 +215,7 @@ int main (int, char**)
     }
   }
 
+  t.is (Lexer::typeName (Lexer::Type::number),       "number",       "Lexer::typeName (Lexer::Type::number)");
   t.is (Lexer::typeName (Lexer::Type::hex),          "hex",          "Lexer::typeName (Lexer::Type::hex)");
   t.is (Lexer::typeName (Lexer::Type::string),       "string",       "Lexer::typeName (Lexer::Type::string)");
   t.is (Lexer::typeName (Lexer::Type::word),         "word",         "Lexer::typeName (Lexer::Type::word)");