Lexer: Added number support

2025-06-26 10:54:28 +02:00 · 2015-12-20 22:00:02 -05:00 · 2015-12-20 22:00:02 -05:00 · fa0c0e5fa7
commit fa0c0e5fa7
parent 53bb3952b8
3 changed files with 140 additions and 2 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -53,6 +53,7 @@ bool Lexer::token (std::string& token, Lexer::Type& type)

  if (isString    (token, type, "'\"") ||
      isHexNumber (token, type)        ||
+      isNumber    (token, type)        ||
      isWord      (token, type))
    return true;

@ -65,6 +66,7 @@ const std::string Lexer::typeName (const Lexer::Type& type)
 {
  switch (type)
  {
+  case Lexer::Type::number:       return "number";
  case Lexer::Type::hex:          return "hex";
  case Lexer::Type::string:       return "string";
  case Lexer::Type::word:         return "word";
@ -115,6 +117,15 @@ bool Lexer::isWhitespace (int c)
          c == 0x3000);    // ideographic space Common  Separator, space
 }

+////////////////////////////////////////////////////////////////////////////////
+// Digits 0-9.
+//
+// TODO This list should be derived from the Unicode database.
+bool Lexer::isDigit (int c)
+{
+  return c >= 0x30 && c <= 0x39;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Digits 0-9 a-f A-F.
 bool Lexer::isHexDigit (int c)
@ -124,6 +135,99 @@ bool Lexer::isHexDigit (int c)
         (c >= 'A' && c <= 'F');
 }

+////////////////////////////////////////////////////////////////////////////////
+// Lexer::Type::number
+//   \d+
+//   [ . \d+ ]
+//   [ e|E [ +|- ] \d+ [ . \d+ ] ]
+//   not followed by non-operator.
+bool Lexer::isNumber (std::string& token, Lexer::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (isDigit (_text[marker]))
+  {
+    ++marker;
+    while (isDigit (_text[marker]))
+      utf8_next_char (_text, marker);
+
+    if (_text[marker] == '.')
+    {
+      ++marker;
+      if (isDigit (_text[marker]))
+      {
+        ++marker;
+        while (isDigit (_text[marker]))
+          utf8_next_char (_text, marker);
+      }
+    }
+
+    if (_text[marker] == 'e' ||
+        _text[marker] == 'E')
+    {
+      ++marker;
+
+      if (_text[marker] == '+' ||
+          _text[marker] == '-')
+        ++marker;
+
+      if (isDigit (_text[marker]))
+      {
+        ++marker;
+        while (isDigit (_text[marker]))
+          utf8_next_char (_text, marker);
+
+        if (_text[marker] == '.')
+        {
+          ++marker;
+          if (isDigit (_text[marker]))
+          {
+            ++marker;
+            while (isDigit (_text[marker]))
+              utf8_next_char (_text, marker);
+          }
+        }
+      }
+    }
+
+    // Lookahread: !<isWhitespace> | !<isSingleCharOperator>
+    // If there is an immediately consecutive character, that is not an operator, fail.
+    if (_eos > marker &&
+        ! isWhitespace (_text[marker]) &&
+        ! isSingleCharOperator (_text[marker]))
+      return false;
+
+    token = _text.substr (_cursor, marker - _cursor);
+    type = Lexer::Type::number;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer::Type::number
+//   \d+
+bool Lexer::isInteger (std::string& token, Lexer::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (isDigit (_text[marker]))
+  {
+    ++marker;
+    while (isDigit (_text[marker]))
+      utf8_next_char (_text, marker);
+
+    token = _text.substr (_cursor, marker - _cursor);
+    type = Lexer::Type::number;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isSingleCharOperator (int c)
 {
@ -293,6 +397,7 @@ std::string Lexer::typeToString (Lexer::Type type)
 {
       if (type == Lexer::Type::string)       return std::string ("\033[38;5;7m\033[48;5;3m")    + "string"       + "\033[0m";
  else if (type == Lexer::Type::hex)          return std::string ("\033[38;5;7m\033[48;5;14m")   + "hex"          + "\033[0m";
+  else if (type == Lexer::Type::number)       return std::string ("\033[38;5;7m\033[48;5;6m")    + "number"       + "\033[0m";
  else if (type == Lexer::Type::word)         return std::string ("\033[38;5;15m\033[48;5;236m") + "word"         + "\033[0m";
  else                                        return std::string ("\033[37;41m")                 + "unknown"      + "\033[0m";
 }
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -35,7 +35,7 @@
 class Lexer
 {
 public:
-  enum class Type { hex,
+  enum class Type { number, hex,
                    string,
                    word };

@ -46,6 +46,7 @@ public:
  // Static helpers.
  static const std::string typeName          (const Lexer::Type&);
  static bool isWhitespace                   (int);
+  static bool isDigit                        (int);
  static bool isHexDigit                     (int);
  static bool isSingleCharOperator           (int);
  static bool isHardBoundary                 (int, int);
@ -61,6 +62,8 @@ public:
  // Stream Classifiers.
  bool isEOS          () const;
  bool isString       (std::string&, Lexer::Type&, const std::string&);
+  bool isNumber       (std::string&, Lexer::Type&);
+  bool isInteger      (std::string&, Lexer::Type&);
  bool isHexNumber    (std::string&, Lexer::Type&);
  bool isWord         (std::string&, Lexer::Type&);

--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@ -34,7 +34,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 int main (int, char**)
 {
-  UnitTest t (131);
+  UnitTest t (171);

  std::vector <std::pair <std::string, Lexer::Type>> tokens;
  std::string token;
@ -77,6 +77,31 @@ int main (int, char**)
  Lexer l1 ("       \t ");
  t.notok (l1.token (token, type), "'       \\t ' --> no tokens");

+  // Test for numbers that are no longer ISO-8601 dates.
+  Lexer l3 ("1 12 123 1234 12345 123456 1234567");
+  tokens.clear ();
+  while (l3.token (token, type))
+  {
+    std::cout << "# «" << token << "» " << Lexer::typeName (type) << "\n";
+    tokens.push_back (std::pair <std::string, Lexer::Type> (token, type));
+  }
+
+  t.is ((int)tokens.size (),     7,                         "7 tokens");
+  t.is (tokens[0].first,         "1",                       "tokens[0] == '1'");
+  t.is ((int) tokens[0].second,  (int) Lexer::Type::number, "tokens[0] == Type::number");
+  t.is (tokens[1].first,         "12",                      "tokens[1] == '12'");
+  t.is ((int) tokens[1].second,  (int) Lexer::Type::number, "tokens[1] == Type::date");
+  t.is (tokens[2].first,         "123",                     "tokens[2] == '123'");
+  t.is ((int) tokens[2].second,  (int) Lexer::Type::number, "tokens[2] == Type::number"); // 70
+  t.is (tokens[3].first,         "1234",                    "tokens[3] == '1234'");
+  t.is ((int) tokens[3].second,  (int) Lexer::Type::number, "tokens[3] == Type::date");
+  t.is (tokens[4].first,         "12345",                   "tokens[4] == '12345'");
+  t.is ((int) tokens[4].second,  (int) Lexer::Type::number, "tokens[4] == Type::number");
+  t.is (tokens[5].first,         "123456",                  "tokens[5] == '123456'");
+  t.is ((int) tokens[5].second,  (int) Lexer::Type::number, "tokens[5] == Type::date");
+  t.is (tokens[6].first,         "1234567",                 "tokens[6] == '1234567'");
+  t.is ((int) tokens[6].second,  (int) Lexer::Type::number, "tokens[6] == Type::number");
+
  // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
  std::string::size_type cursor = 0;
  std::string word;
@ -150,6 +175,10 @@ int main (int, char**)
    { "\"U+20AC4\"",                                  { { "\"€4\"",                                       Lexer::Type::string       }, NO, NO, NO, NO }, },

    // Number
+    { "1",                                            { { "1",                                            Lexer::Type::number       }, NO, NO, NO, NO }, },
+    { "3.14",                                         { { "3.14",                                         Lexer::Type::number       }, NO, NO, NO, NO }, },
+    { "6.02217e23",                                   { { "6.02217e23",                                   Lexer::Type::number       }, NO, NO, NO, NO }, },
+    { "1.2e-3.4",                                     { { "1.2e-3.4",                                     Lexer::Type::number       }, NO, NO, NO, NO }, },
    { "0x2f",                                         { { "0x2f",                                         Lexer::Type::hex          }, NO, NO, NO, NO }, },

  };
@ -186,6 +215,7 @@ int main (int, char**)
    }
  }

+  t.is (Lexer::typeName (Lexer::Type::number),       "number",       "Lexer::typeName (Lexer::Type::number)");
  t.is (Lexer::typeName (Lexer::Type::hex),          "hex",          "Lexer::typeName (Lexer::Type::hex)");
  t.is (Lexer::typeName (Lexer::Type::string),       "string",       "Lexer::typeName (Lexer::Type::string)");
  t.is (Lexer::typeName (Lexer::Type::word),         "word",         "Lexer::typeName (Lexer::Type::word)");