Lexer: Added number support

2025-06-26 10:54:28 +02:00 · 2015-12-20 22:00:02 -05:00 · 2015-12-20 22:00:02 -05:00 · fa0c0e5fa7
commit fa0c0e5fa7
parent 53bb3952b8
3 changed files with 140 additions and 2 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -53,6 +53,7 @@ bool Lexer::token (std::string& token, Lexer::Type& type)
  if (isString    (token, type, "'\"") ||
      isHexNumber (token, type)        ||
      isNumber    (token, type)        ||
      isWord      (token, type))
    return true;
@ -65,6 +66,7 @@ const std::string Lexer::typeName (const Lexer::Type& type)
 {
  switch (type)
  {
  case Lexer::Type::number:       return "number";
  case Lexer::Type::hex:          return "hex";
  case Lexer::Type::string:       return "string";
  case Lexer::Type::word:         return "word";
@ -115,6 +117,15 @@ bool Lexer::isWhitespace (int c)
          c == 0x3000);    // ideographic space Common  Separator, space
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Digits 0-9.
 //
 // TODO This list should be derived from the Unicode database.
 bool Lexer::isDigit (int c)
 {
  return c >= 0x30 && c <= 0x39;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Digits 0-9 a-f A-F.
 bool Lexer::isHexDigit (int c)
@ -124,6 +135,99 @@ bool Lexer::isHexDigit (int c)
         (c >= 'A' && c <= 'F');
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::number
 //   \d+
 //   [ . \d+ ]
 //   [ e|E [ +|- ] \d+ [ . \d+ ] ]
 //   not followed by non-operator.
 bool Lexer::isNumber (std::string& token, Lexer::Type& type)
 {
  std::size_t marker = _cursor;
  if (isDigit (_text[marker]))
  {
    ++marker;
    while (isDigit (_text[marker]))
      utf8_next_char (_text, marker);
    if (_text[marker] == '.')
    {
      ++marker;
      if (isDigit (_text[marker]))
      {
        ++marker;
        while (isDigit (_text[marker]))
          utf8_next_char (_text, marker);
      }
    }
    if (_text[marker] == 'e' ||
        _text[marker] == 'E')
    {
      ++marker;
      if (_text[marker] == '+' ||
          _text[marker] == '-')
        ++marker;
      if (isDigit (_text[marker]))
      {
        ++marker;
        while (isDigit (_text[marker]))
          utf8_next_char (_text, marker);
        if (_text[marker] == '.')
        {
          ++marker;
          if (isDigit (_text[marker]))
          {
            ++marker;
            while (isDigit (_text[marker]))
              utf8_next_char (_text, marker);
          }
        }
      }
    }
    // Lookahread: !<isWhitespace> | !<isSingleCharOperator>
    // If there is an immediately consecutive character, that is not an operator, fail.
    if (_eos > marker &&
        ! isWhitespace (_text[marker]) &&
        ! isSingleCharOperator (_text[marker]))
      return false;
    token = _text.substr (_cursor, marker - _cursor);
    type = Lexer::Type::number;
    _cursor = marker;
    return true;
  }
  return false;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::number
 //   \d+
 bool Lexer::isInteger (std::string& token, Lexer::Type& type)
 {
  std::size_t marker = _cursor;
  if (isDigit (_text[marker]))
  {
    ++marker;
    while (isDigit (_text[marker]))
      utf8_next_char (_text, marker);
    token = _text.substr (_cursor, marker - _cursor);
    type = Lexer::Type::number;
    _cursor = marker;
    return true;
  }
  return false;
 }
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isSingleCharOperator (int c)
 {
@ -293,6 +397,7 @@ std::string Lexer::typeToString (Lexer::Type type)
 {
       if (type == Lexer::Type::string)       return std::string ("\033[38;5;7m\033[48;5;3m")    + "string"       + "\033[0m";
  else if (type == Lexer::Type::hex)          return std::string ("\033[38;5;7m\033[48;5;14m")   + "hex"          + "\033[0m";
  else if (type == Lexer::Type::number)       return std::string ("\033[38;5;7m\033[48;5;6m")    + "number"       + "\033[0m";
  else if (type == Lexer::Type::word)         return std::string ("\033[38;5;15m\033[48;5;236m") + "word"         + "\033[0m";
  else                                        return std::string ("\033[37;41m")                 + "unknown"      + "\033[0m";
 }
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -35,7 +35,7 @@
 class Lexer
 {
 public:
-  enum class Type { hex,
+  enum class Type { number, hex,
                    string,
                    word };
@ -46,6 +46,7 @@ public:
  // Static helpers.
  static const std::string typeName          (const Lexer::Type&);
  static bool isWhitespace                   (int);
  static bool isDigit                        (int);
  static bool isHexDigit                     (int);
  static bool isSingleCharOperator           (int);
  static bool isHardBoundary                 (int, int);
@ -61,6 +62,8 @@ public:
  // Stream Classifiers.
  bool isEOS          () const;
  bool isString       (std::string&, Lexer::Type&, const std::string&);
  bool isNumber       (std::string&, Lexer::Type&);
  bool isInteger      (std::string&, Lexer::Type&);
  bool isHexNumber    (std::string&, Lexer::Type&);
  bool isWord         (std::string&, Lexer::Type&);
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@ -34,7 +34,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 int main (int, char**)
 {
-  UnitTest t (131);
+  UnitTest t (171);
  std::vector <std::pair <std::string, Lexer::Type>> tokens;
  std::string token;
@ -77,6 +77,31 @@ int main (int, char**)
  Lexer l1 ("       \t ");
  t.notok (l1.token (token, type), "'       \\t ' --> no tokens");
  // Test for numbers that are no longer ISO-8601 dates.
  Lexer l3 ("1 12 123 1234 12345 123456 1234567");
  tokens.clear ();
  while (l3.token (token, type))
  {
    std::cout << "# «" << token << "» " << Lexer::typeName (type) << "\n";
    tokens.push_back (std::pair <std::string, Lexer::Type> (token, type));
  }
  t.is ((int)tokens.size (),     7,                         "7 tokens");
  t.is (tokens[0].first,         "1",                       "tokens[0] == '1'");
  t.is ((int) tokens[0].second,  (int) Lexer::Type::number, "tokens[0] == Type::number");
  t.is (tokens[1].first,         "12",                      "tokens[1] == '12'");
  t.is ((int) tokens[1].second,  (int) Lexer::Type::number, "tokens[1] == Type::date");
  t.is (tokens[2].first,         "123",                     "tokens[2] == '123'");
  t.is ((int) tokens[2].second,  (int) Lexer::Type::number, "tokens[2] == Type::number"); // 70
  t.is (tokens[3].first,         "1234",                    "tokens[3] == '1234'");
  t.is ((int) tokens[3].second,  (int) Lexer::Type::number, "tokens[3] == Type::date");
  t.is (tokens[4].first,         "12345",                   "tokens[4] == '12345'");
  t.is ((int) tokens[4].second,  (int) Lexer::Type::number, "tokens[4] == Type::number");
  t.is (tokens[5].first,         "123456",                  "tokens[5] == '123456'");
  t.is ((int) tokens[5].second,  (int) Lexer::Type::number, "tokens[5] == Type::date");
  t.is (tokens[6].first,         "1234567",                 "tokens[6] == '1234567'");
  t.is ((int) tokens[6].second,  (int) Lexer::Type::number, "tokens[6] == Type::number");
  // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
  std::string::size_type cursor = 0;
  std::string word;
@ -150,6 +175,10 @@ int main (int, char**)
    { "\"U+20AC4\"",                                  { { "\"€4\"",                                       Lexer::Type::string       }, NO, NO, NO, NO }, },
    // Number
    { "1",                                            { { "1",                                            Lexer::Type::number       }, NO, NO, NO, NO }, },
    { "3.14",                                         { { "3.14",                                         Lexer::Type::number       }, NO, NO, NO, NO }, },
    { "6.02217e23",                                   { { "6.02217e23",                                   Lexer::Type::number       }, NO, NO, NO, NO }, },
    { "1.2e-3.4",                                     { { "1.2e-3.4",                                     Lexer::Type::number       }, NO, NO, NO, NO }, },
    { "0x2f",                                         { { "0x2f",                                         Lexer::Type::hex          }, NO, NO, NO, NO }, },
  };
@ -186,6 +215,7 @@ int main (int, char**)
    }
  }
  t.is (Lexer::typeName (Lexer::Type::number),       "number",       "Lexer::typeName (Lexer::Type::number)");
  t.is (Lexer::typeName (Lexer::Type::hex),          "hex",          "Lexer::typeName (Lexer::Type::hex)");
  t.is (Lexer::typeName (Lexer::Type::string),       "string",       "Lexer::typeName (Lexer::Type::string)");
  t.is (Lexer::typeName (Lexer::Type::word),         "word",         "Lexer::typeName (Lexer::Type::word)");