Lexer

- Implemented Lexer::word, which is just like ::token, but does not understand dates, durations or operators. - Implemented Lexer::split, which uses Lexer::word. - Added unit tests.
2025-08-28 13:37:20 +02:00 · 2014-04-23 23:19:41 -04:00 · 2014-04-23 23:19:41 -04:00 · 611812007a
commit 611812007a
parent d099a4edfd
3 changed files with 314 additions and 3 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -393,6 +393,293 @@ bool Lexer::token (std::string& token, Type& type)
  return false;
 }

+////////////////////////////////////////////////////////////////////////////////
+// Just like Lexer::token, but no operators, dates or durations.
+bool Lexer::word (std::string& token, Type& type)
+{
+  // Start with nothing.
+  token = "";
+
+  // Different types of matching quote:  ', ".
+  int quote = 0;
+
+  type = typeNone;
+  while (_n0)
+  {
+    switch (type)
+    {
+    case typeNone:
+      if (is_ws (_n0))
+        shift ();
+      else if (_n0 == '"' || _n0 == '\'')
+      {
+        type = typeString;
+        quote = _n0;
+        shift ();
+      }
+      else if (_n0 == '0' &&
+               _n1 == 'x' &&
+               is_hex_digit (_n2))
+      {
+        type = typeHex;
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (is_dec_digit (_n0))
+      {
+        // Speculatively try a date and duration parse.  Longest wins.
+        std::string::size_type iso_i = 0;
+        std::string iso_token;
+        ISO8601d iso;
+        iso.ambiguity (_ambiguity);
+        if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
+          iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
+
+        std::string::size_type dur_i = 0;
+        std::string dur_token;
+        Duration dur;
+        if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
+          dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
+
+        if (iso_token.length () > dur_token.length ())
+        {
+          while (iso_i--) shift ();
+          token = iso_token;
+          type = typeDate;
+          return true;
+        }
+        else if (dur_token.length () > iso_token.length ())
+        {
+          while (dur_i--) shift ();
+          token = dur_token;
+          type = typeDuration;
+          return true;
+        }
+
+        type = typeNumber;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.' && is_dec_digit (_n1))
+      {
+        type = typeDecimal;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '\\')
+      {
+        type = typeIdentifierEscape;
+        shift ();
+      }
+      else if (is_ident_start (_n0))
+      {
+        type = typeIdentifier;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+        throw std::string ("Unexpected error 1");
+      break;
+
+    case typeString:
+      if (_n0 == quote)
+      {
+        shift ();
+        quote = 0;
+        return true;
+      }
+      else if (_n0 == '\\')
+      {
+        type = typeEscape;
+        shift ();
+      }
+      else
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      break;
+
+    case typeIdentifier:
+      if (is_ident (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeIdentifierEscape:
+      if (_n0 == 'u')
+      {
+        type = typeEscapeUnicode;
+        shift ();
+      }
+      break;
+
+    case typeEscape:
+      if (_n0 == 'x')
+      {
+        type = typeEscapeHex;
+        shift ();
+      }
+      else if (_n0 == 'u')
+      {
+        type = typeEscapeUnicode;
+        shift ();
+      }
+      else
+      {
+        token += decode_escape (_n0);
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+      }
+      break;
+
+    case typeEscapeHex:
+      if (is_hex_digit (_n0) && is_hex_digit (_n1))
+      {
+        token += utf8_character (hex_to_int (_n0, _n1));
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+        shift ();
+      }
+      else
+      {
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+        quote = 0;
+        return true;
+      }
+      break;
+
+    case typeEscapeUnicode:
+      if (is_hex_digit (_n0) &&
+          is_hex_digit (_n1) &&
+          is_hex_digit (_n2) &&
+          is_hex_digit (_n3))
+      {
+        token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
+        shift ();
+        shift ();
+        shift ();
+        shift ();
+        type = quote ? typeString : typeIdentifier;
+      }
+      else if (_n0 == quote)
+      {
+        type = typeString;
+        shift ();
+        quote = 0;
+        return true;
+      }
+
+    case typeNumber:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.')
+      {
+        type = typeDecimal;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == 'e' || _n0 == 'E')
+      {
+        type = typeExponentIndicator;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeDecimal:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == 'e' || _n0 == 'E')
+      {
+        type = typeExponentIndicator;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeExponentIndicator:
+      if (_n0 == '+' || _n0 == '-')
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (is_dec_digit (_n0))
+      {
+        type = typeExponent;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      break;
+
+    case typeExponent:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.')
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        type = typeDecimal;
+        return true;
+      }
+      break;
+
+    case typeHex:
+      if (is_hex_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    default:
+      throw std::string ("Unexpected error 2");
+      break;
+    }
+
+    // Fence post.
+    if (!_n0 && token != "")
+      return true;
+  }
+
+  return false;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 void Lexer::ambiguity (bool value)
 {
@ -457,6 +744,19 @@ bool Lexer::is_ws (int c)
          c == 0x3000);    // ideographic space Common  Separator, space
 }

+////////////////////////////////////////////////////////////////////////////////
+// Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes.
+void Lexer::split (std::vector <std::string>& words, const std::string& input)
+{
+  words.clear ();
+
+  std::string word;
+  Lexer::Type type;
+  Lexer lex (input);
+  while (lex.word (word, type))
+    words.push_back (word);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::is_punct (int c) const
 {
@ -593,8 +893,6 @@ void Lexer::shift ()
  _n1 = _n2;
  _n2 = _n3;
  _n3 = utf8_next_char (_input, _i);
-
-  //std::cout << "# shift [" << (char) _n0 << (char) _n1 << (char) _n2 << (char) _n3 << "]\n";
 }

 ////////////////////////////////////////////////////////////////////////////////
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -58,10 +58,12 @@ public:
  Lexer& operator= (const Lexer&); // Not implemented.
  bool operator== (const Lexer&);  // Not implemented.
  bool token (std::string&, Type&);
+  bool word (std::string&, Type&);
  void ambiguity (bool);

  static const std::string type_name (const Type&);
  static bool is_ws (int);
+  static void split (std::vector <std::string>&, const std::string&);

 private:
  bool is_punct (int) const;
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@ -36,7 +36,7 @@ Context context;
 ////////////////////////////////////////////////////////////////////////////////
 int main (int argc, char** argv)
 {
-  UnitTest t (170);
+  UnitTest t (176);

  std::vector <std::pair <std::string, Lexer::Type> > tokens;
  std::string token;
@ -299,6 +299,17 @@ int main (int argc, char** argv)
  t.is (tokens[20].first,                     ")",                    "tokens[20] == ')'");
  t.is (tokens[20].second,                    Lexer::typeOperator,    "tokens[20] == typeOperator"); // 170

+  // void splitq (std::vector<std::string>&, const std::string&);
+  std::string unsplit = " ( A or B ) ";
+  std::vector <std::string> items;
+  Lexer::split (items, unsplit);
+  t.is (items.size (), (size_t) 5, "split ' ( A or B ) '");
+  t.is (items[0], "(",             "split ' ( A or B ) ' -> [0] '('");
+  t.is (items[1], "A",             "split ' ( A or B ) ' -> [1] 'A'");
+  t.is (items[2], "or",            "split ' ( A or B ) ' -> [2] 'or'");
+  t.is (items[3], "B",             "split ' ( A or B ) ' -> [3] 'B'");
+  t.is (items[4], ")",             "split ' ( A or B ) ' -> [4] ')'");
+
  return 0;
 }