Lexer

- Implemented Lexer::word, which is just like ::token, but does not understand dates, durations or operators. - Implemented Lexer::split, which uses Lexer::word. - Added unit tests.
2025-06-26 10:54:26 +02:00 · 2014-04-23 23:19:41 -04:00 · 2014-04-23 23:19:41 -04:00 · 611812007a
commit 611812007a
parent d099a4edfd
3 changed files with 314 additions and 3 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -393,6 +393,293 @@ bool Lexer::token (std::string& token, Type& type)
  return false;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Just like Lexer::token, but no operators, dates or durations.
 bool Lexer::word (std::string& token, Type& type)
 {
  // Start with nothing.
  token = "";
  // Different types of matching quote:  ', ".
  int quote = 0;
  type = typeNone;
  while (_n0)
  {
    switch (type)
    {
    case typeNone:
      if (is_ws (_n0))
        shift ();
      else if (_n0 == '"' || _n0 == '\'')
      {
        type = typeString;
        quote = _n0;
        shift ();
      }
      else if (_n0 == '0' &&
               _n1 == 'x' &&
               is_hex_digit (_n2))
      {
        type = typeHex;
        token += utf8_character (_n0);
        shift ();
        token += utf8_character (_n0);
        shift ();
        token += utf8_character (_n0);
        shift ();
      }
      else if (is_dec_digit (_n0))
      {
        // Speculatively try a date and duration parse.  Longest wins.
        std::string::size_type iso_i = 0;
        std::string iso_token;
        ISO8601d iso;
        iso.ambiguity (_ambiguity);
        if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
          iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
        std::string::size_type dur_i = 0;
        std::string dur_token;
        Duration dur;
        if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
          dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
        if (iso_token.length () > dur_token.length ())
        {
          while (iso_i--) shift ();
          token = iso_token;
          type = typeDate;
          return true;
        }
        else if (dur_token.length () > iso_token.length ())
        {
          while (dur_i--) shift ();
          token = dur_token;
          type = typeDuration;
          return true;
        }
        type = typeNumber;
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == '.' && is_dec_digit (_n1))
      {
        type = typeDecimal;
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == '\\')
      {
        type = typeIdentifierEscape;
        shift ();
      }
      else if (is_ident_start (_n0))
      {
        type = typeIdentifier;
        token += utf8_character (_n0);
        shift ();
      }
      else
        throw std::string ("Unexpected error 1");
      break;
    case typeString:
      if (_n0 == quote)
      {
        shift ();
        quote = 0;
        return true;
      }
      else if (_n0 == '\\')
      {
        type = typeEscape;
        shift ();
      }
      else
      {
        token += utf8_character (_n0);
        shift ();
      }
      break;
    case typeIdentifier:
      if (is_ident (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        return true;
      }
      break;
    case typeIdentifierEscape:
      if (_n0 == 'u')
      {
        type = typeEscapeUnicode;
        shift ();
      }
      break;
    case typeEscape:
      if (_n0 == 'x')
      {
        type = typeEscapeHex;
        shift ();
      }
      else if (_n0 == 'u')
      {
        type = typeEscapeUnicode;
        shift ();
      }
      else
      {
        token += decode_escape (_n0);
        type = quote ? typeString : typeIdentifier;
        shift ();
      }
      break;
    case typeEscapeHex:
      if (is_hex_digit (_n0) && is_hex_digit (_n1))
      {
        token += utf8_character (hex_to_int (_n0, _n1));
        type = quote ? typeString : typeIdentifier;
        shift ();
        shift ();
      }
      else
      {
        type = quote ? typeString : typeIdentifier;
        shift ();
        quote = 0;
        return true;
      }
      break;
    case typeEscapeUnicode:
      if (is_hex_digit (_n0) &&
          is_hex_digit (_n1) &&
          is_hex_digit (_n2) &&
          is_hex_digit (_n3))
      {
        token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
        shift ();
        shift ();
        shift ();
        shift ();
        type = quote ? typeString : typeIdentifier;
      }
      else if (_n0 == quote)
      {
        type = typeString;
        shift ();
        quote = 0;
        return true;
      }
    case typeNumber:
      if (is_dec_digit (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == '.')
      {
        type = typeDecimal;
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == 'e' || _n0 == 'E')
      {
        type = typeExponentIndicator;
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        return true;
      }
      break;
    case typeDecimal:
      if (is_dec_digit (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == 'e' || _n0 == 'E')
      {
        type = typeExponentIndicator;
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        return true;
      }
      break;
    case typeExponentIndicator:
      if (_n0 == '+' || _n0 == '-')
      {
        token += utf8_character (_n0);
        shift ();
      }
      else if (is_dec_digit (_n0))
      {
        type = typeExponent;
        token += utf8_character (_n0);
        shift ();
      }
      break;
    case typeExponent:
      if (is_dec_digit (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == '.')
      {
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        type = typeDecimal;
        return true;
      }
      break;
    case typeHex:
      if (is_hex_digit (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        return true;
      }
      break;
    default:
      throw std::string ("Unexpected error 2");
      break;
    }
    // Fence post.
    if (!_n0 && token != "")
      return true;
  }
  return false;
 }
 ////////////////////////////////////////////////////////////////////////////////
 void Lexer::ambiguity (bool value)
 {
@ -457,6 +744,19 @@ bool Lexer::is_ws (int c)
          c == 0x3000);    // ideographic space Common  Separator, space
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes.
 void Lexer::split (std::vector <std::string>& words, const std::string& input)
 {
  words.clear ();
  std::string word;
  Lexer::Type type;
  Lexer lex (input);
  while (lex.word (word, type))
    words.push_back (word);
 }
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::is_punct (int c) const
 {
@ -593,8 +893,6 @@ void Lexer::shift ()
  _n1 = _n2;
  _n2 = _n3;
  _n3 = utf8_next_char (_input, _i);
  //std::cout << "# shift [" << (char) _n0 << (char) _n1 << (char) _n2 << (char) _n3 << "]\n";
 }
 ////////////////////////////////////////////////////////////////////////////////
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -58,10 +58,12 @@ public:
  Lexer& operator= (const Lexer&); // Not implemented.
  bool operator== (const Lexer&);  // Not implemented.
  bool token (std::string&, Type&);
  bool word (std::string&, Type&);
  void ambiguity (bool);
  static const std::string type_name (const Type&);
  static bool is_ws (int);
  static void split (std::vector <std::string>&, const std::string&);
 private:
  bool is_punct (int) const;
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@ -36,7 +36,7 @@ Context context;
 ////////////////////////////////////////////////////////////////////////////////
 int main (int argc, char** argv)
 {
-  UnitTest t (170);
+  UnitTest t (176);
  std::vector <std::pair <std::string, Lexer::Type> > tokens;
  std::string token;
@ -299,6 +299,17 @@ int main (int argc, char** argv)
  t.is (tokens[20].first,                     ")",                    "tokens[20] == ')'");
  t.is (tokens[20].second,                    Lexer::typeOperator,    "tokens[20] == typeOperator"); // 170
  // void splitq (std::vector<std::string>&, const std::string&);
  std::string unsplit = " ( A or B ) ";
  std::vector <std::string> items;
  Lexer::split (items, unsplit);
  t.is (items.size (), (size_t) 5, "split ' ( A or B ) '");
  t.is (items[0], "(",             "split ' ( A or B ) ' -> [0] '('");
  t.is (items[1], "A",             "split ' ( A or B ) ' -> [1] 'A'");
  t.is (items[2], "or",            "split ' ( A or B ) ' -> [2] 'or'");
  t.is (items[3], "B",             "split ' ( A or B ) ' -> [3] 'B'");
  t.is (items[4], ")",             "split ' ( A or B ) ' -> [4] ')'");
  return 0;
 }