Lexer: Added string support

2025-06-26 10:54:28 +02:00 · 2015-12-20 15:02:53 -05:00 · 2015-12-20 15:02:53 -05:00 · 31c145ef9e
commit 31c145ef9e
parent d236315450
3 changed files with 300 additions and 3 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -51,7 +51,8 @@ bool Lexer::token (std::string& token, Lexer::Type& type)
  if (isEOS ())
    return false;
-  if (isWord (token, type))
+  if (isString (token, type, "'\"") ||
      isWord   (token, type))
    return true;
  return false;
@ -99,6 +100,15 @@ bool Lexer::isWhitespace (int c)
          c == 0x3000);    // ideographic space Common  Separator, space
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Digits 0-9 a-f A-F.
 bool Lexer::isHexDigit (int c)
 {
  return (c >= '0' && c <= '9') ||
         (c >= 'a' && c <= 'f') ||
         (c >= 'A' && c <= 'F');
 }
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isSingleCharOperator (int c)
 {
@ -117,12 +127,56 @@ bool Lexer::isSingleCharOperator (int c)
         c == '~';    // Pattern match
 }
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isHardBoundary (int left, int right)
 {
  // EOS
  if (right == '\0')
    return true;
  // FILTER operators that don't need to be surrounded by whitespace.
  if (left == '(' ||
      left == ')' ||
      right == '(' ||
      right == ')')
    return true;
  return false;
 }
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isEOS () const
 {
  return _cursor >= _eos;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Converts '0'     -> 0
 //          '9'     -> 9
 //          'a'/'A' -> 10
 //          'f'/'F' -> 15
 int Lexer::hexToInt (int c)
 {
       if (c >= '0' && c <= '9') return (c - '0');
  else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
  else                           return (c - 'A' + 10);
 }
 ////////////////////////////////////////////////////////////////////////////////
 int Lexer::hexToInt (int c0, int c1)
 {
  return (hexToInt (c0) << 4) + hexToInt (c1);
 }
 ////////////////////////////////////////////////////////////////////////////////
 int Lexer::hexToInt (int c0, int c1, int c2, int c3)
 {
  return (hexToInt (c0) << 12) +
         (hexToInt (c1) << 8)  +
         (hexToInt (c2) << 4)  +
          hexToInt (c3);
 }
 ////////////////////////////////////////////////////////////////////////////////
 std::string Lexer::trimLeft (const std::string& in, const std::string& t /*= " "*/)
 {
@ -149,6 +203,24 @@ std::string Lexer::trim (const std::string& in, const std::string& t /*= " "*/)
  return trimLeft (trimRight (in, t), t);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::string
 //   '|"
 //   [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ]
 //   '|"
 bool Lexer::isString (std::string& token, Lexer::Type& type, const std::string& quotes)
 {
  std::size_t marker = _cursor;
  if (readWord (_text, quotes, marker, token))
  {
    type = Lexer::Type::string;
    _cursor = marker;
    return true;
  }
  return false;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::word
 //   [^\s]+
@ -173,3 +245,170 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Full implementation of a quoted word.  Includes:
 //   '\''
 //   '"'
 //   "'"
 //   "\""
 //   'one two'
 // Result includes the quotes.
 bool Lexer::readWord (
  const std::string& text,
  const std::string& quotes,
  std::string::size_type& cursor,
  std::string& word)
 {
  if (quotes.find (text[cursor]) == std::string::npos)
    return false;
  std::string::size_type eos = text.length ();
  int quote = text[cursor++];
  word = quote;
  int c;
  while ((c = text[cursor]))
  {
    // Quoted word ends on a quote.
    if (quote && quote == c)
    {
      word += utf8_character (utf8_next_char (text, cursor));
      break;
    }
    // Unicode U+XXXX or \uXXXX codepoint.
    else if (eos - cursor >= 6 &&
             ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
             isHexDigit (text[cursor + 2]) &&
             isHexDigit (text[cursor + 3]) &&
             isHexDigit (text[cursor + 4]) &&
             isHexDigit (text[cursor + 5]))
    {
      word += utf8_character (
                hexToInt (
                  text[cursor + 2],
                  text[cursor + 3],
                  text[cursor + 4],
                  text[cursor + 5]));
      cursor += 6;
    }
    // An escaped thing.
    else if (c == '\\')
    {
      c = text[++cursor];
      switch (c)
      {
      case '"':  word += (char) 0x22; ++cursor; break;
      case '\'': word += (char) 0x27; ++cursor; break;
      case '\\': word += (char) 0x5C; ++cursor; break;
      case 'b':  word += (char) 0x08; ++cursor; break;
      case 'f':  word += (char) 0x0C; ++cursor; break;
      case 'n':  word += (char) 0x0A; ++cursor; break;
      case 'r':  word += (char) 0x0D; ++cursor; break;
      case 't':  word += (char) 0x09; ++cursor; break;
      case 'v':  word += (char) 0x0B; ++cursor; break;
      // This pass-through default case means that anything can be escaped
      // harmlessly. In particular 'quote' is included, if it not one of the
      // above characters.
      default:   word += (char) c;    ++cursor; break;
      }
    }
    // Ordinary character.
    else
      word += utf8_character (utf8_next_char (text, cursor));
  }
  // Verify termination.
  return word[0]                  == quote &&
         word[word.length () - 1] == quote &&
         word.length () >= 2;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Full implementation of an unquoted word.  Includes:
 //   one\ two
 //   abcU+0020def
 //   abc\u0020def
 //   a\tb
 //
 // Ends at:
 //   Lexer::isEOS
 //   Lexer::isWhitespace
 //   Lexer::isHardBoundary
 bool Lexer::readWord (
  const std::string& text,
  std::string::size_type& cursor,
  std::string& word)
 {
  std::string::size_type eos = text.length ();
  word = "";
  int c;
  int prev = 0;
  while ((c = text[cursor]))  // Handles EOS.
  {
    // Unquoted word ends on white space.
    if (Lexer::isWhitespace (c))
      break;
    // Parentheses mostly.
    if (prev && Lexer::isHardBoundary (prev, c))
      break;
    // Unicode U+XXXX or \uXXXX codepoint.
    else if (eos - cursor >= 6 &&
             ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
             isHexDigit (text[cursor + 2]) &&
             isHexDigit (text[cursor + 3]) &&
             isHexDigit (text[cursor + 4]) &&
             isHexDigit (text[cursor + 5]))
    {
      word += utf8_character (
                hexToInt (
                  text[cursor + 2],
                  text[cursor + 3],
                  text[cursor + 4],
                  text[cursor + 5]));
      cursor += 6;
    }
    // An escaped thing.
    else if (c == '\\')
    {
      c = text[++cursor];
      switch (c)
      {
      case '"':  word += (char) 0x22; ++cursor; break;
      case '\'': word += (char) 0x27; ++cursor; break;
      case '\\': word += (char) 0x5C; ++cursor; break;
      case 'b':  word += (char) 0x08; ++cursor; break;
      case 'f':  word += (char) 0x0C; ++cursor; break;
      case 'n':  word += (char) 0x0A; ++cursor; break;
      case 'r':  word += (char) 0x0D; ++cursor; break;
      case 't':  word += (char) 0x09; ++cursor; break;
      case 'v':  word += (char) 0x0B; ++cursor; break;
      // This pass-through default case means that anything can be escaped
      // harmlessly. In particular 'quote' is included, if it not one of the
      // above characters.
      default:   word += (char) c;    ++cursor; break;
      }
    }
    // Ordinary character.
    else
      word += utf8_character (utf8_next_char (text, cursor));
    prev = c;
  }
  return word.length () > 0 ? true : false;
 }
 ////////////////////////////////////////////////////////////////////////////////
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -35,20 +35,29 @@
 class Lexer
 {
 public:
-  enum class Type { word };
+  enum class Type { string,
                    word };
  Lexer (const std::string&);
  bool token (std::string&, Lexer::Type&);
  // Static helpers.
  static bool isWhitespace                   (int);
  static bool isHexDigit                     (int);
  static bool isSingleCharOperator           (int);
  static bool isHardBoundary                 (int, int);
  static bool readWord                       (const std::string&, const std::string&, std::string::size_type&, std::string&);
  static bool readWord                       (const std::string&, std::string::size_type&, std::string&);
  static int hexToInt                        (int);
  static int hexToInt                        (int, int);
  static int hexToInt                        (int, int, int, int);
  static std::string trimLeft                (const std::string& in, const std::string& t = " ");
  static std::string trimRight               (const std::string& in, const std::string& t = " ");
  static std::string trim                    (const std::string& in, const std::string& t = " ");
  // Stream Classifiers.
  bool isEOS          () const;
  bool isString       (std::string&, Lexer::Type&, const std::string&);
  bool isWord         (std::string&, Lexer::Type&);
 private:
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@ -34,7 +34,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 int main (int, char**)
 {
-  UnitTest t (50);
+  UnitTest t (74);
  std::vector <std::pair <std::string, Lexer::Type>> tokens;
  std::string token;
@ -77,6 +77,55 @@ int main (int, char**)
  Lexer l1 ("       \t ");
  t.notok (l1.token (token, type), "'       \\t ' --> no tokens");
  // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
  std::string::size_type cursor = 0;
  std::string word;
  t.ok (Lexer::readWord ("'one two'", "'\"", cursor, word), "readWord ''one two'' --> true");
  t.is (word, "'one two'",                                  "  word '" + word + "'");
  t.is ((int)cursor, 9,                                     "  cursor");
  // Unterminated quoted string is invalid.
  cursor = 0;
  t.notok (Lexer::readWord ("'one", "'\"", cursor, word),   "readWord ''one' --> false");
  // static bool readWord (const std::string&, std::string::size_type&, std::string&);
  cursor = 0;
  t.ok (Lexer::readWord ("input", cursor, word),            "readWord 'input' --> true");
  t.is (word, "input",                                      "  word '" + word + "'");
  t.is ((int)cursor, 5,                                     "  cursor");
  cursor = 0;
  t.ok (Lexer::readWord ("one\\ two", cursor, word),        "readWord 'one\\ two' --> true");
  t.is (word, "one two",                                    "  word '" + word + "'");
  t.is ((int)cursor, 8,                                     "  cursor");
  cursor = 0;
  t.ok (Lexer::readWord ("\\u20A43", cursor, word),         "readWord '\\u20A43' --> true");
  t.is (word, "₤3",                                         "  word '" + word + "'");
  t.is ((int)cursor, 7,                                     "  cursor");
  cursor = 0;
  t.ok (Lexer::readWord ("U+20AC4", cursor, word),          "readWord '\\u20AC4' --> true");
  t.is (word, "€4",                                         "  word '" + word + "'");
  t.is ((int)cursor, 7,                                     "  cursor");
  std::string text = "one 'two' three\\ four";
  cursor = 0;
  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one 'two' three\\ four\" --> true");
  t.is (word, "one",                                        "  word '" + word + "'");
  cursor++;
  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one 'two' three\\ four\" --> true");
  t.is (word, "'two'",                                      "  word '" + word + "'");
  cursor++;
  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one 'two' three\\ four\" --> true");
  t.is (word, "three four",                                 "  word '" + word + "'");
  text = "one     ";
  cursor = 0;
  t.ok (Lexer::readWord (text, cursor, word),               "readWord \"one     \" --> true");
  t.is (word, "one",                                        "  word '" + word + "'");
  // std::string Lexer::trimLeft (const std::string& in, const std::string&)
  t.is (Lexer::trimLeft (""),                     "",            "Lexer::trimLeft '' -> ''");
  t.is (Lexer::trimLeft ("   "),                  "",            "Lexer::trimLeft '   ' -> ''");