Lexer:: Added polymorphic ::readWord for quoteѕ and unquoted strings

2025-08-25 12:17:20 +02:00 · 2015-07-06 16:37:03 -04:00 · 2015-07-06 16:37:03 -04:00 · 7a6d546a0d
commit 7a6d546a0d
parent abaf326855
3 changed files with 80 additions and 15 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -1222,16 +1222,12 @@ bool Lexer::isOneWord (const std::string& text)
 }
 ////////////////////////////////////////////////////////////////////////////////
-// Full implementation of a word.  Includes:
+// Full implementation of a quoted word.  Includes:
 //   one\ two
 //   '\''
 //   '"'
 //   "'"
 //   "\""
 //   'one two'
 //   abcU+0020def
 //   abc\u0020def
 //   a\tb
 bool Lexer::readWord (
  const std::string& text,
  const std::string& quotes,
@ -1255,8 +1251,75 @@ bool Lexer::readWord (
      break;
    }
    // Unicode U+XXXX or \uXXXX codepoint.
    else if (eos - cursor >= 6 &&
             ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
             isHexDigit (text[cursor + 2]) &&
             isHexDigit (text[cursor + 3]) &&
             isHexDigit (text[cursor + 4]) &&
             isHexDigit (text[cursor + 5]))
    {
      word += utf8_character (
                hexToInt (
                  text[cursor + 2],
                  text[cursor + 3],
                  text[cursor + 4],
                  text[cursor + 5]));
      cursor += 6;
    }
    // An escaped thing.
    else if (c == '\\')
    {
      c = text[++cursor];
      switch (c)
      {
      case '"':  word += (char) 0x22; ++cursor; break;
      case '\'': word += (char) 0x27; ++cursor; break;
      case '\\': word += (char) 0x5C; ++cursor; break;
      case 'b':  word += (char) 0x08; ++cursor; break;
      case 'f':  word += (char) 0x0C; ++cursor; break;
      case 'n':  word += (char) 0x0A; ++cursor; break;
      case 'r':  word += (char) 0x0D; ++cursor; break;
      case 't':  word += (char) 0x09; ++cursor; break;
      case 'v':  word += (char) 0x0B; ++cursor; break;
      // This pass-through default case means that anything can be escaped
      // harmlessly. In particular 'quote' is included, if it not one of the
      // above characters.
      default:   word += (char) c;    ++cursor; break;
      }
    }
    // Ordinary character.
    else
      word += utf8_character (utf8_next_char (text, cursor));
  }
  return word.length () > 0 ? true : false;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Full implementation of an unquoted word.  Includes:
 //   one\ two
 //   abcU+0020def
 //   abc\u0020def
 //   a\tb
 bool Lexer::readWord (
  const std::string& text,
  std::string::size_type& cursor,
  std::string& word)
 {
  std::string::size_type eos = text.length ();
  word = "";
  int c;
  while ((c = text[cursor]))
  {
    // Unquoted word ends on white space.
-    if (! quote && Lexer::isWhitespace (c))
+    if (Lexer::isWhitespace (c))
    {
      ++cursor;
      break;
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -76,6 +76,7 @@ public:
  static void dequote               (std::string&);
  static bool wasQuoted             (const std::string&);
  static bool readWord              (const std::string&, const std::string&, std::string::size_type&, std::string&);
  static bool readWord              (const std::string&, std::string::size_type&, std::string&);
  static bool decomposePair         (const std::string&, std::string&, std::string&, std::string&, std::string&);
  static int hexToInt               (int);
  static int hexToInt               (int, int);
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@ -214,33 +214,34 @@ int main (int argc, char** argv)
  // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
  std::string::size_type cursor = 0;
  std::string word;
  t.ok (Lexer::readWord ("input", "'\"", cursor, word),     "readWord 'input' --> true");
  t.is (word, "input",                                      "  word '" + word + "'");
  t.is ((int)cursor, 5,                                     "  cursor");
  cursor = 0;
  t.ok (Lexer::readWord ("'one two'", "'\"", cursor, word), "readWord ''one two'' --> true");
  t.is (word, "one two",                                    "  word '" + word + "'");
  t.is ((int)cursor, 9,                                     "  cursor");
  // static bool readWord (const std::string&, std::string::size_type&, std::string&);
  cursor = 0;
-  t.ok (Lexer::readWord ("one\\ two", "'\"", cursor, word), "readWord 'one\\ two' --> true");
+  t.ok (Lexer::readWord ("input", cursor, word),            "readWord 'input' --> true");
  t.is (word, "input",                                      "  word '" + word + "'");
  t.is ((int)cursor, 5,                                     "  cursor");
  cursor = 0;
  t.ok (Lexer::readWord ("one\\ two", cursor, word),        "readWord 'one\\ two' --> true");
  t.is (word, "one two",                                    "  word '" + word + "'");
  t.is ((int)cursor, 8,                                     "  cursor");
  cursor = 0;
-  t.ok (Lexer::readWord ("\\u20A43", "'\"", cursor, word),  "readWord '\\u20A43' --> true");
+  t.ok (Lexer::readWord ("\\u20A43", cursor, word),         "readWord '\\u20A43' --> true");
  t.is (word, "₤3",                                         "  word '" + word + "'");
  t.is ((int)cursor, 7,                                     "  cursor");
  cursor = 0;
-  t.ok (Lexer::readWord ("U+20AC4", "'\"", cursor, word),   "readWord '\\u20AC4' --> true");
+  t.ok (Lexer::readWord ("U+20AC4", cursor, word),          "readWord '\\u20AC4' --> true");
  t.is (word, "€4",                                         "  word '" + word + "'");
  t.is ((int)cursor, 7,                                     "  cursor");
  std::string text = "one 'two' three\\ four";
  cursor = 0;
-  while (Lexer::readWord (text, "'\"", cursor, word))
+  while (Lexer::readWord (text, cursor, word))
  {
    t.diag ("'" + word + "'");
    while (Lexer::isWhitespace(text[cursor]))