Lexer:: Added polymorphic ::readWord for quoteѕ and unquoted strings

2025-07-07 20:06:36 +02:00 · 2015-07-06 16:37:03 -04:00 · 2015-07-06 16:37:03 -04:00 · 7a6d546a0d
commit 7a6d546a0d
parent abaf326855
3 changed files with 80 additions and 15 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -1222,16 +1222,12 @@ bool Lexer::isOneWord (const std::string& text)
 }

 ////////////////////////////////////////////////////////////////////////////////
-// Full implementation of a word.  Includes:
-//   one\ two
+// Full implementation of a quoted word.  Includes:
 //   '\''
 //   '"'
 //   "'"
 //   "\""
 //   'one two'
-//   abcU+0020def
-//   abc\u0020def
-//   a\tb
 bool Lexer::readWord (
  const std::string& text,
  const std::string& quotes,
@ -1255,8 +1251,75 @@ bool Lexer::readWord (
      break;
    }

+    // Unicode U+XXXX or \uXXXX codepoint.
+    else if (eos - cursor >= 6 &&
+             ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
+              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
+             isHexDigit (text[cursor + 2]) &&
+             isHexDigit (text[cursor + 3]) &&
+             isHexDigit (text[cursor + 4]) &&
+             isHexDigit (text[cursor + 5]))
+    {
+      word += utf8_character (
+                hexToInt (
+                  text[cursor + 2],
+                  text[cursor + 3],
+                  text[cursor + 4],
+                  text[cursor + 5]));
+      cursor += 6;
+    }
+
+    // An escaped thing.
+    else if (c == '\\')
+    {
+      c = text[++cursor];
+
+      switch (c)
+      {
+      case '"':  word += (char) 0x22; ++cursor; break;
+      case '\'': word += (char) 0x27; ++cursor; break;
+      case '\\': word += (char) 0x5C; ++cursor; break;
+      case 'b':  word += (char) 0x08; ++cursor; break;
+      case 'f':  word += (char) 0x0C; ++cursor; break;
+      case 'n':  word += (char) 0x0A; ++cursor; break;
+      case 'r':  word += (char) 0x0D; ++cursor; break;
+      case 't':  word += (char) 0x09; ++cursor; break;
+      case 'v':  word += (char) 0x0B; ++cursor; break;
+
+      // This pass-through default case means that anything can be escaped
+      // harmlessly. In particular 'quote' is included, if it not one of the
+      // above characters.
+      default:   word += (char) c;    ++cursor; break;
+      }
+    }
+
+    // Ordinary character.
+    else
+      word += utf8_character (utf8_next_char (text, cursor));
+  }
+
+  return word.length () > 0 ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Full implementation of an unquoted word.  Includes:
+//   one\ two
+//   abcU+0020def
+//   abc\u0020def
+//   a\tb
+bool Lexer::readWord (
+  const std::string& text,
+  std::string::size_type& cursor,
+  std::string& word)
+{
+  std::string::size_type eos = text.length ();
+
+  word = "";
+  int c;
+  while ((c = text[cursor]))
+  {
    // Unquoted word ends on white space.
-    if (! quote && Lexer::isWhitespace (c))
+    if (Lexer::isWhitespace (c))
    {
      ++cursor;
      break;
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -76,6 +76,7 @@ public:
  static void dequote               (std::string&);
  static bool wasQuoted             (const std::string&);
  static bool readWord              (const std::string&, const std::string&, std::string::size_type&, std::string&);
+  static bool readWord              (const std::string&, std::string::size_type&, std::string&);
  static bool decomposePair         (const std::string&, std::string&, std::string&, std::string&, std::string&);
  static int hexToInt               (int);
  static int hexToInt               (int, int);