diff --git a/src/Lexer.cpp b/src/Lexer.cpp index bfdc0ae5a..89265cd7e 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -1222,16 +1222,12 @@ bool Lexer::isOneWord (const std::string& text) } //////////////////////////////////////////////////////////////////////////////// -// Full implementation of a word. Includes: -// one\ two +// Full implementation of a quoted word. Includes: // '\'' // '"' // "'" // "\"" // 'one two' -// abcU+0020def -// abc\u0020def -// a\tb bool Lexer::readWord ( const std::string& text, const std::string& quotes, @@ -1255,8 +1251,75 @@ bool Lexer::readWord ( break; } + // Unicode U+XXXX or \uXXXX codepoint. + else if (eos - cursor >= 6 && + ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') || + (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) && + isHexDigit (text[cursor + 2]) && + isHexDigit (text[cursor + 3]) && + isHexDigit (text[cursor + 4]) && + isHexDigit (text[cursor + 5])) + { + word += utf8_character ( + hexToInt ( + text[cursor + 2], + text[cursor + 3], + text[cursor + 4], + text[cursor + 5])); + cursor += 6; + } + + // An escaped thing. + else if (c == '\\') + { + c = text[++cursor]; + + switch (c) + { + case '"': word += (char) 0x22; ++cursor; break; + case '\'': word += (char) 0x27; ++cursor; break; + case '\\': word += (char) 0x5C; ++cursor; break; + case 'b': word += (char) 0x08; ++cursor; break; + case 'f': word += (char) 0x0C; ++cursor; break; + case 'n': word += (char) 0x0A; ++cursor; break; + case 'r': word += (char) 0x0D; ++cursor; break; + case 't': word += (char) 0x09; ++cursor; break; + case 'v': word += (char) 0x0B; ++cursor; break; + + // This pass-through default case means that anything can be escaped + // harmlessly. In particular 'quote' is included, if it not one of the + // above characters. + default: word += (char) c; ++cursor; break; + } + } + + // Ordinary character. + else + word += utf8_character (utf8_next_char (text, cursor)); + } + + return word.length () > 0 ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Full implementation of an unquoted word. Includes: +// one\ two +// abcU+0020def +// abc\u0020def +// a\tb +bool Lexer::readWord ( + const std::string& text, + std::string::size_type& cursor, + std::string& word) +{ + std::string::size_type eos = text.length (); + + word = ""; + int c; + while ((c = text[cursor])) + { // Unquoted word ends on white space. - if (! quote && Lexer::isWhitespace (c)) + if (Lexer::isWhitespace (c)) { ++cursor; break; diff --git a/src/Lexer.h b/src/Lexer.h index 6cc84654e..6fa7ee625 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -76,6 +76,7 @@ public: static void dequote (std::string&); static bool wasQuoted (const std::string&); static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); + static bool readWord (const std::string&, std::string::size_type&, std::string&); static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&); static int hexToInt (int); static int hexToInt (int, int); diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp index bc9cae7db..c645aacb7 100644 --- a/test/lexer.t.cpp +++ b/test/lexer.t.cpp @@ -214,33 +214,34 @@ int main (int argc, char** argv) // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); std::string::size_type cursor = 0; std::string word; - t.ok (Lexer::readWord ("input", "'\"", cursor, word), "readWord 'input' --> true"); - t.is (word, "input", " word '" + word + "'"); - t.is ((int)cursor, 5, " cursor"); - - cursor = 0; t.ok (Lexer::readWord ("'one two'", "'\"", cursor, word), "readWord ''one two'' --> true"); t.is (word, "one two", " word '" + word + "'"); t.is ((int)cursor, 9, " cursor"); + // static bool readWord (const std::string&, std::string::size_type&, std::string&); cursor = 0; - t.ok (Lexer::readWord ("one\\ two", "'\"", cursor, word), "readWord 'one\\ two' --> true"); + t.ok (Lexer::readWord ("input", cursor, word), "readWord 'input' --> true"); + t.is (word, "input", " word '" + word + "'"); + t.is ((int)cursor, 5, " cursor"); + + cursor = 0; + t.ok (Lexer::readWord ("one\\ two", cursor, word), "readWord 'one\\ two' --> true"); t.is (word, "one two", " word '" + word + "'"); t.is ((int)cursor, 8, " cursor"); cursor = 0; - t.ok (Lexer::readWord ("\\u20A43", "'\"", cursor, word), "readWord '\\u20A43' --> true"); + t.ok (Lexer::readWord ("\\u20A43", cursor, word), "readWord '\\u20A43' --> true"); t.is (word, "₤3", " word '" + word + "'"); t.is ((int)cursor, 7, " cursor"); cursor = 0; - t.ok (Lexer::readWord ("U+20AC4", "'\"", cursor, word), "readWord '\\u20AC4' --> true"); + t.ok (Lexer::readWord ("U+20AC4", cursor, word), "readWord '\\u20AC4' --> true"); t.is (word, "€4", " word '" + word + "'"); t.is ((int)cursor, 7, " cursor"); std::string text = "one 'two' three\\ four"; cursor = 0; - while (Lexer::readWord (text, "'\"", cursor, word)) + while (Lexer::readWord (text, cursor, word)) { t.diag ("'" + word + "'"); while (Lexer::isWhitespace(text[cursor]))