From 31c145ef9ea8b783ec951c8d98fb6277fa5c2375 Mon Sep 17 00:00:00 2001 From: Paul Beckingham Date: Sun, 20 Dec 2015 15:02:53 -0500 Subject: [PATCH] Lexer: Added string support --- src/Lexer.cpp | 241 ++++++++++++++++++++++++++++++++++++++++++++++- src/Lexer.h | 11 ++- test/lexer.t.cpp | 51 +++++++++- 3 files changed, 300 insertions(+), 3 deletions(-) diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 4a6d4015..75d52ca8 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -51,7 +51,8 @@ bool Lexer::token (std::string& token, Lexer::Type& type) if (isEOS ()) return false; - if (isWord (token, type)) + if (isString (token, type, "'\"") || + isWord (token, type)) return true; return false; @@ -99,6 +100,15 @@ bool Lexer::isWhitespace (int c) c == 0x3000); // ideographic space Common Separator, space } +//////////////////////////////////////////////////////////////////////////////// +// Digits 0-9 a-f A-F. +bool Lexer::isHexDigit (int c) +{ + return (c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); +} + //////////////////////////////////////////////////////////////////////////////// bool Lexer::isSingleCharOperator (int c) { @@ -117,12 +127,56 @@ bool Lexer::isSingleCharOperator (int c) c == '~'; // Pattern match } +//////////////////////////////////////////////////////////////////////////////// +bool Lexer::isHardBoundary (int left, int right) +{ + // EOS + if (right == '\0') + return true; + + // FILTER operators that don't need to be surrounded by whitespace. + if (left == '(' || + left == ')' || + right == '(' || + right == ')') + return true; + + return false; +} + //////////////////////////////////////////////////////////////////////////////// bool Lexer::isEOS () const { return _cursor >= _eos; } +//////////////////////////////////////////////////////////////////////////////// +// Converts '0' -> 0 +// '9' -> 9 +// 'a'/'A' -> 10 +// 'f'/'F' -> 15 +int Lexer::hexToInt (int c) +{ + if (c >= '0' && c <= '9') return (c - '0'); + else if (c >= 'a' && c <= 'f') return (c - 'a' + 10); + else return (c - 'A' + 10); +} + +//////////////////////////////////////////////////////////////////////////////// +int Lexer::hexToInt (int c0, int c1) +{ + return (hexToInt (c0) << 4) + hexToInt (c1); +} + +//////////////////////////////////////////////////////////////////////////////// +int Lexer::hexToInt (int c0, int c1, int c2, int c3) +{ + return (hexToInt (c0) << 12) + + (hexToInt (c1) << 8) + + (hexToInt (c2) << 4) + + hexToInt (c3); +} + //////////////////////////////////////////////////////////////////////////////// std::string Lexer::trimLeft (const std::string& in, const std::string& t /*= " "*/) { @@ -149,6 +203,24 @@ std::string Lexer::trim (const std::string& in, const std::string& t /*= " "*/) return trimLeft (trimRight (in, t), t); } +//////////////////////////////////////////////////////////////////////////////// +// Lexer::Type::string +// '|" +// [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ] +// '|" +bool Lexer::isString (std::string& token, Lexer::Type& type, const std::string& quotes) +{ + std::size_t marker = _cursor; + if (readWord (_text, quotes, marker, token)) + { + type = Lexer::Type::string; + _cursor = marker; + return true; + } + + return false; +} + //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::word // [^\s]+ @@ -173,3 +245,170 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type) } //////////////////////////////////////////////////////////////////////////////// +// Full implementation of a quoted word. Includes: +// '\'' +// '"' +// "'" +// "\"" +// 'one two' +// Result includes the quotes. +bool Lexer::readWord ( + const std::string& text, + const std::string& quotes, + std::string::size_type& cursor, + std::string& word) +{ + if (quotes.find (text[cursor]) == std::string::npos) + return false; + + std::string::size_type eos = text.length (); + int quote = text[cursor++]; + word = quote; + + int c; + while ((c = text[cursor])) + { + // Quoted word ends on a quote. + if (quote && quote == c) + { + word += utf8_character (utf8_next_char (text, cursor)); + break; + } + + // Unicode U+XXXX or \uXXXX codepoint. + else if (eos - cursor >= 6 && + ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') || + (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) && + isHexDigit (text[cursor + 2]) && + isHexDigit (text[cursor + 3]) && + isHexDigit (text[cursor + 4]) && + isHexDigit (text[cursor + 5])) + { + word += utf8_character ( + hexToInt ( + text[cursor + 2], + text[cursor + 3], + text[cursor + 4], + text[cursor + 5])); + cursor += 6; + } + + // An escaped thing. + else if (c == '\\') + { + c = text[++cursor]; + + switch (c) + { + case '"': word += (char) 0x22; ++cursor; break; + case '\'': word += (char) 0x27; ++cursor; break; + case '\\': word += (char) 0x5C; ++cursor; break; + case 'b': word += (char) 0x08; ++cursor; break; + case 'f': word += (char) 0x0C; ++cursor; break; + case 'n': word += (char) 0x0A; ++cursor; break; + case 'r': word += (char) 0x0D; ++cursor; break; + case 't': word += (char) 0x09; ++cursor; break; + case 'v': word += (char) 0x0B; ++cursor; break; + + // This pass-through default case means that anything can be escaped + // harmlessly. In particular 'quote' is included, if it not one of the + // above characters. + default: word += (char) c; ++cursor; break; + } + } + + // Ordinary character. + else + word += utf8_character (utf8_next_char (text, cursor)); + } + + // Verify termination. + return word[0] == quote && + word[word.length () - 1] == quote && + word.length () >= 2; +} + +//////////////////////////////////////////////////////////////////////////////// +// Full implementation of an unquoted word. Includes: +// one\ two +// abcU+0020def +// abc\u0020def +// a\tb +// +// Ends at: +// Lexer::isEOS +// Lexer::isWhitespace +// Lexer::isHardBoundary +bool Lexer::readWord ( + const std::string& text, + std::string::size_type& cursor, + std::string& word) +{ + std::string::size_type eos = text.length (); + + word = ""; + int c; + int prev = 0; + while ((c = text[cursor])) // Handles EOS. + { + // Unquoted word ends on white space. + if (Lexer::isWhitespace (c)) + break; + + // Parentheses mostly. + if (prev && Lexer::isHardBoundary (prev, c)) + break; + + // Unicode U+XXXX or \uXXXX codepoint. + else if (eos - cursor >= 6 && + ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') || + (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) && + isHexDigit (text[cursor + 2]) && + isHexDigit (text[cursor + 3]) && + isHexDigit (text[cursor + 4]) && + isHexDigit (text[cursor + 5])) + { + word += utf8_character ( + hexToInt ( + text[cursor + 2], + text[cursor + 3], + text[cursor + 4], + text[cursor + 5])); + cursor += 6; + } + + // An escaped thing. + else if (c == '\\') + { + c = text[++cursor]; + + switch (c) + { + case '"': word += (char) 0x22; ++cursor; break; + case '\'': word += (char) 0x27; ++cursor; break; + case '\\': word += (char) 0x5C; ++cursor; break; + case 'b': word += (char) 0x08; ++cursor; break; + case 'f': word += (char) 0x0C; ++cursor; break; + case 'n': word += (char) 0x0A; ++cursor; break; + case 'r': word += (char) 0x0D; ++cursor; break; + case 't': word += (char) 0x09; ++cursor; break; + case 'v': word += (char) 0x0B; ++cursor; break; + + // This pass-through default case means that anything can be escaped + // harmlessly. In particular 'quote' is included, if it not one of the + // above characters. + default: word += (char) c; ++cursor; break; + } + } + + // Ordinary character. + else + word += utf8_character (utf8_next_char (text, cursor)); + + prev = c; + } + + return word.length () > 0 ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/src/Lexer.h b/src/Lexer.h index e3358fbd..a0a0428d 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -35,20 +35,29 @@ class Lexer { public: - enum class Type { word }; + enum class Type { string, + word }; Lexer (const std::string&); bool token (std::string&, Lexer::Type&); // Static helpers. static bool isWhitespace (int); + static bool isHexDigit (int); static bool isSingleCharOperator (int); + static bool isHardBoundary (int, int); + static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); + static bool readWord (const std::string&, std::string::size_type&, std::string&); + static int hexToInt (int); + static int hexToInt (int, int); + static int hexToInt (int, int, int, int); static std::string trimLeft (const std::string& in, const std::string& t = " "); static std::string trimRight (const std::string& in, const std::string& t = " "); static std::string trim (const std::string& in, const std::string& t = " "); // Stream Classifiers. bool isEOS () const; + bool isString (std::string&, Lexer::Type&, const std::string&); bool isWord (std::string&, Lexer::Type&); private: diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp index f5da0af2..7bc9ffee 100644 --- a/test/lexer.t.cpp +++ b/test/lexer.t.cpp @@ -34,7 +34,7 @@ //////////////////////////////////////////////////////////////////////////////// int main (int, char**) { - UnitTest t (50); + UnitTest t (74); std::vector > tokens; std::string token; @@ -77,6 +77,55 @@ int main (int, char**) Lexer l1 (" \t "); t.notok (l1.token (token, type), "' \\t ' --> no tokens"); + + // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); + std::string::size_type cursor = 0; + std::string word; + t.ok (Lexer::readWord ("'one two'", "'\"", cursor, word), "readWord ''one two'' --> true"); + t.is (word, "'one two'", " word '" + word + "'"); + t.is ((int)cursor, 9, " cursor"); + + // Unterminated quoted string is invalid. + cursor = 0; + t.notok (Lexer::readWord ("'one", "'\"", cursor, word), "readWord ''one' --> false"); + + // static bool readWord (const std::string&, std::string::size_type&, std::string&); + cursor = 0; + t.ok (Lexer::readWord ("input", cursor, word), "readWord 'input' --> true"); + t.is (word, "input", " word '" + word + "'"); + t.is ((int)cursor, 5, " cursor"); + + cursor = 0; + t.ok (Lexer::readWord ("one\\ two", cursor, word), "readWord 'one\\ two' --> true"); + t.is (word, "one two", " word '" + word + "'"); + t.is ((int)cursor, 8, " cursor"); + + cursor = 0; + t.ok (Lexer::readWord ("\\u20A43", cursor, word), "readWord '\\u20A43' --> true"); + t.is (word, "₤3", " word '" + word + "'"); + t.is ((int)cursor, 7, " cursor"); + + cursor = 0; + t.ok (Lexer::readWord ("U+20AC4", cursor, word), "readWord '\\u20AC4' --> true"); + t.is (word, "€4", " word '" + word + "'"); + t.is ((int)cursor, 7, " cursor"); + + std::string text = "one 'two' three\\ four"; + cursor = 0; + t.ok (Lexer::readWord (text, cursor, word), "readWord \"one 'two' three\\ four\" --> true"); + t.is (word, "one", " word '" + word + "'"); + cursor++; + t.ok (Lexer::readWord (text, cursor, word), "readWord \"one 'two' three\\ four\" --> true"); + t.is (word, "'two'", " word '" + word + "'"); + cursor++; + t.ok (Lexer::readWord (text, cursor, word), "readWord \"one 'two' three\\ four\" --> true"); + t.is (word, "three four", " word '" + word + "'"); + + text = "one "; + cursor = 0; + t.ok (Lexer::readWord (text, cursor, word), "readWord \"one \" --> true"); + t.is (word, "one", " word '" + word + "'"); + // std::string Lexer::trimLeft (const std::string& in, const std::string&) t.is (Lexer::trimLeft (""), "", "Lexer::trimLeft '' -> ''"); t.is (Lexer::trimLeft (" "), "", "Lexer::trimLeft ' ' -> ''");