diff --git a/src/Lexer.cpp b/src/Lexer.cpp index da298726..7cb0e5e4 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -52,6 +52,7 @@ bool Lexer::token (std::string& token, Lexer::Type& type) return false; if (isString (token, type, "'\"") || + isURL (token, type) || isHexNumber (token, type) || isNumber (token, type) || isPattern (token, type) || @@ -70,6 +71,7 @@ const std::string Lexer::typeName (const Lexer::Type& type) case Lexer::Type::number: return "number"; case Lexer::Type::hex: return "hex"; case Lexer::Type::string: return "string"; + case Lexer::Type::url: return "url"; case Lexer::Type::pattern: return "pattern"; case Lexer::Type::word: return "word"; } @@ -393,6 +395,43 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type) return false; } +//////////////////////////////////////////////////////////////////////////////// +// Lexer::Type::url +// http [s] :// ... +bool Lexer::isURL (std::string& token, Lexer::Type& type) +{ + std::size_t marker = _cursor; + + if (_eos - _cursor > 9 && // length 'https://*' + (_text[marker + 0] == 'h' || _text[marker + 0] == 'H') && + (_text[marker + 1] == 't' || _text[marker + 1] == 'T') && + (_text[marker + 2] == 't' || _text[marker + 2] == 'T') && + (_text[marker + 3] == 'p' || _text[marker + 3] == 'P')) + { + marker += 4; + if (_text[marker + 0] == 's' || _text[marker + 0] == 'S') + ++marker; + + if (_text[marker + 0] == ':' && + _text[marker + 1] == '/' && + _text[marker + 2] == '/') + { + marker += 3; + + while (marker < _eos && + ! isWhitespace (_text[marker])) + utf8_next_char (_text, marker); + + token = _text.substr (_cursor, marker - _cursor); + type = Lexer::Type::url; + _cursor = marker; + return true; + } + } + + return false; +} + //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::pattern // / / | @@ -421,6 +460,7 @@ std::string Lexer::typeToString (Lexer::Type type) if (type == Lexer::Type::string) return std::string ("\033[38;5;7m\033[48;5;3m") + "string" + "\033[0m"; else if (type == Lexer::Type::hex) return std::string ("\033[38;5;7m\033[48;5;14m") + "hex" + "\033[0m"; else if (type == Lexer::Type::number) return std::string ("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m"; + else if (type == Lexer::Type::url) return std::string ("\033[38;5;7m\033[48;5;4m") + "url" + "\033[0m"; else if (type == Lexer::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m"; else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m"; else return std::string ("\033[37;41m") + "unknown" + "\033[0m"; diff --git a/src/Lexer.h b/src/Lexer.h index ee4cc2ba..66157bb2 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -37,6 +37,7 @@ class Lexer public: enum class Type { number, hex, string, + url, pattern, word }; @@ -66,6 +67,7 @@ public: bool isNumber (std::string&, Lexer::Type&); bool isInteger (std::string&, Lexer::Type&); bool isHexNumber (std::string&, Lexer::Type&); + bool isURL (std::string&, Lexer::Type&); bool isPattern (std::string&, Lexer::Type&); bool isWord (std::string&, Lexer::Type&); diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp index 3c0b5b13..7ac29dfa 100644 --- a/test/lexer.t.cpp +++ b/test/lexer.t.cpp @@ -34,7 +34,7 @@ //////////////////////////////////////////////////////////////////////////////// int main (int, char**) { - UnitTest t (190); + UnitTest t (203); std::vector > tokens; std::string token; @@ -170,6 +170,10 @@ int main (int, char**) // Word { "1.foo.bar", { { "1.foo.bar", Lexer::Type::word }, NO, NO, NO, NO }, }, + // URL + { "http://tasktools.org", { { "http://tasktools.org", Lexer::Type::url }, NO, NO, NO, NO }, }, + { "https://bug.tasktools.org", { { "https://bug.tasktools.org", Lexer::Type::url }, NO, NO, NO, NO }, }, + // String { "'one two'", { { "'one two'", Lexer::Type::string }, NO, NO, NO, NO }, }, { "\"three\"", { { "\"three\"", Lexer::Type::string }, NO, NO, NO, NO }, }, @@ -223,6 +227,7 @@ int main (int, char**) t.is (Lexer::typeName (Lexer::Type::number), "number", "Lexer::typeName (Lexer::Type::number)"); t.is (Lexer::typeName (Lexer::Type::hex), "hex", "Lexer::typeName (Lexer::Type::hex)"); t.is (Lexer::typeName (Lexer::Type::string), "string", "Lexer::typeName (Lexer::Type::string)"); + t.is (Lexer::typeName (Lexer::Type::url), "url", "Lexer::typeName (Lexer::Type::url)"); t.is (Lexer::typeName (Lexer::Type::pattern), "pattern", "Lexer::typeName (Lexer::Type::pattern)"); t.is (Lexer::typeName (Lexer::Type::word), "word", "Lexer::typeName (Lexer::Type::word)");