//////////////////////////////////////////////////////////////////////////////// // // Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // // http://www.opensource.org/licenses/mit-license.php // //////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include //////////////////////////////////////////////////////////////////////////////// Lexer::Lexer (const std::string& text) : _text (text) , _cursor (0) , _eos (text.size ()) { } //////////////////////////////////////////////////////////////////////////////// // When a Lexer object is constructed with a string, this method walks through // the stream of low-level tokens. bool Lexer::token (std::string& token, Lexer::Type& type) { // Eat white space. while (isWhitespace (_text[_cursor])) utf8_next_char (_text, _cursor); // Terminate at EOS. if (isEOS ()) return false; if (isString (token, type, "'\"") || isHexNumber (token, type) || isNumber (token, type) || isWord (token, type)) return true; return false; } //////////////////////////////////////////////////////////////////////////////// // No L10N - these are for internal purposes. const std::string Lexer::typeName (const Lexer::Type& type) { switch (type) { case Lexer::Type::number: return "number"; case Lexer::Type::hex: return "hex"; case Lexer::Type::string: return "string"; case Lexer::Type::word: return "word"; } return "unknown"; } //////////////////////////////////////////////////////////////////////////////// // Complete Unicode whitespace list. // // http://en.wikipedia.org/wiki/Whitespace_character // Updated 2015-09-13 // Static // // TODO This list should be derived from the Unicode database. bool Lexer::isWhitespace (int c) { return (c == 0x0020 || // space Common Separator, space c == 0x0009 || // Common Other, control HT, Horizontal Tab c == 0x000A || // Common Other, control LF, Line feed c == 0x000B || // Common Other, control VT, Vertical Tab c == 0x000C || // Common Other, control FF, Form feed c == 0x000D || // Common Other, control CR, Carriage return c == 0x0085 || // Common Other, control NEL, Next line c == 0x00A0 || // no-break space Common Separator, space c == 0x1680 || // ogham space mark Ogham Separator, space c == 0x180E || // mongolian vowel separator Mongolian Separator, space c == 0x2000 || // en quad Common Separator, space c == 0x2001 || // em quad Common Separator, space c == 0x2002 || // en space Common Separator, space c == 0x2003 || // em space Common Separator, space c == 0x2004 || // three-per-em space Common Separator, space c == 0x2005 || // four-per-em space Common Separator, space c == 0x2006 || // six-per-em space Common Separator, space c == 0x2007 || // figure space Common Separator, space c == 0x2008 || // punctuation space Common Separator, space c == 0x2009 || // thin space Common Separator, space c == 0x200A || // hair space Common Separator, space c == 0x200B || // zero width space c == 0x200C || // zero width non-joiner c == 0x200D || // zero width joiner c == 0x2028 || // line separator Common Separator, line c == 0x2029 || // paragraph separator Common Separator, paragraph c == 0x202F || // narrow no-break space Common Separator, space c == 0x205F || // medium mathematical space Common Separator, space c == 0x2060 || // word joiner c == 0x3000); // ideographic space Common Separator, space } //////////////////////////////////////////////////////////////////////////////// // Digits 0-9. // // TODO This list should be derived from the Unicode database. bool Lexer::isDigit (int c) { return c >= 0x30 && c <= 0x39; } //////////////////////////////////////////////////////////////////////////////// // Digits 0-9 a-f A-F. bool Lexer::isHexDigit (int c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::number // \d+ // [ . \d+ ] // [ e|E [ +|- ] \d+ [ . \d+ ] ] // not followed by non-operator. bool Lexer::isNumber (std::string& token, Lexer::Type& type) { std::size_t marker = _cursor; if (isDigit (_text[marker])) { ++marker; while (isDigit (_text[marker])) utf8_next_char (_text, marker); if (_text[marker] == '.') { ++marker; if (isDigit (_text[marker])) { ++marker; while (isDigit (_text[marker])) utf8_next_char (_text, marker); } } if (_text[marker] == 'e' || _text[marker] == 'E') { ++marker; if (_text[marker] == '+' || _text[marker] == '-') ++marker; if (isDigit (_text[marker])) { ++marker; while (isDigit (_text[marker])) utf8_next_char (_text, marker); if (_text[marker] == '.') { ++marker; if (isDigit (_text[marker])) { ++marker; while (isDigit (_text[marker])) utf8_next_char (_text, marker); } } } } // Lookahread: ! | ! // If there is an immediately consecutive character, that is not an operator, fail. if (_eos > marker && ! isWhitespace (_text[marker]) && ! isSingleCharOperator (_text[marker])) return false; token = _text.substr (_cursor, marker - _cursor); type = Lexer::Type::number; _cursor = marker; return true; } return false; } //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::number // \d+ bool Lexer::isInteger (std::string& token, Lexer::Type& type) { std::size_t marker = _cursor; if (isDigit (_text[marker])) { ++marker; while (isDigit (_text[marker])) utf8_next_char (_text, marker); token = _text.substr (_cursor, marker - _cursor); type = Lexer::Type::number; _cursor = marker; return true; } return false; } //////////////////////////////////////////////////////////////////////////////// bool Lexer::isSingleCharOperator (int c) { return c == '+' || // Addition c == '-' || // Subtraction or unary minus = ambiguous c == '*' || // Multiplication c == '/' || // Diviѕion c == '(' || // Precedence open parenthesis c == ')' || // Precedence close parenthesis c == '<' || // Less than c == '>' || // Greater than c == '^' || // Exponent c == '!' || // Unary not c == '%' || // Modulus c == '=' || // Partial match c == '~'; // Pattern match } //////////////////////////////////////////////////////////////////////////////// bool Lexer::isHardBoundary (int left, int right) { // EOS if (right == '\0') return true; // FILTER operators that don't need to be surrounded by whitespace. if (left == '(' || left == ')' || right == '(' || right == ')') return true; return false; } //////////////////////////////////////////////////////////////////////////////// bool Lexer::isEOS () const { return _cursor >= _eos; } //////////////////////////////////////////////////////////////////////////////// // Converts '0' -> 0 // '9' -> 9 // 'a'/'A' -> 10 // 'f'/'F' -> 15 int Lexer::hexToInt (int c) { if (c >= '0' && c <= '9') return (c - '0'); else if (c >= 'a' && c <= 'f') return (c - 'a' + 10); else return (c - 'A' + 10); } //////////////////////////////////////////////////////////////////////////////// int Lexer::hexToInt (int c0, int c1) { return (hexToInt (c0) << 4) + hexToInt (c1); } //////////////////////////////////////////////////////////////////////////////// int Lexer::hexToInt (int c0, int c1, int c2, int c3) { return (hexToInt (c0) << 12) + (hexToInt (c1) << 8) + (hexToInt (c2) << 4) + hexToInt (c3); } //////////////////////////////////////////////////////////////////////////////// std::string Lexer::trimLeft (const std::string& in, const std::string& t /*= " "*/) { std::string::size_type ws = in.find_first_not_of (t); if (ws > 0) { std::string out {in}; return out.erase (0, ws); } return in; } //////////////////////////////////////////////////////////////////////////////// std::string Lexer::trimRight (const std::string& in, const std::string& t /*= " "*/) { std::string out {in}; return out.erase (in.find_last_not_of (t) + 1); } //////////////////////////////////////////////////////////////////////////////// std::string Lexer::trim (const std::string& in, const std::string& t /*= " "*/) { return trimLeft (trimRight (in, t), t); } //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::string // '|" // [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ] // '|" bool Lexer::isString (std::string& token, Lexer::Type& type, const std::string& quotes) { std::size_t marker = _cursor; if (readWord (_text, quotes, marker, token)) { type = Lexer::Type::string; _cursor = marker; return true; } return false; } //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::hex // 0xX+ bool Lexer::isHexNumber (std::string& token, Lexer::Type& type) { std::size_t marker = _cursor; if (_eos - marker >= 3 && _text[marker + 0] == '0' && _text[marker + 1] == 'x') { marker += 2; while (isHexDigit (_text[marker])) ++marker; if (marker - _cursor > 2) { token = _text.substr (_cursor, marker - _cursor); type = Lexer::Type::hex; _cursor = marker; return true; } } return false; } //////////////////////////////////////////////////////////////////////////////// // Lexer::Type::word // [^\s]+ bool Lexer::isWord (std::string& token, Lexer::Type& type) { std::size_t marker = _cursor; while (_text[marker] && ! isWhitespace (_text[marker]) && ! isSingleCharOperator (_text[marker])) utf8_next_char (_text, marker); if (marker > _cursor) { token = _text.substr (_cursor, marker - _cursor); type = Lexer::Type::word; _cursor = marker; return true; } return false; } //////////////////////////////////////////////////////////////////////////////// // Static std::string Lexer::typeToString (Lexer::Type type) { if (type == Lexer::Type::string) return std::string ("\033[38;5;7m\033[48;5;3m") + "string" + "\033[0m"; else if (type == Lexer::Type::hex) return std::string ("\033[38;5;7m\033[48;5;14m") + "hex" + "\033[0m"; else if (type == Lexer::Type::number) return std::string ("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m"; else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m"; else return std::string ("\033[37;41m") + "unknown" + "\033[0m"; } //////////////////////////////////////////////////////////////////////////////// // Full implementation of a quoted word. Includes: // '\'' // '"' // "'" // "\"" // 'one two' // Result includes the quotes. bool Lexer::readWord ( const std::string& text, const std::string& quotes, std::string::size_type& cursor, std::string& word) { if (quotes.find (text[cursor]) == std::string::npos) return false; std::string::size_type eos = text.length (); int quote = text[cursor++]; word = quote; int c; while ((c = text[cursor])) { // Quoted word ends on a quote. if (quote && quote == c) { word += utf8_character (utf8_next_char (text, cursor)); break; } // Unicode U+XXXX or \uXXXX codepoint. else if (eos - cursor >= 6 && ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') || (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) && isHexDigit (text[cursor + 2]) && isHexDigit (text[cursor + 3]) && isHexDigit (text[cursor + 4]) && isHexDigit (text[cursor + 5])) { word += utf8_character ( hexToInt ( text[cursor + 2], text[cursor + 3], text[cursor + 4], text[cursor + 5])); cursor += 6; } // An escaped thing. else if (c == '\\') { c = text[++cursor]; switch (c) { case '"': word += (char) 0x22; ++cursor; break; case '\'': word += (char) 0x27; ++cursor; break; case '\\': word += (char) 0x5C; ++cursor; break; case 'b': word += (char) 0x08; ++cursor; break; case 'f': word += (char) 0x0C; ++cursor; break; case 'n': word += (char) 0x0A; ++cursor; break; case 'r': word += (char) 0x0D; ++cursor; break; case 't': word += (char) 0x09; ++cursor; break; case 'v': word += (char) 0x0B; ++cursor; break; // This pass-through default case means that anything can be escaped // harmlessly. In particular 'quote' is included, if it not one of the // above characters. default: word += (char) c; ++cursor; break; } } // Ordinary character. else word += utf8_character (utf8_next_char (text, cursor)); } // Verify termination. return word[0] == quote && word[word.length () - 1] == quote && word.length () >= 2; } //////////////////////////////////////////////////////////////////////////////// // Full implementation of an unquoted word. Includes: // one\ two // abcU+0020def // abc\u0020def // a\tb // // Ends at: // Lexer::isEOS // Lexer::isWhitespace // Lexer::isHardBoundary bool Lexer::readWord ( const std::string& text, std::string::size_type& cursor, std::string& word) { std::string::size_type eos = text.length (); word = ""; int c; int prev = 0; while ((c = text[cursor])) // Handles EOS. { // Unquoted word ends on white space. if (Lexer::isWhitespace (c)) break; // Parentheses mostly. if (prev && Lexer::isHardBoundary (prev, c)) break; // Unicode U+XXXX or \uXXXX codepoint. else if (eos - cursor >= 6 && ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') || (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) && isHexDigit (text[cursor + 2]) && isHexDigit (text[cursor + 3]) && isHexDigit (text[cursor + 4]) && isHexDigit (text[cursor + 5])) { word += utf8_character ( hexToInt ( text[cursor + 2], text[cursor + 3], text[cursor + 4], text[cursor + 5])); cursor += 6; } // An escaped thing. else if (c == '\\') { c = text[++cursor]; switch (c) { case '"': word += (char) 0x22; ++cursor; break; case '\'': word += (char) 0x27; ++cursor; break; case '\\': word += (char) 0x5C; ++cursor; break; case 'b': word += (char) 0x08; ++cursor; break; case 'f': word += (char) 0x0C; ++cursor; break; case 'n': word += (char) 0x0A; ++cursor; break; case 'r': word += (char) 0x0D; ++cursor; break; case 't': word += (char) 0x09; ++cursor; break; case 'v': word += (char) 0x0B; ++cursor; break; // This pass-through default case means that anything can be escaped // harmlessly. In particular 'quote' is included, if it not one of the // above characters. default: word += (char) c; ++cursor; break; } } // Ordinary character. else word += utf8_character (utf8_next_char (text, cursor)); prev = c; } return word.length () > 0 ? true : false; } ////////////////////////////////////////////////////////////////////////////////