diff --git a/src/CLI.cpp b/src/CLI.cpp index eb59cbd44..b8af8092f 100644 --- a/src/CLI.cpp +++ b/src/CLI.cpp @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -662,13 +661,13 @@ void CLI::addArg (const std::string& arg) // that cause the lexemes to be ignored, and the original arugment used // intact. std::string lexeme; - Lexer::Type type; - Lexer lex (raw); + Lexer2::Type type; + Lexer2 lex (raw); lex.ambiguity (false); - std::vector > lexemes; + std::vector > lexemes; while (lex.token (lexeme, type)) - lexemes.push_back (std::pair (lexeme, type)); + lexemes.push_back (std::pair (lexeme, type)); if (disqualifyInsufficientTerms (lexemes) || disqualifyNoOps (lexemes) || @@ -682,7 +681,7 @@ void CLI::addArg (const std::string& arg) { // How often have I said to you that when you have eliminated the // impossible, whatever remains, however improbable, must be the truth? - std::vector >::iterator l; + std::vector >::iterator l; for (l = lexemes.begin (); l != lexemes.end (); ++l) _original_args.push_back (l->first); } @@ -714,9 +713,7 @@ void CLI::aliasExpansion () { if (_aliases.find (raw) != _aliases.end ()) { - std::vector lexed; - Lexer::token_split (lexed, _aliases[raw]); - + std::vector lexed = Lexer2::split (_aliases[raw]); std::vector ::iterator l; for (l = lexed.begin (); l != lexed.end (); ++l) { @@ -1815,8 +1812,7 @@ void CLI::injectDefaults () if (defaultCommand != "") { // Split the defaultCommand into separate args. - std::vector tokens; - Lexer::token_split (tokens, defaultCommand); + std::vector tokens = Lexer2::split (defaultCommand); // Modify _args to be: [ ...] [...] std::vector reconstructed; @@ -2306,9 +2302,9 @@ bool CLI::isName (const std::string& raw) const { for (int i = 0; i < raw.length (); ++i) { - if (i == 0 && ! Lexer::is_ident_start (raw[i])) + if (i == 0 && ! Lexer2::isIdentifierStart (raw[i])) return false; - else if (! Lexer::is_ident (raw[i])) + else if (! Lexer2::isIdentifierNext (raw[i])) return false; } @@ -2320,19 +2316,19 @@ bool CLI::isName (const std::string& raw) const //////////////////////////////////////////////////////////////////////////////// bool CLI::disqualifyInsufficientTerms ( - const std::vector >& lexemes) const + const std::vector >& lexemes) const { return lexemes.size () < 3 ? true : false; } //////////////////////////////////////////////////////////////////////////////// bool CLI::disqualifyNoOps ( - const std::vector >& lexemes) const + const std::vector >& lexemes) const { bool foundOP = false; - std::vector >::const_iterator l; + std::vector >::const_iterator l; for (l = lexemes.begin (); l != lexemes.end (); ++l) - if (l->second == Lexer::typeOperator) + if (l->second == Lexer2::Type::op) foundOP = true; return ! foundOP; @@ -2340,16 +2336,16 @@ bool CLI::disqualifyNoOps ( //////////////////////////////////////////////////////////////////////////////// bool CLI::disqualifyOnlyParenOps ( - const std::vector >& lexemes) const + const std::vector >& lexemes) const { int opCount = 0; int opSugarCount = 0; int opParenCount = 0; - std::vector >::const_iterator l; + std::vector >::const_iterator l; for (l = lexemes.begin (); l != lexemes.end (); ++l) { - if (l->second == Lexer::typeOperator) + if (l->second == Lexer2::Type::op) { ++opCount; @@ -2376,7 +2372,7 @@ bool CLI::disqualifyOnlyParenOps ( // as there are no operators in between, which includes syntactic sugar that // hides operators. bool CLI::disqualifyFirstLastBinary ( - const std::vector >& lexemes) const + const std::vector >& lexemes) const { bool firstBinary = false; bool lastBinary = false; @@ -2395,7 +2391,7 @@ bool CLI::disqualifyFirstLastBinary ( //////////////////////////////////////////////////////////////////////////////// // Disqualify terms when there operators hidden by syntactic sugar. bool CLI::disqualifySugarFree ( - const std::vector >& lexemes) const + const std::vector >& lexemes) const { bool sugared = true; for (unsigned int i = 1; i < lexemes.size () - 1; ++i) diff --git a/src/CLI.h b/src/CLI.h index e855db697..0a3c28221 100644 --- a/src/CLI.h +++ b/src/CLI.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include @@ -126,11 +126,11 @@ private: bool isOperator (const std::string&) const; bool isName (const std::string&) const; - bool disqualifyInsufficientTerms (const std::vector >&) const; - bool disqualifyNoOps (const std::vector >&) const; - bool disqualifyOnlyParenOps (const std::vector >&) const; - bool disqualifyFirstLastBinary (const std::vector >&) const; - bool disqualifySugarFree (const std::vector >&) const; + bool disqualifyInsufficientTerms (const std::vector >&) const; + bool disqualifyNoOps (const std::vector >&) const; + bool disqualifyOnlyParenOps (const std::vector >&) const; + bool disqualifyFirstLastBinary (const std::vector >&) const; + bool disqualifySugarFree (const std::vector >&) const; public: std::multimap _entities; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f46db80a8..51fec6e28 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,7 +20,6 @@ set (task_SRCS CLI.cpp CLI.h Hooks.cpp Hooks.h ISO8601.cpp ISO8601.h JSON.cpp JSON.h - Lexer.cpp Lexer.h Lexer2.cpp Lexer2.h Msg.cpp Msg.h Nibbler.cpp Nibbler.h diff --git a/src/Context.cpp b/src/Context.cpp index 4d0e6bc0b..33852e8a1 100644 --- a/src/Context.cpp +++ b/src/Context.cpp @@ -657,8 +657,8 @@ void Context::staticInitialization () Task::searchCaseSensitive = Variant::searchCaseSensitive = config.getBoolean ("search.case.sensitive"); Task::regex = Variant::searchUsingRegex = config.getBoolean ("regex"); - Lexer::dateFormat = Variant::dateFormat = config.get ("dateformat"); - Lexer::isoEnabled = Variant::isoEnabled = config.getBoolean ("date.iso"); + Lexer2::dateFormat = Variant::dateFormat = config.get ("dateformat"); + Lexer2::isoEnabled = Variant::isoEnabled = config.getBoolean ("date.iso"); std::map ::iterator i; for (i = columns.begin (); i != columns.end (); ++i) diff --git a/src/Eval.cpp b/src/Eval.cpp index dbe13e916..df00acc50 100644 --- a/src/Eval.cpp +++ b/src/Eval.cpp @@ -125,13 +125,13 @@ void Eval::addSource (bool (*source)(const std::string&, Variant&)) void Eval::evaluateInfixExpression (const std::string& e, Variant& v) const { // Reduce e to a vector of tokens. - Lexer l (e); + Lexer2 l (e); l.ambiguity (_ambiguity); - std::vector > tokens; + std::vector > tokens; std::string token; - Lexer::Type type; + Lexer2::Type type; while (l.token (token, type)) - tokens.push_back (std::pair (token, type)); + tokens.push_back (std::pair (token, type)); // Parse for syntax checking and operator replacement. if (_debug) @@ -153,13 +153,13 @@ void Eval::evaluateInfixExpression (const std::string& e, Variant& v) const void Eval::evaluatePostfixExpression (const std::string& e, Variant& v) const { // Reduce e to a vector of tokens. - Lexer l (e); + Lexer2 l (e); l.ambiguity (_ambiguity); - std::vector > tokens; + std::vector > tokens; std::string token; - Lexer::Type type; + Lexer2::Type type; while (l.token (token, type)) - tokens.push_back (std::pair (token, type)); + tokens.push_back (std::pair (token, type)); if (_debug) context.debug ("FILTER Postfix " + dump (tokens)); @@ -172,15 +172,15 @@ void Eval::evaluatePostfixExpression (const std::string& e, Variant& v) const void Eval::compileExpression (const std::string& e) { // Reduce e to a vector of tokens. - Lexer l (e); + Lexer2 l (e); l.ambiguity (_ambiguity); std::string token; - Lexer::Type type; + Lexer2::Type type; while (l.token (token, type)) { if (_debug) - context.debug ("Lexer '" + token + "' " + Lexer::type_name (type)); - _compiled.push_back (std::pair (token, type)); + context.debug ("Lexer '" + token + "' " + Lexer2::typeToString (type)); + _compiled.push_back (std::pair (token, type)); } // Parse for syntax checking and operator replacement. @@ -236,7 +236,7 @@ void Eval::getBinaryOperators (std::vector & all) //////////////////////////////////////////////////////////////////////////////// void Eval::evaluatePostfixStack ( - const std::vector >& tokens, + const std::vector >& tokens, Variant& result) const { if (tokens.size () == 0) @@ -245,11 +245,11 @@ void Eval::evaluatePostfixStack ( // This is stack used by the postfix evaluator. std::vector values; - std::vector >::const_iterator token; + std::vector >::const_iterator token; for (token = tokens.begin (); token != tokens.end (); ++token) { // Unary operators. - if (token->second == Lexer::typeOperator && + if (token->second == Lexer2::Type::op && token->first == "!") { if (values.size () < 1) @@ -262,7 +262,7 @@ void Eval::evaluatePostfixStack ( if (_debug) context.debug (format ("Eval {1} ↓'{2}' → ↑'{3}'", token->first, (std::string) right, (std::string) result)); } - else if (token->second == Lexer::typeOperator && + else if (token->second == Lexer2::Type::op && token->first == "_neg_") { if (values.size () < 1) @@ -278,7 +278,7 @@ void Eval::evaluatePostfixStack ( if (_debug) context.debug (format ("Eval {1} ↓'{2}' → ↑'{3}'", token->first, (std::string) right, (std::string) result)); } - else if (token->second == Lexer::typeOperator && + else if (token->second == Lexer2::Type::op && token->first == "_pos_") { // The _pos_ operator is a NOP. @@ -287,7 +287,7 @@ void Eval::evaluatePostfixStack ( } // Binary operators. - else if (token->second == Lexer::typeOperator) + else if (token->second == Lexer2::Type::op) { if (values.size () < 2) throw std::string (STRING_EVAL_NO_EVAL); @@ -338,24 +338,27 @@ void Eval::evaluatePostfixStack ( Variant v (token->first); switch (token->second) { - case Lexer::typeNumber: - case Lexer::typeHex: - v.cast (Variant::type_integer); - if (_debug) - context.debug (format ("Eval literal number ↑'{1}'", (std::string) v)); + case Lexer2::Type::number: + if (Lexer2::isAllDigits (token->first)) + { + v.cast (Variant::type_integer); + if (_debug) + context.debug (format ("Eval literal number ↑'{1}'", (std::string) v)); + } + else + { + v.cast (Variant::type_real); + if (_debug) + context.debug (format ("Eval literal decimal ↑'{1}'", (std::string) v)); + } break; - case Lexer::typeDecimal: - v.cast (Variant::type_real); - if (_debug) - context.debug (format ("Eval literal decimal ↑'{1}'", (std::string) v)); - break; - case Lexer::typeOperator: + case Lexer2::Type::op: throw std::string (STRING_EVAL_OP_EXPECTED); break; - case Lexer::typeIdentifier: + case Lexer2::Type::identifier: { bool found = false; std::vector ::const_iterator source; @@ -380,20 +383,33 @@ void Eval::evaluatePostfixStack ( } break; - case Lexer::typeDate: + case Lexer2::Type::date: v.cast (Variant::type_date); if (_debug) context.debug (format ("Eval literal date ↑'{1}'", (std::string) v)); break; - case Lexer::typeDuration: + case Lexer2::Type::duration: v.cast (Variant::type_duration); if (_debug) context.debug (format ("Eval literal duration ↑'{1}'", (std::string) v)); break; // Nothing to do. - case Lexer::typeString: +/* + case Lexer2::Type::uuid: + case Lexer2::Type::hex: + case Lexer2::Type::list: + case Lexer2::Type::url: + case Lexer2::Type::pair: + case Lexer2::Type::separator: + case Lexer2::Type::tag: + case Lexer2::Type::path: + case Lexer2::Type::substitution: + case Lexer2::Type::pattern: + case Lexer2::Type::word: +*/ + case Lexer2::Type::string: default: if (_debug) context.debug (format ("Eval literal string ↑'{1}'", (std::string) v)); @@ -427,7 +443,7 @@ void Eval::evaluatePostfixStack ( // Primitive --> "(" Logical ")" | Variant // void Eval::infixParse ( - std::vector >& infix) const + std::vector >& infix) const { int i = 0; parseLogical (infix, i); @@ -436,17 +452,17 @@ void Eval::infixParse ( //////////////////////////////////////////////////////////////////////////////// // Logical --> Regex {( "and" | "or" | "xor" ) Regex} bool Eval::parseLogical ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size () && parseRegex (infix, i)) { while (i < infix.size () && + infix[i].second == Lexer2::Type::op && (infix[i].first == "and" || infix[i].first == "or" || - infix[i].first == "xor") && - infix[i].second == Lexer::typeOperator) + infix[i].first == "xor")) { ++i; if (! parseRegex (infix, i)) @@ -462,16 +478,16 @@ bool Eval::parseLogical ( //////////////////////////////////////////////////////////////////////////////// // Regex --> Equality {( "~" | "!~" ) Equality} bool Eval::parseRegex ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size () && parseEquality (infix, i)) { while (i < infix.size () && + infix[i].second == Lexer2::Type::op && (infix[i].first == "~" || - infix[i].first == "!~") && - infix[i].second == Lexer::typeOperator) + infix[i].first == "!~")) { ++i; if (! parseEquality (infix, i)) @@ -487,18 +503,18 @@ bool Eval::parseRegex ( //////////////////////////////////////////////////////////////////////////////// // Equality --> Comparative {( "==" | "=" | "!==" | "!=" ) Comparative} bool Eval::parseEquality ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size () && parseComparative (infix, i)) { while (i < infix.size () && + infix[i].second == Lexer2::Type::op && (infix[i].first == "==" || infix[i].first == "=" || infix[i].first == "!==" || - infix[i].first == "!=") && - infix[i].second == Lexer::typeOperator) + infix[i].first == "!=")) { ++i; if (! parseComparative (infix, i)) @@ -514,18 +530,18 @@ bool Eval::parseEquality ( //////////////////////////////////////////////////////////////////////////////// // Comparative --> Arithmetic {( "<=" | "<" | ">=" | ">" ) Arithmetic} bool Eval::parseComparative ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size () && parseArithmetic (infix, i)) { while (i < infix.size () && + infix[i].second == Lexer2::Type::op && (infix[i].first == "<=" || infix[i].first == "<" || infix[i].first == ">=" || - infix[i].first == ">") && - infix[i].second == Lexer::typeOperator) + infix[i].first == ">")) { ++i; if (! parseArithmetic (infix, i)) @@ -541,16 +557,16 @@ bool Eval::parseComparative ( //////////////////////////////////////////////////////////////////////////////// // Arithmetic --> Geometric {( "+" | "-" ) Geometric} bool Eval::parseArithmetic ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size () && parseGeometric (infix, i)) { while (i < infix.size () && + infix[i].second == Lexer2::Type::op && (infix[i].first == "+" || - infix[i].first == "-") && - infix[i].second == Lexer::typeOperator) + infix[i].first == "-")) { ++i; if (! parseGeometric (infix, i)) @@ -566,17 +582,17 @@ bool Eval::parseArithmetic ( //////////////////////////////////////////////////////////////////////////////// // Geometric --> Tag {( "*" | "/" | "%" ) Tag} bool Eval::parseGeometric ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size () && parseTag (infix, i)) { while (i < infix.size () && + infix[i].second == Lexer2::Type::op && (infix[i].first == "*" || infix[i].first == "/" || - infix[i].first == "%") && - infix[i].second == Lexer::typeOperator) + infix[i].first == "%")) { ++i; if (! parseTag (infix, i)) @@ -592,16 +608,16 @@ bool Eval::parseGeometric ( //////////////////////////////////////////////////////////////////////////////// // Tag --> Unary {( "_hastag_" | "_notag_" ) Unary} bool Eval::parseTag ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size () && parseUnary (infix, i)) { while (i < infix.size () && + infix[i].second == Lexer2::Type::op && (infix[i].first == "_hastag_" || - infix[i].first == "_notag_") && - infix[i].second == Lexer::typeOperator) + infix[i].first == "_notag_")) { ++i; if (! parseUnary (infix, i)) @@ -617,7 +633,7 @@ bool Eval::parseTag ( //////////////////////////////////////////////////////////////////////////////// // Unary --> [( "-" | "+" | "!" )] Exponent bool Eval::parseUnary ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size ()) @@ -644,15 +660,15 @@ bool Eval::parseUnary ( //////////////////////////////////////////////////////////////////////////////// // Exponent --> Primitive ["^" Primitive] bool Eval::parseExponent ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size () && parsePrimitive (infix, i)) { while (i < infix.size () && - infix[i].first == "^" && - infix[i].second == Lexer::typeOperator) + infix[i].second == Lexer2::Type::op && + infix[i].first == "^") { ++i; if (! parsePrimitive (infix, i)) @@ -668,7 +684,7 @@ bool Eval::parseExponent ( //////////////////////////////////////////////////////////////////////////////// // Primitive --> "(" Logical ")" | Variant bool Eval::parsePrimitive ( - std::vector >& infix, + std::vector >& infix, int &i) const { if (i < infix.size ()) @@ -706,7 +722,7 @@ bool Eval::parsePrimitive ( ++i; return true; } - else if (infix[i].second != Lexer::typeOperator) + else if (infix[i].second != Lexer2::Type::op) { ++i; return true; @@ -750,32 +766,32 @@ bool Eval::parsePrimitive ( // Exit. // void Eval::infixToPostfix ( - std::vector >& infix) const + std::vector >& infix) const { // Short circuit. if (infix.size () == 1) return; // Result. - std::vector > postfix; + std::vector > postfix; // Shunting yard. - std::vector > op_stack; + std::vector > op_stack; // Operator characteristics. char type; int precedence; char associativity; - std::vector >::iterator token; + std::vector >::iterator token; for (token = infix.begin (); token != infix.end (); ++token) { - if (token->second == Lexer::typeOperator && + if (token->second == Lexer2::Type::op && token->first == "(") { op_stack.push_back (*token); } - else if (token->second == Lexer::typeOperator && + else if (token->second == Lexer2::Type::op && token->first == ")") { while (op_stack.size () && @@ -790,7 +806,7 @@ void Eval::infixToPostfix ( else throw std::string ("Mismatched parentheses in expression"); } - else if (token->second == Lexer::typeOperator && + else if (token->second == Lexer2::Type::op && identifyOperator (token->first, type, precedence, associativity)) { char type2; @@ -849,22 +865,20 @@ bool Eval::identifyOperator ( //////////////////////////////////////////////////////////////////////////////// std::string Eval::dump ( - std::vector >& tokens) const + std::vector >& tokens) const { // Set up a color mapping. - std::map color_map; - color_map[Lexer::typeNone] = Color ("rgb000 on gray6"); - color_map[Lexer::typeOperator] = Color ("gray14 on gray6"); - color_map[Lexer::typeNumber] = Color ("rgb530 on gray6"); - color_map[Lexer::typeHex] = Color ("rgb303 on gray6"); - color_map[Lexer::typeDecimal] = Color ("rgb530 on gray6"); - color_map[Lexer::typeString] = Color ("rgb550 on gray6"); - color_map[Lexer::typeIdentifier] = Color ("rgb035 on gray6"); - color_map[Lexer::typeDate] = Color ("rgb150 on gray6"); - color_map[Lexer::typeDuration] = Color ("rgb531 on gray6"); + std::map color_map; + color_map[Lexer2::Type::op] = Color ("gray14 on gray6"); + color_map[Lexer2::Type::number] = Color ("rgb530 on gray6"); + color_map[Lexer2::Type::hex] = Color ("rgb303 on gray6"); + color_map[Lexer2::Type::string] = Color ("rgb550 on gray6"); + color_map[Lexer2::Type::identifier] = Color ("rgb035 on gray6"); + color_map[Lexer2::Type::date] = Color ("rgb150 on gray6"); + color_map[Lexer2::Type::duration] = Color ("rgb531 on gray6"); std::string output; - std::vector >::const_iterator i; + std::vector >::const_iterator i; for (i = tokens.begin (); i != tokens.end (); ++i) { if (i != tokens.begin ()) @@ -874,7 +888,7 @@ std::string Eval::dump ( if (color_map[i->second].nontrivial ()) c = color_map[i->second]; else - c = color_map[Lexer::typeNone]; + c = Color ("rgb000 on gray6"); output += c.colorize (i->first); } diff --git a/src/Eval.h b/src/Eval.h index b0062814b..cfeb03721 100644 --- a/src/Eval.h +++ b/src/Eval.h @@ -29,7 +29,7 @@ #include #include -#include +#include #include class Eval @@ -53,28 +53,28 @@ public: static void getBinaryOperators (std::vector &); private: - void evaluatePostfixStack (const std::vector >&, Variant&) const; - void infixToPostfix (std::vector >&) const; - void infixParse (std::vector >&) const; - bool parseLogical (std::vector >&, int &) const; - bool parseRegex (std::vector >&, int &) const; - bool parseEquality (std::vector >&, int &) const; - bool parseComparative (std::vector >&, int &) const; - bool parseArithmetic (std::vector >&, int &) const; - bool parseGeometric (std::vector >&, int &) const; - bool parseTag (std::vector >&, int &) const; - bool parseUnary (std::vector >&, int &) const; - bool parseExponent (std::vector >&, int &) const; - bool parsePrimitive (std::vector >&, int &) const; + void evaluatePostfixStack (const std::vector >&, Variant&) const; + void infixToPostfix (std::vector >&) const; + void infixParse (std::vector >&) const; + bool parseLogical (std::vector >&, int &) const; + bool parseRegex (std::vector >&, int &) const; + bool parseEquality (std::vector >&, int &) const; + bool parseComparative (std::vector >&, int &) const; + bool parseArithmetic (std::vector >&, int &) const; + bool parseGeometric (std::vector >&, int &) const; + bool parseTag (std::vector >&, int &) const; + bool parseUnary (std::vector >&, int &) const; + bool parseExponent (std::vector >&, int &) const; + bool parsePrimitive (std::vector >&, int &) const; bool identifyOperator (const std::string&, char&, int&, char&) const; - std::string dump (std::vector >&) const; + std::string dump (std::vector >&) const; private: std::vector _sources; bool _ambiguity; bool _debug; - std::vector > _compiled; + std::vector > _compiled; }; diff --git a/src/Lexer.cpp b/src/Lexer.cpp deleted file mode 100644 index 371a3a4c8..000000000 --- a/src/Lexer.cpp +++ /dev/null @@ -1,898 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included -// in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// http://www.opensource.org/licenses/mit-license.php -// -//////////////////////////////////////////////////////////////////////////////// - -#include -#include -#include -#include -#include -#include -#include -#include - -std::string Lexer::dateFormat = ""; -bool Lexer::isoEnabled = true; - -//////////////////////////////////////////////////////////////////////////////// -Lexer::Lexer (const std::string& input) -: _input (input) -, _i (0) -, _shift_counter (0) -, _n0 (32) -, _n1 (32) -, _n2 (32) -, _n3 (32) -, _boundary01 (false) -, _boundary12 (false) -, _boundary23 (false) -, _ambiguity (true) -{ - // Read 4 chars in preparation. Even if there are < 4. Take a deep breath. - shift (); - shift (); - shift (); - shift (); - - // Reset because the four shifts above do not represent advancement into the - // _input. All subsequents shiftѕ do though. - _shift_counter = 0; -} - -//////////////////////////////////////////////////////////////////////////////// -Lexer::~Lexer () -{ -} - -//////////////////////////////////////////////////////////////////////////////// -// Walk the input string, looking for transitions. -bool Lexer::token (std::string& result, Type& type) -{ - // Start with nothing. - result = ""; - - // Different types of matching quote: ', ". - int quote = 0; - - type = typeNone; - while (_n0) - { - switch (type) - { - case typeNone: - if (is_ws (_n0)) - shift (); - else if (_n0 == '"' || _n0 == '\'') - { - type = typeString; - quote = _n0; - result += utf8_character (_n0); - shift (); - } - else if (_n0 == '0' && - _n1 == 'x' && - is_hex_digit (_n2)) - { - type = typeHex; - result += utf8_character (_n0); - shift (); - result += utf8_character (_n0); - shift (); - result += utf8_character (_n0); - shift (); - } - else if (is_dec_digit (_n0)) - { - // Speculatively try a date and duration parse. Longest wins. - if (is_date (result)) - { - type = typeDate; - return true; - } - - if (is_duration (result)) - { - type = typeDuration; - return true; - } - - type = typeNumber; - result += utf8_character (_n0); - shift (); - } - else if (_n0 == '.' && is_dec_digit (_n1)) - { - type = typeDecimal; - result += utf8_character (_n0); - shift (); - } - else if ((_n0 == '+' || _n0 == '-') && is_ident_start (_n1)) - { - type = typeTag; - result += utf8_character (_n0); - shift (); - } - else if (is_triple_op (_n0, _n1, _n2)) - { - type = typeOperator; - result += utf8_character (_n0); - shift (); - result += utf8_character (_n0); - shift (); - result += utf8_character (_n0); - shift (); - return true; - } - else if (is_double_op (_n0, _n1, _n2)) - { - type = typeOperator; - result += utf8_character (_n0); - shift (); - result += utf8_character (_n0); - shift (); - return true; - } - else if (is_single_op (_n0)) - { - type = typeOperator; - result += utf8_character (_n0); - shift (); - return true; - } - else if (_n0 == '\\') - { - type = typeIdentifierEscape; - shift (); - } - else if (is_ident_start (_n0)) - { - if (is_date (result)) - { - type = typeDate; - return true; - } - - if (is_duration (result)) - { - type = typeDuration; - return true; - } - - type = typeIdentifier; - result += utf8_character (_n0); - shift (); - } - else - throw std::string (STRING_LEX_IMMEDIATE_UNK); - break; - - case typeString: - if (_n0 == quote) - { - result += utf8_character (_n0); - shift (); - quote = 0; - return true; - } - else if (_n0 == '\\') - { - type = typeEscape; - shift (); - } - else - { - result += utf8_character (_n0); - shift (); - } - break; - - case typeTag: - if (is_ident_start (_n0)) - { - result += utf8_character (_n0); - shift (); - } - else - { - return true; - } - break; - - case typeIdentifier: - if (is_ident (_n0)) - { - result += utf8_character (_n0); - shift (); - } - else - { - // typeIdentifier is a catch-all type. Anything word-like becomes an - // identifier. At this point in the processing, an identifier is found, - // and can be matched against a list of potential upgrades. - if (result == "_hastag_" || - result == "_notag_" || - result == "_neg_" || - result == "_pos_") - type = typeOperator; - - return true; - } - break; - - case typeIdentifierEscape: - if (_n0 == 'u') - { - type = typeEscapeUnicode; - shift (); - } - else - { - type = quote ? typeString : typeIdentifier; - result += utf8_character (quote); - result += utf8_character (_n0); - shift (); - } - break; - - case typeEscape: - if (_n0 == 'x') - { - type = typeEscapeHex; - shift (); - } - else if (_n0 == 'u') - { - type = typeEscapeUnicode; - shift (); - } - else - { - result += '\\'; - result += utf8_character (_n0); - type = quote ? typeString : typeIdentifier; - shift (); - } - break; - - case typeEscapeHex: - if (is_hex_digit (_n0) && is_hex_digit (_n1)) - { - result += utf8_character (hex_to_int (_n0, _n1)); - type = quote ? typeString : typeIdentifier; - shift (); - shift (); - } - else - { - type = quote ? typeString : typeIdentifier; - shift (); - quote = 0; - return true; - } - break; - - case typeEscapeUnicode: - if (is_hex_digit (_n0) && - is_hex_digit (_n1) && - is_hex_digit (_n2) && - is_hex_digit (_n3)) - { - result += utf8_character (hex_to_int (_n0, _n1, _n2, _n3)); - shift (); - shift (); - shift (); - shift (); - type = quote ? typeString : typeIdentifier; - } - else if (_n0 == quote) - { - type = typeString; - shift (); - quote = 0; - return true; - } - break; - - case typeNumber: - if (is_dec_digit (_n0)) - { - result += utf8_character (_n0); - shift (); - } - else if (_n0 == '.') - { - type = typeDecimal; - result += utf8_character (_n0); - shift (); - } - else if (_n0 == 'e' || _n0 == 'E') - { - type = typeExponentIndicator; - result += utf8_character (_n0); - shift (); - } - else if (is_ident_start (_n0)) - { - type = typeIdentifier; - result += utf8_character (_n0); - shift (); - } - else - { - return true; - } - break; - - case typeDecimal: - if (is_dec_digit (_n0)) - { - result += utf8_character (_n0); - shift (); - } - else if (_n0 == 'e' || _n0 == 'E') - { - type = typeExponentIndicator; - result += utf8_character (_n0); - shift (); - } - else if (is_ident_start (_n0)) - { - type = typeIdentifier; - result += utf8_character (_n0); - shift (); - } - else - { - return true; - } - break; - - case typeExponentIndicator: - if (_n0 == '+' || _n0 == '-') - { - result += utf8_character (_n0); - shift (); - } - else if (is_dec_digit (_n0)) - { - type = typeExponent; - result += utf8_character (_n0); - shift (); - } - else if (is_ident_start (_n0)) - { - type = typeIdentifier; - result += utf8_character (_n0); - shift (); - } - break; - - case typeExponent: - if (is_dec_digit (_n0) || _n0 == '.') - { - result += utf8_character (_n0); - shift (); - } - else - { - type = typeDecimal; - return true; - } - break; - - case typeHex: - if (is_hex_digit (_n0)) - { - result += utf8_character (_n0); - shift (); - } - else - { - return true; - } - break; - - default: - throw std::string (STRING_LEX_TYPE_UNK); - break; - } - - // Fence post. - if (!_n0 && result != "") - return true; - } - - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -// Just like Lexer::token, but no operators, numbers, dates or durations. -bool Lexer::word (std::string& token, Type& type) -{ - // Start with nothing. - token = ""; - - // Different types of matching quote: ', ". - int quote = 0; - - type = typeNone; - while (_n0) - { - switch (type) - { - case typeNone: - if (is_ws (_n0)) - shift (); - else if (_n0 == '"' || _n0 == '\'') - { - type = typeString; - quote = _n0; - token += utf8_character (_n0); - shift (); - } - else - { - type = typeString; - token += utf8_character (_n0); - shift (); - } - break; - - case typeString: - if (_n0 == quote) - { - token += utf8_character (_n0); - shift (); - quote = 0; - return true; - } - else if (_n0 == '\\') - { - type = typeEscape; - shift (); - } - else if (! quote && is_ws (_n0)) - { - shift (); - return true; - } - else - { - token += utf8_character (_n0); - shift (); - } - break; - - case typeEscape: - if (_n0 == 'x') - { - type = typeEscapeHex; - shift (); - } - else if (_n0 == 'u') - { - type = typeEscapeUnicode; - shift (); - } - else - { - token += '\\'; - token += utf8_character (_n0); - type = typeString; - shift (); - } - break; - - case typeEscapeHex: - if (is_hex_digit (_n0) && is_hex_digit (_n1)) - { - token += utf8_character (hex_to_int (_n0, _n1)); - type = typeString; - shift (); - shift (); - } - else - { - type = typeString; - shift (); - quote = 0; - return true; - } - break; - - case typeEscapeUnicode: - if (is_hex_digit (_n0) && - is_hex_digit (_n1) && - is_hex_digit (_n2) && - is_hex_digit (_n3)) - { - token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3)); - shift (); - shift (); - shift (); - shift (); - type = typeString; - } - else if (_n0 == quote) - { - type = typeString; - shift (); - quote = 0; - return true; - } - break; - - default: - throw std::string (STRING_LEX_TYPE_UNK); - break; - } - - // Fence post. - if (!_n0 && token != "") - return true; - } - - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -void Lexer::ambiguity (bool value) -{ - _ambiguity = value; -} - -//////////////////////////////////////////////////////////////////////////////// -// No L10N - these are for internal purposes. -const std::string Lexer::type_name (const Type& type) -{ - switch (type) - { - case Lexer::typeNone: return "None"; - case Lexer::typeString: return "String"; - case Lexer::typeIdentifier: return "Identifier"; - case Lexer::typeIdentifierEscape: return "IdentifierEscape"; - case Lexer::typeNumber: return "Number"; - case Lexer::typeDecimal: return "Decimal"; - case Lexer::typeExponentIndicator: return "ExponentIndicator"; - case Lexer::typeExponent: return "Exponent"; - case Lexer::typeHex: return "Hex"; - case Lexer::typeOperator: return "Operator"; - case Lexer::typeEscape: return "Escape"; - case Lexer::typeEscapeHex: return "EscapeHex"; - case Lexer::typeEscapeUnicode: return "EscapeUnicode"; - case Lexer::typeDate: return "Date"; - case Lexer::typeDuration: return "Duration"; - case Lexer::typeTag: return "Tag"; - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Complete Unicode whitespace list. -// -// http://en.wikipedia.org/wiki/Whitespace_character -// Updated 2013-11-18 -bool Lexer::is_ws (int c) -{ - return (c == 0x0020 || // space Common Separator, space - c == 0x0009 || // Common Other, control HT, Horizontal Tab - c == 0x000A || // Common Other, control LF, Line feed - c == 0x000B || // Common Other, control VT, Vertical Tab - c == 0x000C || // Common Other, control FF, Form feed - c == 0x000D || // Common Other, control CR, Carriage return - c == 0x0085 || // Common Other, control NEL, Next line - c == 0x00A0 || // no-break space Common Separator, space - c == 0x1680 || // ogham space mark Ogham Separator, space - c == 0x180E || // mongolian vowel separator Mongolian Separator, space - c == 0x2000 || // en quad Common Separator, space - c == 0x2001 || // em quad Common Separator, space - c == 0x2002 || // en space Common Separator, space - c == 0x2003 || // em space Common Separator, space - c == 0x2004 || // three-per-em space Common Separator, space - c == 0x2005 || // four-per-em space Common Separator, space - c == 0x2006 || // six-per-em space Common Separator, space - c == 0x2007 || // figure space Common Separator, space - c == 0x2008 || // punctuation space Common Separator, space - c == 0x2009 || // thin space Common Separator, space - c == 0x200A || // hair space Common Separator, space - c == 0x2028 || // line separator Common Separator, line - c == 0x2029 || // paragraph separator Common Separator, paragraph - c == 0x202F || // narrow no-break space Common Separator, space - c == 0x205F || // medium mathematical space Common Separator, space - c == 0x3000); // ideographic space Common Separator, space -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_ident_start (int c) -{ - return c && // Include null character check. - ! is_ws (c) && - ! is_dec_digit (c) && - ! is_single_op (c); -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_ident (int c) -{ - return c && // Include null character check. - ! is_ws (c) && - ! is_single_op (c); -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_single_op (int c) -{ - return c == '+' || - c == '-' || - c == '*' || - c == '/' || - c == '(' || - c == ')' || - c == '<' || - c == '>' || - c == '^' || - c == '!' || - c == '%' || - c == '=' || - c == '~'; -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_dec_digit (int c) -{ - return c >= '0' && c <= '9'; -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::boundary (int left, int right) -{ - // XOR - if (isalpha (left) != isalpha (right)) return true; - if (isdigit (left) != isdigit (right)) return true; - if (isspace (left) != isspace (right)) return true; - - // OR - if (ispunct (left) || ispunct (right)) return true; - - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -// Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes. -void Lexer::word_split (std::vector & words, const std::string& input) -{ - words.clear (); - - std::string word; - Lexer::Type type; - Lexer lex (input); - while (lex.word (word, type)) - words.push_back (word); -} - -//////////////////////////////////////////////////////////////////////////////// -// Split 'input' into 'tokens'. -void Lexer::token_split (std::vector & words, const std::string& input) -{ - words.clear (); - - std::string word; - Lexer::Type type; - Lexer lex (input); - while (lex.token (word, type)) - words.push_back (word); -} - -//////////////////////////////////////////////////////////////////////////////// -// Split 'input' into 'tokens', preserving type. -void Lexer::token_split (std::vector >& lexemes, const std::string& input) -{ - lexemes.clear (); - - std::string word; - Lexer::Type type; - Lexer lex (input); - while (lex.token (word, type)) - lexemes.push_back (std::pair (word, type)); -} - -//////////////////////////////////////////////////////////////////////////////// -void Lexer::dequote (std::string& input) -{ - int quote = input[0]; - size_t len = input.length (); - if ((quote == '\'' || quote == '"') && - quote == input[len - 1]) - { - input = input.substr (1, len - 2); - } -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_date (std::string& result) -{ - // Try an ISO date parse. - if (isoEnabled) - { - std::string::size_type iso_i = 0; - std::string iso_result; - ISO8601d iso; - iso.ambiguity (_ambiguity); - if (iso.parse (_input.substr (_shift_counter), iso_i)) - { - result = _input.substr (_shift_counter, iso_i); - while (iso_i--) shift (); - return true; - } - } - - // Try a legacy rc.dateformat parse here. - if (Lexer::dateFormat != "") - { - try - { - std::string::size_type legacy_i = 0; - Date legacyDate (_input.substr (_shift_counter), legacy_i, Lexer::dateFormat, false, false); - result = _input.substr (_shift_counter, legacy_i); - while (legacy_i--) shift (); - return true; - } - - catch (...) { /* Never mind. */ } - } - - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_duration (std::string& result) -{ - std::string::size_type iso_i = 0; - std::string iso_result; - ISO8601p iso; - if (iso.parse (_input.substr (_shift_counter), iso_i)) - { - result = _input.substr (_shift_counter, iso_i); - while (iso_i--) shift (); - return true; - } - - std::string::size_type dur_i = 0; - std::string dur_result; - Duration dur; - if (dur.parse (_input.substr (_shift_counter), dur_i)) - { - result = _input.substr (_shift_counter, dur_i); - while (dur_i--) shift (); - return true; - } - - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_punct (int c) const -{ - if (c == ',' || - c == '.') // Tab - return true; - - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_num (int c) const -{ - if ((c >= '0' && c <= '9') || - c == '.') - return true; - - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_triple_op (int c0, int c1, int c2) const -{ - return (c0 == 'a' && c1 == 'n' && c2 == 'd' && _boundary23) || - (c0 == 'x' && c1 == 'o' && c2 == 'r' && _boundary23) || - (c0 == '!' && c1 == '=' && c2 == '='); -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_double_op (int c0, int c1, int c2) const -{ - return (c0 == '=' && c1 == '=') || - (c0 == '!' && c1 == '=') || - (c0 == '<' && c1 == '=') || - (c0 == '>' && c1 == '=') || - (c0 == 'o' && c1 == 'r' && _boundary12) || - (c0 == '|' && c1 == '|') || - (c0 == '&' && c1 == '&') || - (c0 == '!' && c1 == '~'); -} - -//////////////////////////////////////////////////////////////////////////////// -bool Lexer::is_hex_digit (int c) const -{ - return (c >= '0' && c <= '9') || - (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F'); -} - -//////////////////////////////////////////////////////////////////////////////// -int Lexer::decode_escape (int c) const -{ - switch (c) - { - case 'b': return 0x08; - case 'f': return 0x0C; - case 'n': return 0x0A; - case 'r': return 0x0D; - case 't': return 0x09; - case 'v': return 0x0B; - case '\'': return 0x27; - case '"': return 0x22; - default: return c; - } -} - -//////////////////////////////////////////////////////////////////////////////// -int Lexer::hex_to_int (int c) const -{ - if (c >= '0' && c <= '9') return (c - '0'); - else if (c >= 'a' && c <= 'f') return (c - 'a' + 10); - else return (c - 'A' + 10); -} - -//////////////////////////////////////////////////////////////////////////////// -int Lexer::hex_to_int (int c0, int c1) const -{ - return (hex_to_int (c0) << 4) + hex_to_int (c1); -} - -//////////////////////////////////////////////////////////////////////////////// -int Lexer::hex_to_int (int c0, int c1, int c2, int c3) const -{ - return (hex_to_int (c0) << 12) + - (hex_to_int (c1) << 8) + - (hex_to_int (c2) << 4) + - hex_to_int (c3); -} - -//////////////////////////////////////////////////////////////////////////////// -void Lexer::shift () -{ - _n0 = _n1; - _n1 = _n2; - _n2 = _n3; - _n3 = utf8_next_char (_input, _i); - ++_shift_counter; - - // Detect type boundaries between characters. - _boundary01 = boundary (_n0, _n1); - _boundary12 = boundary (_n1, _n2); - _boundary23 = boundary (_n2, _n3); -} - -//////////////////////////////////////////////////////////////////////////////// diff --git a/src/Lexer.h b/src/Lexer.h deleted file mode 100644 index b4a3bfa43..000000000 --- a/src/Lexer.h +++ /dev/null @@ -1,120 +0,0 @@ -//////////////////////////////////////////////////////////////////////////////// -// -// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included -// in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// http://www.opensource.org/licenses/mit-license.php -// -//////////////////////////////////////////////////////////////////////////////// - -#ifndef INCLUDED_LEXER -#define INCLUDED_LEXER - -#include -#include - -class Lexer -{ -public: - static std::string dateFormat; - static bool isoEnabled; - - enum Type - { - typeNone = 0, - typeString, - typeIdentifier, - typeIdentifierEscape, // Intermediate - typeEscape, // Intermediate - typeEscapeHex, // Intermediate - typeEscapeUnicode, // Intermediate - typeNumber, - typeDecimal, - typeExponentIndicator, // Intermediate - typeExponent, // Intermediate - typeHex, - typeOperator, - typeDate, - typeDuration, - typeTag, -/* - Recognizing more types means that Lexer::*_split and Lexer::token approach - the ideal form, whereby the command line becomes just one string that is - lexed into tokens. Those tokens are then simply dissected by type.. - - typeUUID, - typePattern, - typeSubstitution, - typeNameValue, -*/ - }; - - Lexer (const std::string&); - virtual ~Lexer (); - Lexer (const Lexer&); // Not implemented. - Lexer& operator= (const Lexer&); // Not implemented. - bool operator== (const Lexer&); // Not implemented. - bool token (std::string&, Type&); - bool word (std::string&, Type&); - void ambiguity (bool); - - static const std::string type_name (const Type&); - static bool is_ws (int); - static bool is_ident_start (int); - static bool is_ident (int); - static bool is_single_op (int); - static bool is_dec_digit (int); - static bool boundary (int, int); - static void word_split (std::vector &, const std::string&); - static void token_split (std::vector &, const std::string&); - static void token_split (std::vector >&, const std::string&); - static void dequote (std::string&); - -private: - bool is_date (std::string&); - bool is_duration (std::string&); - bool is_punct (int) const; - bool is_num (int) const; - bool is_triple_op (int, int, int) const; - bool is_double_op (int, int, int) const; - bool is_hex_digit (int) const; - int decode_escape (int) const; - int hex_to_int (int) const; - int hex_to_int (int, int) const; - int hex_to_int (int, int, int, int) const; - void shift (); - -private: - const std::string _input; - std::string::size_type _i; - std::string::size_type _shift_counter; - int _n0; - int _n1; - int _n2; - int _n3; - bool _boundary01; - bool _boundary12; - bool _boundary23; - bool _ambiguity; -}; - -#endif - -//////////////////////////////////////////////////////////////////////////////// diff --git a/src/Lexer2.cpp b/src/Lexer2.cpp index 119a93d95..d90e3b80e 100644 --- a/src/Lexer2.cpp +++ b/src/Lexer2.cpp @@ -37,13 +37,13 @@ static const int uuid_min_length = 8; std::string Lexer2::dateFormat = ""; bool Lexer2::isoEnabled = true; -bool Lexer2::ambiguity = true; //////////////////////////////////////////////////////////////////////////////// Lexer2::Lexer2 (const std::string& text) : _text (text) , _cursor (0) , _eos (text.size ()) +, _ambiguity (false) { } @@ -52,6 +52,12 @@ Lexer2::~Lexer2 () { } +//////////////////////////////////////////////////////////////////////////////// +void Lexer2::ambiguity (bool value) +{ + _ambiguity = value; +} + //////////////////////////////////////////////////////////////////////////////// // When a Lexer2 object is constructed with a string, this method walks through // the stream of low-level tokens. @@ -417,7 +423,7 @@ bool Lexer2::isDate (std::string& token, Lexer2::Type& type) { std::size_t iso_i = 0; ISO8601d iso; - iso.ambiguity (Lexer2::ambiguity); + iso.ambiguity (_ambiguity); if (iso.parse (_text.substr (_cursor), iso_i)) { type = Lexer2::Type::date; @@ -504,10 +510,13 @@ bool Lexer2::isUUID (std::string& token, Lexer2::Type& type) if (i >= uuid_min_length) { - token = _text.substr (_cursor, i + 1); - type = Lexer2::Type::uuid; - _cursor += i; - return true; + token = _text.substr (_cursor, i); + if (! isAllDigits (token)) + { + type = Lexer2::Type::uuid; + _cursor += i; + return true; + } } return false; @@ -545,7 +554,7 @@ bool Lexer2::isHexNumber (std::string& token, Lexer2::Type& type) // Lexer2::Type::number // \d+ // [ . \d+ ] -// [ e|E [ +|- ] \d+ ] +// [ e|E [ +|- ] \d+ [ . \d+ ] ] bool Lexer2::isNumber (std::string& token, Lexer2::Type& type) { std::size_t marker = _cursor; @@ -581,6 +590,17 @@ bool Lexer2::isNumber (std::string& token, Lexer2::Type& type) ++marker; while (isDigit (_text[marker])) utf8_next_char (_text, marker); + + if (_text[marker] == '.') + { + ++marker; + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + } + } } } @@ -667,7 +687,7 @@ bool Lexer2::isURL (std::string& token, Lexer2::Type& type) //////////////////////////////////////////////////////////////////////////////// // Lexer2::Type::pair -// : [ | ] +// :|= [ | ] bool Lexer2::isPair (std::string& token, Lexer2::Type& type) { std::size_t marker = _cursor; @@ -698,11 +718,18 @@ bool Lexer2::isPair (std::string& token, Lexer2::Type& type) //////////////////////////////////////////////////////////////////////////////// // Lexer2::Type::tag -// [ +|- ] [ ]* +// ^ | [ +|- ] [ ]* bool Lexer2::isTag (std::string& token, Lexer2::Type& type) { std::size_t marker = _cursor; + // This test requires a tag to have a preceding space or start a string. + // bad: 'a+b' --> identifier tag + // good: 'a+b' --> identifier op identifier + if (marker > 0 && + ! isWhitespace (_text[marker - 1])) + return false; + if (_text[marker] == '+' || _text[marker] == '-') { @@ -926,7 +953,7 @@ bool Lexer2::isWord (std::string& token, Lexer2::Type& type) { std::size_t marker = _cursor; - while (! isWhitespace (_text[marker])) + while (_text[marker] && ! isWhitespace (_text[marker])) utf8_next_char (_text, marker); if (marker > _cursor) diff --git a/src/Lexer2.h b/src/Lexer2.h index 70cf99aa5..9ede9be7d 100644 --- a/src/Lexer2.h +++ b/src/Lexer2.h @@ -40,7 +40,6 @@ public: // These are overridable. static std::string dateFormat; static bool isoEnabled; - static bool ambiguity; enum class Type { uuid, number, hex, string, @@ -54,6 +53,7 @@ public: Lexer2 (const std::string&); ~Lexer2 (); + void ambiguity (bool); bool token (std::string&, Lexer2::Type&); static std::vector > tokens (const std::string&); static std::vector split (const std::string&); @@ -101,8 +101,9 @@ public: private: std::string _text; - std::size_t _cursor = 0; - std::size_t _eos = 0; + std::size_t _cursor; + std::size_t _eos; + bool _ambiguity; }; #endif diff --git a/src/commands/CmdCustom.cpp b/src/commands/CmdCustom.cpp index 1876dab23..77ec3b76f 100644 --- a/src/commands/CmdCustom.cpp +++ b/src/commands/CmdCustom.cpp @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -83,8 +83,8 @@ int CmdCustom::execute (std::string& output) // Prepend the argument list with those from the report filter. std::string lexeme; - Lexer::Type type; - Lexer lex (reportFilter); + Lexer2::Type type; + Lexer2 lex (reportFilter); lex.ambiguity (false); while (lex.token (lexeme, type)) context.cli.add (lexeme); diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp index ad265192c..684680538 100644 --- a/test/lexer.t.cpp +++ b/test/lexer.t.cpp @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include Context context; @@ -36,360 +36,349 @@ Context context; //////////////////////////////////////////////////////////////////////////////// int main (int argc, char** argv) { - UnitTest t (212); + UnitTest t (211); - std::vector > tokens; + std::vector > tokens; std::string token; - Lexer::Type type; + Lexer2::Type type; // White space detection. - t.notok (Lexer::is_ws (0x0041), "U+0041 (A) is not ws"); - t.ok (Lexer::is_ws (0x0020), "U+0020 is_ws"); - t.ok (Lexer::is_ws (0x0009), "U+0009 is_ws"); - t.ok (Lexer::is_ws (0x000A), "U+000A is_ws"); - t.ok (Lexer::is_ws (0x000B), "U+000B is_ws"); - t.ok (Lexer::is_ws (0x000C), "U+000C is_ws"); - t.ok (Lexer::is_ws (0x000D), "U+000D is_ws"); - t.ok (Lexer::is_ws (0x0085), "U+0085 is_ws"); - t.ok (Lexer::is_ws (0x00A0), "U+00A0 is_ws"); - t.ok (Lexer::is_ws (0x1680), "U+1680 is_ws"); // 10 - t.ok (Lexer::is_ws (0x180E), "U+180E is_ws"); - t.ok (Lexer::is_ws (0x2000), "U+2000 is_ws"); - t.ok (Lexer::is_ws (0x2001), "U+2001 is_ws"); - t.ok (Lexer::is_ws (0x2002), "U+2002 is_ws"); - t.ok (Lexer::is_ws (0x2003), "U+2003 is_ws"); - t.ok (Lexer::is_ws (0x2004), "U+2004 is_ws"); - t.ok (Lexer::is_ws (0x2005), "U+2005 is_ws"); - t.ok (Lexer::is_ws (0x2006), "U+2006 is_ws"); - t.ok (Lexer::is_ws (0x2007), "U+2007 is_ws"); - t.ok (Lexer::is_ws (0x2008), "U+2008 is_ws"); // 20 - t.ok (Lexer::is_ws (0x2009), "U+2009 is_ws"); - t.ok (Lexer::is_ws (0x200A), "U+200A is_ws"); - t.ok (Lexer::is_ws (0x2028), "U+2028 is_ws"); - t.ok (Lexer::is_ws (0x2029), "U+2029 is_ws"); - t.ok (Lexer::is_ws (0x202F), "U+202F is_ws"); - t.ok (Lexer::is_ws (0x205F), "U+205F is_ws"); - t.ok (Lexer::is_ws (0x3000), "U+3000 is_ws"); + t.notok (Lexer2::isWhitespace (0x0041), "U+0041 (A) ! isWhitespace"); + t.ok (Lexer2::isWhitespace (0x0020), "U+0020 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x0009), "U+0009 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x000A), "U+000A isWhitespace"); + t.ok (Lexer2::isWhitespace (0x000B), "U+000B isWhitespace"); + t.ok (Lexer2::isWhitespace (0x000C), "U+000C isWhitespace"); + t.ok (Lexer2::isWhitespace (0x000D), "U+000D isWhitespace"); + t.ok (Lexer2::isWhitespace (0x0085), "U+0085 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x00A0), "U+00A0 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x1680), "U+1680 isWhitespace"); // 10 + t.ok (Lexer2::isWhitespace (0x180E), "U+180E isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2000), "U+2000 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2001), "U+2001 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2002), "U+2002 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2003), "U+2003 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2004), "U+2004 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2005), "U+2005 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2006), "U+2006 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2007), "U+2007 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2008), "U+2008 isWhitespace"); // 20 + t.ok (Lexer2::isWhitespace (0x2009), "U+2009 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x200A), "U+200A isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2028), "U+2028 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x2029), "U+2029 isWhitespace"); + t.ok (Lexer2::isWhitespace (0x202F), "U+202F isWhitespace"); + t.ok (Lexer2::isWhitespace (0x205F), "U+205F isWhitespace"); + t.ok (Lexer2::isWhitespace (0x3000), "U+3000 isWhitespace"); - // static bool Lexer::boundary(int, int); - t.ok (Lexer::boundary (' ', 'a'), "' ' --> 'a' = boundary"); - t.ok (Lexer::boundary ('a', ' '), "'a' --> ' ' = boundary"); - t.ok (Lexer::boundary (' ', '+'), "' ' --> '+' = boundary"); - t.ok (Lexer::boundary (' ', ','), "' ' --> ',' = boundary"); - t.notok (Lexer::boundary ('3', '4'), "'3' --> '4' = boundary"); - t.ok (Lexer::boundary ('(', '('), "'(' --> '(' = boundary"); - t.notok (Lexer::boundary ('r', 'd'), "'r' --> 'd' = boundary"); + // static bool Lexer2::isBoundary(int, int); + t.ok (Lexer2::isBoundary (' ', 'a'), "' ' --> 'a' = isBoundary"); + t.ok (Lexer2::isBoundary ('a', ' '), "'a' --> ' ' = isBoundary"); + t.ok (Lexer2::isBoundary (' ', '+'), "' ' --> '+' = isBoundary"); + t.ok (Lexer2::isBoundary (' ', ','), "' ' --> ',' = isBoundary"); + t.notok (Lexer2::isBoundary ('3', '4'), "'3' --> '4' = isBoundary"); + t.ok (Lexer2::isBoundary ('(', '('), "'(' --> '(' = isBoundary"); + t.notok (Lexer2::isBoundary ('r', 'd'), "'r' --> 'd' = isBoundary"); // Should result in no tokens. - Lexer l0 (""); + Lexer2 l0 (""); t.notok (l0.token (token, type), "'' --> no tokens"); // Should result in no tokens. - Lexer l1 (" \t "); + Lexer2 l1 (" \t "); t.notok (l1.token (token, type), "' \\t ' --> no tokens"); // \u20ac = Euro symbol. - Lexer l2 (" one 'two \\'three\\''+456-(1.3*2 - 0x12) \\u0041 1.2e-3.4 foo.bar and '\\u20ac'"); + Lexer2 l2 (" one 'two \\'three\\''+456-(1.3*2 - 0x12) 1.2e-3.4 foo.bar and '\\u20ac'"); tokens.clear (); while (l2.token (token, type)) { - std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; - tokens.push_back (std::pair (token, type)); + std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); } t.is (tokens[0].first, "one", "tokens[0] = 'left'"); // 30 - t.is (Lexer::type_name (tokens[0].second), "Identifier", "tokens[0] = Identifier"); + t.is (Lexer2::typeName (tokens[0].second), "identifier", "tokens[0] = identifier"); - t.is (tokens[1].first, "'two \\'three\\''", "tokens[1] = 'two \\'three\\''"); - t.is (Lexer::type_name (tokens[1].second), "String", "tokens[1] = String"); + t.is (tokens[1].first, "two 'three'", "tokens[1] = 'two 'three''"); + t.is (Lexer2::typeName (tokens[1].second), "string", "tokens[1] = string"); t.is (tokens[2].first, "+", "tokens[2] = '+'"); - t.is (Lexer::type_name (tokens[2].second), "Operator", "tokens[2] = Operator"); + t.is (Lexer2::typeName (tokens[2].second), "op", "tokens[2] = op"); t.is (tokens[3].first, "456", "tokens[3] = '456'"); - t.is (Lexer::type_name (tokens[3].second), "Number", "tokens[3] = Number"); + t.is (Lexer2::typeName (tokens[3].second), "number", "tokens[3] = number"); t.is (tokens[4].first, "-", "tokens[4] = '-'"); - t.is (Lexer::type_name (tokens[4].second), "Operator", "tokens[4] = Operator"); + t.is (Lexer2::typeName (tokens[4].second), "op", "tokens[4] = op"); t.is (tokens[5].first, "(", "tokens[5] = '('"); // 40 - t.is (Lexer::type_name (tokens[5].second), "Operator", "tokens[5] = Operator"); + t.is (Lexer2::typeName (tokens[5].second), "op", "tokens[5] = op"); t.is (tokens[6].first, "1.3", "tokens[6] = '1.3'"); - t.is (Lexer::type_name (tokens[6].second), "Decimal", "tokens[6] = Decimal"); + t.is (Lexer2::typeName (tokens[6].second), "number", "tokens[6] = number"); t.is (tokens[7].first, "*", "tokens[7] = '*'"); - t.is (Lexer::type_name (tokens[7].second), "Operator", "tokens[7] = Operator"); + t.is (Lexer2::typeName (tokens[7].second), "op", "tokens[7] = op"); t.is (tokens[8].first, "2", "tokens[8] = '2'"); - t.is (Lexer::type_name (tokens[8].second), "Number", "tokens[8] = Number"); + t.is (Lexer2::typeName (tokens[8].second), "number", "tokens[8] = number"); t.is (tokens[9].first, "-", "tokens[9] = '-'"); - t.is (Lexer::type_name (tokens[9].second), "Operator", "tokens[9] = Operator"); + t.is (Lexer2::typeName (tokens[9].second), "op", "tokens[9] = op"); t.is (tokens[10].first, "0x12", "tokens[10] = '0x12'"); // 50 - t.is (Lexer::type_name (tokens[10].second), "Hex", "tokens[10] = Hex"); + t.is (Lexer2::typeName (tokens[10].second), "hex", "tokens[10] = hex"); t.is (tokens[11].first, ")", "tokens[11] = ')'"); - t.is (Lexer::type_name (tokens[11].second), "Operator", "tokens[11] = Operator"); + t.is (Lexer2::typeName (tokens[11].second), "op", "tokens[11] = op"); - t.is (tokens[12].first, "A", "tokens[12] = \\u0041 --> 'A'"); - t.is (Lexer::type_name (tokens[12].second), "Identifier", "tokens[12] = Identifier"); + t.is (tokens[12].first, "1.2e-3.4", "tokens[12] = '1.2e-3.4'"); + t.is (Lexer2::typeName (tokens[12].second), "number", "tokens[12] = number"); - t.is (tokens[13].first, "1.2e-3.4", "tokens[13] = '1.2e-3.4'"); - t.is (Lexer::type_name (tokens[13].second), "Decimal", "tokens[13] = Decimal"); + t.is (tokens[13].first, "foo.bar", "tokens[13] = 'foo.bar'"); + t.is (Lexer2::typeName (tokens[13].second), "identifier", "tokens[13] = identifier"); - t.is (tokens[14].first, "foo.bar", "tokens[14] = 'foo.bar'"); - t.is (Lexer::type_name (tokens[14].second), "Identifier", "tokens[14] = Identifier"); + t.is (tokens[14].first, "and", "tokens[14] = 'and'"); // 60 + t.is (Lexer2::typeName (tokens[14].second), "op", "tokens[14] = op"); - t.is (tokens[15].first, "and", "tokens[15] = 'and'"); // 60 - t.is (Lexer::type_name (tokens[15].second), "Operator", "tokens[15] = Operator"); - - t.is (tokens[16].first, "'€'", "tokens[16] = \\u20ac --> '€'"); - t.is (Lexer::type_name (tokens[16].second), "String", "tokens[16] = String"); + t.is (tokens[15].first, "€", "tokens[15] = \\u20ac --> '€'"); + t.is (Lexer2::typeName (tokens[15].second), "string", "tokens[15] = string"); // Test for ISO-8601 dates (favoring dates in ambiguous cases). - Lexer l3 ("1 12 123 1234 12345 123456 1234567 12345678 20131129T225800Z 2013-11-29T22:58:00Z"); + Lexer2 l3 ("1 12 123 1234 12345 123456 1234567 12345678 20131129T225800Z 2013-11-29T22:58:00Z"); l3.ambiguity (true); tokens.clear (); while (l3.token (token, type)) { - std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; - tokens.push_back (std::pair (token, type)); + std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); } - t.is ((int)tokens.size (), 10, "10 tokens"); - t.is (tokens[0].first, "1", "tokens[0] == '1'"); - t.is (tokens[0].second, Lexer::typeNumber, "tokens[0] == typeNumber"); - t.is (tokens[1].first, "12", "tokens[1] == '12'"); - t.is (tokens[1].second, Lexer::typeDate, "tokens[1] == typeDate"); - t.is (tokens[2].first, "123", "tokens[2] == '123'"); - t.is (tokens[2].second, Lexer::typeNumber, "tokens[2] == typeNumber"); // 70 - t.is (tokens[3].first, "1234", "tokens[3] == '1234'"); - t.is (tokens[3].second, Lexer::typeDate, "tokens[3] == typeDate"); - t.is (tokens[4].first, "12345", "tokens[4] == '12345'"); - t.is (tokens[4].second, Lexer::typeNumber, "tokens[4] == typeNumber"); - t.is (tokens[5].first, "123456", "tokens[5] == '123456'"); - t.is (tokens[5].second, Lexer::typeDate, "tokens[5] == typeDate"); - t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'"); - t.is (tokens[6].second, Lexer::typeNumber, "tokens[6] == typeNumber"); - t.is (tokens[7].first, "12345678", "tokens[7] == '12345678'"); - t.is (tokens[7].second, Lexer::typeNumber, "tokens[7] == typeNumber"); // 80 - t.is (tokens[8].first, "20131129T225800Z", "tokens[8] == '20131129T225800Z'"); - t.is (tokens[8].second, Lexer::typeDate, "tokens[8] == typeDate"); - t.is (tokens[9].first, "2013-11-29T22:58:00Z", "tokens[9] == '2013-11-29T22:58:00Z'"); - t.is (tokens[9].second, Lexer::typeDate, "tokens[9] == typeDate"); + t.is ((int)tokens.size (), 10, "10 tokens"); + t.is (tokens[0].first, "1", "tokens[0] == '1'"); + t.is ((int) tokens[0].second, (int) Lexer2::Type::number, "tokens[0] == Type::number"); + t.is (tokens[1].first, "12", "tokens[1] == '12'"); + t.is ((int) tokens[1].second, (int) Lexer2::Type::date, "tokens[1] == Type::date"); + t.is (tokens[2].first, "123", "tokens[2] == '123'"); + t.is ((int) tokens[2].second, (int) Lexer2::Type::number, "tokens[2] == Type::number"); // 70 + t.is (tokens[3].first, "1234", "tokens[3] == '1234'"); + t.is ((int) tokens[3].second, (int) Lexer2::Type::date, "tokens[3] == Type::date"); + t.is (tokens[4].first, "12345", "tokens[4] == '12345'"); + t.is ((int) tokens[4].second, (int) Lexer2::Type::number, "tokens[4] == Type::number"); + t.is (tokens[5].first, "123456", "tokens[5] == '123456'"); + t.is ((int) tokens[5].second, (int) Lexer2::Type::date, "tokens[5] == Type::date"); + t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'"); + t.is ((int) tokens[6].second, (int) Lexer2::Type::number, "tokens[6] == Type::number"); + t.is (tokens[7].first, "12345678", "tokens[7] == '12345678'"); + t.is ((int) tokens[7].second, (int) Lexer2::Type::number, "tokens[7] == Type::number"); // 80 + t.is (tokens[8].first, "20131129T225800Z", "tokens[8] == '20131129T225800Z'"); + t.is ((int) tokens[8].second, (int) Lexer2::Type::date, "tokens[8] == Type::date"); + t.is (tokens[9].first, "2013-11-29T22:58:00Z", "tokens[9] == '2013-11-29T22:58:00Z'"); + t.is ((int) tokens[9].second, (int) Lexer2::Type::date, "tokens[9] == Type::date"); // Test for ISO-8601 dates (favoring numbers in ambiguous cases). - Lexer l4 ("1 12 123 1234 12345 123456 1234567 12345678 20131129T225800Z 2013-11-29T22:58:00Z"); + Lexer2 l4 ("1 12 123 1234 12345 123456 1234567 12345678 20131129T225800Z 2013-11-29T22:58:00Z"); l4.ambiguity (false); tokens.clear (); while (l4.token (token, type)) { - std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; - tokens.push_back (std::pair (token, type)); + std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); } - t.is ((int)tokens.size (), 10, "10 tokens"); - t.is (tokens[0].first, "1", "tokens[0] == '1'"); - t.is (tokens[0].second, Lexer::typeNumber, "tokens[0] == typeNumber"); - t.is (tokens[1].first, "12", "tokens[1] == '12'"); - t.is (tokens[1].second, Lexer::typeNumber, "tokens[1] == typeNumber"); - t.is (tokens[2].first, "123", "tokens[2] == '123'"); // 90 - t.is (tokens[2].second, Lexer::typeNumber, "tokens[2] == typeNumber"); - t.is (tokens[3].first, "1234", "tokens[3] == '1234'"); - t.is (tokens[3].second, Lexer::typeNumber, "tokens[3] == typeNumber"); - t.is (tokens[4].first, "12345", "tokens[4] == '12345'"); - t.is (tokens[4].second, Lexer::typeNumber, "tokens[4] == typeNumber"); - t.is (tokens[5].first, "123456", "tokens[5] == '123456'"); - t.is (tokens[5].second, Lexer::typeNumber, "tokens[5] == typeNumber"); - t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'"); - t.is (tokens[6].second, Lexer::typeNumber, "tokens[6] == typeNumber"); - t.is (tokens[7].first, "12345678", "tokens[7] == '12345678'"); // 100 - t.is (tokens[7].second, Lexer::typeNumber, "tokens[7] == typeNumber"); - t.is (tokens[8].first, "20131129T225800Z", "tokens[8] == '20131129T225800Z'"); - t.is (tokens[8].second, Lexer::typeDate, "tokens[8] == typeDate"); - t.is (tokens[9].first, "2013-11-29T22:58:00Z", "tokens[9] == '2013-11-29T22:58:00Z'"); - t.is (tokens[9].second, Lexer::typeDate, "tokens[9] == typeDate"); + t.is ((int)tokens.size (), 10, "10 tokens"); + t.is (tokens[0].first, "1", "tokens[0] == '1'"); + t.is ((int) tokens[0].second, (int) Lexer2::Type::number, "tokens[0] == Type::number"); + t.is (tokens[1].first, "12", "tokens[1] == '12'"); + t.is ((int) tokens[1].second, (int) Lexer2::Type::number, "tokens[1] == Type::number"); + t.is (tokens[2].first, "123", "tokens[2] == '123'"); // 90 + t.is ((int) tokens[2].second, (int) Lexer2::Type::number, "tokens[2] == Type::number"); + t.is (tokens[3].first, "1234", "tokens[3] == '1234'"); + t.is ((int) tokens[3].second, (int) Lexer2::Type::number, "tokens[3] == Type::number"); + t.is (tokens[4].first, "12345", "tokens[4] == '12345'"); + t.is ((int) tokens[4].second, (int) Lexer2::Type::number, "tokens[4] == Type::number"); + t.is (tokens[5].first, "123456", "tokens[5] == '123456'"); + t.is ((int) tokens[5].second, (int) Lexer2::Type::number, "tokens[5] == Type::number"); + t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'"); + t.is ((int) tokens[6].second, (int) Lexer2::Type::number, "tokens[6] == Type::number"); + t.is (tokens[7].first, "12345678", "tokens[7] == '12345678'"); // 100 + t.is ((int) tokens[7].second, (int) Lexer2::Type::number, "tokens[7] == Type::number"); + t.is (tokens[8].first, "20131129T225800Z", "tokens[8] == '20131129T225800Z'"); + t.is ((int) tokens[8].second, (int) Lexer2::Type::date, "tokens[8] == Type::date"); + t.is (tokens[9].first, "2013-11-29T22:58:00Z", "tokens[9] == '2013-11-29T22:58:00Z'"); + t.is ((int) tokens[9].second, (int) Lexer2::Type::date, "tokens[9] == Type::date"); // Test for durations - Lexer l5 ("1second 1minute 2hour 3 days 4w 5mo 6 years"); + Lexer2 l5 ("1second 1minute 2hour 3 days 4w 5mo 6 years"); tokens.clear (); while (l5.token (token, type)) { - std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; - tokens.push_back (std::pair (token, type)); + std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); } - t.is ((int)tokens.size (), 7, "7 tokens"); - t.is (tokens[0].first, "1second", "tokens[0] == '1second'"); - t.is (tokens[0].second, Lexer::typeDuration, "tokens[0] == typeDuration"); - t.is (tokens[1].first, "1minute", "tokens[1] == '1minute'"); - t.is (tokens[1].second, Lexer::typeDuration, "tokens[1] == typeDuration"); // 110 - t.is (tokens[2].first, "2hour", "tokens[2] == '2hour'"); - t.is (tokens[2].second, Lexer::typeDuration, "tokens[2] == typeDuration"); - t.is (tokens[3].first, "3 days", "tokens[3] == '3 days'"); - t.is (tokens[3].second, Lexer::typeDuration, "tokens[3] == typeDuration"); - t.is (tokens[4].first, "4w", "tokens[4] == '4w'"); - t.is (tokens[4].second, Lexer::typeDuration, "tokens[4] == typeDuration"); - t.is (tokens[5].first, "5mo", "tokens[5] == '5mo'"); - t.is (tokens[5].second, Lexer::typeDuration, "tokens[5] == typeDuration"); - t.is (tokens[6].first, "6 years", "tokens[6] == '6 years'"); - t.is (tokens[6].second, Lexer::typeDuration, "tokens[6] == typeDuration"); // 120 + t.is ((int)tokens.size (), 7, "7 tokens"); + t.is (tokens[0].first, "1second", "tokens[0] == '1second'"); + t.is ((int) tokens[0].second, (int) Lexer2::Type::duration, "tokens[0] == Type::duration"); + t.is (tokens[1].first, "1minute", "tokens[1] == '1minute'"); + t.is ((int) tokens[1].second, (int) Lexer2::Type::duration, "tokens[1] == Type::duration"); // 110 + t.is (tokens[2].first, "2hour", "tokens[2] == '2hour'"); + t.is ((int) tokens[2].second, (int) Lexer2::Type::duration, "tokens[2] == Type::duration"); + t.is (tokens[3].first, "3 days", "tokens[3] == '3 days'"); + t.is ((int) tokens[3].second, (int) Lexer2::Type::duration, "tokens[3] == Type::duration"); + t.is (tokens[4].first, "4w", "tokens[4] == '4w'"); + t.is ((int) tokens[4].second, (int) Lexer2::Type::duration, "tokens[4] == Type::duration"); + t.is (tokens[5].first, "5mo", "tokens[5] == '5mo'"); + t.is ((int) tokens[5].second, (int) Lexer2::Type::duration, "tokens[5] == Type::duration"); + t.is (tokens[6].first, "6 years", "tokens[6] == '6 years'"); + t.is ((int) tokens[6].second, (int) Lexer2::Type::duration, "tokens[6] == Type::duration"); // 120 // All the Eval operators. - Lexer l6 ("P1Y PT1H P1Y1M1DT1H1M1S 1s 1second"); + Lexer2 l6 ("P1Y PT1H P1Y1M1DT1H1M1S 1s 1second"); tokens.clear (); while (l6.token (token, type)) { - std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; - tokens.push_back (std::pair (token, type)); + std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); } - t.is ((int)tokens.size (), 5, "5 ISO periods"); - t.is (tokens[0].first, "P1Y", "tokens[0] == 'P1Y'"); - t.is (tokens[0].second, Lexer::typeDuration, "tokens[0] == typeDuration"); - t.is (tokens[1].first, "PT1H", "tokens[1] == 'PT1H'"); - t.is (tokens[1].second, Lexer::typeDuration, "tokens[1] == typeDuration"); - t.is (tokens[2].first, "P1Y1M1DT1H1M1S", "tokens[2] == 'P1Y1M1DT1H1M1S'"); - t.is (tokens[2].second, Lexer::typeDuration, "tokens[2] == typeDuration"); - t.is (tokens[3].first, "1s", "tokens[3] == '1s'"); - t.is (tokens[3].second, Lexer::typeDuration, "tokens[3] == typeDuration"); - t.is (tokens[4].first, "1second", "tokens[4] == '1second'"); - t.is (tokens[4].second, Lexer::typeDuration, "tokens[4] == typeDuration"); + t.is ((int)tokens.size (), 5, "5 ISO periods"); + t.is (tokens[0].first, "P1Y", "tokens[0] == 'P1Y'"); + t.is ((int) tokens[0].second, (int) Lexer2::Type::duration, "tokens[0] == Type::duration"); + t.is (tokens[1].first, "PT1H", "tokens[1] == 'PT1H'"); + t.is ((int) tokens[1].second, (int) Lexer2::Type::duration, "tokens[1] == Type::duration"); + t.is (tokens[2].first, "P1Y1M1DT1H1M1S", "tokens[2] == 'P1Y1M1DT1H1M1S'"); + t.is ((int) tokens[2].second, (int) Lexer2::Type::duration, "tokens[2] == Type::duration"); + t.is (tokens[3].first, "1s", "tokens[3] == '1s'"); + t.is ((int) tokens[3].second, (int) Lexer2::Type::duration, "tokens[3] == Type::duration"); + t.is (tokens[4].first, "1second", "tokens[4] == '1second'"); + t.is ((int) tokens[4].second, (int) Lexer2::Type::duration, "tokens[4] == Type::duration"); - // All the Eval operators. - Lexer l7 ("and xor or <= >= !~ != == = ^ > ~ ! * / % + - < ( )"); + // All (int) the Eval operators. + Lexer2 l7 ("and xor or <= >= !~ != == = ^ > ~ ! * / % + - < ( )"); tokens.clear (); while (l7.token (token, type)) { - std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; - tokens.push_back (std::pair (token, type)); + std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); } - t.is ((int)tokens.size (), 21, "21 operators"); - t.is (tokens[0].first, "and", "tokens[0] == 'and'"); - t.is (tokens[0].second, Lexer::typeOperator, "tokens[0] == typeOperator"); // 130 - t.is (tokens[1].first, "xor", "tokens[1] == 'xor'"); - t.is (tokens[1].second, Lexer::typeOperator, "tokens[1] == typeOperator"); - t.is (tokens[2].first, "or", "tokens[2] == 'or'"); - t.is (tokens[2].second, Lexer::typeOperator, "tokens[2] == typeOperator"); - t.is (tokens[3].first, "<=", "tokens[3] == '<='"); - t.is (tokens[3].second, Lexer::typeOperator, "tokens[3] == typeOperator"); - t.is (tokens[4].first, ">=", "tokens[4] == '>='"); - t.is (tokens[4].second, Lexer::typeOperator, "tokens[4] == typeOperator"); - t.is (tokens[5].first, "!~", "tokens[5] == '!~'"); - t.is (tokens[5].second, Lexer::typeOperator, "tokens[5] == typeOperator"); // 140 - t.is (tokens[6].first, "!=", "tokens[6] == '!='"); - t.is (tokens[6].second, Lexer::typeOperator, "tokens[6] == typeOperator"); - t.is (tokens[7].first, "==", "tokens[7] == '=='"); - t.is (tokens[7].second, Lexer::typeOperator, "tokens[7] == typeOperator"); - t.is (tokens[8].first, "=", "tokens[8] == '='"); - t.is (tokens[8].second, Lexer::typeOperator, "tokens[8] == typeOperator"); - t.is (tokens[9].first, "^", "tokens[9] == '^'"); - t.is (tokens[9].second, Lexer::typeOperator, "tokens[9] == typeOperator"); - t.is (tokens[10].first, ">", "tokens[10] == '>'"); - t.is (tokens[10].second, Lexer::typeOperator, "tokens[10] == typeOperator"); // 150 - t.is (tokens[11].first, "~", "tokens[11] == '~'"); - t.is (tokens[11].second, Lexer::typeOperator, "tokens[11] == typeOperator"); - t.is (tokens[12].first, "!", "tokens[12] == '!'"); - t.is (tokens[12].second, Lexer::typeOperator, "tokens[12] == typeOperator"); - t.is (tokens[13].first, "*", "tokens[13] == '*'"); - t.is (tokens[13].second, Lexer::typeOperator, "tokens[13] == typeOperator"); - t.is (tokens[14].first, "/", "tokens[14] == '/'"); - t.is (tokens[14].second, Lexer::typeOperator, "tokens[14] == typeOperator"); - t.is (tokens[15].first, "%", "tokens[15] == '%'"); - t.is (tokens[15].second, Lexer::typeOperator, "tokens[15] == typeOperator"); // 160 - t.is (tokens[16].first, "+", "tokens[16] == '+'"); - t.is (tokens[16].second, Lexer::typeOperator, "tokens[16] == typeOperator"); - t.is (tokens[17].first, "-", "tokens[17] == '-'"); - t.is (tokens[17].second, Lexer::typeOperator, "tokens[17] == typeOperator"); - t.is (tokens[18].first, "<", "tokens[18] == '<'"); - t.is (tokens[18].second, Lexer::typeOperator, "tokens[18] == typeOperator"); - t.is (tokens[19].first, "(", "tokens[19] == '('"); - t.is (tokens[19].second, Lexer::typeOperator, "tokens[19] == typeOperator"); - t.is (tokens[20].first, ")", "tokens[20] == ')'"); - t.is (tokens[20].second, Lexer::typeOperator, "tokens[20] == typeOperator"); // 170 + t.is ((int)tokens.size (), 21, "21 operators"); + t.is (tokens[0].first, "and", "tokens[0] == 'and'"); + t.is ((int) tokens[0].second, (int) Lexer2::Type::op, "tokens[0] == Type::op"); // 130 + t.is (tokens[1].first, "xor", "tokens[1] == 'xor'"); + t.is ((int) tokens[1].second, (int) Lexer2::Type::op, "tokens[1] == Type::op"); + t.is (tokens[2].first, "or", "tokens[2] == 'or'"); + t.is ((int) tokens[2].second, (int) Lexer2::Type::op, "tokens[2] == Type::op"); + t.is (tokens[3].first, "<=", "tokens[3] == '<='"); + t.is ((int) tokens[3].second, (int) Lexer2::Type::op, "tokens[3] == Type::op"); + t.is (tokens[4].first, ">=", "tokens[4] == '>='"); + t.is ((int) tokens[4].second, (int) Lexer2::Type::op, "tokens[4] == Type::op"); + t.is (tokens[5].first, "!~", "tokens[5] == '!~'"); + t.is ((int) tokens[5].second, (int) Lexer2::Type::op, "tokens[5] == Type::op"); // 140 + t.is (tokens[6].first, "!=", "tokens[6] == '!='"); + t.is ((int) tokens[6].second, (int) Lexer2::Type::op, "tokens[6] == Type::op"); + t.is (tokens[7].first, "==", "tokens[7] == '=='"); + t.is ((int) tokens[7].second, (int) Lexer2::Type::op, "tokens[7] == Type::op"); + t.is (tokens[8].first, "=", "tokens[8] == '='"); + t.is ((int) tokens[8].second, (int) Lexer2::Type::op, "tokens[8] == Type::op"); + t.is (tokens[9].first, "^", "tokens[9] == '^'"); + t.is ((int) tokens[9].second, (int) Lexer2::Type::op, "tokens[9] == Type::op"); + t.is (tokens[10].first, ">", "tokens[10] == '>'"); + t.is ((int) tokens[10].second, (int) Lexer2::Type::op, "tokens[10] == Type::op"); // 150 + t.is (tokens[11].first, "~", "tokens[11] == '~'"); + t.is ((int) tokens[11].second, (int) Lexer2::Type::op, "tokens[11] == Type::op"); + t.is (tokens[12].first, "!", "tokens[12] == '!'"); + t.is ((int) tokens[12].second, (int) Lexer2::Type::op, "tokens[12] == Type::op"); + t.is (tokens[13].first, "*", "tokens[13] == '*'"); + t.is ((int) tokens[13].second, (int) Lexer2::Type::op, "tokens[13] == Type::op"); + t.is (tokens[14].first, "/", "tokens[14] == '/'"); + t.is ((int) tokens[14].second, (int) Lexer2::Type::op, "tokens[14] == Type::op"); + t.is (tokens[15].first, "%", "tokens[15] == '%'"); + t.is ((int) tokens[15].second, (int) Lexer2::Type::op, "tokens[15] == Type::op"); // 160 + t.is (tokens[16].first, "+", "tokens[16] == '+'"); + t.is ((int) tokens[16].second, (int) Lexer2::Type::op, "tokens[16] == Type::op"); + t.is (tokens[17].first, "-", "tokens[17] == '-'"); + t.is ((int) tokens[17].second, (int) Lexer2::Type::op, "tokens[17] == Type::op"); + t.is (tokens[18].first, "<", "tokens[18] == '<'"); + t.is ((int) tokens[18].second, (int) Lexer2::Type::op, "tokens[18] == Type::op"); + t.is (tokens[19].first, "(", "tokens[19] == '('"); + t.is ((int) tokens[19].second, (int) Lexer2::Type::op, "tokens[19] == Type::op"); + t.is (tokens[20].first, ")", "tokens[20] == ')'"); + t.is ((int) tokens[20].second, (int)Lexer2::Type::op, "tokens[20] == Type::op"); // 170 // Test ordinal dates. - Lexer l8 ("9th 10th"); + Lexer2 l8 ("9th 10th"); l8.ambiguity (false); tokens.clear (); while (l8.token (token, type)) { - std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; - tokens.push_back (std::pair (token, type)); + std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); } - t.is ((int)tokens.size (), 2, "2 tokens"); - t.is (tokens[0].first, "9th", "tokens[0] == '9th'"); - t.is (tokens[0].second, Lexer::typeIdentifier, "tokens[0] == typeIdentifier"); - t.is (tokens[1].first, "10th", "tokens[1] == '10th'"); - t.is (tokens[1].second, Lexer::typeIdentifier, "tokens[1] == typeIdentifier"); + t.is ((int)tokens.size (), 2, "2 tokens"); + t.is (tokens[0].first, "9th", "tokens[0] == '9th'"); + t.is ((int) tokens[0].second, (int) Lexer2::Type::identifier, "tokens[0] == Type::identifier"); + t.is (tokens[1].first, "10th", "tokens[1] == '10th'"); + t.is ((int) tokens[1].second, (int) Lexer2::Type::identifier, "tokens[1] == Type::identifier"); // Test tag recognition. - Lexer l9 ("+with -WITHOUT + 2"); + Lexer2 l9 ("+with -WITHOUT + 2"); l9.ambiguity (false); tokens.clear (); while (l9.token (token, type)) { - std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; - tokens.push_back (std::pair (token, type)); + std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); } - t.is ((int)tokens.size (), 4, "4 tokens"); - t.is (tokens[0].first, "+with", "tokens[0] == '+with'"); - t.is (tokens[0].second, Lexer::typeTag, "tokens[0] == typeTag"); - t.is (tokens[1].first, "-WITHOUT", "tokens[1] == '-WITHOUT'"); - t.is (tokens[1].second, Lexer::typeTag, "tokens[1] == typeTag"); - t.is (tokens[2].first, "+", "tokens[2] == '+'"); - t.is (tokens[2].second, Lexer::typeOperator, "tokens[2] == typeOperator"); - t.is (tokens[3].first, "2", "tokens[3] == '2'"); - t.is (tokens[3].second, Lexer::typeNumber, "tokens[3] == typeNumber"); + t.is ((int)tokens.size (), 4, "4 tokens"); + t.is (tokens[0].first, "+with", "tokens[0] == '+with'"); + t.is ((int) tokens[0].second, (int) Lexer2::Type::tag, "tokens[0] == Type::tag"); + t.is (tokens[1].first, "-WITHOUT", "tokens[1] == '-WITHOUT'"); + t.is ((int) tokens[1].second, (int) Lexer2::Type::tag, "tokens[1] == Type::tag"); + t.is (tokens[2].first, "+", "tokens[2] == '+'"); + t.is ((int) tokens[2].second, (int) Lexer2::Type::op, "tokens[2] == Type::op"); + t.is (tokens[3].first, "2", "tokens[3] == '2'"); + t.is ((int) tokens[3].second, (int) Lexer2::Type::number, "tokens[3] == Type::number"); - // void word_split (std::vector&, const std::string&); + // void split (std::vector&, const std::string&); std::string unsplit = " ( A or B ) "; std::vector items; - Lexer::word_split (items, unsplit); - t.is (items.size (), (size_t) 5, "word_split ' ( A or B ) '"); - t.is (items[0], "(", "word_split ' ( A or B ) ' -> [0] '('"); - t.is (items[1], "A", "word_split ' ( A or B ) ' -> [1] 'A'"); - t.is (items[2], "or", "word_split ' ( A or B ) ' -> [2] 'or'"); - t.is (items[3], "B", "word_split ' ( A or B ) ' -> [3] 'B'"); - t.is (items[4], ")", "word_split ' ( A or B ) ' -> [4] ')'"); + items = Lexer2::split (unsplit); + t.is (items.size (), (size_t) 5, "split ' ( A or B ) '"); + t.is (items[0], "(", "split ' ( A or B ) ' -> [0] '('"); + t.is (items[1], "A", "split ' ( A or B ) ' -> [1] 'A'"); + t.is (items[2], "or", "split ' ( A or B ) ' -> [2] 'or'"); + t.is (items[3], "B", "split ' ( A or B ) ' -> [3] 'B'"); + t.is (items[4], ")", "split ' ( A or B ) ' -> [4] ')'"); // Test simple mode with contrived tokens that ordinarily split. unsplit = " +-* a+b 12.3e4 'c d'"; - Lexer::word_split (items, unsplit); - t.is (items.size (), (size_t) 4, "word_split ' +-* a+b 12.3e4 'c d''"); - t.is (items[0], "+-*", "word_split ' +-* a+b 12.3e4 'c d'' -> [0] '+-*'"); - t.is (items[1], "a+b", "word_split ' +-* a+b 12.3e4 'c d'' -> [1] 'a+b'"); - t.is (items[2], "12.3e4", "word_split ' +-* a+b 12.3e4 'c d'' -> [2] '12.3e4'"); - t.is (items[3], "'c d'", "word_split ' +-* a+b 12.3e4 'c d'' -> [3] 'c d'"); + items = Lexer2::split (unsplit); + t.is (items.size (), (size_t) 8, "split ' +-* a+b 12.3e4 'c d''"); + t.is (items[0], "+", "split ' +-* a+b 12.3e4 'c d'' -> [0] '+'"); + t.is (items[1], "-", "split ' +-* a+b 12.3e4 'c d'' -> [1] '-'"); + t.is (items[2], "*", "split ' +-* a+b 12.3e4 'c d'' -> [2] '*'"); + t.is (items[3], "a", "split ' +-* a+b 12.3e4 'c d'' -> [3] 'a'"); + t.is (items[4], "+", "split ' +-* a+b 12.3e4 'c d'' -> [4] '+'"); + t.is (items[5], "b", "split ' +-* a+b 12.3e4 'c d'' -> [5] 'b'"); + t.is (items[6], "12.3e4", "split ' +-* a+b 12.3e4 'c d'' -> [6] '12.3e4'"); + t.is (items[7], "c d", "split ' +-* a+b 12.3e4 'c d'' -> [7] 'c d'"); // Test common expression element. unsplit = "name=value"; - Lexer::token_split (items, unsplit); - t.is (items.size (), (size_t) 3, "token_split 'name=value'"); - if (items.size () == 3) - { - t.is (items[0], "name", "token_split 'name=value' -> [0] 'name'"); - t.is (items[1], "=", "token_split 'name=value' -> [1] '='"); - t.is (items[2], "value", "token_split 'name=value' -> [2] 'value'"); - } - else - { - t.fail ("token_split 'name=value' -> [0] 'name'"); - t.fail ("token_split 'name=value' -> [1] '='"); - t.fail ("token_split 'name=value' -> [2] 'value'"); - } + items = Lexer2::split (unsplit); + t.is (items.size (), (size_t) 1, "split 'name=value'"); // Test unterminated tokens. unsplit = " ordinary "; - Lexer::token_split (items, unsplit); - t.is (items.size (), (size_t) 1, "token_split 'ordinary' --> 1 token"); - t.is (items[0], "ordinary", "token_split 'ordinary' --> 'ordinary'"); + items = Lexer2::split (unsplit); + t.is (items.size (), (size_t) 1, "split 'ordinary' --> 1 token"); + t.is (items[0], "ordinary", "split 'ordinary' --> 'ordinary'"); return 0; }