From 611812007a574b2d9fb19063eac0ea2f0bade1aa Mon Sep 17 00:00:00 2001 From: Paul Beckingham Date: Wed, 23 Apr 2014 23:19:41 -0400 Subject: [PATCH] Lexer - Implemented Lexer::word, which is just like ::token, but does not understand dates, durations or operators. - Implemented Lexer::split, which uses Lexer::word. - Added unit tests. --- src/Lexer.cpp | 302 ++++++++++++++++++++++++++++++++++++++++++++++- src/Lexer.h | 2 + test/lexer.t.cpp | 13 +- 3 files changed, 314 insertions(+), 3 deletions(-) diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 506208984..81df8510c 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -393,6 +393,293 @@ bool Lexer::token (std::string& token, Type& type) return false; } +//////////////////////////////////////////////////////////////////////////////// +// Just like Lexer::token, but no operators, dates or durations. +bool Lexer::word (std::string& token, Type& type) +{ + // Start with nothing. + token = ""; + + // Different types of matching quote: ', ". + int quote = 0; + + type = typeNone; + while (_n0) + { + switch (type) + { + case typeNone: + if (is_ws (_n0)) + shift (); + else if (_n0 == '"' || _n0 == '\'') + { + type = typeString; + quote = _n0; + shift (); + } + else if (_n0 == '0' && + _n1 == 'x' && + is_hex_digit (_n2)) + { + type = typeHex; + token += utf8_character (_n0); + shift (); + token += utf8_character (_n0); + shift (); + token += utf8_character (_n0); + shift (); + } + else if (is_dec_digit (_n0)) + { + // Speculatively try a date and duration parse. Longest wins. + std::string::size_type iso_i = 0; + std::string iso_token; + ISO8601d iso; + iso.ambiguity (_ambiguity); + if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i)) + iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i); + + std::string::size_type dur_i = 0; + std::string dur_token; + Duration dur; + if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i)) + dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i); + + if (iso_token.length () > dur_token.length ()) + { + while (iso_i--) shift (); + token = iso_token; + type = typeDate; + return true; + } + else if (dur_token.length () > iso_token.length ()) + { + while (dur_i--) shift (); + token = dur_token; + type = typeDuration; + return true; + } + + type = typeNumber; + token += utf8_character (_n0); + shift (); + } + else if (_n0 == '.' && is_dec_digit (_n1)) + { + type = typeDecimal; + token += utf8_character (_n0); + shift (); + } + else if (_n0 == '\\') + { + type = typeIdentifierEscape; + shift (); + } + else if (is_ident_start (_n0)) + { + type = typeIdentifier; + token += utf8_character (_n0); + shift (); + } + else + throw std::string ("Unexpected error 1"); + break; + + case typeString: + if (_n0 == quote) + { + shift (); + quote = 0; + return true; + } + else if (_n0 == '\\') + { + type = typeEscape; + shift (); + } + else + { + token += utf8_character (_n0); + shift (); + } + break; + + case typeIdentifier: + if (is_ident (_n0)) + { + token += utf8_character (_n0); + shift (); + } + else + { + return true; + } + break; + + case typeIdentifierEscape: + if (_n0 == 'u') + { + type = typeEscapeUnicode; + shift (); + } + break; + + case typeEscape: + if (_n0 == 'x') + { + type = typeEscapeHex; + shift (); + } + else if (_n0 == 'u') + { + type = typeEscapeUnicode; + shift (); + } + else + { + token += decode_escape (_n0); + type = quote ? typeString : typeIdentifier; + shift (); + } + break; + + case typeEscapeHex: + if (is_hex_digit (_n0) && is_hex_digit (_n1)) + { + token += utf8_character (hex_to_int (_n0, _n1)); + type = quote ? typeString : typeIdentifier; + shift (); + shift (); + } + else + { + type = quote ? typeString : typeIdentifier; + shift (); + quote = 0; + return true; + } + break; + + case typeEscapeUnicode: + if (is_hex_digit (_n0) && + is_hex_digit (_n1) && + is_hex_digit (_n2) && + is_hex_digit (_n3)) + { + token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3)); + shift (); + shift (); + shift (); + shift (); + type = quote ? typeString : typeIdentifier; + } + else if (_n0 == quote) + { + type = typeString; + shift (); + quote = 0; + return true; + } + + case typeNumber: + if (is_dec_digit (_n0)) + { + token += utf8_character (_n0); + shift (); + } + else if (_n0 == '.') + { + type = typeDecimal; + token += utf8_character (_n0); + shift (); + } + else if (_n0 == 'e' || _n0 == 'E') + { + type = typeExponentIndicator; + token += utf8_character (_n0); + shift (); + } + else + { + return true; + } + break; + + case typeDecimal: + if (is_dec_digit (_n0)) + { + token += utf8_character (_n0); + shift (); + } + else if (_n0 == 'e' || _n0 == 'E') + { + type = typeExponentIndicator; + token += utf8_character (_n0); + shift (); + } + else + { + return true; + } + break; + + case typeExponentIndicator: + if (_n0 == '+' || _n0 == '-') + { + token += utf8_character (_n0); + shift (); + } + else if (is_dec_digit (_n0)) + { + type = typeExponent; + token += utf8_character (_n0); + shift (); + } + break; + + case typeExponent: + if (is_dec_digit (_n0)) + { + token += utf8_character (_n0); + shift (); + } + else if (_n0 == '.') + { + token += utf8_character (_n0); + shift (); + } + else + { + type = typeDecimal; + return true; + } + break; + + case typeHex: + if (is_hex_digit (_n0)) + { + token += utf8_character (_n0); + shift (); + } + else + { + return true; + } + break; + + default: + throw std::string ("Unexpected error 2"); + break; + } + + // Fence post. + if (!_n0 && token != "") + return true; + } + + return false; +} + //////////////////////////////////////////////////////////////////////////////// void Lexer::ambiguity (bool value) { @@ -457,6 +744,19 @@ bool Lexer::is_ws (int c) c == 0x3000); // ideographic space Common Separator, space } +//////////////////////////////////////////////////////////////////////////////// +// Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes. +void Lexer::split (std::vector & words, const std::string& input) +{ + words.clear (); + + std::string word; + Lexer::Type type; + Lexer lex (input); + while (lex.word (word, type)) + words.push_back (word); +} + //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_punct (int c) const { @@ -593,8 +893,6 @@ void Lexer::shift () _n1 = _n2; _n2 = _n3; _n3 = utf8_next_char (_input, _i); - - //std::cout << "# shift [" << (char) _n0 << (char) _n1 << (char) _n2 << (char) _n3 << "]\n"; } //////////////////////////////////////////////////////////////////////////////// diff --git a/src/Lexer.h b/src/Lexer.h index 9010b6ea8..6d6a76043 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -58,10 +58,12 @@ public: Lexer& operator= (const Lexer&); // Not implemented. bool operator== (const Lexer&); // Not implemented. bool token (std::string&, Type&); + bool word (std::string&, Type&); void ambiguity (bool); static const std::string type_name (const Type&); static bool is_ws (int); + static void split (std::vector &, const std::string&); private: bool is_punct (int) const; diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp index f850650fd..bff4f2e87 100644 --- a/test/lexer.t.cpp +++ b/test/lexer.t.cpp @@ -36,7 +36,7 @@ Context context; //////////////////////////////////////////////////////////////////////////////// int main (int argc, char** argv) { - UnitTest t (170); + UnitTest t (176); std::vector > tokens; std::string token; @@ -299,6 +299,17 @@ int main (int argc, char** argv) t.is (tokens[20].first, ")", "tokens[20] == ')'"); t.is (tokens[20].second, Lexer::typeOperator, "tokens[20] == typeOperator"); // 170 + // void splitq (std::vector&, const std::string&); + std::string unsplit = " ( A or B ) "; + std::vector items; + Lexer::split (items, unsplit); + t.is (items.size (), (size_t) 5, "split ' ( A or B ) '"); + t.is (items[0], "(", "split ' ( A or B ) ' -> [0] '('"); + t.is (items[1], "A", "split ' ( A or B ) ' -> [1] 'A'"); + t.is (items[2], "or", "split ' ( A or B ) ' -> [2] 'or'"); + t.is (items[3], "B", "split ' ( A or B ) ' -> [3] 'B'"); + t.is (items[4], ")", "split ' ( A or B ) ' -> [4] ')'"); + return 0; }