From fa0c0e5fa76682c256443e9dc77f816ebf223d85 Mon Sep 17 00:00:00 2001 From: Paul Beckingham Date: Sun, 20 Dec 2015 22:00:02 -0500 Subject: [PATCH] Lexer: Added number support --- src/Lexer.cpp | 105 +++++++++++++++++++++++++++++++++++++++++++++++ src/Lexer.h | 5 ++- test/lexer.t.cpp | 32 ++++++++++++++- 3 files changed, 140 insertions(+), 2 deletions(-) diff --git a/src/Lexer.cpp b/src/Lexer.cpp index e5052db2..bfe28f65 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -53,6 +53,7 @@ bool Lexer::token (std::string& token, Lexer::Type& type) if (isString (token, type, "'\"") || isHexNumber (token, type) || + isNumber (token, type) || isWord (token, type)) return true; @@ -65,6 +66,7 @@ const std::string Lexer::typeName (const Lexer::Type& type) { switch (type) { + case Lexer::Type::number: return "number"; case Lexer::Type::hex: return "hex"; case Lexer::Type::string: return "string"; case Lexer::Type::word: return "word"; @@ -115,6 +117,15 @@ bool Lexer::isWhitespace (int c) c == 0x3000); // ideographic space Common Separator, space } +//////////////////////////////////////////////////////////////////////////////// +// Digits 0-9. +// +// TODO This list should be derived from the Unicode database. +bool Lexer::isDigit (int c) +{ + return c >= 0x30 && c <= 0x39; +} + //////////////////////////////////////////////////////////////////////////////// // Digits 0-9 a-f A-F. bool Lexer::isHexDigit (int c) @@ -124,6 +135,99 @@ bool Lexer::isHexDigit (int c) (c >= 'A' && c <= 'F'); } +//////////////////////////////////////////////////////////////////////////////// +// Lexer::Type::number +// \d+ +// [ . \d+ ] +// [ e|E [ +|- ] \d+ [ . \d+ ] ] +// not followed by non-operator. +bool Lexer::isNumber (std::string& token, Lexer::Type& type) +{ + std::size_t marker = _cursor; + + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + + if (_text[marker] == '.') + { + ++marker; + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + } + } + + if (_text[marker] == 'e' || + _text[marker] == 'E') + { + ++marker; + + if (_text[marker] == '+' || + _text[marker] == '-') + ++marker; + + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + + if (_text[marker] == '.') + { + ++marker; + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + } + } + } + } + + // Lookahread: ! | ! + // If there is an immediately consecutive character, that is not an operator, fail. + if (_eos > marker && + ! isWhitespace (_text[marker]) && + ! isSingleCharOperator (_text[marker])) + return false; + + token = _text.substr (_cursor, marker - _cursor); + type = Lexer::Type::number; + _cursor = marker; + return true; + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer::Type::number +// \d+ +bool Lexer::isInteger (std::string& token, Lexer::Type& type) +{ + std::size_t marker = _cursor; + + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + + token = _text.substr (_cursor, marker - _cursor); + type = Lexer::Type::number; + _cursor = marker; + return true; + } + + return false; +} + //////////////////////////////////////////////////////////////////////////////// bool Lexer::isSingleCharOperator (int c) { @@ -293,6 +397,7 @@ std::string Lexer::typeToString (Lexer::Type type) { if (type == Lexer::Type::string) return std::string ("\033[38;5;7m\033[48;5;3m") + "string" + "\033[0m"; else if (type == Lexer::Type::hex) return std::string ("\033[38;5;7m\033[48;5;14m") + "hex" + "\033[0m"; + else if (type == Lexer::Type::number) return std::string ("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m"; else if (type == Lexer::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m"; else return std::string ("\033[37;41m") + "unknown" + "\033[0m"; } diff --git a/src/Lexer.h b/src/Lexer.h index ebd5655c..562fe83c 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -35,7 +35,7 @@ class Lexer { public: - enum class Type { hex, + enum class Type { number, hex, string, word }; @@ -46,6 +46,7 @@ public: // Static helpers. static const std::string typeName (const Lexer::Type&); static bool isWhitespace (int); + static bool isDigit (int); static bool isHexDigit (int); static bool isSingleCharOperator (int); static bool isHardBoundary (int, int); @@ -61,6 +62,8 @@ public: // Stream Classifiers. bool isEOS () const; bool isString (std::string&, Lexer::Type&, const std::string&); + bool isNumber (std::string&, Lexer::Type&); + bool isInteger (std::string&, Lexer::Type&); bool isHexNumber (std::string&, Lexer::Type&); bool isWord (std::string&, Lexer::Type&); diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp index 5f483bb2..fe747be3 100644 --- a/test/lexer.t.cpp +++ b/test/lexer.t.cpp @@ -34,7 +34,7 @@ //////////////////////////////////////////////////////////////////////////////// int main (int, char**) { - UnitTest t (131); + UnitTest t (171); std::vector > tokens; std::string token; @@ -77,6 +77,31 @@ int main (int, char**) Lexer l1 (" \t "); t.notok (l1.token (token, type), "' \\t ' --> no tokens"); + // Test for numbers that are no longer ISO-8601 dates. + Lexer l3 ("1 12 123 1234 12345 123456 1234567"); + tokens.clear (); + while (l3.token (token, type)) + { + std::cout << "# «" << token << "» " << Lexer::typeName (type) << "\n"; + tokens.push_back (std::pair (token, type)); + } + + t.is ((int)tokens.size (), 7, "7 tokens"); + t.is (tokens[0].first, "1", "tokens[0] == '1'"); + t.is ((int) tokens[0].second, (int) Lexer::Type::number, "tokens[0] == Type::number"); + t.is (tokens[1].first, "12", "tokens[1] == '12'"); + t.is ((int) tokens[1].second, (int) Lexer::Type::number, "tokens[1] == Type::date"); + t.is (tokens[2].first, "123", "tokens[2] == '123'"); + t.is ((int) tokens[2].second, (int) Lexer::Type::number, "tokens[2] == Type::number"); // 70 + t.is (tokens[3].first, "1234", "tokens[3] == '1234'"); + t.is ((int) tokens[3].second, (int) Lexer::Type::number, "tokens[3] == Type::date"); + t.is (tokens[4].first, "12345", "tokens[4] == '12345'"); + t.is ((int) tokens[4].second, (int) Lexer::Type::number, "tokens[4] == Type::number"); + t.is (tokens[5].first, "123456", "tokens[5] == '123456'"); + t.is ((int) tokens[5].second, (int) Lexer::Type::number, "tokens[5] == Type::date"); + t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'"); + t.is ((int) tokens[6].second, (int) Lexer::Type::number, "tokens[6] == Type::number"); + // static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); std::string::size_type cursor = 0; std::string word; @@ -150,6 +175,10 @@ int main (int, char**) { "\"U+20AC4\"", { { "\"€4\"", Lexer::Type::string }, NO, NO, NO, NO }, }, // Number + { "1", { { "1", Lexer::Type::number }, NO, NO, NO, NO }, }, + { "3.14", { { "3.14", Lexer::Type::number }, NO, NO, NO, NO }, }, + { "6.02217e23", { { "6.02217e23", Lexer::Type::number }, NO, NO, NO, NO }, }, + { "1.2e-3.4", { { "1.2e-3.4", Lexer::Type::number }, NO, NO, NO, NO }, }, { "0x2f", { { "0x2f", Lexer::Type::hex }, NO, NO, NO, NO }, }, }; @@ -186,6 +215,7 @@ int main (int, char**) } } + t.is (Lexer::typeName (Lexer::Type::number), "number", "Lexer::typeName (Lexer::Type::number)"); t.is (Lexer::typeName (Lexer::Type::hex), "hex", "Lexer::typeName (Lexer::Type::hex)"); t.is (Lexer::typeName (Lexer::Type::string), "string", "Lexer::typeName (Lexer::Type::string)"); t.is (Lexer::typeName (Lexer::Type::word), "word", "Lexer::typeName (Lexer::Type::word)");