diff --git a/src/Lexer.cpp b/src/Lexer.cpp index b31a69242..53c690026 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -24,6 +24,7 @@ // //////////////////////////////////////////////////////////////////////////////// +#include #include #include #include @@ -41,6 +42,9 @@ Lexer::Lexer (const std::string& input) , _n1 (32) , _n2 (32) , _n3 (32) +, _boundary01 (false) +, _boundary12 (false) +, _boundary23 (false) , _ambiguity (true) { // Read 4 chars in preparation. Even if there are < 4. Take a deep breath. @@ -636,6 +640,20 @@ bool Lexer::is_ws (int c) c == 0x3000); // ideographic space Common Separator, space } +//////////////////////////////////////////////////////////////////////////////// +bool Lexer::boundary (int left, int right) +{ + // XOR + if (!isdigit (left) != !isdigit (right)) return true; + if (!isalpha (left) != !isalpha (right)) return true; + if (!isspace (left) != !isspace (right)) return true; + + // OR + if (ispunct (left) || ispunct (right)) return true; + + return false; +} + //////////////////////////////////////////////////////////////////////////////// // Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes. void Lexer::word_split (std::vector & words, const std::string& input) @@ -814,6 +832,11 @@ void Lexer::shift () _n2 = _n3; _n3 = utf8_next_char (_input, _i); ++_shift_counter; + + // Detect type boundaries between characters. + _boundary01 = boundary (_n0, _n1); + _boundary12 = boundary (_n1, _n2); + _boundary23 = boundary (_n2, _n3); } //////////////////////////////////////////////////////////////////////////////// diff --git a/src/Lexer.h b/src/Lexer.h index 1d21161db..698ac9f32 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -65,6 +65,7 @@ public: static const std::string type_name (const Type&); static bool is_ws (int); + static bool boundary (int, int); static void word_split (std::vector &, const std::string&); static void token_split (std::vector &, const std::string&); static void token_split (std::vector >&, const std::string&); @@ -93,6 +94,9 @@ private: int _n1; int _n2; int _n3; + bool _boundary01; + bool _boundary12; + bool _boundary23; bool _ambiguity; };