- Implmented boundary detection hints.
This commit is contained in:
Paul Beckingham 2014-06-18 17:45:25 -04:00
parent cbb6decf93
commit 008ba6ecab
2 changed files with 27 additions and 0 deletions

View file

@ -24,6 +24,7 @@
// //
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#include <ctype.h>
#include <utf8.h> #include <utf8.h>
#include <ISO8601.h> #include <ISO8601.h>
#include <Date.h> #include <Date.h>
@ -41,6 +42,9 @@ Lexer::Lexer (const std::string& input)
, _n1 (32) , _n1 (32)
, _n2 (32) , _n2 (32)
, _n3 (32) , _n3 (32)
, _boundary01 (false)
, _boundary12 (false)
, _boundary23 (false)
, _ambiguity (true) , _ambiguity (true)
{ {
// Read 4 chars in preparation. Even if there are < 4. Take a deep breath. // Read 4 chars in preparation. Even if there are < 4. Take a deep breath.
@ -636,6 +640,20 @@ bool Lexer::is_ws (int c)
c == 0x3000); // ideographic space Common Separator, space c == 0x3000); // ideographic space Common Separator, space
} }
////////////////////////////////////////////////////////////////////////////////
bool Lexer::boundary (int left, int right)
{
// XOR
if (!isdigit (left) != !isdigit (right)) return true;
if (!isalpha (left) != !isalpha (right)) return true;
if (!isspace (left) != !isspace (right)) return true;
// OR
if (ispunct (left) || ispunct (right)) return true;
return false;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes. // Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes.
void Lexer::word_split (std::vector <std::string>& words, const std::string& input) void Lexer::word_split (std::vector <std::string>& words, const std::string& input)
@ -814,6 +832,11 @@ void Lexer::shift ()
_n2 = _n3; _n2 = _n3;
_n3 = utf8_next_char (_input, _i); _n3 = utf8_next_char (_input, _i);
++_shift_counter; ++_shift_counter;
// Detect type boundaries between characters.
_boundary01 = boundary (_n0, _n1);
_boundary12 = boundary (_n1, _n2);
_boundary23 = boundary (_n2, _n3);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View file

@ -65,6 +65,7 @@ public:
static const std::string type_name (const Type&); static const std::string type_name (const Type&);
static bool is_ws (int); static bool is_ws (int);
static bool boundary (int, int);
static void word_split (std::vector <std::string>&, const std::string&); static void word_split (std::vector <std::string>&, const std::string&);
static void token_split (std::vector <std::string>&, const std::string&); static void token_split (std::vector <std::string>&, const std::string&);
static void token_split (std::vector <std::pair <std::string, Lexer::Type> >&, const std::string&); static void token_split (std::vector <std::pair <std::string, Lexer::Type> >&, const std::string&);
@ -93,6 +94,9 @@ private:
int _n1; int _n1;
int _n2; int _n2;
int _n3; int _n3;
bool _boundary01;
bool _boundary12;
bool _boundary23;
bool _ambiguity; bool _ambiguity;
}; };