Lexer

- Implmented boundary detection hints.
2025-06-26 10:54:26 +02:00 · 2014-06-18 17:45:25 -04:00 · 2014-06-18 17:45:25 -04:00 · 008ba6ecab
commit 008ba6ecab
parent cbb6decf93
2 changed files with 27 additions and 0 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -24,6 +24,7 @@
 //
 ////////////////////////////////////////////////////////////////////////////////
 #include <ctype.h>
 #include <utf8.h>
 #include <ISO8601.h>
 #include <Date.h>
@ -41,6 +42,9 @@ Lexer::Lexer (const std::string& input)
 , _n1 (32)
 , _n2 (32)
 , _n3 (32)
 , _boundary01 (false)
 , _boundary12 (false)
 , _boundary23 (false)
 , _ambiguity (true)
 {
  // Read 4 chars in preparation.  Even if there are < 4.  Take a deep breath.
@ -636,6 +640,20 @@ bool Lexer::is_ws (int c)
          c == 0x3000);    // ideographic space Common  Separator, space
 }
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::boundary (int left, int right)
 {
  // XOR
  if (!isdigit (left) != !isdigit (right)) return true;
  if (!isalpha (left) != !isalpha (right)) return true;
  if (!isspace (left) != !isspace (right)) return true;
  // OR
  if (ispunct (left)  || ispunct (right))  return true;
  return false;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes.
 void Lexer::word_split (std::vector <std::string>& words, const std::string& input)
@ -814,6 +832,11 @@ void Lexer::shift ()
  _n2 = _n3;
  _n3 = utf8_next_char (_input, _i);
  ++_shift_counter;
  // Detect type boundaries between characters.
  _boundary01 = boundary (_n0, _n1);
  _boundary12 = boundary (_n1, _n2);
  _boundary23 = boundary (_n2, _n3);
 }
 ////////////////////////////////////////////////////////////////////////////////
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -65,6 +65,7 @@ public:
  static const std::string type_name (const Type&);
  static bool is_ws (int);
  static bool boundary (int, int);
  static void word_split (std::vector <std::string>&, const std::string&);
  static void token_split (std::vector <std::string>&, const std::string&);
  static void token_split (std::vector <std::pair <std::string, Lexer::Type> >&, const std::string&);
@ -93,6 +94,9 @@ private:
  int _n1;
  int _n2;
  int _n3;
  bool _boundary01;
  bool _boundary12;
  bool _boundary23;
  bool _ambiguity;
 };