Lexer: Migrated to unicodeWhitespace

2025-06-26 10:54:26 +02:00 · 2018-01-25 00:47:23 -05:00 · 2018-01-25 00:47:23 -05:00 · 49dedfbc86
commit 49dedfbc86
parent 2c89688b46
4 changed files with 37 additions and 106 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -30,6 +30,7 @@
 #include <ctype.h>
 #include <Datetime.h>
 #include <Duration.h>
+#include <unicode.h>
 #include <utf8.h>

 static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
@ -59,7 +60,7 @@ Lexer::~Lexer ()
 bool Lexer::token (std::string& token, Lexer::Type& type)
 {
  // Eat white space.
-  while (isWhitespace (_text[_cursor]))
+  while (unicodeWhitespace (_text[_cursor]))
    utf8_next_char (_text, _cursor);

  // Terminate at EOS.
@ -142,48 +143,6 @@ const std::string Lexer::typeName (const Lexer::Type& type)
  return "unknown";
 }

-////////////////////////////////////////////////////////////////////////////////
-// Complete Unicode whitespace list.
-//
-// http://en.wikipedia.org/wiki/Whitespace_character
-// Updated 2015-09-13
-// Static
-//
-// TODO This list should be derived from the Unicode database.
-bool Lexer::isWhitespace (int c)
-{
-  return (c == 0x0020 ||   // space Common  Separator, space
-          c == 0x0009 ||   // Common  Other, control  HT, Horizontal Tab
-          c == 0x000A ||   // Common  Other, control  LF, Line feed
-          c == 0x000B ||   // Common  Other, control  VT, Vertical Tab
-          c == 0x000C ||   // Common  Other, control  FF, Form feed
-          c == 0x000D ||   // Common  Other, control  CR, Carriage return
-          c == 0x0085 ||   // Common  Other, control  NEL, Next line
-          c == 0x00A0 ||   // no-break space  Common  Separator, space
-          c == 0x1680 ||   // ogham space mark  Ogham Separator, space
-          c == 0x180E ||   // mongolian vowel separator Mongolian Separator, space
-          c == 0x2000 ||   // en quad Common  Separator, space
-          c == 0x2001 ||   // em quad Common  Separator, space
-          c == 0x2002 ||   // en space  Common  Separator, space
-          c == 0x2003 ||   // em space  Common  Separator, space
-          c == 0x2004 ||   // three-per-em space  Common  Separator, space
-          c == 0x2005 ||   // four-per-em space Common  Separator, space
-          c == 0x2006 ||   // six-per-em space  Common  Separator, space
-          c == 0x2007 ||   // figure space  Common  Separator, space
-          c == 0x2008 ||   // punctuation space Common  Separator, space
-          c == 0x2009 ||   // thin space  Common  Separator, space
-          c == 0x200A ||   // hair space  Common  Separator, space
-          c == 0x200B ||   // zero width space
-          c == 0x200C ||   // zero width non-joiner
-          c == 0x200D ||   // zero width joiner
-          c == 0x2028 ||   // line separator  Common  Separator, line
-          c == 0x2029 ||   // paragraph separator Common  Separator, paragraph
-          c == 0x202F ||   // narrow no-break space Common  Separator, space
-          c == 0x205F ||   // medium mathematical space Common  Separator, space
-          c == 0x2060 ||   // word joiner
-          c == 0x3000);    // ideographic space Common  Separator, space
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 bool Lexer::isAlpha (int c)
 {
@ -213,7 +172,7 @@ bool Lexer::isHexDigit (int c)
 bool Lexer::isIdentifierStart (int c)
 {
  return c                          &&  // Include null character check.
-         ! isWhitespace         (c) &&
+         ! unicodeWhitespace    (c) &&
         ! isDigit              (c) &&
         ! isSingleCharOperator (c) &&
         ! isPunctuation        (c);
@ -225,7 +184,7 @@ bool Lexer::isIdentifierNext (int c)
  return c                          &&  // Include null character check.
         c != ':'                   &&  // Used in isPair.
         c != '='                   &&  // Used in isPair.
-         ! isWhitespace         (c) &&
+         ! unicodeWhitespace    (c) &&
         ! isSingleCharOperator (c);
 }

@ -272,15 +231,15 @@ bool Lexer::isTripleCharOperator (int c0, int c1, int c2, int c3)
 bool Lexer::isBoundary (int left, int right)
 {
  // EOS
-  if (right == '\0')                                 return true;
+  if (right == '\0')                                          return true;

  // XOR
-  if (isAlpha (left)       != isAlpha (right))       return true;
-  if (isDigit (left)       != isDigit (right))       return true;
-  if (isWhitespace (left)  != isWhitespace (right))  return true;
+  if (isAlpha (left)           != isAlpha (right))            return true;
+  if (isDigit (left)           != isDigit (right))            return true;
+  if (unicodeWhitespace (left) != unicodeWhitespace (right))  return true;

  // OR
-  if (isPunctuation (left) || isPunctuation (right)) return true;
+  if (isPunctuation (left) || isPunctuation (right))          return true;

  return false;
 }
@ -289,7 +248,8 @@ bool Lexer::isBoundary (int left, int right)
 bool Lexer::isHardBoundary (int left, int right)
 {
  // EOS
-  if (right == '\0')                                               return true;
+  if (right == '\0')
+    return true;

  // FILTER operators that don't need to be surrounded by whitespace.
  if (left == '(' ||
@ -628,10 +588,10 @@ bool Lexer::isUUID (std::string& token, Lexer::Type& type, bool endBoundary)
      break;
  }

-  if (i >= uuid_min_length              &&
-      (! endBoundary                    ||
-       ! _text[marker + i]              ||
-       isWhitespace (_text[marker + i]) ||
+  if (i >= uuid_min_length                   &&
+      (! endBoundary                         ||
+       ! _text[marker + i]                   ||
+       unicodeWhitespace (_text[marker + i]) ||
       isSingleCharOperator (_text[marker + i])))
  {
    token = _text.substr (_cursor, i);
@ -726,10 +686,10 @@ bool Lexer::isNumber (std::string& token, Lexer::Type& type)
      }
    }

-    // Lookahead: !<isWhitespace> | !<isSingleCharOperator>
+    // Lookahead: !<unicodeWhitespace> | !<isSingleCharOperator>
    // If there is an immediately consecutive character, that is not an operator, fail.
    if (_eos > marker &&
-        ! isWhitespace (_text[marker]) &&
+        ! unicodeWhitespace (_text[marker]) &&
        ! isSingleCharOperator (_text[marker]))
      return false;

@ -806,7 +766,7 @@ bool Lexer::isURL (std::string& token, Lexer::Type& type)
      marker += 3;

      while (marker < _eos &&
-             ! isWhitespace (_text[marker]))
+             ! unicodeWhitespace (_text[marker]))
        utf8_next_char (_text, marker);

      token = _text.substr (_cursor, marker - _cursor);
@ -847,7 +807,7 @@ bool Lexer::isPair (std::string& token, Lexer::Type& type)
    if (readWord (_text, "'\"", _cursor, ignoredToken) ||
        readWord (_text,        _cursor, ignoredToken) ||
        isEOS ()                                       ||
-        isWhitespace (_text[_cursor]))
+        unicodeWhitespace (_text[_cursor]))
    {
      token = _text.substr (marker, _cursor - marker);
      type = Lexer::Type::pair;
@ -901,7 +861,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type)
  // Success is multiple numbers, matching the pattern.
  if (count > 1 &&
      (isEOS () ||
-       isWhitespace (_text[_cursor]) ||
+       unicodeWhitespace (_text[_cursor]) ||
       isHardBoundary (_text[_cursor], _text[_cursor + 1])))
  {
    token = _text.substr (marker, _cursor - marker);
@ -916,7 +876,7 @@ bool Lexer::isSet (std::string& token, Lexer::Type& type)

 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::tag
-//   ^ | '(' | ')' | <isWhitespace>
+//   ^ | '(' | ')' | <unicodeWhitespace>
 //     [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]*
 bool Lexer::isTag (std::string& token, Lexer::Type& type)
 {
@ -924,7 +884,7 @@ bool Lexer::isTag (std::string& token, Lexer::Type& type)

  // Lookbehind: Assert ^ or preceded by whitespace, (, or ).
  if (marker > 0                         &&
-      ! isWhitespace (_text[marker - 1]) &&
+      ! unicodeWhitespace (_text[marker - 1]) &&
      _text[marker - 1] != '('           &&
      _text[marker - 1] != ')')
    return false;
@ -970,12 +930,12 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type)
      break;

    if (_text[marker] &&
-        ! isWhitespace (_text[marker]) &&
+        ! unicodeWhitespace (_text[marker]) &&
        _text[marker] != '/')
    {
      utf8_next_char (_text, marker);
      while (_text[marker] &&
-             ! isWhitespace (_text[marker]) &&
+             ! unicodeWhitespace (_text[marker]) &&
             _text[marker] != '/')
        utf8_next_char (_text, marker);
    }
@ -997,7 +957,7 @@ bool Lexer::isPath (std::string& token, Lexer::Type& type)

 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::substitution
-//   / <unquoted-string> / <unquoted-string> / [g]  <EOS> | <isWhitespace>
+//   / <unquoted-string> / <unquoted-string> / [g]  <EOS> | <unicodeWhitespace>
 bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
 {
  std::size_t marker = _cursor;
@ -1012,9 +972,9 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)
      if (_text[_cursor] == 'g')
        ++_cursor;

-      // Lookahread: <EOS> | <isWhitespace>
+      // Lookahread: <EOS> | <unicodeWhitespace>
      if (_text[_cursor] == '\0' ||
-          isWhitespace (_text[_cursor]))
+          unicodeWhitespace (_text[_cursor]))
      {
        token = _text.substr (marker, _cursor - marker);
        type = Lexer::Type::substitution;
@ -1029,7 +989,7 @@ bool Lexer::isSubstitution (std::string& token, Lexer::Type& type)

 ////////////////////////////////////////////////////////////////////////////////
 // Lexer::Type::pattern
-//   / <unquoted-string> /  <EOS> | <isWhitespace>
+//   / <unquoted-string> /  <EOS> | <unicodeWhitespace>
 bool Lexer::isPattern (std::string& token, Lexer::Type& type)
 {
  std::size_t marker = _cursor;
@ -1037,7 +997,7 @@ bool Lexer::isPattern (std::string& token, Lexer::Type& type)
  std::string word;
  if (readWord (_text, "/", _cursor, word) &&
      (isEOS () ||
-       isWhitespace (_text[_cursor])))
+       unicodeWhitespace (_text[_cursor])))
  {
    token = _text.substr (marker, _cursor - marker);
    type = Lexer::Type::pattern;
@ -1357,7 +1317,7 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
  std::size_t marker = _cursor;

  while (_text[marker]                  &&
-         ! isWhitespace (_text[marker]) &&
+         ! unicodeWhitespace (_text[marker]) &&
         ! isSingleCharOperator (_text[marker]))
    utf8_next_char (_text, marker);

@ -1393,7 +1353,7 @@ bool Lexer::isLiteral (
  // End boundary conditions must be met.
  if (endBoundary &&
      _text[_cursor + common] &&
-      ! Lexer::isWhitespace (_text[_cursor + common]) &&
+      ! unicodeWhitespace (_text[_cursor + common]) &&
      ! Lexer::isSingleCharOperator (_text[_cursor + common]))
    return false;

@ -1567,7 +1527,7 @@ bool Lexer::readWord (
 //
 // Ends at:
 //   Lexer::isEOS
-//   Lexer::isWhitespace
+//   unicodeWhitespace
 //   Lexer::isHardBoundary
 bool Lexer::readWord (
  const std::string& text,
@ -1582,7 +1542,7 @@ bool Lexer::readWord (
  while ((c = text[cursor]))  // Handles EOS.
  {
    // Unquoted word ends on white space.
-    if (Lexer::isWhitespace (c))
+    if (unicodeWhitespace (c))
      break;

    // Parentheses mostly.
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -61,7 +61,6 @@ public:

  // Static helpers.
  static const std::string typeName          (const Lexer::Type&);
-  static bool isWhitespace                   (int);
  static bool isAlpha                        (int);
  static bool isDigit                        (int);
  static bool isHexDigit                     (int);
--- a/src/util.cpp
+++ b/src/util.cpp
@ -51,6 +51,7 @@
 #include <signal.h>
 #include <sys/select.h>
 #include <Lexer.h>
+#include <unicode.h>
 #include <utf8.h>
 #include <util.h>
 #include <main.h>
@ -268,7 +269,7 @@ bool nontrivial (const std::string& input)
  std::string::size_type i = 0;
  int character;
  while ((character = utf8_next_char (input, i)))
-    if (! Lexer::isWhitespace (character))
+    if (! unicodeWhitespace (character))
      return true;

  return false;
--- a/test/lexer.t.cpp
+++ b/test/lexer.t.cpp
@ -37,9 +37,9 @@
 int main (int, char**)
 {
 #ifdef PRODUCT_TASKWARRIOR
-  UnitTest t (1280);
+  UnitTest t (1253);
 #else
-  UnitTest t (1262);
+  UnitTest t (1235);
 #endif

  // Use same Datetime/Duraiton configuration as Context∴:staticInitialization.
@ -58,35 +58,6 @@ int main (int, char**)
  Lexer::attributes["tags"]        = "string";
  Lexer::attributes["description"] = "string";

-  // White space detection.
-  t.notok (Lexer::isWhitespace (0x0041), "U+0041 (A) ! isWhitespace");
-  t.ok (Lexer::isWhitespace (0x0020), "U+0020 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x0009), "U+0009 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x000A), "U+000A isWhitespace");
-  t.ok (Lexer::isWhitespace (0x000B), "U+000B isWhitespace");
-  t.ok (Lexer::isWhitespace (0x000C), "U+000C isWhitespace");
-  t.ok (Lexer::isWhitespace (0x000D), "U+000D isWhitespace");
-  t.ok (Lexer::isWhitespace (0x0085), "U+0085 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x00A0), "U+00A0 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x1680), "U+1680 isWhitespace"); // 10
-  t.ok (Lexer::isWhitespace (0x180E), "U+180E isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2000), "U+2000 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2001), "U+2001 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2002), "U+2002 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2003), "U+2003 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2004), "U+2004 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2005), "U+2005 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2006), "U+2006 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2007), "U+2007 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2008), "U+2008 isWhitespace"); // 20
-  t.ok (Lexer::isWhitespace (0x2009), "U+2009 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x200A), "U+200A isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2028), "U+2028 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x2029), "U+2029 isWhitespace");
-  t.ok (Lexer::isWhitespace (0x202F), "U+202F isWhitespace");
-  t.ok (Lexer::isWhitespace (0x205F), "U+205F isWhitespace");
-  t.ok (Lexer::isWhitespace (0x3000), "U+3000 isWhitespace");
-
  // static bool Lexer::isBoundary (int, int);
  t.ok    (Lexer::isBoundary (' ', 'a'), "' ' --> 'a' = isBoundary");
  t.ok    (Lexer::isBoundary ('a', ' '), "'a' --> ' ' = isBoundary");