Lexer2

- Copied in the Lexer2 object.
2025-08-20 04:13:07 +02:00 · 2015-02-19 08:54:20 -08:00 · 2015-02-19 08:54:20 -08:00 · d10ad5c7af
commit d10ad5c7af
parent 1ae4ea2ea3
3 changed files with 905 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -21,6 +21,7 @@ set (task_SRCS CLI.cpp CLI.h
               ISO8601.cpp ISO8601.h
               JSON.cpp JSON.h
               Lexer.cpp Lexer.h
+               Lexer2.cpp Lexer2.h
               Msg.cpp Msg.h
               Nibbler.cpp Nibbler.h
               Path.cpp Path.h
--- a/src/Lexer2.cpp
+++ b/src/Lexer2.cpp
@ -0,0 +1,808 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// http://www.opensource.org/licenses/mit-license.php
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <cmake.h>
+#include <ctype.h>
+#include <Lexer2.h>
+#include <utf8.h>
+
+static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
+static const int uuid_min_length = 8;
+
+////////////////////////////////////////////////////////////////////////////////
+Lexer2::Lexer2 (const std::string& text)
+: _text (text)
+, _cursor (0)
+, _eos (text.size ())
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+Lexer2::~Lexer2 ()
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::token (std::string& token, Lexer2::Type& type)
+{
+  // Eat white space.
+  while (isWhitespace (_text[_cursor]))
+    utf8_next_char (_text, _cursor);
+
+  // Terminate at EOS.
+  if (isEOS ())
+    return false;
+
+  // The sequence is specific, and must follow these rules:
+  // - date < uuid < identifier
+  // - duraiton < identifier
+  // - pair < identifier
+  // - hex < number
+  // - separator < tag < operator
+  // - substitution < pattern
+  // - word last
+  if (isString       (token, type, '\'') ||
+      isString       (token, type, '"')  ||
+      isUUID         (token, type)       ||
+      isPartialUUID  (token, type)       ||
+      isHexNumber    (token, type)       ||
+      isNumber       (token, type)       ||
+      isSeparator    (token, type)       ||
+      isList         (token, type)       ||
+      isPair         (token, type)       ||
+      isTag          (token, type)       ||
+      isSubstitution (token, type)       ||
+      isPattern      (token, type)       ||
+      isOperator     (token, type)       ||
+      isIdentifier   (token, type)       ||
+      isWord         (token, type))
+    return true;
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// No L10N - these are for internal purposes.
+const std::string Lexer2::typeName (const Lexer2::Type& type)
+{
+  switch (type)
+  {
+  case Lexer2::Type::uuid:         return "uuid";
+  case Lexer2::Type::number:       return "number";
+  case Lexer2::Type::hex:          return "hex";
+  case Lexer2::Type::string:       return "string";
+  case Lexer2::Type::list:         return "list";
+  case Lexer2::Type::pair:         return "pair";
+  case Lexer2::Type::separator:    return "separator";
+  case Lexer2::Type::tag:          return "tag";
+  case Lexer2::Type::substitution: return "substitution";
+  case Lexer2::Type::pattern:      return "pattern";
+  case Lexer2::Type::op:           return "op";
+  case Lexer2::Type::identifier:   return "identifier";
+  case Lexer2::Type::word:         return "word";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Complete Unicode whitespace list.
+//
+// http://en.wikipedia.org/wiki/Whitespace_character
+// Updated 2013-11-18
+// Static
+bool Lexer2::isWhitespace (int c)
+{
+  return (c == 0x0020 ||   // space Common  Separator, space
+          c == 0x0009 ||   // Common  Other, control  HT, Horizontal Tab
+          c == 0x000A ||   // Common  Other, control  LF, Line feed
+          c == 0x000B ||   // Common  Other, control  VT, Vertical Tab
+          c == 0x000C ||   // Common  Other, control  FF, Form feed
+          c == 0x000D ||   // Common  Other, control  CR, Carriage return
+          c == 0x0085 ||   // Common  Other, control  NEL, Next line
+          c == 0x00A0 ||   // no-break space  Common  Separator, space
+          c == 0x1680 ||   // ogham space mark  Ogham Separator, space
+          c == 0x180E ||   // mongolian vowel separator Mongolian Separator, space
+          c == 0x2000 ||   // en quad Common  Separator, space
+          c == 0x2001 ||   // em quad Common  Separator, space
+          c == 0x2002 ||   // en space  Common  Separator, space
+          c == 0x2003 ||   // em space  Common  Separator, space
+          c == 0x2004 ||   // three-per-em space  Common  Separator, space
+          c == 0x2005 ||   // four-per-em space Common  Separator, space
+          c == 0x2006 ||   // six-per-em space  Common  Separator, space
+          c == 0x2007 ||   // figure space  Common  Separator, space
+          c == 0x2008 ||   // punctuation space Common  Separator, space
+          c == 0x2009 ||   // thin space  Common  Separator, space
+          c == 0x200A ||   // hair space  Common  Separator, space
+          c == 0x2028 ||   // line separator  Common  Separator, line
+          c == 0x2029 ||   // paragraph separator Common  Separator, paragraph
+          c == 0x202F ||   // narrow no-break space Common  Separator, space
+          c == 0x205F ||   // medium mathematical space Common  Separator, space
+          c == 0x3000);    // ideographic space Common  Separator, space
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Digits 0-9.
+bool Lexer2::isDigit (int c)
+{
+  return c >= 0x30 && c <= 0x39;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Digits 0-9 a-f A-F.
+bool Lexer2::isHexDigit (int c)
+{
+  return (c >= '0' && c <= '9') ||
+         (c >= 'a' && c <= 'f') ||
+         (c >= 'A' && c <= 'F');
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::isIdentifierStart (int c)
+{
+  return c                          &&  // Include null character check.
+         ! isWhitespace         (c) &&
+         ! isDigit              (c) &&
+         ! isSingleCharOperator (c) &&
+         ! isPunctuation        (c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::isIdentifierNext (int c)
+{
+  return c                          &&  // Include null character check.
+         c != ':'                   &&  // Used in isPair.
+         ! isWhitespace         (c) &&
+         ! isSingleCharOperator (c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::isSingleCharOperator (int c)
+{
+  return c == '+' ||  // Addition
+         c == '-' ||  // Subtraction or unary minus = ambiguous
+         c == '*' ||  // Multiplication
+         c == '/' ||  // Diviѕion
+         c == '(' ||  // Precedence open parenthesis
+         c == ')' ||  // Precedence close parenthesis
+         c == '<' ||  // Less than
+         c == '>' ||  // Greater than
+         c == '^' ||  // Exponent
+         c == '!' ||  // Unary not
+         c == '%' ||  // Modulus
+         c == '=' ||  // Partial match
+         c == '~';    // Pattern match
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::isDoubleCharOperator (int c0, int c1, int c2)
+{
+  return (c0 == '=' && c1 == '=')                        ||
+         (c0 == '!' && c1 == '=')                        ||
+         (c0 == '<' && c1 == '=')                        ||
+         (c0 == '>' && c1 == '=')                        ||
+         (c0 == 'o' && c1 == 'r' && isBoundary (c1, c2)) ||
+         (c0 == '|' && c1 == '|')                        ||
+         (c0 == '&' && c1 == '&')                        ||
+         (c0 == '!' && c1 == '~');
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::isTripleCharOperator (int c0, int c1, int c2, int c3)
+{
+  return (c0 == 'a' && c1 == 'n' && c2 == 'd' && isBoundary (c2, c3)) ||
+         (c0 == 'x' && c1 == 'o' && c2 == 'r' && isBoundary (c2, c3)) ||
+         (c0 == '!' && c1 == '=' && c2 == '=');
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::isBoundary (int left, int right)
+{
+  // XOR
+  if (isalpha (left)       != isalpha (right))       return true;
+  if (isDigit (left)       != isDigit (right))       return true;
+  if (isWhitespace (left)  != isWhitespace (right))  return true;
+
+  // OR
+  if (isPunctuation (left) || isPunctuation (right)) return true;
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::isPunctuation (int c)
+{
+  return c != '@' &&
+         ispunct (c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer2::isEOS () const
+{
+  return _cursor >= _eos;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Converts '0'     -> 0
+//          '9'     -> 9
+//          'a'/'A' -> 10
+//          'f'/'F' -> 15
+int Lexer2::hexToInt (int c) const
+{
+       if (c >= '0' && c <= '9') return (c - '0');
+  else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
+  else                           return (c - 'A' + 10);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int Lexer2::hexToInt (int c0, int c1) const
+{
+  return (hexToInt (c0) << 4) + hexToInt (c1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int Lexer2::hexToInt (int c0, int c1, int c2, int c3) const
+{
+  return (hexToInt (c0) << 12) +
+         (hexToInt (c1) << 8)  +
+         (hexToInt (c2) << 4)  +
+          hexToInt (c3);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::string
+//   '|"
+//   [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ]
+//   '|"
+bool Lexer2::isString (std::string& token, Lexer2::Type& type, int quote)
+{
+  std::size_t marker = _cursor;
+
+  if (_text[marker] == quote)
+  {
+    ++marker;
+    token = "";
+
+    int c;
+    while ((c = _text[marker]))
+    {
+      // EOS.
+      if (c == quote)
+        break;
+
+      // Unicode U+XXXX or \uXXXX codepoint.
+      else if (_eos - marker >= 6 &&
+               ((_text[marker + 0] == 'U' && _text[marker + 1] == '+') ||
+                (_text[marker + 0] == '\\' && _text[marker + 1] == 'u')) &&
+               isHexDigit (_text[marker + 2]) &&
+               isHexDigit (_text[marker + 3]) &&
+               isHexDigit (_text[marker + 4]) &&
+               isHexDigit (_text[marker + 5]))
+      {
+        token += utf8_character (
+                   hexToInt (
+                     _text[marker + 2],
+                     _text[marker + 3],
+                     _text[marker + 4],
+                     _text[marker + 5]));
+        marker += 6;
+      }
+
+      // An escaped thing.
+      else if (c == '\\')
+      {
+        c = _text[++marker];
+
+        switch (c)
+        {
+        case '"':  token += (char) 0x22; ++marker; break;
+        case '\'': token += (char) 0x27; ++marker; break;
+        case '\\': token += (char) 0x5C; ++marker; break;
+        case 'b':  token += (char) 0x08; ++marker; break;
+        case 'f':  token += (char) 0x0C; ++marker; break;
+        case 'n':  token += (char) 0x0A; ++marker; break;
+        case 'r':  token += (char) 0x0D; ++marker; break;
+        case 't':  token += (char) 0x09; ++marker; break;
+        case 'v':  token += (char) 0x0B; ++marker; break;
+
+        // This pass-through default case means that anythign can be escaped
+        // harmlessly. In particular 'quote' is included, if it not one of the
+        // above characters.
+        default:   token += (char) c;    ++marker; break;
+        }
+      }
+
+      // Ordinary character.
+      else
+        token += utf8_character (utf8_next_char (_text, marker));
+    }
+
+    if (_text[marker] == quote)
+    {
+      ++marker;
+      type = Lexer2::Type::string;
+      _cursor = marker;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::uuid
+//   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
+bool Lexer2::isUUID (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (_eos - marker >= 36)
+  {
+    if (isHexDigit (_text[marker + 0]) &&
+        isHexDigit (_text[marker + 1]) &&
+        isHexDigit (_text[marker + 2]) &&
+        isHexDigit (_text[marker + 3]) &&
+        isHexDigit (_text[marker + 4]) &&
+        isHexDigit (_text[marker + 5]) &&
+        isHexDigit (_text[marker + 6]) &&
+        isHexDigit (_text[marker + 7]) &&
+        _text[marker + 8] == '-'       &&
+        isHexDigit (_text[marker + 9]) &&
+        isHexDigit (_text[marker + 10]) &&
+        isHexDigit (_text[marker + 11]) &&
+        isHexDigit (_text[marker + 12]) &&
+        _text[marker + 13] == '-'       &&
+        isHexDigit (_text[marker + 14]) &&
+        isHexDigit (_text[marker + 15]) &&
+        isHexDigit (_text[marker + 16]) &&
+        isHexDigit (_text[marker + 17]) &&
+        _text[marker + 18] == '-'       &&
+        isHexDigit (_text[marker + 19]) &&
+        isHexDigit (_text[marker + 20]) &&
+        isHexDigit (_text[marker + 20]) &&
+        isHexDigit (_text[marker + 20]) &&
+        _text[marker + 23] == '-'       &&
+        isHexDigit (_text[marker + 24]) &&
+        isHexDigit (_text[marker + 25]) &&
+        isHexDigit (_text[marker + 26]) &&
+        isHexDigit (_text[marker + 27]) &&
+        isHexDigit (_text[marker + 28]) &&
+        isHexDigit (_text[marker + 29]) &&
+        isHexDigit (_text[marker + 30]) &&
+        isHexDigit (_text[marker + 31]) &&
+        isHexDigit (_text[marker + 32]) &&
+        isHexDigit (_text[marker + 33]) &&
+        isHexDigit (_text[marker + 34]) &&
+        isHexDigit (_text[marker + 35]))
+    {
+      marker += 36;
+      token = _text.substr (_cursor, marker - _cursor);
+      type = Lexer2::Type::uuid;
+      _cursor = marker;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::uuid
+//   XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX
+bool Lexer2::isPartialUUID (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  std::size_t i = 0;
+  for (; i < 36 && marker + i < _eos; i++)
+  {
+    if (uuid_pattern[i] == 'x')
+    {
+      if (! isHexDigit (_text[marker + i]))
+        break;
+    }
+    else if (uuid_pattern[i] != _text[marker + i])
+      break;
+  }
+
+  if (i >= uuid_min_length)
+  {
+    token = _text.substr (_cursor, i + 1);
+    type = Lexer2::Type::uuid;
+    _cursor += i;
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::hex
+//   0xX+
+bool Lexer2::isHexNumber (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (_eos - marker >= 3 &&
+      _text[marker + 0] == '0' &&
+      _text[marker + 1] == 'x')
+  {
+    marker += 2;
+
+    while (isHexDigit (_text[marker]))
+      ++marker;
+
+    if (marker - _cursor > 2)
+    {
+      token = _text.substr (_cursor, marker - _cursor);
+      type = Lexer2::Type::hex;
+      _cursor = marker;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::number
+//   \d+
+//   [ . \d+ ]
+//   [ e|E [ +|- ] \d+ ]
+bool Lexer2::isNumber (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (isDigit (_text[marker]))
+  {
+    ++marker;
+    while (isDigit (_text[marker]))
+      utf8_next_char (_text, marker);
+
+    if (_text[marker] == '.')
+    {
+      ++marker;
+      if (isDigit (_text[marker]))
+      {
+        ++marker;
+        while (isDigit (_text[marker]))
+          utf8_next_char (_text, marker);
+      }
+    }
+
+    if (_text[marker] == 'e' ||
+        _text[marker] == 'E')
+    {
+      ++marker;
+
+      if (_text[marker] == '+' ||
+          _text[marker] == '-')
+        ++marker;
+
+      if (isDigit (_text[marker]))
+      {
+        ++marker;
+        while (isDigit (_text[marker]))
+          utf8_next_char (_text, marker);
+      }
+    }
+
+    token = _text.substr (_cursor, marker - _cursor);
+    type = Lexer2::Type::number;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::separator
+//   --
+bool Lexer2::isSeparator (std::string& token, Lexer2::Type& type)
+{
+  if (_eos - _cursor >= 2 &&
+      _text[_cursor] == '-' &&
+      _text[_cursor + 1] == '-')
+  {
+    _cursor += 2;
+    type = Lexer2::Type::separator;
+    token = "--";
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::list
+//   ,
+bool Lexer2::isList (std::string& token, Lexer2::Type& type)
+{
+  if (_eos - _cursor > 1 &&
+      _text[_cursor] == ',')
+  {
+    ++_cursor;
+    type = Lexer2::Type::list;
+    token = ",";
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::pair
+//   <identifier> : [ <string> | <word> ]
+bool Lexer2::isPair (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  std::string ignoredToken;
+  Lexer2::Type ignoredType;
+  if (isIdentifier (ignoredToken, ignoredType))
+  {
+    if (_eos - _cursor > 1 &&
+        (_text[_cursor] == ':' || _text[_cursor] == '='))
+    {
+      _cursor++;
+
+      if (isString (ignoredToken, ignoredType, '\'') ||
+          isString (ignoredToken, ignoredType, '"')  ||
+          isWord   (ignoredToken, ignoredType))
+      {
+        token = _text.substr (marker, _cursor - marker);
+        type = Lexer2::Type::pair;
+        return true;
+      }
+    }
+  }
+
+  _cursor = marker;
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::tag
+//   [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]*
+bool Lexer2::isTag (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (_text[marker] == '+' ||
+      _text[marker] == '-')
+  {
+    ++marker;
+
+    if (isIdentifierStart (_text[marker]))
+    {
+      utf8_next_char (_text, marker);
+
+      while (isIdentifierNext (_text[marker]))
+          utf8_next_char (_text, marker);
+
+      token = _text.substr (_cursor, marker - _cursor);
+      type = Lexer2::Type::tag;
+      _cursor = marker;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::substitution
+//   / <unquoted-string> / <unquoted-string> / [g]
+bool Lexer2::isSubstitution (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  std::string extractedToken;
+  Lexer2::Type extractedType;
+  if (isString (extractedToken, extractedType, '/'))
+  {
+    --_cursor;  // Step back over the '/'.
+
+    if (isString (extractedToken, extractedType, '/'))
+    {
+      if (_text[_cursor] == 'g')
+        ++_cursor;
+
+      if (isWhitespace (_text[_cursor]))
+      {
+        token = _text.substr (marker, _cursor - marker);
+        type = Lexer2::Type::substitution;
+        return true;
+      }
+    }
+  }
+
+  _cursor = marker;
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::pattern
+//   / <unquoted-string> /
+bool Lexer2::isPattern (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  std::string extractedToken;
+  Lexer2::Type extractedType;
+  if (isString (extractedToken, extractedType, '/') &&
+      isWhitespace (_text[_cursor]))
+  {
+    token = _text.substr (marker, _cursor - marker);
+    type = Lexer2::Type::pattern;
+    return true;
+  }
+
+  _cursor = marker;
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::op
+//   _hastag_ | _notag | _neg_ | _pos_ |
+//   <isTripleCharOperator> |
+//   <isDoubleCharOperator> |
+//   <isSingleCharOperator> |
+bool Lexer2::isOperator (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (_eos - marker >= 8 && _text.substr (marker, 8) == "_hastag_")
+  {
+    marker += 8;
+    type = Lexer2::Type::op;
+    token = _text.substr (_cursor, marker - _cursor);
+    _cursor = marker;
+    return true;
+  }
+
+  else if (_eos - marker >= 7 && _text.substr (marker, 7) == "_notag_")
+  {
+    marker += 7;
+    type = Lexer2::Type::op;
+    token = _text.substr (_cursor, marker - _cursor);
+    _cursor = marker;
+    return true;
+  }
+
+  else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_neg_")
+  {
+    marker += 5;
+    type = Lexer2::Type::op;
+    token = _text.substr (_cursor, marker - _cursor);
+    _cursor = marker;
+    return true;
+  }
+
+  else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_pos_")
+  {
+    marker += 5;
+    type = Lexer2::Type::op;
+    token = _text.substr (_cursor, marker - _cursor);
+    _cursor = marker;
+    return true;
+  }
+
+  else if (_eos - marker >= 4 &&
+      isTripleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2], _text[marker + 3]))
+  {
+    marker += 3;
+    type = Lexer2::Type::op;
+    token = _text.substr (_cursor, marker - _cursor);
+    _cursor = marker;
+    return true;
+  }
+
+  else if (_eos - marker >= 2 &&
+      isDoubleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2]))
+  {
+    marker += 2;
+    type = Lexer2::Type::op;
+    token = _text.substr (_cursor, marker - _cursor);
+    _cursor = marker;
+    return true;
+  }
+
+  else if (isSingleCharOperator (_text[marker]))
+  {
+    token = _text[marker];
+    type = Lexer2::Type::op;
+    _cursor = ++marker;
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::identifier
+//   <isIdentifierStart> [ <isIdentifierNext> ]*
+bool Lexer2::isIdentifier (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  if (isIdentifierStart (_text[marker]))
+  {
+    utf8_next_char (_text, marker);
+
+    while (isIdentifierNext (_text[marker]))
+        utf8_next_char (_text, marker);
+
+    token = _text.substr (_cursor, marker - _cursor);
+    type = Lexer2::Type::identifier;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer2::Type::word
+//   [^\s]+
+bool Lexer2::isWord (std::string& token, Lexer2::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  while (! isWhitespace (_text[marker]))
+    utf8_next_char (_text, marker);
+
+  if (marker > _cursor)
+  {
+    token = _text.substr (_cursor, marker - _cursor);
+    type = Lexer2::Type::word;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Static
+std::string Lexer2::typeToString (Lexer2::Type type)
+{
+       if (type == Lexer2::Type::string)       return std::string ("\033[38;5;7m\033[48;5;3m")    + "string"       + "\033[0m";
+  else if (type == Lexer2::Type::uuid)         return std::string ("\033[38;5;7m\033[48;5;10m")   + "uuid"         + "\033[0m";
+  else if (type == Lexer2::Type::hex)          return std::string ("\033[38;5;7m\033[48;5;14m")   + "hex"          + "\033[0m";
+  else if (type == Lexer2::Type::number)       return std::string ("\033[38;5;7m\033[48;5;6m")    + "number"       + "\033[0m";
+  else if (type == Lexer2::Type::separator)    return std::string ("\033[38;5;7m\033[48;5;4m")    + "separator"    + "\033[0m";
+  else if (type == Lexer2::Type::list)         return std::string ("\033[38;5;7m\033[48;5;4m")    + "list"         + "\033[0m";
+  else if (type == Lexer2::Type::pair)         return std::string ("\033[38;5;7m\033[48;5;1m")    + "pair"         + "\033[0m";
+  else if (type == Lexer2::Type::tag)          return std::string ("\033[37;45m")                 + "tag"          + "\033[0m";
+  else if (type == Lexer2::Type::substitution) return std::string ("\033[37;102m")                + "substitution" + "\033[0m";
+  else if (type == Lexer2::Type::pattern)      return std::string ("\033[37;42m")                 + "pattern"      + "\033[0m";
+  else if (type == Lexer2::Type::op)           return std::string ("\033[38;5;7m\033[48;5;203m")  + "op"           + "\033[0m";
+  else if (type == Lexer2::Type::identifier)   return std::string ("\033[38;5;15m\033[48;5;244m") + "identifier"   + "\033[0m";
+  else if (type == Lexer2::Type::word)         return std::string ("\033[38;5;15m\033[48;5;236m") + "word"         + "\033[0m";
+  else                                        return std::string ("\033[37;41m")                 + "unknown"      + "\033[0m";
+}
+
+////////////////////////////////////////////////////////////////////////////////
--- a/src/Lexer2.h
+++ b/src/Lexer2.h
@ -0,0 +1,96 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// http://www.opensource.org/licenses/mit-license.php
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef INCLUDED_LEXER2
+#define INCLUDED_LEXER2
+
+#include <string>
+#include <cstddef>
+
+// Lexer2: A UTF8 lexical analyzer for every construct used on the Taskwarrior
+//         command line, with additional recognized types for disambiguation.
+
+class Lexer2
+{
+public:
+  enum class Type { uuid, number, hex,
+                    string,
+                    list, pair, separator,
+                    substitution, pattern,
+                    tag,
+                    op,
+                    identifier, word,
+                    /*date,*/ /*duration,*/ };
+
+  Lexer2 (const std::string&);
+  ~Lexer2 ();
+  bool token (std::string&, Lexer2::Type&);
+  static std::string typeToString (Lexer2::Type);
+
+  // Static helpers.
+  static const std::string typeName (const Lexer2::Type&);
+  static bool isWhitespace         (int);
+  static bool isDigit              (int);
+  static bool isHexDigit           (int);
+  static bool isIdentifierStart    (int);
+  static bool isIdentifierNext     (int);
+  static bool isSingleCharOperator (int);
+  static bool isDoubleCharOperator (int, int, int);
+  static bool isTripleCharOperator (int, int, int, int);
+  static bool isBoundary           (int, int);
+  static bool isPunctuation        (int);
+
+  // Helpers.
+  bool isEOS () const;
+  int hexToInt (int) const;
+  int hexToInt (int, int) const;
+  int hexToInt (int, int, int, int) const;
+
+  // Classifiers.
+  bool isString       (std::string&, Lexer2::Type&, int quote);
+  bool isUUID         (std::string&, Lexer2::Type&);
+  bool isPartialUUID  (std::string&, Lexer2::Type&);
+  bool isNumber       (std::string&, Lexer2::Type&);
+  bool isHexNumber    (std::string&, Lexer2::Type&);
+  bool isSeparator    (std::string&, Lexer2::Type&);
+  bool isList         (std::string&, Lexer2::Type&);
+  bool isPair         (std::string&, Lexer2::Type&);
+  bool isTag          (std::string&, Lexer2::Type&);
+  bool isSubstitution (std::string&, Lexer2::Type&);
+  bool isPattern      (std::string&, Lexer2::Type&);
+  bool isOperator     (std::string&, Lexer2::Type&);
+  bool isIdentifier   (std::string&, Lexer2::Type&);
+  bool isWord         (std::string&, Lexer2::Type&);
+
+private:
+  std::string _text;
+  std::size_t _cursor = 0;
+  std::size_t _eos = 0;
+};
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////