Lexer, Duration

- Merged libexpr code.
2025-06-26 10:54:26 +02:00 · 2014-01-02 00:55:53 -05:00 · 2014-01-02 00:55:53 -05:00 · 9bfe40fac7
commit 9bfe40fac7
parent 9c5adc432c
5 changed files with 913 additions and 1 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -14,15 +14,17 @@ set (task_SRCS A3.cpp A3.h
               DOM.cpp DOM.h
               Date.cpp Date.h
               Directory.cpp Directory.h
-               OldDuration.cpp OldDuration.h
+               Duration.cpp Duration.h
               E9.cpp E9.h
               File.cpp File.h
               Hooks.cpp Hooks.h
               ISO8601.cpp ISO8601.h
               JSON.cpp JSON.h
+               Lexer.cpp Lexer.h
               LRParser.cpp LRParser.h
               Msg.cpp Msg.h
               Nibbler.cpp Nibbler.h
+               OldDuration.cpp OldDuration.h
               Parser.cpp Parser.h
               Path.cpp Path.h
               RX.cpp RX.h
--- a/src/Duration.cpp
+++ b/src/Duration.cpp
@ -0,0 +1,167 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2006 - 2014, Paul Beckingham, Federico Hernandez.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// http://www.opensource.org/licenses/mit-license.php
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <cmake.h>
+#include <stdlib.h>
+#include <Nibbler.h>
+#include <Lexer.h>
+#include <Duration.h>
+
+#define DAY    86400
+#define HOUR    3600
+#define MINUTE    60
+#define SECOND     1
+
+static struct
+{
+  std::string unit;
+  int seconds;
+} durations[] =
+{
+  // These are sorted by first character, then length, so that Nibbler::getOneOf
+  // returns a maximal match.
+  {"annual",     365 * DAY},
+  {"biannual",   730 * DAY},
+  {"bimonthly",   61 * DAY},
+  {"biweekly",    14 * DAY},
+  {"biyearly",   730 * DAY},
+  {"daily",        1 * DAY},
+  {"days",         1 * DAY},
+  {"day",          1 * DAY},
+  {"d",            1 * DAY},
+  {"fortnight",   14 * DAY},
+  {"hours",        1 * HOUR},
+  {"hour",         1 * HOUR},
+  {"hrs",          1 * HOUR},           // Deprecate
+  {"hr",           1 * HOUR},           // Deprecate
+  {"h",            1 * HOUR},
+  {"minutes",      1 * MINUTE},
+  {"minute",       1 * MINUTE},
+  {"mins",         1 * MINUTE},         // Deprecate
+  {"min",          1 * MINUTE},
+  {"monthly",     30 * DAY},
+  {"months",      30 * DAY},
+  {"month",       30 * DAY},
+  {"mnths",       30 * DAY},            // Deprecate
+  {"mths",        30 * DAY},            // Deprecate
+  {"mth",         30 * DAY},            // Deprecate
+  {"mos",         30 * DAY},            // Deprecate
+  {"mo",          30 * DAY},
+  {"quarterly",   91 * DAY},
+  {"quarters",    91 * DAY},
+  {"quarter",     91 * DAY},
+  {"qrtrs",       91 * DAY},            // Deprecate
+  {"qtrs",        91 * DAY},            // Deprecate
+  {"qtr",         91 * DAY},            // Deprecate
+  {"q",           91 * DAY},
+  {"semiannual", 183 * DAY},
+  {"sennight",    14 * DAY},
+  {"seconds",      1 * SECOND},
+  {"second",       1 * SECOND},
+  {"secs",         1 * SECOND},         // Deprecate
+  {"sec",          1 * SECOND},         // Deprecate
+  {"s",            1 * SECOND},
+  {"weekdays",         DAY},
+  {"weekly",       7 * DAY},
+  {"weeks",        7 * DAY},
+  {"week",         7 * DAY},
+  {"wks",          7 * DAY},            // Deprecate
+  {"wk",           7 * DAY},            // Deprecate
+  {"w",            7 * DAY},
+  {"yearly",     365 * DAY},
+  {"years",      365 * DAY},
+  {"year",       365 * DAY},
+  {"yrs",        365 * DAY},            // Deprecate
+  {"yr",         365 * DAY},            // Deprecate
+  {"y",          365 * DAY},
+};
+
+#define NUM_DURATIONS (sizeof (durations) / sizeof (durations[0]))
+
+////////////////////////////////////////////////////////////////////////////////
+Duration::Duration ()
+: _secs (0)
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+Duration::~Duration ()
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+Duration::operator time_t () const
+{
+  return _secs;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Duration::parse (const std::string& input, std::string::size_type& start)
+{
+  std::string::size_type original_start = start;
+  Nibbler n (input.substr (start));
+
+  std::vector <std::string> units;
+  for (int i = 0; i < NUM_DURATIONS; i++)
+    units.push_back (durations[i].unit);
+
+  std::string number;
+  std::string unit;
+  if ((n.getNumber (number) && n.skipWS () && n.getOneOf (units, unit)) ||
+                                              n.getOneOf (units, unit))
+  {
+    if (n.depleted () ||
+        Lexer::is_ws (n.next ()))
+    {
+      start = original_start + n.cursor ();
+      double quantity = (number == "")
+                          ? 1.0
+                          : strtod (number.c_str (), NULL);
+
+      // Linear lookup - should be logarithmic.
+      double seconds = 1;
+      for (int i = 0; i < NUM_DURATIONS; i++)
+      {
+        if (durations[i].unit == unit)
+        {
+          seconds = durations[i].seconds;
+        _secs = static_cast <int> (quantity * static_cast <double> (seconds));
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+void Duration::clear ()
+{
+  _secs = 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////
--- a/src/Duration.h
+++ b/src/Duration.h
@ -0,0 +1,49 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2006 - 2014, Paul Beckingham, Federico Hernandez.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// http://www.opensource.org/licenses/mit-license.php
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef INCLUDED_DURATION
+#define INCLUDED_DURATION
+
+#include <string>
+#include <time.h>
+
+class Duration
+{
+public:
+  Duration ();                           // Default constructor
+  ~Duration ();                          // Destructor
+  Duration (const Duration&);            // Unimplemented
+  Duration& operator= (const Duration&); // Unimplemented
+  operator time_t () const;
+  bool parse (const std::string&, std::string::size_type&);
+  void clear ();
+
+protected:
+  time_t _secs;
+};
+
+#endif
+////////////////////////////////////////////////////////////////////////////////
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -0,0 +1,600 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2013 - 2014, Paul Beckingham, Federico Hernandez.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// http://www.opensource.org/licenses/mit-license.php
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <utf8.h>
+#include <ISO8601.h>
+#include <Duration.h>
+#include <Lexer.h>
+
+////////////////////////////////////////////////////////////////////////////////
+Lexer::Lexer (const std::string& input)
+: _input (input)
+, _i (0)
+, _n0 (32)
+, _n1 (32)
+, _n2 (32)
+, _n3 (32)
+, _ambiguity (true)
+{
+  // Read 4 chars in preparation.  Even if there are < 4.  Take a deep breath.
+  shift ();
+  shift ();
+  shift ();
+  shift ();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+Lexer::~Lexer ()
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Walk the input string, looking for transitions.
+bool Lexer::token (std::string& token, Type& type)
+{
+  // Start with nothing.
+  token = "";
+
+  // Different types of matching quote:  ', ".
+  int quote = 0;
+
+  type = typeNone;
+  while (_n0)
+  {
+    switch (type)
+    {
+    case typeNone:
+      if (is_ws (_n0))
+        shift ();
+      else if (_n0 == '"' || _n0 == '\'')
+      {
+        type = typeString;
+        quote = _n0;
+        shift ();
+      }
+      else if (_n0 == '0' &&
+               _n1 == 'x' &&
+               is_hex_digit (_n2))
+      {
+        type = typeHex;
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (is_dec_digit (_n0))
+      {
+        // Speculatively try a date and duration parse.  Longest wins.
+        std::string::size_type iso_i = 0;
+        std::string iso_token;
+        ISO8601d iso;
+        iso.ambiguity (_ambiguity);
+        if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
+          iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
+
+        std::string::size_type dur_i = 0;
+        std::string dur_token;
+        Duration dur;
+        if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
+          dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
+
+        if (iso_token.length () > dur_token.length ())
+        {
+          while (iso_i--) shift ();
+          token = iso_token;
+          type = typeDate;
+          return true;
+        }
+        else if (dur_token.length () > iso_token.length ())
+        {
+          while (dur_i--) shift ();
+          token = dur_token;
+          type = typeDuration;
+          return true;
+        }
+
+        type = typeNumber;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.' && is_dec_digit (_n1))
+      {
+        type = typeDecimal;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (is_triple_op (_n0, _n1, _n2))
+      {
+        type = typeOperator;
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+        return true;
+      }
+      else if (is_double_op (_n0, _n1))
+      {
+        type = typeOperator;
+        token += utf8_character (_n0);
+        shift ();
+        token += utf8_character (_n0);
+        shift ();
+        return true;
+      }
+      else if (is_single_op (_n0))
+      {
+        type = typeOperator;
+        token += utf8_character (_n0);
+        shift ();
+        return true;
+      }
+      else if (_n0 == '\\')
+      {
+        type = typeIdentifierEscape;
+        shift ();
+      }
+      else if (is_ident_start (_n0))
+      {
+        // Speculatively try a date and duration parse.  Longest wins.
+        std::string::size_type iso_i = 0;
+        std::string iso_token;
+        ISO8601p iso;
+        if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
+          iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
+
+        std::string::size_type dur_i = 0;
+        std::string dur_token;
+        Duration dur;
+        if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
+          dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
+
+        if (iso_token.length () > dur_token.length ())
+        {
+          while (iso_i--) shift ();
+          token = iso_token;
+          type = typeDuration;
+          return true;
+        }
+        else if (dur_token.length () > iso_token.length ())
+        {
+          while (dur_i--) shift ();
+          token = dur_token;
+          type = typeDuration;
+          return true;
+        }
+
+        type = typeIdentifier;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+        throw std::string ("Unexpected error 1");
+      break;
+
+    case typeString:
+      if (_n0 == quote)
+      {
+        shift ();
+        quote = 0;
+        return true;
+      }
+      else if (_n0 == '\\')
+      {
+        type = typeEscape;
+        shift ();
+      }
+      else
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      break;
+
+    case typeIdentifier:
+      if (is_ident (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeIdentifierEscape:
+      if (_n0 == 'u')
+      {
+        type = typeEscapeUnicode;
+        shift ();
+      }
+      break;
+
+    case typeEscape:
+      if (_n0 == 'x')
+      {
+        type = typeEscapeHex;
+        shift ();
+      }
+      else if (_n0 == 'u')
+      {
+        type = typeEscapeUnicode;
+        shift ();
+      }
+      else
+      {
+        token += decode_escape (_n0);
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+      }
+      break;
+
+    case typeEscapeHex:
+      if (is_hex_digit (_n0) && is_hex_digit (_n1))
+      {
+        token += utf8_character (hex_to_int (_n0, _n1));
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+        shift ();
+      }
+      else
+      {
+        type = quote ? typeString : typeIdentifier;
+        shift ();
+        quote = 0;
+        return true;
+      }
+      break;
+
+    case typeEscapeUnicode:
+      if (is_hex_digit (_n0) &&
+          is_hex_digit (_n1) &&
+          is_hex_digit (_n2) &&
+          is_hex_digit (_n3))
+      {
+        token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
+        shift ();
+        shift ();
+        shift ();
+        shift ();
+        type = quote ? typeString : typeIdentifier;
+      }
+      else if (_n0 == quote)
+      {
+        type = typeString;
+        shift ();
+        quote = 0;
+        return true;
+      }
+
+    case typeNumber:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.')
+      {
+        type = typeDecimal;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == 'e' || _n0 == 'E')
+      {
+        type = typeExponentIndicator;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeDecimal:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == 'e' || _n0 == 'E')
+      {
+        type = typeExponentIndicator;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    case typeExponentIndicator:
+      if (_n0 == '+' || _n0 == '-')
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (is_dec_digit (_n0))
+      {
+        type = typeExponent;
+        token += utf8_character (_n0);
+        shift ();
+      }
+      break;
+
+    case typeExponent:
+      if (is_dec_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else if (_n0 == '.')
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        type = typeDecimal;
+        return true;
+      }
+      break;
+
+    case typeHex:
+      if (is_hex_digit (_n0))
+      {
+        token += utf8_character (_n0);
+        shift ();
+      }
+      else
+      {
+        return true;
+      }
+      break;
+
+    default:
+      throw std::string ("Unexpected error 2");
+      break;
+    }
+
+    // Fence post.
+    if (!_n0 && token != "")
+      return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+void Lexer::ambiguity (bool value)
+{
+  _ambiguity = value;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+const std::string Lexer::type_name (const Type& type)
+{
+  switch (type)
+  {
+  case Lexer::typeNone:              return "None";
+  case Lexer::typeString:            return "String";
+  case Lexer::typeIdentifier:        return "Identifier";
+  case Lexer::typeIdentifierEscape:  return "IdentifierEscape";
+  case Lexer::typeNumber:            return "Number";
+  case Lexer::typeDecimal:           return "Decimal";
+  case Lexer::typeExponentIndicator: return "ExponentIndicator";
+  case Lexer::typeExponent:          return "Exponent";
+  case Lexer::typeHex:               return "Hex";
+  case Lexer::typeOperator:          return "Operator";
+  case Lexer::typeEscape:            return "Escape";
+  case Lexer::typeEscapeHex:         return "EscapeHex";
+  case Lexer::typeEscapeUnicode:     return "EscapeUnicode";
+  case Lexer::typeDate:              return "Date";
+  case Lexer::typeDuration:          return "Duration";
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Complete Unicode whitespace list.
+//
+// http://en.wikipedia.org/wiki/Whitespace_character
+// Updated 2013-11-18
+bool Lexer::is_ws (int c)
+{
+  return (c == 0x0020 ||   // space Common  Separator, space
+          c == 0x0009 ||   // Common  Other, control  HT, Horizontal Tab
+          c == 0x000A ||   // Common  Other, control  LF, Line feed
+          c == 0x000B ||   // Common  Other, control  VT, Vertical Tab
+          c == 0x000C ||   // Common  Other, control  FF, Form feed
+          c == 0x000D ||   // Common  Other, control  CR, Carriage return
+          c == 0x0085 ||   // Common  Other, control  NEL, Next line
+          c == 0x00A0 ||   // no-break space  Common  Separator, space
+          c == 0x1680 ||   // ogham space mark  Ogham Separator, space
+          c == 0x180E ||   // mongolian vowel separator Mongolian Separator, space
+          c == 0x2000 ||   // en quad Common  Separator, space
+          c == 0x2001 ||   // em quad Common  Separator, space
+          c == 0x2002 ||   // en space  Common  Separator, space
+          c == 0x2003 ||   // em space  Common  Separator, space
+          c == 0x2004 ||   // three-per-em space  Common  Separator, space
+          c == 0x2005 ||   // four-per-em space Common  Separator, space
+          c == 0x2006 ||   // six-per-em space  Common  Separator, space
+          c == 0x2007 ||   // figure space  Common  Separator, space
+          c == 0x2008 ||   // punctuation space Common  Separator, space
+          c == 0x2009 ||   // thin space  Common  Separator, space
+          c == 0x200A ||   // hair space  Common  Separator, space
+          c == 0x2028 ||   // line separator  Common  Separator, line
+          c == 0x2029 ||   // paragraph separator Common  Separator, paragraph
+          c == 0x202F ||   // narrow no-break space Common  Separator, space
+          c == 0x205F ||   // medium mathematical space Common  Separator, space
+          c == 0x3000);    // ideographic space Common  Separator, space
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_punct (int c) const
+{
+  if (c == ',' ||
+      c == '.')      // Tab
+    return true;
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_num (int c) const
+{
+  if ((c >= '0' && c <= '9') ||
+      c == '.')
+    return true;
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_ident_start (int c) const
+{
+  return c           &&       // Include null character check.
+         ! is_ws (c) &&
+         ! is_dec_digit (c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_ident (int c) const
+{
+  return c           &&       // Include null character check.
+         ! is_ws (c) &&
+         ! is_single_op (c);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_triple_op (int c0, int c1, int c2) const
+{
+  return (c0 == 'a' && c1 == 'n' && c2 == 'd') ||
+         (c0 == 'x' && c1 == 'o' && c2 == 'r');
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_double_op (int c0, int c1) const
+{
+  return (c0 == '=' && c1 == '=') ||
+         (c0 == '!' && c1 == '=') ||
+         (c0 == '<' && c1 == '=') ||
+         (c0 == '>' && c1 == '=') ||
+         (c0 == 'o' && c1 == 'r') ||
+         (c0 == '|' && c1 == '|') ||
+         (c0 == '&' && c1 == '&') ||
+         (c0 == '!' && c1 == '~');
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_single_op (int c) const
+{
+  return c == '+' ||
+         c == '-' ||
+         c == '*' ||
+         c == '/' ||
+         c == '(' ||
+         c == ')' ||
+         c == '<' ||
+         c == '>' ||
+         c == '^' ||
+         c == '!' ||
+         c == '%' ||
+         c == '=' ||
+         c == '~';
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_dec_digit (int c) const
+{
+  return c >= '0' && c <= '9';
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::is_hex_digit (int c) const
+{
+  return (c >= '0' && c <= '9') ||
+         (c >= 'a' && c <= 'f') ||
+         (c >= 'A' && c <= 'F');
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int Lexer::decode_escape (int c) const
+{
+  switch (c)
+  {
+  case 'b':  return 0x08;
+  case 'f':  return 0x0C;
+  case 'n':  return 0x0A;
+  case 'r':  return 0x0D;
+  case 't':  return 0x09;
+  case 'v':  return 0x0B;
+  case '\'': return 0x27;
+  case '"':  return 0x22;
+  case '\\': return 0x5C;
+  default:   return c;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int Lexer::hex_to_int (int c) const
+{
+       if (c >= '0' && c <= '9') return (c - '0');
+  else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
+  else                           return (c - 'A' + 10);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int Lexer::hex_to_int (int c0, int c1) const
+{
+  return (hex_to_int (c0) << 4) + hex_to_int (c1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int Lexer::hex_to_int (int c0, int c1, int c2, int c3) const
+{
+  return (hex_to_int (c0) << 12) +
+         (hex_to_int (c1) << 8)  +
+         (hex_to_int (c2) << 4)  +
+          hex_to_int (c3);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+void Lexer::shift ()
+{
+  _n0 = _n1;
+  _n1 = _n2;
+  _n2 = _n3;
+  _n3 = utf8_next_char (_input, _i);
+
+  //std::cout << "# shift [" << (char) _n0 << (char) _n1 << (char) _n2 << (char) _n3 << "]\n";
+}
+
+////////////////////////////////////////////////////////////////////////////////
--- a/src/Lexer.h
+++ b/src/Lexer.h
@ -0,0 +1,94 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2013 - 2014, Paul Beckingham, Federico Hernandez.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// http://www.opensource.org/licenses/mit-license.php
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef INCLUDED_LEXER
+#define INCLUDED_LEXER
+
+#include <vector>
+#include <string>
+
+class Lexer
+{
+public:
+  enum Type
+  {
+    typeNone = 0,
+    typeString,
+    typeIdentifier,
+    typeIdentifierEscape,    // Intermediate
+    typeEscape,              // Intermediate
+    typeEscapeHex,           // Intermediate
+    typeEscapeUnicode,       // Intermediate
+    typeNumber,
+    typeDecimal,
+    typeExponentIndicator,   // Intermediate
+    typeExponent,            // Intermediate
+    typeHex,
+    typeOperator,
+    typeDate,
+    typeDuration,
+  };
+
+  Lexer (const std::string&);
+  virtual ~Lexer ();
+  Lexer (const Lexer&);            // Not implemented.
+  Lexer& operator= (const Lexer&); // Not implemented.
+  bool operator== (const Lexer&);  // Not implemented.
+  bool token (std::string&, Type&);
+  void ambiguity (bool);
+
+  static const std::string type_name (const Type&);
+  static bool is_ws (int);
+
+private:
+  bool is_punct (int) const;
+  bool is_num (int) const;
+  bool is_ident_start (int) const;
+  bool is_ident (int) const;
+  bool is_triple_op (int, int, int) const;
+  bool is_double_op (int, int) const;
+  bool is_single_op (int) const;
+  bool is_dec_digit (int) const;
+  bool is_hex_digit (int) const;
+  int decode_escape (int) const;
+  int hex_to_int (int) const;
+  int hex_to_int (int, int) const;
+  int hex_to_int (int, int, int, int) const;
+  void shift ();
+
+private:
+  const std::string _input;
+  std::string::size_type _i;
+  int _n0;
+  int _n1;
+  int _n2;
+  int _n3;
+  bool _ambiguity;
+};
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////