- Lexer now makes a speculative legacy dateformat parse whenever it encounters
  a decimal digit.  This assumes that rc.dateformat begins with a numeric date
  element, which is a restriction, but not a big one.
This commit is contained in:
Paul Beckingham 2014-05-29 18:09:11 -04:00
parent 79576819c3
commit 592a3bb60f
3 changed files with 75 additions and 51 deletions

View file

@ -641,6 +641,7 @@ void Context::staticInitialization ()
Task::coefficients[*var] = config.getReal (*var);
}
Lexer::dateFormat = config.get ("dateformat");
Variant::dateFormat = config.get ("dateformat");
}

View file

@ -26,9 +26,12 @@
#include <utf8.h>
#include <ISO8601.h>
#include <Date.h>
#include <Duration.h>
#include <Lexer.h>
std::string Lexer::dateFormat = "";
////////////////////////////////////////////////////////////////////////////////
Lexer::Lexer (const std::string& input)
: _input (input)
@ -53,10 +56,10 @@ Lexer::~Lexer ()
////////////////////////////////////////////////////////////////////////////////
// Walk the input string, looking for transitions.
bool Lexer::token (std::string& token, Type& type)
bool Lexer::token (std::string& result, Type& type)
{
// Start with nothing.
token = "";
result = "";
// Different types of matching quote: ', ".
int quote = 0;
@ -80,80 +83,98 @@ bool Lexer::token (std::string& token, Type& type)
is_hex_digit (_n2))
{
type = typeHex;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else if (is_dec_digit (_n0))
{
// Speculatively try a date and duration parse. Longest wins.
std::string::size_type iso_i = 0;
std::string iso_token;
std::string iso_result;
ISO8601d iso;
iso.ambiguity (_ambiguity);
if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
iso_result = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
std::string::size_type dur_i = 0;
std::string dur_token;
std::string dur_result;
Duration dur;
if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
dur_result = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
if (iso_token.length () > dur_token.length ())
if (iso_result.length () > dur_result.length ())
{
while (iso_i--) shift ();
token = iso_token;
result = iso_result;
type = typeDate;
return true;
}
else if (dur_token.length () > iso_token.length ())
else if (dur_result.length () > iso_result.length ())
{
while (dur_i--) shift ();
token = dur_token;
result = dur_result;
type = typeDuration;
return true;
}
// TODO Try an rc.dateformat parse here.
// Try a legacy rc.dateformat parse here.
try
{
std::string::size_type start = _i < 4 ? 0 : _i - 4;
std::string::size_type space = _input.find (' ', _i);
if (space == std::string::npos)
space = _input.length ();
std::string legacy = _input.substr (start, space - start);
Date legacyDate (legacy, Lexer::dateFormat, false, false);
space -= start;
while (space--) shift ();
result = legacy;
type = typeDate;
return true;
}
catch (...) { /* Never mind. */ }
type = typeNumber;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else if (_n0 == '.' && is_dec_digit (_n1))
{
type = typeDecimal;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else if (is_triple_op (_n0, _n1, _n2))
{
type = typeOperator;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
return true;
}
else if (is_double_op (_n0, _n1))
{
type = typeOperator;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
return true;
}
else if (is_single_op (_n0))
{
type = typeOperator;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
return true;
}
@ -166,34 +187,34 @@ bool Lexer::token (std::string& token, Type& type)
{
// Speculatively try a date and duration parse. Longest wins.
std::string::size_type iso_i = 0;
std::string iso_token;
std::string iso_result;
ISO8601p iso;
if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
iso_result = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
std::string::size_type dur_i = 0;
std::string dur_token;
std::string dur_result;
Duration dur;
if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
dur_result = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
if (iso_token.length () > dur_token.length ())
if (iso_result.length () > dur_result.length ())
{
while (iso_i--) shift ();
token = iso_token;
result = iso_result;
type = typeDuration;
return true;
}
else if (dur_token.length () > iso_token.length ())
else if (dur_result.length () > iso_result.length ())
{
while (dur_i--) shift ();
token = dur_token;
result = dur_result;
type = typeDuration;
return true;
}
type = typeIdentifier;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else
@ -214,7 +235,7 @@ bool Lexer::token (std::string& token, Type& type)
}
else
{
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
break;
@ -222,7 +243,7 @@ bool Lexer::token (std::string& token, Type& type)
case typeIdentifier:
if (is_ident (_n0))
{
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else
@ -230,10 +251,10 @@ bool Lexer::token (std::string& token, Type& type)
// typeIdentifier is a catch-all type. Anything word-like becomes an
// identifier. At this point in the processing, an identifier is found,
// and can be matched against a list of potential upgrades.
if (token == "_hastag_" ||
token == "_notag_" ||
token == "_neg_" ||
token == "_pos_")
if (result == "_hastag_" ||
result == "_notag_" ||
result == "_neg_" ||
result == "_pos_")
type = typeOperator;
return true;
@ -261,7 +282,7 @@ bool Lexer::token (std::string& token, Type& type)
}
else
{
token += decode_escape (_n0);
result += decode_escape (_n0);
type = quote ? typeString : typeIdentifier;
shift ();
}
@ -270,7 +291,7 @@ bool Lexer::token (std::string& token, Type& type)
case typeEscapeHex:
if (is_hex_digit (_n0) && is_hex_digit (_n1))
{
token += utf8_character (hex_to_int (_n0, _n1));
result += utf8_character (hex_to_int (_n0, _n1));
type = quote ? typeString : typeIdentifier;
shift ();
shift ();
@ -290,7 +311,7 @@ bool Lexer::token (std::string& token, Type& type)
is_hex_digit (_n2) &&
is_hex_digit (_n3))
{
token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
result += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
shift ();
shift ();
shift ();
@ -309,19 +330,19 @@ bool Lexer::token (std::string& token, Type& type)
case typeNumber:
if (is_dec_digit (_n0))
{
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else if (_n0 == '.')
{
type = typeDecimal;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else if (_n0 == 'e' || _n0 == 'E')
{
type = typeExponentIndicator;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else
@ -333,13 +354,13 @@ bool Lexer::token (std::string& token, Type& type)
case typeDecimal:
if (is_dec_digit (_n0))
{
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else if (_n0 == 'e' || _n0 == 'E')
{
type = typeExponentIndicator;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else
@ -351,13 +372,13 @@ bool Lexer::token (std::string& token, Type& type)
case typeExponentIndicator:
if (_n0 == '+' || _n0 == '-')
{
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else if (is_dec_digit (_n0))
{
type = typeExponent;
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
break;
@ -365,12 +386,12 @@ bool Lexer::token (std::string& token, Type& type)
case typeExponent:
if (is_dec_digit (_n0))
{
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else if (_n0 == '.')
{
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else
@ -383,7 +404,7 @@ bool Lexer::token (std::string& token, Type& type)
case typeHex:
if (is_hex_digit (_n0))
{
token += utf8_character (_n0);
result += utf8_character (_n0);
shift ();
}
else
@ -398,7 +419,7 @@ bool Lexer::token (std::string& token, Type& type)
}
// Fence post.
if (!_n0 && token != "")
if (!_n0 && result != "")
return true;
}

View file

@ -33,6 +33,8 @@
class Lexer
{
public:
static std::string dateFormat;
enum Type
{
typeNone = 0,