taskwarrior/src/Lexer.cpp
Paul Beckingham 32566c9844 Bug
- Segfault when 'project:android' is split into 'and' and 'roid' (thanks to
  Richard Boß).
2014-07-07 21:43:09 -04:00

851 lines
22 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

////////////////////////////////////////////////////////////////////////////////
//
// Copyright 2013 - 2014, Paul Beckingham, Federico Hernandez.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// http://www.opensource.org/licenses/mit-license.php
//
////////////////////////////////////////////////////////////////////////////////
#include <cmake.h>
#include <ctype.h>
#include <utf8.h>
#include <ISO8601.h>
#include <Date.h>
#include <Duration.h>
#include <Lexer.h>
#include <i18n.h>
std::string Lexer::dateFormat = "";
////////////////////////////////////////////////////////////////////////////////
Lexer::Lexer (const std::string& input)
: _input (input)
, _i (0)
, _shift_counter (0)
, _n0 (32)
, _n1 (32)
, _n2 (32)
, _n3 (32)
, _boundary01 (false)
, _boundary12 (false)
, _boundary23 (false)
, _ambiguity (true)
{
// Read 4 chars in preparation. Even if there are < 4. Take a deep breath.
shift ();
shift ();
shift ();
shift ();
// Reset because the four shifts above do not represent advancement into the
// _input. All subsequents shiftѕ do though.
_shift_counter = 0;
}
////////////////////////////////////////////////////////////////////////////////
Lexer::~Lexer ()
{
}
////////////////////////////////////////////////////////////////////////////////
// Walk the input string, looking for transitions.
bool Lexer::token (std::string& result, Type& type)
{
// Start with nothing.
result = "";
// Different types of matching quote: ', ".
int quote = 0;
type = typeNone;
while (_n0)
{
switch (type)
{
case typeNone:
if (is_ws (_n0))
shift ();
else if (_n0 == '"' || _n0 == '\'')
{
type = typeString;
quote = _n0;
shift ();
}
else if (_n0 == '0' &&
_n1 == 'x' &&
is_hex_digit (_n2))
{
type = typeHex;
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
}
else if (is_dec_digit (_n0))
{
// Speculatively try a date and duration parse. Longest wins.
if (is_date (result))
{
type = typeDate;
return true;
}
if (is_duration (result))
{
type = typeDuration;
return true;
}
type = typeNumber;
result += utf8_character (_n0);
shift ();
}
else if (_n0 == '.' && is_dec_digit (_n1))
{
type = typeDecimal;
result += utf8_character (_n0);
shift ();
}
else if (is_triple_op (_n0, _n1, _n2))
{
type = typeOperator;
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
return true;
}
else if (is_double_op (_n0, _n1, _n2))
{
type = typeOperator;
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
return true;
}
else if (is_single_op (_n0))
{
type = typeOperator;
result += utf8_character (_n0);
shift ();
return true;
}
else if (_n0 == '\\')
{
type = typeIdentifierEscape;
shift ();
}
else if (is_ident_start (_n0))
{
if (is_date (result))
{
type = typeDate;
return true;
}
if (is_duration (result))
{
type = typeDuration;
return true;
}
type = typeIdentifier;
result += utf8_character (_n0);
shift ();
}
else
throw std::string (STRING_LEX_IMMEDIATE_UNK);
break;
case typeString:
if (_n0 == quote)
{
shift ();
quote = 0;
return true;
}
else if (_n0 == '\\')
{
type = typeEscape;
shift ();
}
else
{
result += utf8_character (_n0);
shift ();
}
break;
case typeIdentifier:
if (is_ident (_n0))
{
result += utf8_character (_n0);
shift ();
}
else
{
// typeIdentifier is a catch-all type. Anything word-like becomes an
// identifier. At this point in the processing, an identifier is found,
// and can be matched against a list of potential upgrades.
if (result == "_hastag_" ||
result == "_notag_" ||
result == "_neg_" ||
result == "_pos_")
type = typeOperator;
return true;
}
break;
case typeIdentifierEscape:
if (_n0 == 'u')
{
type = typeEscapeUnicode;
shift ();
}
break;
case typeEscape:
if (_n0 == 'x')
{
type = typeEscapeHex;
shift ();
}
else if (_n0 == 'u')
{
type = typeEscapeUnicode;
shift ();
}
else
{
result += decode_escape (_n0);
type = quote ? typeString : typeIdentifier;
shift ();
}
break;
case typeEscapeHex:
if (is_hex_digit (_n0) && is_hex_digit (_n1))
{
result += utf8_character (hex_to_int (_n0, _n1));
type = quote ? typeString : typeIdentifier;
shift ();
shift ();
}
else
{
type = quote ? typeString : typeIdentifier;
shift ();
quote = 0;
return true;
}
break;
case typeEscapeUnicode:
if (is_hex_digit (_n0) &&
is_hex_digit (_n1) &&
is_hex_digit (_n2) &&
is_hex_digit (_n3))
{
result += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
shift ();
shift ();
shift ();
shift ();
type = quote ? typeString : typeIdentifier;
}
else if (_n0 == quote)
{
type = typeString;
shift ();
quote = 0;
return true;
}
break;
case typeNumber:
if (is_dec_digit (_n0))
{
result += utf8_character (_n0);
shift ();
}
else if (_n0 == '.')
{
type = typeDecimal;
result += utf8_character (_n0);
shift ();
}
else if (_n0 == 'e' || _n0 == 'E')
{
type = typeExponentIndicator;
result += utf8_character (_n0);
shift ();
}
else if (is_ident_start (_n0))
{
type = typeIdentifier;
result += utf8_character (_n0);
shift ();
}
else
{
return true;
}
break;
case typeDecimal:
if (is_dec_digit (_n0))
{
result += utf8_character (_n0);
shift ();
}
else if (_n0 == 'e' || _n0 == 'E')
{
type = typeExponentIndicator;
result += utf8_character (_n0);
shift ();
}
else if (is_ident_start (_n0))
{
type = typeIdentifier;
result += utf8_character (_n0);
shift ();
}
else
{
return true;
}
break;
case typeExponentIndicator:
if (_n0 == '+' || _n0 == '-')
{
result += utf8_character (_n0);
shift ();
}
else if (is_dec_digit (_n0))
{
type = typeExponent;
result += utf8_character (_n0);
shift ();
}
else if (is_ident_start (_n0))
{
type = typeIdentifier;
result += utf8_character (_n0);
shift ();
}
break;
case typeExponent:
if (is_dec_digit (_n0) || _n0 == '.')
{
result += utf8_character (_n0);
shift ();
}
else
{
type = typeDecimal;
return true;
}
break;
case typeHex:
if (is_hex_digit (_n0))
{
result += utf8_character (_n0);
shift ();
}
else
{
return true;
}
break;
default:
throw std::string (STRING_LEX_TYPE_UNK);
break;
}
// Fence post.
if (!_n0 && result != "")
return true;
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
// Just like Lexer::token, but no operators, numbers, dates or durations.
bool Lexer::word (std::string& token, Type& type)
{
// Start with nothing.
token = "";
// Different types of matching quote: ', ".
int quote = 0;
type = typeNone;
while (_n0)
{
switch (type)
{
case typeNone:
if (is_ws (_n0))
shift ();
else if (_n0 == '"' || _n0 == '\'')
{
type = typeString;
quote = _n0;
shift ();
}
else
{
type = typeString;
token += utf8_character (_n0);
shift ();
}
break;
case typeString:
if (_n0 == quote)
{
shift ();
quote = 0;
return true;
}
else if (_n0 == '\\')
{
type = typeEscape;
shift ();
}
else if (! quote && is_ws (_n0))
{
shift ();
return true;
}
else
{
token += utf8_character (_n0);
shift ();
}
break;
case typeEscape:
if (_n0 == 'x')
{
type = typeEscapeHex;
shift ();
}
else if (_n0 == 'u')
{
type = typeEscapeUnicode;
shift ();
}
else
{
token += decode_escape (_n0);
type = typeString;
shift ();
}
break;
case typeEscapeHex:
if (is_hex_digit (_n0) && is_hex_digit (_n1))
{
token += utf8_character (hex_to_int (_n0, _n1));
type = typeString;
shift ();
shift ();
}
else
{
type = typeString;
shift ();
quote = 0;
return true;
}
break;
case typeEscapeUnicode:
if (is_hex_digit (_n0) &&
is_hex_digit (_n1) &&
is_hex_digit (_n2) &&
is_hex_digit (_n3))
{
token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
shift ();
shift ();
shift ();
shift ();
type = typeString;
}
else if (_n0 == quote)
{
type = typeString;
shift ();
quote = 0;
return true;
}
break;
default:
throw std::string (STRING_LEX_TYPE_UNK);
break;
}
// Fence post.
if (!_n0 && token != "")
return true;
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
void Lexer::ambiguity (bool value)
{
_ambiguity = value;
}
////////////////////////////////////////////////////////////////////////////////
// No L10N - these are for internal purposes.
const std::string Lexer::type_name (const Type& type)
{
switch (type)
{
case Lexer::typeNone: return "None";
case Lexer::typeString: return "String";
case Lexer::typeIdentifier: return "Identifier";
case Lexer::typeIdentifierEscape: return "IdentifierEscape";
case Lexer::typeNumber: return "Number";
case Lexer::typeDecimal: return "Decimal";
case Lexer::typeExponentIndicator: return "ExponentIndicator";
case Lexer::typeExponent: return "Exponent";
case Lexer::typeHex: return "Hex";
case Lexer::typeOperator: return "Operator";
case Lexer::typeEscape: return "Escape";
case Lexer::typeEscapeHex: return "EscapeHex";
case Lexer::typeEscapeUnicode: return "EscapeUnicode";
case Lexer::typeDate: return "Date";
case Lexer::typeDuration: return "Duration";
}
}
////////////////////////////////////////////////////////////////////////////////
// Complete Unicode whitespace list.
//
// http://en.wikipedia.org/wiki/Whitespace_character
// Updated 2013-11-18
bool Lexer::is_ws (int c)
{
return (c == 0x0020 || // space Common Separator, space
c == 0x0009 || // Common Other, control HT, Horizontal Tab
c == 0x000A || // Common Other, control LF, Line feed
c == 0x000B || // Common Other, control VT, Vertical Tab
c == 0x000C || // Common Other, control FF, Form feed
c == 0x000D || // Common Other, control CR, Carriage return
c == 0x0085 || // Common Other, control NEL, Next line
c == 0x00A0 || // no-break space Common Separator, space
c == 0x1680 || // ogham space mark Ogham Separator, space
c == 0x180E || // mongolian vowel separator Mongolian Separator, space
c == 0x2000 || // en quad Common Separator, space
c == 0x2001 || // em quad Common Separator, space
c == 0x2002 || // en space Common Separator, space
c == 0x2003 || // em space Common Separator, space
c == 0x2004 || // three-per-em space Common Separator, space
c == 0x2005 || // four-per-em space Common Separator, space
c == 0x2006 || // six-per-em space Common Separator, space
c == 0x2007 || // figure space Common Separator, space
c == 0x2008 || // punctuation space Common Separator, space
c == 0x2009 || // thin space Common Separator, space
c == 0x200A || // hair space Common Separator, space
c == 0x2028 || // line separator Common Separator, line
c == 0x2029 || // paragraph separator Common Separator, paragraph
c == 0x202F || // narrow no-break space Common Separator, space
c == 0x205F || // medium mathematical space Common Separator, space
c == 0x3000); // ideographic space Common Separator, space
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::boundary (int left, int right)
{
// XOR
if (isalpha (left) != isalpha (right)) return true;
if (isdigit (left) != isdigit (right)) return true;
if (isspace (left) != isspace (right)) return true;
// OR
if (ispunct (left) || ispunct (right)) return true;
return false;
}
////////////////////////////////////////////////////////////////////////////////
// Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes.
void Lexer::word_split (std::vector <std::string>& words, const std::string& input)
{
words.clear ();
std::string word;
Lexer::Type type;
Lexer lex (input);
while (lex.word (word, type))
words.push_back (word);
}
////////////////////////////////////////////////////////////////////////////////
// Split 'input' into 'tokens'.
void Lexer::token_split (std::vector <std::string>& words, const std::string& input)
{
words.clear ();
std::string word;
Lexer::Type type;
Lexer lex (input);
while (lex.token (word, type))
words.push_back (word);
}
////////////////////////////////////////////////////////////////////////////////
// Split 'input' into 'tokens', preserving type.
void Lexer::token_split (std::vector <std::pair <std::string, Lexer::Type> >& lexemes, const std::string& input)
{
lexemes.clear ();
std::string word;
Lexer::Type type;
Lexer lex (input);
while (lex.token (word, type))
lexemes.push_back (std::pair <std::string, Lexer::Type>(word, type));
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_date (std::string& result)
{
// Try an ISO date parse.
std::string::size_type iso_i = 0;
std::string iso_result;
ISO8601d iso;
iso.ambiguity (_ambiguity);
if (iso.parse (_input.substr (_shift_counter), iso_i))
{
result = _input.substr (_shift_counter, iso_i);
while (iso_i--) shift ();
return true;
}
// Try a legacy rc.dateformat parse here.
if (Lexer::dateFormat != "")
{
try
{
std::string::size_type legacy_i = 0;
Date legacyDate (_input.substr (_shift_counter), legacy_i, Lexer::dateFormat, false, false);
result = _input.substr (_shift_counter, legacy_i);
while (legacy_i--) shift ();
return true;
}
catch (...) { /* Never mind. */ }
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_duration (std::string& result)
{
std::string::size_type iso_i = 0;
std::string iso_result;
ISO8601p iso;
if (iso.parse (_input.substr (_shift_counter), iso_i))
{
result = _input.substr (_shift_counter, iso_i);
while (iso_i--) shift ();
return true;
}
std::string::size_type dur_i = 0;
std::string dur_result;
Duration dur;
if (dur.parse (_input.substr (_shift_counter), dur_i))
{
result = _input.substr (_shift_counter, dur_i);
while (dur_i--) shift ();
return true;
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_punct (int c) const
{
if (c == ',' ||
c == '.') // Tab
return true;
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_num (int c) const
{
if ((c >= '0' && c <= '9') ||
c == '.')
return true;
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_ident_start (int c) const
{
return c && // Include null character check.
! is_ws (c) &&
! is_dec_digit (c) &&
! is_single_op (c);
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_ident (int c) const
{
return c && // Include null character check.
! is_ws (c) &&
! is_single_op (c);
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_triple_op (int c0, int c1, int c2) const
{
return (c0 == 'a' && c1 == 'n' && c2 == 'd' && _boundary23) ||
(c0 == 'x' && c1 == 'o' && c2 == 'r' && _boundary23) ||
(c0 == '!' && c1 == '=' && c2 == '=');
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_double_op (int c0, int c1, int c2) const
{
return (c0 == '=' && c1 == '=') ||
(c0 == '!' && c1 == '=') ||
(c0 == '<' && c1 == '=') ||
(c0 == '>' && c1 == '=') ||
(c0 == 'o' && c1 == 'r' && _boundary12) ||
(c0 == '|' && c1 == '|') ||
(c0 == '&' && c1 == '&') ||
(c0 == '!' && c1 == '~');
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_single_op (int c) const
{
return c == '+' ||
c == '-' ||
c == '*' ||
c == '/' ||
c == '(' ||
c == ')' ||
c == '<' ||
c == '>' ||
c == '^' ||
c == '!' ||
c == '%' ||
c == '=' ||
c == '~';
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_dec_digit (int c) const
{
return c >= '0' && c <= '9';
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_hex_digit (int c) const
{
return (c >= '0' && c <= '9') ||
(c >= 'a' && c <= 'f') ||
(c >= 'A' && c <= 'F');
}
////////////////////////////////////////////////////////////////////////////////
int Lexer::decode_escape (int c) const
{
switch (c)
{
case 'b': return 0x08;
case 'f': return 0x0C;
case 'n': return 0x0A;
case 'r': return 0x0D;
case 't': return 0x09;
case 'v': return 0x0B;
case '\'': return 0x27;
case '"': return 0x22;
case '\\': return 0x5C;
default: return c;
}
}
////////////////////////////////////////////////////////////////////////////////
int Lexer::hex_to_int (int c) const
{
if (c >= '0' && c <= '9') return (c - '0');
else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
else return (c - 'A' + 10);
}
////////////////////////////////////////////////////////////////////////////////
int Lexer::hex_to_int (int c0, int c1) const
{
return (hex_to_int (c0) << 4) + hex_to_int (c1);
}
////////////////////////////////////////////////////////////////////////////////
int Lexer::hex_to_int (int c0, int c1, int c2, int c3) const
{
return (hex_to_int (c0) << 12) +
(hex_to_int (c1) << 8) +
(hex_to_int (c2) << 4) +
hex_to_int (c3);
}
////////////////////////////////////////////////////////////////////////////////
void Lexer::shift ()
{
_n0 = _n1;
_n1 = _n2;
_n2 = _n3;
_n3 = utf8_next_char (_input, _i);
++_shift_counter;
// Detect type boundaries between characters.
_boundary01 = boundary (_n0, _n1);
_boundary12 = boundary (_n1, _n2);
_boundary23 = boundary (_n2, _n3);
}
////////////////////////////////////////////////////////////////////////////////