mirror of
https://github.com/GothenburgBitFactory/taskwarrior.git
synced 2025-07-07 20:06:36 +02:00

- Implemented sequence --> infix converter. - Added new Lexer code. - Added Lexer unit tests.
374 lines
11 KiB
C++
374 lines
11 KiB
C++
////////////////////////////////////////////////////////////////////////////////
|
|
// taskwarrior - a command line task list manager.
|
|
//
|
|
// Copyright 2011, Paul Beckingham, Federico Hernandez.
|
|
// All rights reserved.
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify it under
|
|
// the terms of the GNU General Public License as published by the Free Software
|
|
// Foundation; either version 2 of the License, or (at your option) any later
|
|
// version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful, but WITHOUT
|
|
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
// FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
|
// details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License along with
|
|
// this program; if not, write to the
|
|
//
|
|
// Free Software Foundation, Inc.,
|
|
// 51 Franklin Street, Fifth Floor,
|
|
// Boston, MA
|
|
// 02110-1301
|
|
// USA
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// This lexer works by breaking the input stream into tokens. The essence of
|
|
// the algorithm lies in the distinction between adjacent tokens, such that
|
|
// between the two extremes lies a good solution.
|
|
//
|
|
// At one extreme, the entire input is considered one token. Clearly this is
|
|
// only correct for trivial input. At the other extreme, every character of the
|
|
// input is a token. This is also wrong.
|
|
//
|
|
// If the input is as follows:
|
|
//
|
|
// It is almost 11:00am.
|
|
//
|
|
// The desired tokenization is:
|
|
//
|
|
// It
|
|
// <space>
|
|
// is
|
|
// <space>
|
|
// almost
|
|
// <space>
|
|
// 11
|
|
// :
|
|
// 00
|
|
// am
|
|
// .
|
|
// \n
|
|
//
|
|
// This can be achieved by allowing transitions to denote token boundaries.
|
|
// Given the following character classes:
|
|
//
|
|
// letter: a-z A-Z
|
|
// digit: 0-9
|
|
// whitespace: <space> <tab> <newline> <cr> <lf> <vertical-tab>
|
|
// other: Everything else
|
|
//
|
|
// Then a token boundary is a transition between:
|
|
// letter -> !letter
|
|
// digit -> !digit
|
|
// whitespace -> any
|
|
// other -> any
|
|
//
|
|
// This has the effect of allowing groups of consecutive letters to be
|
|
// considered one token, as well as groups of digits.
|
|
//
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include <iostream>
|
|
#include <util.h>
|
|
#include <Lexer.h>
|
|
|
|
static const int other = -1;
|
|
static const int alpha = -2;
|
|
static const int digit = -3;
|
|
static const int white = -4;
|
|
static const int quote = -5;
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
Lexer::Lexer (const std::string& input)
|
|
: mInput (input)
|
|
|
|
, mAlpha ("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
, mDigit ("0123456789")
|
|
, mQuote ("'\"")
|
|
, mWhite (" \t\n\r\f")
|
|
|
|
, mAlphaCoalesce (true)
|
|
, mDigitCoalesce (true)
|
|
, mQuotedCoalesce (false)
|
|
, mWhiteCoalesce (false)
|
|
, mSkipWhitespace (false)
|
|
{
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::tokenize (std::vector <std::string>& all)
|
|
{
|
|
all.clear (); // Prevent repeated accumulation.
|
|
|
|
std::string token;
|
|
bool inQuote = false;
|
|
char quoteChar = '\0';
|
|
for (unsigned int i = 0; i < mInput.length (); ++i)
|
|
{
|
|
bool specialFound = false;
|
|
for (unsigned int s = 0; s < mSpecialTokens.size (); ++s)
|
|
{
|
|
std::string potential = mInput.substr (
|
|
i, min (mSpecialTokens[s].length (), mInput.length () - i));
|
|
|
|
if (potential == mSpecialTokens[s])
|
|
{
|
|
// Capture currently assembled token, the special token, increment over
|
|
// that token, and skip all remaining code in the loop.
|
|
if (token.length ())
|
|
{
|
|
all.push_back (token);
|
|
token = "";
|
|
}
|
|
|
|
all.push_back (potential);
|
|
i += potential.length () - 1;
|
|
specialFound = true;
|
|
}
|
|
}
|
|
|
|
if (specialFound)
|
|
continue;
|
|
|
|
char c = mInput[i];
|
|
char next = '\0';
|
|
if (i < mInput.length () - 1)
|
|
next = mInput[i + 1];
|
|
|
|
// Classify current and next characters.
|
|
int thisChar = classify (c);
|
|
int nextChar = classify (next);
|
|
|
|
// Properly set inQuote, quoteChar.
|
|
if (!inQuote && thisChar == quote)
|
|
{
|
|
quoteChar = c;
|
|
inQuote = true;
|
|
}
|
|
else if (inQuote && c == quoteChar)
|
|
{
|
|
inQuote = false;
|
|
}
|
|
|
|
// Detect transitions.
|
|
bool transition = false;
|
|
if (thisChar != nextChar)
|
|
transition = true;
|
|
|
|
token += c;
|
|
|
|
// Transitions mean new token. All 'other' characters are separate tokens.
|
|
if (transition || nextChar == other)
|
|
{
|
|
if (!inQuote || !mQuotedCoalesce)
|
|
{
|
|
if (!mSkipWhitespace || thisChar != white)
|
|
all.push_back (token);
|
|
token = "";
|
|
}
|
|
}
|
|
|
|
// Non-transitions - runs.
|
|
else
|
|
{
|
|
// Runs may be optionally coalesced.
|
|
if (!(mAlphaCoalesce && nextChar == alpha) &&
|
|
!(mDigitCoalesce && nextChar == digit) &&
|
|
!(mWhiteCoalesce && nextChar == white))
|
|
{
|
|
if (!inQuote || !mQuotedCoalesce)
|
|
{
|
|
if (!mSkipWhitespace || thisChar != white)
|
|
all.push_back (token);
|
|
token = "";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::categorizeAsAlpha (char value)
|
|
{
|
|
if (mAlpha.find (value) == std::string::npos)
|
|
mAlpha += value;
|
|
|
|
std::string::size_type pos;
|
|
if ((pos = mDigit.find (value)) != std::string::npos) mDigit.erase (pos, 1);
|
|
if ((pos = mQuote.find (value)) != std::string::npos) mQuote.erase (pos, 1);
|
|
if ((pos = mWhite.find (value)) != std::string::npos) mWhite.erase (pos, 1);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::ignoreAsAlpha (char value)
|
|
{
|
|
std::string::size_type pos;
|
|
if ((pos = mAlpha.find (value)) != std::string::npos) mAlpha.erase (pos, 1);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::setAlpha (const std::string& value)
|
|
{
|
|
mAlpha = value;
|
|
|
|
std::string::size_type pos;
|
|
for (unsigned int i = 0; i < mAlpha.length (); ++i)
|
|
{
|
|
if ((pos = mDigit.find (mAlpha[i])) != std::string::npos) mDigit.erase (pos, 1);
|
|
if ((pos = mQuote.find (mAlpha[i])) != std::string::npos) mQuote.erase (pos, 1);
|
|
if ((pos = mWhite.find (mAlpha[i])) != std::string::npos) mWhite.erase (pos, 1);
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::categorizeAsDigit (char value)
|
|
{
|
|
if (mDigit.find (value) == std::string::npos)
|
|
mDigit += value;
|
|
|
|
std::string::size_type pos;
|
|
if ((pos = mAlpha.find (value)) != std::string::npos) mAlpha.erase (pos, 1);
|
|
if ((pos = mQuote.find (value)) != std::string::npos) mQuote.erase (pos, 1);
|
|
if ((pos = mWhite.find (value)) != std::string::npos) mWhite.erase (pos, 1);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::ignoreAsDigit (char value)
|
|
{
|
|
std::string::size_type pos;
|
|
if ((pos = mDigit.find (value)) != std::string::npos) mDigit.erase (pos, 1);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::setDigit (const std::string& value)
|
|
{
|
|
mDigit = value;
|
|
|
|
std::string::size_type pos;
|
|
for (unsigned int i = 0; i < mDigit.length (); ++i)
|
|
{
|
|
if ((pos = mAlpha.find (mDigit[i])) != std::string::npos) mAlpha.erase (pos, 1);
|
|
if ((pos = mQuote.find (mDigit[i])) != std::string::npos) mQuote.erase (pos, 1);
|
|
if ((pos = mWhite.find (mDigit[i])) != std::string::npos) mWhite.erase (pos, 1);
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::categorizeAsQuote (char value)
|
|
{
|
|
if (mQuote.find (value) == std::string::npos)
|
|
mQuote += value;
|
|
|
|
std::string::size_type pos;
|
|
if ((pos = mAlpha.find (value)) != std::string::npos) mAlpha.erase (pos, 1);
|
|
if ((pos = mDigit.find (value)) != std::string::npos) mDigit.erase (pos, 1);
|
|
if ((pos = mWhite.find (value)) != std::string::npos) mWhite.erase (pos, 1);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::ignoreAsQuote (char value)
|
|
{
|
|
std::string::size_type pos;
|
|
if ((pos = mQuote.find (value)) != std::string::npos) mQuote.erase (pos, 1);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::setQuote (const std::string& value)
|
|
{
|
|
mQuote = value;
|
|
|
|
std::string::size_type pos;
|
|
for (unsigned int i = 0; i < mQuote.length (); ++i)
|
|
{
|
|
if ((pos = mAlpha.find (mQuote[i])) != std::string::npos) mAlpha.erase (pos, 1);
|
|
if ((pos = mDigit.find (mQuote[i])) != std::string::npos) mDigit.erase (pos, 1);
|
|
if ((pos = mWhite.find (mQuote[i])) != std::string::npos) mWhite.erase (pos, 1);
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::categorizeAsWhite (char value)
|
|
{
|
|
if (mWhite.find (value) == std::string::npos)
|
|
mWhite += value;
|
|
|
|
std::string::size_type pos;
|
|
if ((pos = mAlpha.find (value)) != std::string::npos) mAlpha.erase (pos, 1);
|
|
if ((pos = mDigit.find (value)) != std::string::npos) mDigit.erase (pos, 1);
|
|
if ((pos = mQuote.find (value)) != std::string::npos) mQuote.erase (pos, 1);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::ignoreAsWhite (char value)
|
|
{
|
|
std::string::size_type pos;
|
|
if ((pos = mWhite.find (value)) != std::string::npos) mWhite.erase (pos, 1);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::setWhite (const std::string& value)
|
|
{
|
|
mWhite = value;
|
|
|
|
std::string::size_type pos;
|
|
for (unsigned int i = 0; i < mWhite.length (); ++i)
|
|
{
|
|
if ((pos = mAlpha.find (mWhite[i])) != std::string::npos) mAlpha.erase (pos, 1);
|
|
if ((pos = mDigit.find (mWhite[i])) != std::string::npos) mDigit.erase (pos, 1);
|
|
if ((pos = mQuote.find (mWhite[i])) != std::string::npos) mQuote.erase (pos, 1);
|
|
}
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::coalesceAlpha (bool value)
|
|
{
|
|
mAlphaCoalesce = value;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::coalesceDigits (bool value)
|
|
{
|
|
mDigitCoalesce = value;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::coalesceQuoted (bool value)
|
|
{
|
|
mQuotedCoalesce = value;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::coalesceWhite (bool value)
|
|
{
|
|
mWhiteCoalesce = value;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::skipWhitespace (bool value)
|
|
{
|
|
mSkipWhitespace = value;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void Lexer::specialToken (const std::string& special)
|
|
{
|
|
mSpecialTokens.push_back (special);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
int Lexer::classify (char c)
|
|
{
|
|
if (mAlpha.find (c) != std::string::npos) return alpha;
|
|
if (mDigit.find (c) != std::string::npos) return digit;
|
|
if (mWhite.find (c) != std::string::npos) return white;
|
|
if (mQuote.find (c) != std::string::npos) return quote;
|
|
|
|
return other;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|