mirror of
https://github.com/GothenburgBitFactory/timewarrior.git
synced 2025-07-07 20:06:39 +02:00
Lexer: Added minimal lexer that currently only identifies words
This commit is contained in:
parent
77b1c6ad35
commit
897bc0bc14
3 changed files with 209 additions and 0 deletions
|
@ -5,6 +5,7 @@ include_directories (${CMAKE_SOURCE_DIR}
|
|||
${TIMEW_INCLUDE_DIRS})
|
||||
|
||||
set (timew_SRCS Grammar.cpp Grammar.h
|
||||
Lexer.cpp Lexer.h
|
||||
LR0.cpp LR0.h)
|
||||
|
||||
add_library (timew STATIC ${timew_SRCS})
|
||||
|
|
149
src/Lexer.cpp
Normal file
149
src/Lexer.cpp
Normal file
|
@ -0,0 +1,149 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included
|
||||
// in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
//
|
||||
// http://www.opensource.org/licenses/mit-license.php
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <cmake.h>
|
||||
#include <Lexer.h>
|
||||
#include <algorithm>
|
||||
#include <ctype.h>
|
||||
#include <utf8.h>
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
Lexer::Lexer (const std::string& text)
|
||||
: _text (text)
|
||||
, _cursor (0)
|
||||
, _eos (text.size ())
|
||||
{
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// When a Lexer object is constructed with a string, this method walks through
|
||||
// the stream of low-level tokens.
|
||||
bool Lexer::token (std::string& token, Lexer::Type& type)
|
||||
{
|
||||
// Eat white space.
|
||||
while (isWhitespace (_text[_cursor]))
|
||||
utf8_next_char (_text, _cursor);
|
||||
|
||||
// Terminate at EOS.
|
||||
if (isEOS ())
|
||||
return false;
|
||||
|
||||
if (isWord (token, type))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Complete Unicode whitespace list.
|
||||
//
|
||||
// http://en.wikipedia.org/wiki/Whitespace_character
|
||||
// Updated 2015-09-13
|
||||
// Static
|
||||
//
|
||||
// TODO This list should be derived from the Unicode database.
|
||||
bool Lexer::isWhitespace (int c)
|
||||
{
|
||||
return (c == 0x0020 || // space Common Separator, space
|
||||
c == 0x0009 || // Common Other, control HT, Horizontal Tab
|
||||
c == 0x000A || // Common Other, control LF, Line feed
|
||||
c == 0x000B || // Common Other, control VT, Vertical Tab
|
||||
c == 0x000C || // Common Other, control FF, Form feed
|
||||
c == 0x000D || // Common Other, control CR, Carriage return
|
||||
c == 0x0085 || // Common Other, control NEL, Next line
|
||||
c == 0x00A0 || // no-break space Common Separator, space
|
||||
c == 0x1680 || // ogham space mark Ogham Separator, space
|
||||
c == 0x180E || // mongolian vowel separator Mongolian Separator, space
|
||||
c == 0x2000 || // en quad Common Separator, space
|
||||
c == 0x2001 || // em quad Common Separator, space
|
||||
c == 0x2002 || // en space Common Separator, space
|
||||
c == 0x2003 || // em space Common Separator, space
|
||||
c == 0x2004 || // three-per-em space Common Separator, space
|
||||
c == 0x2005 || // four-per-em space Common Separator, space
|
||||
c == 0x2006 || // six-per-em space Common Separator, space
|
||||
c == 0x2007 || // figure space Common Separator, space
|
||||
c == 0x2008 || // punctuation space Common Separator, space
|
||||
c == 0x2009 || // thin space Common Separator, space
|
||||
c == 0x200A || // hair space Common Separator, space
|
||||
c == 0x200B || // zero width space
|
||||
c == 0x200C || // zero width non-joiner
|
||||
c == 0x200D || // zero width joiner
|
||||
c == 0x2028 || // line separator Common Separator, line
|
||||
c == 0x2029 || // paragraph separator Common Separator, paragraph
|
||||
c == 0x202F || // narrow no-break space Common Separator, space
|
||||
c == 0x205F || // medium mathematical space Common Separator, space
|
||||
c == 0x2060 || // word joiner
|
||||
c == 0x3000); // ideographic space Common Separator, space
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
bool Lexer::isSingleCharOperator (int c)
|
||||
{
|
||||
return c == '+' || // Addition
|
||||
c == '-' || // Subtraction or unary minus = ambiguous
|
||||
c == '*' || // Multiplication
|
||||
c == '/' || // Diviѕion
|
||||
c == '(' || // Precedence open parenthesis
|
||||
c == ')' || // Precedence close parenthesis
|
||||
c == '<' || // Less than
|
||||
c == '>' || // Greater than
|
||||
c == '^' || // Exponent
|
||||
c == '!' || // Unary not
|
||||
c == '%' || // Modulus
|
||||
c == '=' || // Partial match
|
||||
c == '~'; // Pattern match
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
bool Lexer::isEOS () const
|
||||
{
|
||||
return _cursor >= _eos;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Lexer::Type::word
|
||||
// [^\s]+
|
||||
bool Lexer::isWord (std::string& token, Lexer::Type& type)
|
||||
{
|
||||
std::size_t marker = _cursor;
|
||||
|
||||
while (_text[marker] &&
|
||||
! isWhitespace (_text[marker]) &&
|
||||
! isSingleCharOperator (_text[marker]))
|
||||
utf8_next_char (_text, marker);
|
||||
|
||||
if (marker > _cursor)
|
||||
{
|
||||
token = _text.substr (_cursor, marker - _cursor);
|
||||
type = Lexer::Type::word;
|
||||
_cursor = marker;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
59
src/Lexer.h
Normal file
59
src/Lexer.h
Normal file
|
@ -0,0 +1,59 @@
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included
|
||||
// in all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
//
|
||||
// http://www.opensource.org/licenses/mit-license.php
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef INCLUDED_LEXER
|
||||
#define INCLUDED_LEXER
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <cstddef>
|
||||
|
||||
class Lexer
|
||||
{
|
||||
public:
|
||||
enum class Type { word };
|
||||
|
||||
Lexer (const std::string&);
|
||||
bool token (std::string&, Lexer::Type&);
|
||||
|
||||
// Static helpers.
|
||||
static bool isWhitespace (int);
|
||||
static bool isSingleCharOperator (int);
|
||||
|
||||
// Stream Classifiers.
|
||||
bool isEOS () const;
|
||||
bool isWord (std::string&, Lexer::Type&);
|
||||
|
||||
private:
|
||||
std::string _text;
|
||||
std::size_t _cursor;
|
||||
std::size_t _eos;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
Loading…
Add table
Add a link
Reference in a new issue