Lexer: Added minimal lexer that currently only identifies words

This commit is contained in:
Paul Beckingham 2015-12-20 12:12:07 -05:00
parent 77b1c6ad35
commit 897bc0bc14
3 changed files with 209 additions and 0 deletions

149
src/Lexer.cpp Normal file
View file

@ -0,0 +1,149 @@
////////////////////////////////////////////////////////////////////////////////
//
// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// http://www.opensource.org/licenses/mit-license.php
//
////////////////////////////////////////////////////////////////////////////////
#include <cmake.h>
#include <Lexer.h>
#include <algorithm>
#include <ctype.h>
#include <utf8.h>
////////////////////////////////////////////////////////////////////////////////
Lexer::Lexer (const std::string& text)
: _text (text)
, _cursor (0)
, _eos (text.size ())
{
}
////////////////////////////////////////////////////////////////////////////////
// When a Lexer object is constructed with a string, this method walks through
// the stream of low-level tokens.
bool Lexer::token (std::string& token, Lexer::Type& type)
{
// Eat white space.
while (isWhitespace (_text[_cursor]))
utf8_next_char (_text, _cursor);
// Terminate at EOS.
if (isEOS ())
return false;
if (isWord (token, type))
return true;
return false;
}
////////////////////////////////////////////////////////////////////////////////
// Complete Unicode whitespace list.
//
// http://en.wikipedia.org/wiki/Whitespace_character
// Updated 2015-09-13
// Static
//
// TODO This list should be derived from the Unicode database.
bool Lexer::isWhitespace (int c)
{
return (c == 0x0020 || // space Common Separator, space
c == 0x0009 || // Common Other, control HT, Horizontal Tab
c == 0x000A || // Common Other, control LF, Line feed
c == 0x000B || // Common Other, control VT, Vertical Tab
c == 0x000C || // Common Other, control FF, Form feed
c == 0x000D || // Common Other, control CR, Carriage return
c == 0x0085 || // Common Other, control NEL, Next line
c == 0x00A0 || // no-break space Common Separator, space
c == 0x1680 || // ogham space mark Ogham Separator, space
c == 0x180E || // mongolian vowel separator Mongolian Separator, space
c == 0x2000 || // en quad Common Separator, space
c == 0x2001 || // em quad Common Separator, space
c == 0x2002 || // en space Common Separator, space
c == 0x2003 || // em space Common Separator, space
c == 0x2004 || // three-per-em space Common Separator, space
c == 0x2005 || // four-per-em space Common Separator, space
c == 0x2006 || // six-per-em space Common Separator, space
c == 0x2007 || // figure space Common Separator, space
c == 0x2008 || // punctuation space Common Separator, space
c == 0x2009 || // thin space Common Separator, space
c == 0x200A || // hair space Common Separator, space
c == 0x200B || // zero width space
c == 0x200C || // zero width non-joiner
c == 0x200D || // zero width joiner
c == 0x2028 || // line separator Common Separator, line
c == 0x2029 || // paragraph separator Common Separator, paragraph
c == 0x202F || // narrow no-break space Common Separator, space
c == 0x205F || // medium mathematical space Common Separator, space
c == 0x2060 || // word joiner
c == 0x3000); // ideographic space Common Separator, space
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isSingleCharOperator (int c)
{
return c == '+' || // Addition
c == '-' || // Subtraction or unary minus = ambiguous
c == '*' || // Multiplication
c == '/' || // Diviѕion
c == '(' || // Precedence open parenthesis
c == ')' || // Precedence close parenthesis
c == '<' || // Less than
c == '>' || // Greater than
c == '^' || // Exponent
c == '!' || // Unary not
c == '%' || // Modulus
c == '=' || // Partial match
c == '~'; // Pattern match
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::isEOS () const
{
return _cursor >= _eos;
}
////////////////////////////////////////////////////////////////////////////////
// Lexer::Type::word
// [^\s]+
bool Lexer::isWord (std::string& token, Lexer::Type& type)
{
std::size_t marker = _cursor;
while (_text[marker] &&
! isWhitespace (_text[marker]) &&
! isSingleCharOperator (_text[marker]))
utf8_next_char (_text, marker);
if (marker > _cursor)
{
token = _text.substr (_cursor, marker - _cursor);
type = Lexer::Type::word;
_cursor = marker;
return true;
}
return false;
}
////////////////////////////////////////////////////////////////////////////////