timewarrior/src/Grammar.cpp
2015-12-22 23:40:22 -05:00

262 lines
7.4 KiB
C++

////////////////////////////////////////////////////////////////////////////////
//
// Copyright 2015, Paul Beckingham, Federico Hernandez.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// http://www.opensource.org/licenses/mit-license.php
//
////////////////////////////////////////////////////////////////////////////////
#include <cmake.h>
#include <Grammar.h>
#include <Lexer.h>
#include <text.h>
#include <sstream>
////////////////////////////////////////////////////////////////////////////////
Grammar::Grammar ()
{
}
////////////////////////////////////////////////////////////////////////////////
void Grammar::loadFromFile (File& file)
{
if (! file.exists ())
throw format ("Grammar file '{1}' not found.", file._data);
std::string contents;
file.read (contents);
loadFromString (contents);
}
////////////////////////////////////////////////////////////////////////////////
// Load and parse BNF.
//
// Syntax:
// rule-name: alternate1-token1 alternate1-token2
// alternate2-token1
//
// - Rules are aligned at left margin only, followed by a comma.
// - Productions are indented and never at left margin.
// - Blank line between rules.
//
// Details:
// - Literals are always double-quoted.
// - "*", "+" and "?" suffixes have POSIX semantics.
// - "є" means empty set.
// - Literal modifiers:
// - :a Accept abbreviations
// - :i Accept caseless match
//
void Grammar::loadFromString (const std::string& input)
{
std::string rule_name = "";
// This is a state machine. Read each line.
for (auto& line : split (input, '\n'))
{
// Skip whole-line comments.
if (line[0] == '#')
continue;
// Eliminate inline comments.
std::string::size_type hash = line.find ('#');
if (hash != std::string::npos)
line.resize (hash);
// Skip blank lines with no semantics.
line = Lexer::trim (line);
if (line == "" and rule_name == "")
continue;
if (line != "")
{
int token_count = 0;
Lexer l (line);
Lexer::Type type;
std::string token;
while (l.token (token, type))
{
++token_count;
if (token.back () == ':')
{
// Capture the Rule_name.
rule_name = token.substr (0, token.size () - 1);
// If this is the first Rule, capture it as a starting point.
if (_start == "")
_start = rule_name;
_rules[rule_name] = Grammar::Rule ();
token_count = 0;
}
else if (token.front () == ':')
{
// Decorate the most recent token, of the most recent Production,
// of the current Rule.
_rules[rule_name].back ().back ().decorate (token);
}
else
{
// If no Production was added yet, add one.
if (token_count <= 1)
_rules[rule_name].push_back (Grammar::Production ());
// Add the new Token to the most recent Production, of the current
// Rule.
_rules[rule_name].back ().push_back (Grammar::Token (token));
}
}
}
// A blank line in the input ends the current rule definition.
else
rule_name = "";
}
// Validate the parsed grammar.
validate ();
}
////////////////////////////////////////////////////////////////////////////////
std::string Grammar::start () const
{
return _start;
}
////////////////////////////////////////////////////////////////////////////////
std::vector <std::string> Grammar::rules () const
{
std::vector <std::string> results;
for (auto& rule : _rules)
results.push_back (rule.first);
return results;
}
////////////////////////////////////////////////////////////////////////////////
std::vector <std::string> Grammar::terminals () const
{
std::vector <std::string> results;
for (auto& rule : _rules)
for (auto& production : rule.second)
for (auto& token : production)
if (_rules.find (token._token) == _rules.end ())
results.push_back (token._token);
return results;
}
////////////////////////////////////////////////////////////////////////////////
std::string Grammar::dump () const
{
std::stringstream out;
for (auto& rule : _rules)
{
// Indicate the start Rule.
if (rule.first == _start)
out << "";
out << rule.first << ":\n";
for (auto& production : rule.second)
{
out << " ";
for (auto& term : production)
{
out << term._token;
if (term._decoration != "")
out << " " << term._decoration;
out << " ";
}
out << "\n";
}
out << "\n";
}
return out.str ();
}
////////////////////////////////////////////////////////////////////////////////
void Grammar::validate () const
{
if (_start == "")
throw std::string ("There are no rules defined.");
std::vector <std::string> allRules;
std::vector <std::string> allTokens;
std::vector <std::string> allLeftRecursive;
for (auto& rule : _rules)
{
allRules.push_back (rule.first);
for (auto& production : rule.second)
{
for (auto& token : production)
{
if (token._token.front () != '"' and
token._token.front () != '/')
allTokens.push_back (token._token);
if (token._token == production[0]._token &&
rule.first == production[0]._token)
allLeftRecursive.push_back (token._token);
}
}
}
std::vector <std::string> notUsed;
std::vector <std::string> notDefined;
listDiff (allRules, allTokens, notUsed, notDefined);
// Undefined value - these are definitions that appear in token, but are
// not in _rules.
for (auto& nd : notDefined)
if (nd != "є")
throw format ("Definition '{1}' referenced, but not defined.", nd);
// Circular definitions - these are names in _rules that also appear as
// token 0 in any of the alternates for that definition.
for (auto& lr : allLeftRecursive)
throw format ("Definition '{1}' is left recursive.", lr);
for (auto& r : allRules)
if (r[0] == '"' or
r[0] == '/')
throw format ("Definition '{1}' must not be a literal.");
// Unused definitions - these are names in _rules that are never
// referenced as token.
for (auto& nu : notUsed)
if (nu != _start)
throw format ("Definition '{1}' is defined, but not referenced.", nu);
}
////////////////////////////////////////////////////////////////////////////////
void Grammar::generateParseTable ()
{
}
////////////////////////////////////////////////////////////////////////////////