Lexer: Added minimal lexer that currently only identifies words

2025-07-07 20:06:39 +02:00 · 2015-12-20 12:12:07 -05:00 · 2015-12-20 12:12:07 -05:00 · 897bc0bc14
commit 897bc0bc14
parent 77b1c6ad35
3 changed files with 209 additions and 0 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@ -0,0 +1,149 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included
+// in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// http://www.opensource.org/licenses/mit-license.php
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <cmake.h>
+#include <Lexer.h>
+#include <algorithm>
+#include <ctype.h>
+#include <utf8.h>
+
+////////////////////////////////////////////////////////////////////////////////
+Lexer::Lexer (const std::string& text)
+: _text (text)
+, _cursor (0)
+, _eos (text.size ())
+{
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// When a Lexer object is constructed with a string, this method walks through
+// the stream of low-level tokens.
+bool Lexer::token (std::string& token, Lexer::Type& type)
+{
+  // Eat white space.
+  while (isWhitespace (_text[_cursor]))
+    utf8_next_char (_text, _cursor);
+
+  // Terminate at EOS.
+  if (isEOS ())
+    return false;
+
+  if (isWord (token, type))
+    return true;
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Complete Unicode whitespace list.
+//
+// http://en.wikipedia.org/wiki/Whitespace_character
+// Updated 2015-09-13
+// Static
+//
+// TODO This list should be derived from the Unicode database.
+bool Lexer::isWhitespace (int c)
+{
+  return (c == 0x0020 ||   // space Common  Separator, space
+          c == 0x0009 ||   // Common  Other, control  HT, Horizontal Tab
+          c == 0x000A ||   // Common  Other, control  LF, Line feed
+          c == 0x000B ||   // Common  Other, control  VT, Vertical Tab
+          c == 0x000C ||   // Common  Other, control  FF, Form feed
+          c == 0x000D ||   // Common  Other, control  CR, Carriage return
+          c == 0x0085 ||   // Common  Other, control  NEL, Next line
+          c == 0x00A0 ||   // no-break space  Common  Separator, space
+          c == 0x1680 ||   // ogham space mark  Ogham Separator, space
+          c == 0x180E ||   // mongolian vowel separator Mongolian Separator, space
+          c == 0x2000 ||   // en quad Common  Separator, space
+          c == 0x2001 ||   // em quad Common  Separator, space
+          c == 0x2002 ||   // en space  Common  Separator, space
+          c == 0x2003 ||   // em space  Common  Separator, space
+          c == 0x2004 ||   // three-per-em space  Common  Separator, space
+          c == 0x2005 ||   // four-per-em space Common  Separator, space
+          c == 0x2006 ||   // six-per-em space  Common  Separator, space
+          c == 0x2007 ||   // figure space  Common  Separator, space
+          c == 0x2008 ||   // punctuation space Common  Separator, space
+          c == 0x2009 ||   // thin space  Common  Separator, space
+          c == 0x200A ||   // hair space  Common  Separator, space
+          c == 0x200B ||   // zero width space
+          c == 0x200C ||   // zero width non-joiner
+          c == 0x200D ||   // zero width joiner
+          c == 0x2028 ||   // line separator  Common  Separator, line
+          c == 0x2029 ||   // paragraph separator Common  Separator, paragraph
+          c == 0x202F ||   // narrow no-break space Common  Separator, space
+          c == 0x205F ||   // medium mathematical space Common  Separator, space
+          c == 0x2060 ||   // word joiner
+          c == 0x3000);    // ideographic space Common  Separator, space
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::isSingleCharOperator (int c)
+{
+  return c == '+' ||  // Addition
+         c == '-' ||  // Subtraction or unary minus = ambiguous
+         c == '*' ||  // Multiplication
+         c == '/' ||  // Diviѕion
+         c == '(' ||  // Precedence open parenthesis
+         c == ')' ||  // Precedence close parenthesis
+         c == '<' ||  // Less than
+         c == '>' ||  // Greater than
+         c == '^' ||  // Exponent
+         c == '!' ||  // Unary not
+         c == '%' ||  // Modulus
+         c == '=' ||  // Partial match
+         c == '~';    // Pattern match
+}
+
+////////////////////////////////////////////////////////////////////////////////
+bool Lexer::isEOS () const
+{
+  return _cursor >= _eos;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lexer::Type::word
+//   [^\s]+
+bool Lexer::isWord (std::string& token, Lexer::Type& type)
+{
+  std::size_t marker = _cursor;
+
+  while (_text[marker]                  &&
+         ! isWhitespace (_text[marker]) &&
+         ! isSingleCharOperator (_text[marker]))
+    utf8_next_char (_text, marker);
+
+  if (marker > _cursor)
+  {
+    token = _text.substr (_cursor, marker - _cursor);
+    type = Lexer::Type::word;
+    _cursor = marker;
+    return true;
+  }
+
+  return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////