From d82da280cb9cb7cec610758fe12091b7571b2a2e Mon Sep 17 00:00:00 2001
From: Paul Beckingham <paul@beckingham.net>
Date: Mon, 6 Jul 2015 15:32:12 -0400
Subject: [PATCH] Lexer: Implemented ::readWord

- Lexer::readWord is a general-purpose text parser, for finding plain words and
  quoted strings. It supports \uNNNN and U+NNNN unicode sequences, and general
  escapes, \t, \', \" etc.
---
 src/Lexer.cpp | 98 +++++++++++++++++++++++++++++++++++++++++++++++++--
 src/Lexer.h   | 10 +++---
 2 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/src/Lexer.cpp b/src/Lexer.cpp
index 98aca3186..bfdc0ae5a 100644
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -342,7 +342,7 @@ bool Lexer::isEOS () const
 //          '9'     -> 9
 //          'a'/'A' -> 10
 //          'f'/'F' -> 15
-int Lexer::hexToInt (int c) const
+int Lexer::hexToInt (int c)
 {
        if (c >= '0' && c <= '9') return (c - '0');
   else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
@@ -350,13 +350,13 @@ int Lexer::hexToInt (int c) const
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-int Lexer::hexToInt (int c0, int c1) const
+int Lexer::hexToInt (int c0, int c1)
 {
   return (hexToInt (c0) << 4) + hexToInt (c1);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-int Lexer::hexToInt (int c0, int c1, int c2, int c3) const
+int Lexer::hexToInt (int c0, int c1, int c2, int c3)
 {
   return (hexToInt (c0) << 12) +
          (hexToInt (c1) << 8)  +
@@ -1209,6 +1209,7 @@ bool Lexer::isAllDigits (const std::string& text)
 }
 
 ////////////////////////////////////////////////////////////////////////////////
+// Not escape-proof.
 bool Lexer::isOneWord (const std::string& text)
 {
   std::string::size_type i = 0;
@@ -1220,6 +1221,97 @@ bool Lexer::isOneWord (const std::string& text)
   return true;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Full implementation of a word.  Includes:
+//   one\ two
+//   '\''
+//   '"'
+//   "'"
+//   "\""
+//   'one two'
+//   abcU+0020def
+//   abc\u0020def
+//   a\tb
+bool Lexer::readWord (
+  const std::string& text,
+  const std::string& quotes,
+  std::string::size_type& cursor,
+  std::string& word)
+{
+  std::string::size_type eos = text.length ();
+
+  int quote = 0;
+  if (quotes.find (text[cursor]) != std::string::npos)
+    quote = text[cursor++];
+
+  word = "";
+  int c;
+  while ((c = text[cursor]))
+  {
+    // Quoted word ends on a quote.
+    if (quote && quote == c)
+    {
+      ++cursor;
+      break;
+    }
+
+    // Unquoted word ends on white space.
+    if (! quote && Lexer::isWhitespace (c))
+    {
+      ++cursor;
+      break;
+    }
+
+    // Unicode U+XXXX or \uXXXX codepoint.
+    else if (eos - cursor >= 6 &&
+             ((text[cursor + 0] == 'U'  && text[cursor + 1] == '+') ||
+              (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) &&
+             isHexDigit (text[cursor + 2]) &&
+             isHexDigit (text[cursor + 3]) &&
+             isHexDigit (text[cursor + 4]) &&
+             isHexDigit (text[cursor + 5]))
+    {
+      word += utf8_character (
+                hexToInt (
+                  text[cursor + 2],
+                  text[cursor + 3],
+                  text[cursor + 4],
+                  text[cursor + 5]));
+      cursor += 6;
+    }
+
+    // An escaped thing.
+    else if (c == '\\')
+    {
+      c = text[++cursor];
+
+      switch (c)
+      {
+      case '"':  word += (char) 0x22; ++cursor; break;
+      case '\'': word += (char) 0x27; ++cursor; break;
+      case '\\': word += (char) 0x5C; ++cursor; break;
+      case 'b':  word += (char) 0x08; ++cursor; break;
+      case 'f':  word += (char) 0x0C; ++cursor; break;
+      case 'n':  word += (char) 0x0A; ++cursor; break;
+      case 'r':  word += (char) 0x0D; ++cursor; break;
+      case 't':  word += (char) 0x09; ++cursor; break;
+      case 'v':  word += (char) 0x0B; ++cursor; break;
+
+      // This pass-through default case means that anything can be escaped
+      // harmlessly. In particular 'quote' is included, if it not one of the
+      // above characters.
+      default:   word += (char) c;    ++cursor; break;
+      }
+    }
+
+    // Ordinary character.
+    else
+      word += utf8_character (utf8_next_char (text, cursor));
+  }
+
+  return word.length () > 0 ? true : false;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // <name> [. <modifier>] <: | = | :: | :=> [<value>]
 bool Lexer::decomposePair (
diff --git a/src/Lexer.h b/src/Lexer.h
index e8e7e48de..6cc84654e 100644
--- a/src/Lexer.h
+++ b/src/Lexer.h
@@ -75,13 +75,13 @@ public:
   static bool isOneWord             (const std::string&);
   static void dequote               (std::string&);
   static bool wasQuoted             (const std::string&);
+  static bool readWord              (const std::string&, const std::string&, std::string::size_type&, std::string&);
   static bool decomposePair         (const std::string&, std::string&, std::string&, std::string&, std::string&);
+  static int hexToInt               (int);
+  static int hexToInt               (int, int);
+  static int hexToInt               (int, int, int, int);
 
-  // Helpers.
-  bool isEOS () const;
-  int hexToInt (int) const;
-  int hexToInt (int, int) const;
-  int hexToInt (int, int, int, int) const;
+  bool isEOS                        () const;
 
   // Stream Classifiers.
   bool isString       (std::string&, Lexer::Type&, int quote);