From d82da280cb9cb7cec610758fe12091b7571b2a2e Mon Sep 17 00:00:00 2001 From: Paul Beckingham Date: Mon, 6 Jul 2015 15:32:12 -0400 Subject: [PATCH] Lexer: Implemented ::readWord - Lexer::readWord is a general-purpose text parser, for finding plain words and quoted strings. It supports \uNNNN and U+NNNN unicode sequences, and general escapes, \t, \', \" etc. --- src/Lexer.cpp | 98 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/Lexer.h | 10 +++--- 2 files changed, 100 insertions(+), 8 deletions(-) diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 98aca3186..bfdc0ae5a 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -342,7 +342,7 @@ bool Lexer::isEOS () const // '9' -> 9 // 'a'/'A' -> 10 // 'f'/'F' -> 15 -int Lexer::hexToInt (int c) const +int Lexer::hexToInt (int c) { if (c >= '0' && c <= '9') return (c - '0'); else if (c >= 'a' && c <= 'f') return (c - 'a' + 10); @@ -350,13 +350,13 @@ int Lexer::hexToInt (int c) const } //////////////////////////////////////////////////////////////////////////////// -int Lexer::hexToInt (int c0, int c1) const +int Lexer::hexToInt (int c0, int c1) { return (hexToInt (c0) << 4) + hexToInt (c1); } //////////////////////////////////////////////////////////////////////////////// -int Lexer::hexToInt (int c0, int c1, int c2, int c3) const +int Lexer::hexToInt (int c0, int c1, int c2, int c3) { return (hexToInt (c0) << 12) + (hexToInt (c1) << 8) + @@ -1209,6 +1209,7 @@ bool Lexer::isAllDigits (const std::string& text) } //////////////////////////////////////////////////////////////////////////////// +// Not escape-proof. bool Lexer::isOneWord (const std::string& text) { std::string::size_type i = 0; @@ -1220,6 +1221,97 @@ bool Lexer::isOneWord (const std::string& text) return true; } +//////////////////////////////////////////////////////////////////////////////// +// Full implementation of a word. Includes: +// one\ two +// '\'' +// '"' +// "'" +// "\"" +// 'one two' +// abcU+0020def +// abc\u0020def +// a\tb +bool Lexer::readWord ( + const std::string& text, + const std::string& quotes, + std::string::size_type& cursor, + std::string& word) +{ + std::string::size_type eos = text.length (); + + int quote = 0; + if (quotes.find (text[cursor]) != std::string::npos) + quote = text[cursor++]; + + word = ""; + int c; + while ((c = text[cursor])) + { + // Quoted word ends on a quote. + if (quote && quote == c) + { + ++cursor; + break; + } + + // Unquoted word ends on white space. + if (! quote && Lexer::isWhitespace (c)) + { + ++cursor; + break; + } + + // Unicode U+XXXX or \uXXXX codepoint. + else if (eos - cursor >= 6 && + ((text[cursor + 0] == 'U' && text[cursor + 1] == '+') || + (text[cursor + 0] == '\\' && text[cursor + 1] == 'u')) && + isHexDigit (text[cursor + 2]) && + isHexDigit (text[cursor + 3]) && + isHexDigit (text[cursor + 4]) && + isHexDigit (text[cursor + 5])) + { + word += utf8_character ( + hexToInt ( + text[cursor + 2], + text[cursor + 3], + text[cursor + 4], + text[cursor + 5])); + cursor += 6; + } + + // An escaped thing. + else if (c == '\\') + { + c = text[++cursor]; + + switch (c) + { + case '"': word += (char) 0x22; ++cursor; break; + case '\'': word += (char) 0x27; ++cursor; break; + case '\\': word += (char) 0x5C; ++cursor; break; + case 'b': word += (char) 0x08; ++cursor; break; + case 'f': word += (char) 0x0C; ++cursor; break; + case 'n': word += (char) 0x0A; ++cursor; break; + case 'r': word += (char) 0x0D; ++cursor; break; + case 't': word += (char) 0x09; ++cursor; break; + case 'v': word += (char) 0x0B; ++cursor; break; + + // This pass-through default case means that anything can be escaped + // harmlessly. In particular 'quote' is included, if it not one of the + // above characters. + default: word += (char) c; ++cursor; break; + } + } + + // Ordinary character. + else + word += utf8_character (utf8_next_char (text, cursor)); + } + + return word.length () > 0 ? true : false; +} + //////////////////////////////////////////////////////////////////////////////// // [. ] <: | = | :: | :=> [] bool Lexer::decomposePair ( diff --git a/src/Lexer.h b/src/Lexer.h index e8e7e48de..6cc84654e 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -75,13 +75,13 @@ public: static bool isOneWord (const std::string&); static void dequote (std::string&); static bool wasQuoted (const std::string&); + static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&); + static int hexToInt (int); + static int hexToInt (int, int); + static int hexToInt (int, int, int, int); - // Helpers. - bool isEOS () const; - int hexToInt (int) const; - int hexToInt (int, int) const; - int hexToInt (int, int, int, int) const; + bool isEOS () const; // Stream Classifiers. bool isString (std::string&, Lexer::Type&, int quote);