diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 61b91d22a..80b8da1f0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,6 +21,7 @@ set (task_SRCS CLI.cpp CLI.h ISO8601.cpp ISO8601.h JSON.cpp JSON.h Lexer.cpp Lexer.h + Lexer2.cpp Lexer2.h Msg.cpp Msg.h Nibbler.cpp Nibbler.h Path.cpp Path.h diff --git a/src/Lexer2.cpp b/src/Lexer2.cpp new file mode 100644 index 000000000..b4c5a392b --- /dev/null +++ b/src/Lexer2.cpp @@ -0,0 +1,808 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// http://www.opensource.org/licenses/mit-license.php +// +//////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include + +static const std::string uuid_pattern = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"; +static const int uuid_min_length = 8; + +//////////////////////////////////////////////////////////////////////////////// +Lexer2::Lexer2 (const std::string& text) +: _text (text) +, _cursor (0) +, _eos (text.size ()) +{ +} + +//////////////////////////////////////////////////////////////////////////////// +Lexer2::~Lexer2 () +{ +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::token (std::string& token, Lexer2::Type& type) +{ + // Eat white space. + while (isWhitespace (_text[_cursor])) + utf8_next_char (_text, _cursor); + + // Terminate at EOS. + if (isEOS ()) + return false; + + // The sequence is specific, and must follow these rules: + // - date < uuid < identifier + // - duraiton < identifier + // - pair < identifier + // - hex < number + // - separator < tag < operator + // - substitution < pattern + // - word last + if (isString (token, type, '\'') || + isString (token, type, '"') || + isUUID (token, type) || + isPartialUUID (token, type) || + isHexNumber (token, type) || + isNumber (token, type) || + isSeparator (token, type) || + isList (token, type) || + isPair (token, type) || + isTag (token, type) || + isSubstitution (token, type) || + isPattern (token, type) || + isOperator (token, type) || + isIdentifier (token, type) || + isWord (token, type)) + return true; + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// No L10N - these are for internal purposes. +const std::string Lexer2::typeName (const Lexer2::Type& type) +{ + switch (type) + { + case Lexer2::Type::uuid: return "uuid"; + case Lexer2::Type::number: return "number"; + case Lexer2::Type::hex: return "hex"; + case Lexer2::Type::string: return "string"; + case Lexer2::Type::list: return "list"; + case Lexer2::Type::pair: return "pair"; + case Lexer2::Type::separator: return "separator"; + case Lexer2::Type::tag: return "tag"; + case Lexer2::Type::substitution: return "substitution"; + case Lexer2::Type::pattern: return "pattern"; + case Lexer2::Type::op: return "op"; + case Lexer2::Type::identifier: return "identifier"; + case Lexer2::Type::word: return "word"; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Complete Unicode whitespace list. +// +// http://en.wikipedia.org/wiki/Whitespace_character +// Updated 2013-11-18 +// Static +bool Lexer2::isWhitespace (int c) +{ + return (c == 0x0020 || // space Common Separator, space + c == 0x0009 || // Common Other, control HT, Horizontal Tab + c == 0x000A || // Common Other, control LF, Line feed + c == 0x000B || // Common Other, control VT, Vertical Tab + c == 0x000C || // Common Other, control FF, Form feed + c == 0x000D || // Common Other, control CR, Carriage return + c == 0x0085 || // Common Other, control NEL, Next line + c == 0x00A0 || // no-break space Common Separator, space + c == 0x1680 || // ogham space mark Ogham Separator, space + c == 0x180E || // mongolian vowel separator Mongolian Separator, space + c == 0x2000 || // en quad Common Separator, space + c == 0x2001 || // em quad Common Separator, space + c == 0x2002 || // en space Common Separator, space + c == 0x2003 || // em space Common Separator, space + c == 0x2004 || // three-per-em space Common Separator, space + c == 0x2005 || // four-per-em space Common Separator, space + c == 0x2006 || // six-per-em space Common Separator, space + c == 0x2007 || // figure space Common Separator, space + c == 0x2008 || // punctuation space Common Separator, space + c == 0x2009 || // thin space Common Separator, space + c == 0x200A || // hair space Common Separator, space + c == 0x2028 || // line separator Common Separator, line + c == 0x2029 || // paragraph separator Common Separator, paragraph + c == 0x202F || // narrow no-break space Common Separator, space + c == 0x205F || // medium mathematical space Common Separator, space + c == 0x3000); // ideographic space Common Separator, space +} + +//////////////////////////////////////////////////////////////////////////////// +// Digits 0-9. +bool Lexer2::isDigit (int c) +{ + return c >= 0x30 && c <= 0x39; +} + +//////////////////////////////////////////////////////////////////////////////// +// Digits 0-9 a-f A-F. +bool Lexer2::isHexDigit (int c) +{ + return (c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F'); +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::isIdentifierStart (int c) +{ + return c && // Include null character check. + ! isWhitespace (c) && + ! isDigit (c) && + ! isSingleCharOperator (c) && + ! isPunctuation (c); +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::isIdentifierNext (int c) +{ + return c && // Include null character check. + c != ':' && // Used in isPair. + ! isWhitespace (c) && + ! isSingleCharOperator (c); +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::isSingleCharOperator (int c) +{ + return c == '+' || // Addition + c == '-' || // Subtraction or unary minus = ambiguous + c == '*' || // Multiplication + c == '/' || // Diviѕion + c == '(' || // Precedence open parenthesis + c == ')' || // Precedence close parenthesis + c == '<' || // Less than + c == '>' || // Greater than + c == '^' || // Exponent + c == '!' || // Unary not + c == '%' || // Modulus + c == '=' || // Partial match + c == '~'; // Pattern match +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::isDoubleCharOperator (int c0, int c1, int c2) +{ + return (c0 == '=' && c1 == '=') || + (c0 == '!' && c1 == '=') || + (c0 == '<' && c1 == '=') || + (c0 == '>' && c1 == '=') || + (c0 == 'o' && c1 == 'r' && isBoundary (c1, c2)) || + (c0 == '|' && c1 == '|') || + (c0 == '&' && c1 == '&') || + (c0 == '!' && c1 == '~'); +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::isTripleCharOperator (int c0, int c1, int c2, int c3) +{ + return (c0 == 'a' && c1 == 'n' && c2 == 'd' && isBoundary (c2, c3)) || + (c0 == 'x' && c1 == 'o' && c2 == 'r' && isBoundary (c2, c3)) || + (c0 == '!' && c1 == '=' && c2 == '='); +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::isBoundary (int left, int right) +{ + // XOR + if (isalpha (left) != isalpha (right)) return true; + if (isDigit (left) != isDigit (right)) return true; + if (isWhitespace (left) != isWhitespace (right)) return true; + + // OR + if (isPunctuation (left) || isPunctuation (right)) return true; + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::isPunctuation (int c) +{ + return c != '@' && + ispunct (c); +} + +//////////////////////////////////////////////////////////////////////////////// +bool Lexer2::isEOS () const +{ + return _cursor >= _eos; +} + +//////////////////////////////////////////////////////////////////////////////// +// Converts '0' -> 0 +// '9' -> 9 +// 'a'/'A' -> 10 +// 'f'/'F' -> 15 +int Lexer2::hexToInt (int c) const +{ + if (c >= '0' && c <= '9') return (c - '0'); + else if (c >= 'a' && c <= 'f') return (c - 'a' + 10); + else return (c - 'A' + 10); +} + +//////////////////////////////////////////////////////////////////////////////// +int Lexer2::hexToInt (int c0, int c1) const +{ + return (hexToInt (c0) << 4) + hexToInt (c1); +} + +//////////////////////////////////////////////////////////////////////////////// +int Lexer2::hexToInt (int c0, int c1, int c2, int c3) const +{ + return (hexToInt (c0) << 12) + + (hexToInt (c1) << 8) + + (hexToInt (c2) << 4) + + hexToInt (c3); +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::string +// '|" +// [ U+XXXX | \uXXXX | \" | \' | \\ | \/ | \b | \f | \n | \r | \t | . ] +// '|" +bool Lexer2::isString (std::string& token, Lexer2::Type& type, int quote) +{ + std::size_t marker = _cursor; + + if (_text[marker] == quote) + { + ++marker; + token = ""; + + int c; + while ((c = _text[marker])) + { + // EOS. + if (c == quote) + break; + + // Unicode U+XXXX or \uXXXX codepoint. + else if (_eos - marker >= 6 && + ((_text[marker + 0] == 'U' && _text[marker + 1] == '+') || + (_text[marker + 0] == '\\' && _text[marker + 1] == 'u')) && + isHexDigit (_text[marker + 2]) && + isHexDigit (_text[marker + 3]) && + isHexDigit (_text[marker + 4]) && + isHexDigit (_text[marker + 5])) + { + token += utf8_character ( + hexToInt ( + _text[marker + 2], + _text[marker + 3], + _text[marker + 4], + _text[marker + 5])); + marker += 6; + } + + // An escaped thing. + else if (c == '\\') + { + c = _text[++marker]; + + switch (c) + { + case '"': token += (char) 0x22; ++marker; break; + case '\'': token += (char) 0x27; ++marker; break; + case '\\': token += (char) 0x5C; ++marker; break; + case 'b': token += (char) 0x08; ++marker; break; + case 'f': token += (char) 0x0C; ++marker; break; + case 'n': token += (char) 0x0A; ++marker; break; + case 'r': token += (char) 0x0D; ++marker; break; + case 't': token += (char) 0x09; ++marker; break; + case 'v': token += (char) 0x0B; ++marker; break; + + // This pass-through default case means that anythign can be escaped + // harmlessly. In particular 'quote' is included, if it not one of the + // above characters. + default: token += (char) c; ++marker; break; + } + } + + // Ordinary character. + else + token += utf8_character (utf8_next_char (_text, marker)); + } + + if (_text[marker] == quote) + { + ++marker; + type = Lexer2::Type::string; + _cursor = marker; + return true; + } + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::uuid +// XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX +bool Lexer2::isUUID (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + if (_eos - marker >= 36) + { + if (isHexDigit (_text[marker + 0]) && + isHexDigit (_text[marker + 1]) && + isHexDigit (_text[marker + 2]) && + isHexDigit (_text[marker + 3]) && + isHexDigit (_text[marker + 4]) && + isHexDigit (_text[marker + 5]) && + isHexDigit (_text[marker + 6]) && + isHexDigit (_text[marker + 7]) && + _text[marker + 8] == '-' && + isHexDigit (_text[marker + 9]) && + isHexDigit (_text[marker + 10]) && + isHexDigit (_text[marker + 11]) && + isHexDigit (_text[marker + 12]) && + _text[marker + 13] == '-' && + isHexDigit (_text[marker + 14]) && + isHexDigit (_text[marker + 15]) && + isHexDigit (_text[marker + 16]) && + isHexDigit (_text[marker + 17]) && + _text[marker + 18] == '-' && + isHexDigit (_text[marker + 19]) && + isHexDigit (_text[marker + 20]) && + isHexDigit (_text[marker + 20]) && + isHexDigit (_text[marker + 20]) && + _text[marker + 23] == '-' && + isHexDigit (_text[marker + 24]) && + isHexDigit (_text[marker + 25]) && + isHexDigit (_text[marker + 26]) && + isHexDigit (_text[marker + 27]) && + isHexDigit (_text[marker + 28]) && + isHexDigit (_text[marker + 29]) && + isHexDigit (_text[marker + 30]) && + isHexDigit (_text[marker + 31]) && + isHexDigit (_text[marker + 32]) && + isHexDigit (_text[marker + 33]) && + isHexDigit (_text[marker + 34]) && + isHexDigit (_text[marker + 35])) + { + marker += 36; + token = _text.substr (_cursor, marker - _cursor); + type = Lexer2::Type::uuid; + _cursor = marker; + return true; + } + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::uuid +// XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX +bool Lexer2::isPartialUUID (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + std::size_t i = 0; + for (; i < 36 && marker + i < _eos; i++) + { + if (uuid_pattern[i] == 'x') + { + if (! isHexDigit (_text[marker + i])) + break; + } + else if (uuid_pattern[i] != _text[marker + i]) + break; + } + + if (i >= uuid_min_length) + { + token = _text.substr (_cursor, i + 1); + type = Lexer2::Type::uuid; + _cursor += i; + return true; + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::hex +// 0xX+ +bool Lexer2::isHexNumber (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + if (_eos - marker >= 3 && + _text[marker + 0] == '0' && + _text[marker + 1] == 'x') + { + marker += 2; + + while (isHexDigit (_text[marker])) + ++marker; + + if (marker - _cursor > 2) + { + token = _text.substr (_cursor, marker - _cursor); + type = Lexer2::Type::hex; + _cursor = marker; + return true; + } + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::number +// \d+ +// [ . \d+ ] +// [ e|E [ +|- ] \d+ ] +bool Lexer2::isNumber (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + + if (_text[marker] == '.') + { + ++marker; + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + } + } + + if (_text[marker] == 'e' || + _text[marker] == 'E') + { + ++marker; + + if (_text[marker] == '+' || + _text[marker] == '-') + ++marker; + + if (isDigit (_text[marker])) + { + ++marker; + while (isDigit (_text[marker])) + utf8_next_char (_text, marker); + } + } + + token = _text.substr (_cursor, marker - _cursor); + type = Lexer2::Type::number; + _cursor = marker; + return true; + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::separator +// -- +bool Lexer2::isSeparator (std::string& token, Lexer2::Type& type) +{ + if (_eos - _cursor >= 2 && + _text[_cursor] == '-' && + _text[_cursor + 1] == '-') + { + _cursor += 2; + type = Lexer2::Type::separator; + token = "--"; + return true; + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::list +// , +bool Lexer2::isList (std::string& token, Lexer2::Type& type) +{ + if (_eos - _cursor > 1 && + _text[_cursor] == ',') + { + ++_cursor; + type = Lexer2::Type::list; + token = ","; + return true; + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::pair +// : [ | ] +bool Lexer2::isPair (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + std::string ignoredToken; + Lexer2::Type ignoredType; + if (isIdentifier (ignoredToken, ignoredType)) + { + if (_eos - _cursor > 1 && + (_text[_cursor] == ':' || _text[_cursor] == '=')) + { + _cursor++; + + if (isString (ignoredToken, ignoredType, '\'') || + isString (ignoredToken, ignoredType, '"') || + isWord (ignoredToken, ignoredType)) + { + token = _text.substr (marker, _cursor - marker); + type = Lexer2::Type::pair; + return true; + } + } + } + + _cursor = marker; + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::tag +// [ +|- ] [ ]* +bool Lexer2::isTag (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + if (_text[marker] == '+' || + _text[marker] == '-') + { + ++marker; + + if (isIdentifierStart (_text[marker])) + { + utf8_next_char (_text, marker); + + while (isIdentifierNext (_text[marker])) + utf8_next_char (_text, marker); + + token = _text.substr (_cursor, marker - _cursor); + type = Lexer2::Type::tag; + _cursor = marker; + return true; + } + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::substitution +// / / / [g] +bool Lexer2::isSubstitution (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + std::string extractedToken; + Lexer2::Type extractedType; + if (isString (extractedToken, extractedType, '/')) + { + --_cursor; // Step back over the '/'. + + if (isString (extractedToken, extractedType, '/')) + { + if (_text[_cursor] == 'g') + ++_cursor; + + if (isWhitespace (_text[_cursor])) + { + token = _text.substr (marker, _cursor - marker); + type = Lexer2::Type::substitution; + return true; + } + } + } + + _cursor = marker; + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::pattern +// / / +bool Lexer2::isPattern (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + std::string extractedToken; + Lexer2::Type extractedType; + if (isString (extractedToken, extractedType, '/') && + isWhitespace (_text[_cursor])) + { + token = _text.substr (marker, _cursor - marker); + type = Lexer2::Type::pattern; + return true; + } + + _cursor = marker; + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::op +// _hastag_ | _notag | _neg_ | _pos_ | +// | +// | +// | +bool Lexer2::isOperator (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + if (_eos - marker >= 8 && _text.substr (marker, 8) == "_hastag_") + { + marker += 8; + type = Lexer2::Type::op; + token = _text.substr (_cursor, marker - _cursor); + _cursor = marker; + return true; + } + + else if (_eos - marker >= 7 && _text.substr (marker, 7) == "_notag_") + { + marker += 7; + type = Lexer2::Type::op; + token = _text.substr (_cursor, marker - _cursor); + _cursor = marker; + return true; + } + + else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_neg_") + { + marker += 5; + type = Lexer2::Type::op; + token = _text.substr (_cursor, marker - _cursor); + _cursor = marker; + return true; + } + + else if (_eos - marker >= 5 && _text.substr (marker, 5) == "_pos_") + { + marker += 5; + type = Lexer2::Type::op; + token = _text.substr (_cursor, marker - _cursor); + _cursor = marker; + return true; + } + + else if (_eos - marker >= 4 && + isTripleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2], _text[marker + 3])) + { + marker += 3; + type = Lexer2::Type::op; + token = _text.substr (_cursor, marker - _cursor); + _cursor = marker; + return true; + } + + else if (_eos - marker >= 2 && + isDoubleCharOperator (_text[marker], _text[marker + 1], _text[marker + 2])) + { + marker += 2; + type = Lexer2::Type::op; + token = _text.substr (_cursor, marker - _cursor); + _cursor = marker; + return true; + } + + else if (isSingleCharOperator (_text[marker])) + { + token = _text[marker]; + type = Lexer2::Type::op; + _cursor = ++marker; + return true; + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::identifier +// [ ]* +bool Lexer2::isIdentifier (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + if (isIdentifierStart (_text[marker])) + { + utf8_next_char (_text, marker); + + while (isIdentifierNext (_text[marker])) + utf8_next_char (_text, marker); + + token = _text.substr (_cursor, marker - _cursor); + type = Lexer2::Type::identifier; + _cursor = marker; + return true; + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Lexer2::Type::word +// [^\s]+ +bool Lexer2::isWord (std::string& token, Lexer2::Type& type) +{ + std::size_t marker = _cursor; + + while (! isWhitespace (_text[marker])) + utf8_next_char (_text, marker); + + if (marker > _cursor) + { + token = _text.substr (_cursor, marker - _cursor); + type = Lexer2::Type::word; + _cursor = marker; + return true; + } + + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// Static +std::string Lexer2::typeToString (Lexer2::Type type) +{ + if (type == Lexer2::Type::string) return std::string ("\033[38;5;7m\033[48;5;3m") + "string" + "\033[0m"; + else if (type == Lexer2::Type::uuid) return std::string ("\033[38;5;7m\033[48;5;10m") + "uuid" + "\033[0m"; + else if (type == Lexer2::Type::hex) return std::string ("\033[38;5;7m\033[48;5;14m") + "hex" + "\033[0m"; + else if (type == Lexer2::Type::number) return std::string ("\033[38;5;7m\033[48;5;6m") + "number" + "\033[0m"; + else if (type == Lexer2::Type::separator) return std::string ("\033[38;5;7m\033[48;5;4m") + "separator" + "\033[0m"; + else if (type == Lexer2::Type::list) return std::string ("\033[38;5;7m\033[48;5;4m") + "list" + "\033[0m"; + else if (type == Lexer2::Type::pair) return std::string ("\033[38;5;7m\033[48;5;1m") + "pair" + "\033[0m"; + else if (type == Lexer2::Type::tag) return std::string ("\033[37;45m") + "tag" + "\033[0m"; + else if (type == Lexer2::Type::substitution) return std::string ("\033[37;102m") + "substitution" + "\033[0m"; + else if (type == Lexer2::Type::pattern) return std::string ("\033[37;42m") + "pattern" + "\033[0m"; + else if (type == Lexer2::Type::op) return std::string ("\033[38;5;7m\033[48;5;203m") + "op" + "\033[0m"; + else if (type == Lexer2::Type::identifier) return std::string ("\033[38;5;15m\033[48;5;244m") + "identifier" + "\033[0m"; + else if (type == Lexer2::Type::word) return std::string ("\033[38;5;15m\033[48;5;236m") + "word" + "\033[0m"; + else return std::string ("\033[37;41m") + "unknown" + "\033[0m"; +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/src/Lexer2.h b/src/Lexer2.h new file mode 100644 index 000000000..cc12c6daf --- /dev/null +++ b/src/Lexer2.h @@ -0,0 +1,96 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// http://www.opensource.org/licenses/mit-license.php +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef INCLUDED_LEXER2 +#define INCLUDED_LEXER2 + +#include +#include + +// Lexer2: A UTF8 lexical analyzer for every construct used on the Taskwarrior +// command line, with additional recognized types for disambiguation. + +class Lexer2 +{ +public: + enum class Type { uuid, number, hex, + string, + list, pair, separator, + substitution, pattern, + tag, + op, + identifier, word, + /*date,*/ /*duration,*/ }; + + Lexer2 (const std::string&); + ~Lexer2 (); + bool token (std::string&, Lexer2::Type&); + static std::string typeToString (Lexer2::Type); + + // Static helpers. + static const std::string typeName (const Lexer2::Type&); + static bool isWhitespace (int); + static bool isDigit (int); + static bool isHexDigit (int); + static bool isIdentifierStart (int); + static bool isIdentifierNext (int); + static bool isSingleCharOperator (int); + static bool isDoubleCharOperator (int, int, int); + static bool isTripleCharOperator (int, int, int, int); + static bool isBoundary (int, int); + static bool isPunctuation (int); + + // Helpers. + bool isEOS () const; + int hexToInt (int) const; + int hexToInt (int, int) const; + int hexToInt (int, int, int, int) const; + + // Classifiers. + bool isString (std::string&, Lexer2::Type&, int quote); + bool isUUID (std::string&, Lexer2::Type&); + bool isPartialUUID (std::string&, Lexer2::Type&); + bool isNumber (std::string&, Lexer2::Type&); + bool isHexNumber (std::string&, Lexer2::Type&); + bool isSeparator (std::string&, Lexer2::Type&); + bool isList (std::string&, Lexer2::Type&); + bool isPair (std::string&, Lexer2::Type&); + bool isTag (std::string&, Lexer2::Type&); + bool isSubstitution (std::string&, Lexer2::Type&); + bool isPattern (std::string&, Lexer2::Type&); + bool isOperator (std::string&, Lexer2::Type&); + bool isIdentifier (std::string&, Lexer2::Type&); + bool isWord (std::string&, Lexer2::Type&); + +private: + std::string _text; + std::size_t _cursor = 0; + std::size_t _eos = 0; +}; + +#endif + +////////////////////////////////////////////////////////////////////////////////