//////////////////////////////////////////////////////////////////////////////// // // Copyright 2013 - 2014, Paul Beckingham, Federico Hernandez. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // // http://www.opensource.org/licenses/mit-license.php // //////////////////////////////////////////////////////////////////////////////// #include #include #include #include //////////////////////////////////////////////////////////////////////////////// Lexer::Lexer (const std::string& input) : _input (input) , _i (0) , _n0 (32) , _n1 (32) , _n2 (32) , _n3 (32) , _ambiguity (true) { // Read 4 chars in preparation. Even if there are < 4. Take a deep breath. shift (); shift (); shift (); shift (); } //////////////////////////////////////////////////////////////////////////////// Lexer::~Lexer () { } //////////////////////////////////////////////////////////////////////////////// // Walk the input string, looking for transitions. bool Lexer::token (std::string& token, Type& type) { // Start with nothing. token = ""; // Different types of matching quote: ', ". int quote = 0; type = typeNone; while (_n0) { switch (type) { case typeNone: if (is_ws (_n0)) shift (); else if (_n0 == '"' || _n0 == '\'') { type = typeString; quote = _n0; shift (); } else if (_n0 == '0' && _n1 == 'x' && is_hex_digit (_n2)) { type = typeHex; token += utf8_character (_n0); shift (); token += utf8_character (_n0); shift (); token += utf8_character (_n0); shift (); } else if (is_dec_digit (_n0)) { // Speculatively try a date and duration parse. Longest wins. std::string::size_type iso_i = 0; std::string iso_token; ISO8601d iso; iso.ambiguity (_ambiguity); if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i)) iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i); std::string::size_type dur_i = 0; std::string dur_token; Duration dur; if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i)) dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i); if (iso_token.length () > dur_token.length ()) { while (iso_i--) shift (); token = iso_token; type = typeDate; return true; } else if (dur_token.length () > iso_token.length ()) { while (dur_i--) shift (); token = dur_token; type = typeDuration; return true; } type = typeNumber; token += utf8_character (_n0); shift (); } else if (_n0 == '.' && is_dec_digit (_n1)) { type = typeDecimal; token += utf8_character (_n0); shift (); } else if (is_triple_op (_n0, _n1, _n2)) { type = typeOperator; token += utf8_character (_n0); shift (); token += utf8_character (_n0); shift (); token += utf8_character (_n0); shift (); return true; } else if (is_double_op (_n0, _n1)) { type = typeOperator; token += utf8_character (_n0); shift (); token += utf8_character (_n0); shift (); return true; } else if (is_single_op (_n0)) { type = typeOperator; token += utf8_character (_n0); shift (); return true; } else if (_n0 == '\\') { type = typeIdentifierEscape; shift (); } else if (is_ident_start (_n0)) { // Speculatively try a date and duration parse. Longest wins. std::string::size_type iso_i = 0; std::string iso_token; ISO8601p iso; if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i)) iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i); std::string::size_type dur_i = 0; std::string dur_token; Duration dur; if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i)) dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i); if (iso_token.length () > dur_token.length ()) { while (iso_i--) shift (); token = iso_token; type = typeDuration; return true; } else if (dur_token.length () > iso_token.length ()) { while (dur_i--) shift (); token = dur_token; type = typeDuration; return true; } type = typeIdentifier; token += utf8_character (_n0); shift (); } else throw std::string ("Unexpected error 1"); break; case typeString: if (_n0 == quote) { shift (); quote = 0; return true; } else if (_n0 == '\\') { type = typeEscape; shift (); } else { token += utf8_character (_n0); shift (); } break; case typeIdentifier: if (is_ident (_n0)) { token += utf8_character (_n0); shift (); } else { return true; } break; case typeIdentifierEscape: if (_n0 == 'u') { type = typeEscapeUnicode; shift (); } break; case typeEscape: if (_n0 == 'x') { type = typeEscapeHex; shift (); } else if (_n0 == 'u') { type = typeEscapeUnicode; shift (); } else { token += decode_escape (_n0); type = quote ? typeString : typeIdentifier; shift (); } break; case typeEscapeHex: if (is_hex_digit (_n0) && is_hex_digit (_n1)) { token += utf8_character (hex_to_int (_n0, _n1)); type = quote ? typeString : typeIdentifier; shift (); shift (); } else { type = quote ? typeString : typeIdentifier; shift (); quote = 0; return true; } break; case typeEscapeUnicode: if (is_hex_digit (_n0) && is_hex_digit (_n1) && is_hex_digit (_n2) && is_hex_digit (_n3)) { token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3)); shift (); shift (); shift (); shift (); type = quote ? typeString : typeIdentifier; } else if (_n0 == quote) { type = typeString; shift (); quote = 0; return true; } case typeNumber: if (is_dec_digit (_n0)) { token += utf8_character (_n0); shift (); } else if (_n0 == '.') { type = typeDecimal; token += utf8_character (_n0); shift (); } else if (_n0 == 'e' || _n0 == 'E') { type = typeExponentIndicator; token += utf8_character (_n0); shift (); } else { return true; } break; case typeDecimal: if (is_dec_digit (_n0)) { token += utf8_character (_n0); shift (); } else if (_n0 == 'e' || _n0 == 'E') { type = typeExponentIndicator; token += utf8_character (_n0); shift (); } else { return true; } break; case typeExponentIndicator: if (_n0 == '+' || _n0 == '-') { token += utf8_character (_n0); shift (); } else if (is_dec_digit (_n0)) { type = typeExponent; token += utf8_character (_n0); shift (); } break; case typeExponent: if (is_dec_digit (_n0)) { token += utf8_character (_n0); shift (); } else if (_n0 == '.') { token += utf8_character (_n0); shift (); } else { type = typeDecimal; return true; } break; case typeHex: if (is_hex_digit (_n0)) { token += utf8_character (_n0); shift (); } else { return true; } break; default: throw std::string ("Unexpected error 2"); break; } // Fence post. if (!_n0 && token != "") return true; } return false; } //////////////////////////////////////////////////////////////////////////////// // Just like Lexer::token, but no operators, dates or durations. bool Lexer::word (std::string& token, Type& type) { // Start with nothing. token = ""; // Different types of matching quote: ', ". int quote = 0; type = typeNone; while (_n0) { switch (type) { case typeNone: if (is_ws (_n0)) shift (); else if (_n0 == '"' || _n0 == '\'') { type = typeString; quote = _n0; shift (); } else if (_n0 == '0' && _n1 == 'x' && is_hex_digit (_n2)) { type = typeHex; token += utf8_character (_n0); shift (); token += utf8_character (_n0); shift (); token += utf8_character (_n0); shift (); } else if (is_dec_digit (_n0)) { // Speculatively try a date and duration parse. Longest wins. std::string::size_type iso_i = 0; std::string iso_token; ISO8601d iso; iso.ambiguity (_ambiguity); if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i)) iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i); std::string::size_type dur_i = 0; std::string dur_token; Duration dur; if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i)) dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i); if (iso_token.length () > dur_token.length ()) { while (iso_i--) shift (); token = iso_token; type = typeDate; return true; } else if (dur_token.length () > iso_token.length ()) { while (dur_i--) shift (); token = dur_token; type = typeDuration; return true; } type = typeNumber; token += utf8_character (_n0); shift (); } else if (_n0 == '.' && is_dec_digit (_n1)) { type = typeDecimal; token += utf8_character (_n0); shift (); } else if (_n0 == '\\') { type = typeIdentifierEscape; shift (); } else if (is_ident_start (_n0)) { type = typeIdentifier; token += utf8_character (_n0); shift (); } else throw std::string ("Unexpected error 1"); break; case typeString: if (_n0 == quote) { shift (); quote = 0; return true; } else if (_n0 == '\\') { type = typeEscape; shift (); } else { token += utf8_character (_n0); shift (); } break; case typeIdentifier: if (is_ident (_n0)) { token += utf8_character (_n0); shift (); } else { return true; } break; case typeIdentifierEscape: if (_n0 == 'u') { type = typeEscapeUnicode; shift (); } break; case typeEscape: if (_n0 == 'x') { type = typeEscapeHex; shift (); } else if (_n0 == 'u') { type = typeEscapeUnicode; shift (); } else { token += decode_escape (_n0); type = quote ? typeString : typeIdentifier; shift (); } break; case typeEscapeHex: if (is_hex_digit (_n0) && is_hex_digit (_n1)) { token += utf8_character (hex_to_int (_n0, _n1)); type = quote ? typeString : typeIdentifier; shift (); shift (); } else { type = quote ? typeString : typeIdentifier; shift (); quote = 0; return true; } break; case typeEscapeUnicode: if (is_hex_digit (_n0) && is_hex_digit (_n1) && is_hex_digit (_n2) && is_hex_digit (_n3)) { token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3)); shift (); shift (); shift (); shift (); type = quote ? typeString : typeIdentifier; } else if (_n0 == quote) { type = typeString; shift (); quote = 0; return true; } case typeNumber: if (is_dec_digit (_n0)) { token += utf8_character (_n0); shift (); } else if (_n0 == '.') { type = typeDecimal; token += utf8_character (_n0); shift (); } else if (_n0 == 'e' || _n0 == 'E') { type = typeExponentIndicator; token += utf8_character (_n0); shift (); } else { return true; } break; case typeDecimal: if (is_dec_digit (_n0)) { token += utf8_character (_n0); shift (); } else if (_n0 == 'e' || _n0 == 'E') { type = typeExponentIndicator; token += utf8_character (_n0); shift (); } else { return true; } break; case typeExponentIndicator: if (_n0 == '+' || _n0 == '-') { token += utf8_character (_n0); shift (); } else if (is_dec_digit (_n0)) { type = typeExponent; token += utf8_character (_n0); shift (); } break; case typeExponent: if (is_dec_digit (_n0)) { token += utf8_character (_n0); shift (); } else if (_n0 == '.') { token += utf8_character (_n0); shift (); } else { type = typeDecimal; return true; } break; case typeHex: if (is_hex_digit (_n0)) { token += utf8_character (_n0); shift (); } else { return true; } break; default: throw std::string ("Unexpected error 2"); break; } // Fence post. if (!_n0 && token != "") return true; } return false; } //////////////////////////////////////////////////////////////////////////////// void Lexer::ambiguity (bool value) { _ambiguity = value; } //////////////////////////////////////////////////////////////////////////////// const std::string Lexer::type_name (const Type& type) { switch (type) { case Lexer::typeNone: return "None"; case Lexer::typeString: return "String"; case Lexer::typeIdentifier: return "Identifier"; case Lexer::typeIdentifierEscape: return "IdentifierEscape"; case Lexer::typeNumber: return "Number"; case Lexer::typeDecimal: return "Decimal"; case Lexer::typeExponentIndicator: return "ExponentIndicator"; case Lexer::typeExponent: return "Exponent"; case Lexer::typeHex: return "Hex"; case Lexer::typeOperator: return "Operator"; case Lexer::typeEscape: return "Escape"; case Lexer::typeEscapeHex: return "EscapeHex"; case Lexer::typeEscapeUnicode: return "EscapeUnicode"; case Lexer::typeDate: return "Date"; case Lexer::typeDuration: return "Duration"; } } //////////////////////////////////////////////////////////////////////////////// // Complete Unicode whitespace list. // // http://en.wikipedia.org/wiki/Whitespace_character // Updated 2013-11-18 bool Lexer::is_ws (int c) { return (c == 0x0020 || // space Common Separator, space c == 0x0009 || // Common Other, control HT, Horizontal Tab c == 0x000A || // Common Other, control LF, Line feed c == 0x000B || // Common Other, control VT, Vertical Tab c == 0x000C || // Common Other, control FF, Form feed c == 0x000D || // Common Other, control CR, Carriage return c == 0x0085 || // Common Other, control NEL, Next line c == 0x00A0 || // no-break space Common Separator, space c == 0x1680 || // ogham space mark Ogham Separator, space c == 0x180E || // mongolian vowel separator Mongolian Separator, space c == 0x2000 || // en quad Common Separator, space c == 0x2001 || // em quad Common Separator, space c == 0x2002 || // en space Common Separator, space c == 0x2003 || // em space Common Separator, space c == 0x2004 || // three-per-em space Common Separator, space c == 0x2005 || // four-per-em space Common Separator, space c == 0x2006 || // six-per-em space Common Separator, space c == 0x2007 || // figure space Common Separator, space c == 0x2008 || // punctuation space Common Separator, space c == 0x2009 || // thin space Common Separator, space c == 0x200A || // hair space Common Separator, space c == 0x2028 || // line separator Common Separator, line c == 0x2029 || // paragraph separator Common Separator, paragraph c == 0x202F || // narrow no-break space Common Separator, space c == 0x205F || // medium mathematical space Common Separator, space c == 0x3000); // ideographic space Common Separator, space } //////////////////////////////////////////////////////////////////////////////// // Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes. void Lexer::split (std::vector & words, const std::string& input) { words.clear (); std::string word; Lexer::Type type; Lexer lex (input); while (lex.word (word, type)) words.push_back (word); } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_punct (int c) const { if (c == ',' || c == '.') // Tab return true; return false; } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_num (int c) const { if ((c >= '0' && c <= '9') || c == '.') return true; return false; } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_ident_start (int c) const { return c && // Include null character check. ! is_ws (c) && ! is_dec_digit (c); } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_ident (int c) const { return c && // Include null character check. ! is_ws (c) && ! is_single_op (c); } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_triple_op (int c0, int c1, int c2) const { return (c0 == 'a' && c1 == 'n' && c2 == 'd') || (c0 == 'x' && c1 == 'o' && c2 == 'r'); } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_double_op (int c0, int c1) const { return (c0 == '=' && c1 == '=') || (c0 == '!' && c1 == '=') || (c0 == '<' && c1 == '=') || (c0 == '>' && c1 == '=') || (c0 == 'o' && c1 == 'r') || (c0 == '|' && c1 == '|') || (c0 == '&' && c1 == '&') || (c0 == '!' && c1 == '~'); } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_single_op (int c) const { return c == '+' || c == '-' || c == '*' || c == '/' || c == '(' || c == ')' || c == '<' || c == '>' || c == '^' || c == '!' || c == '%' || c == '=' || c == '~'; } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_dec_digit (int c) const { return c >= '0' && c <= '9'; } //////////////////////////////////////////////////////////////////////////////// bool Lexer::is_hex_digit (int c) const { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } //////////////////////////////////////////////////////////////////////////////// int Lexer::decode_escape (int c) const { switch (c) { case 'b': return 0x08; case 'f': return 0x0C; case 'n': return 0x0A; case 'r': return 0x0D; case 't': return 0x09; case 'v': return 0x0B; case '\'': return 0x27; case '"': return 0x22; case '\\': return 0x5C; default: return c; } } //////////////////////////////////////////////////////////////////////////////// int Lexer::hex_to_int (int c) const { if (c >= '0' && c <= '9') return (c - '0'); else if (c >= 'a' && c <= 'f') return (c - 'a' + 10); else return (c - 'A' + 10); } //////////////////////////////////////////////////////////////////////////////// int Lexer::hex_to_int (int c0, int c1) const { return (hex_to_int (c0) << 4) + hex_to_int (c1); } //////////////////////////////////////////////////////////////////////////////// int Lexer::hex_to_int (int c0, int c1, int c2, int c3) const { return (hex_to_int (c0) << 12) + (hex_to_int (c1) << 8) + (hex_to_int (c2) << 4) + hex_to_int (c3); } //////////////////////////////////////////////////////////////////////////////// void Lexer::shift () { _n0 = _n1; _n1 = _n2; _n2 = _n3; _n3 = utf8_next_char (_input, _i); } ////////////////////////////////////////////////////////////////////////////////