diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 99a22cb9b..e489c0c69 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -37,7 +37,7 @@ static const unsigned int uuid_min_length = 8; std::string Lexer::dateFormat = ""; bool Lexer::isoEnabled = true; -int Lexer::minimumMatchLength = 3; +std::string::size_type Lexer::minimumMatchLength = 3; std::map Lexer::attributes; @@ -373,7 +373,9 @@ int Lexer::hexToInt (int c0, int c1, int c2, int c3) // left: wonderful // right: wonderbread // returns: ^ 6 -int Lexer::commonLength (const std::string& left, const std::string& right) +std::string::size_type Lexer::commonLength ( + const std::string& left, + const std::string& right) { std::string::size_type l = 0; std::string::size_type r = 0; @@ -382,7 +384,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right) utf8_next_char (right, r)) ; - return (int) l; + return l; } //////////////////////////////////////////////////////////////////////////////// @@ -393,7 +395,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right) // right: prowonderbread // r: ^ // returns: ^ 6 -int Lexer::commonLength ( +std::string::size_type Lexer::commonLength ( const std::string& left, std::string::size_type l, const std::string& right, @@ -404,7 +406,7 @@ int Lexer::commonLength ( utf8_next_char (right, r)) ; - return (int) l; + return l; } //////////////////////////////////////////////////////////////////////////////// @@ -1077,7 +1079,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type) std::string partialToken; Lexer::Type partialType; - if (isLiteral ("rc.", false) && + if (isLiteral ("rc.", false, false) && isWord (partialToken, partialType)) { token = _text.substr (marker, _cursor - marker); @@ -1090,7 +1092,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type) "context.width", "context.height", "system.version", - "system.os"}, true)) + "system.os"}, false, true)) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::dom; @@ -1105,7 +1107,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type) if (isUUID (extractedToken, extractedType, false) || isInteger (extractedToken, extractedType)) { - if (! isLiteral (".", false)) + if (! isLiteral (".", false, false)) { _cursor = marker; return false; @@ -1116,8 +1118,9 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type) std::size_t checkpoint = _cursor; // [prefix]tags. - if (isLiteral ("tags.", false) && - isWord (partialToken, partialType)) + if (isLiteral ("tags", true, false) && + isLiteral (".", false, false) && + isWord (partialToken, partialType)) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::dom; @@ -1127,28 +1130,26 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type) _cursor = checkpoint; // [prefix]attribute - if (isOneOf (attributes, true)) + if (isOneOf (attributes, true, true)) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::dom; return true; } - else - _cursor = checkpoint; // [prefix]attribute - if (isOneOf (attributes, false)) + if (isOneOf (attributes, true, false)) { - if (isLiteral (".", false)) + if (isLiteral (".", false, false)) { std::string attribute = _text.substr (checkpoint, _cursor - checkpoint - 1); - // if attribute type is 'date' + // if attribute type is 'date', then it has sub-elements. if (attributes[attribute] == "date" && isOneOf ({"year", "month", "day", "week", "weekday", "julian", - "hour", "minute", "second"}, true)) + "hour", "minute", "second"}, true, true)) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::dom; @@ -1162,35 +1163,35 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type) return true; } } - else - _cursor = checkpoint; // [prefix]annotations. - if (isLiteral ("annotations.", false)) + if (isLiteral ("annotations", true, false) && + isLiteral (".", false, true)) { std::string extractedToken; Lexer::Type extractedType; if (isInteger (extractedToken, extractedType)) { - if (isLiteral (".", false)) + if (isLiteral (".", false, false)) { - if (isLiteral ("description", true)) + if (isLiteral ("description", true, true)) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::dom; return true; } - else if (isLiteral ("entry", true)) + else if (isLiteral ("entry", true, true)) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::dom; return true; } - else if (isLiteral ("entry.", false) && + else if (isLiteral ("entry", true, false) && + isLiteral (".", false, true) && isOneOf ({"year", "month", "day", "week", "weekday", "julian", - "hour", "minute", "second"}, true)) + "hour", "minute", "second"}, true, true)) { token = _text.substr (marker, _cursor - marker); type = Lexer::Type::dom; @@ -1251,37 +1252,54 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type) } //////////////////////////////////////////////////////////////////////////////// -bool Lexer::isLiteral (const std::string& literal, bool endBoundary) +bool Lexer::isLiteral ( + const std::string& literal, + bool allowAbbreviations, + bool endBoundary) { - auto len = literal.length (); - if (_text.find (literal, _cursor) == _cursor && - (! endBoundary || - _text.length () == _cursor + len || - Lexer::isWhitespace (_text[_cursor + len]) || - Lexer::isSingleCharOperator (_text[_cursor + len]))) - { - _cursor += len; - return true; - } + auto common = commonLength (literal, 0, _text, _cursor); - return false; + // Without abbreviations, common must equal literal length. + if (! allowAbbreviations && + common < literal.length ()) + return false; + + // Abbreviations must meet the minimum size. + if (allowAbbreviations && + common < minimumMatchLength) + return false; + + // End boundary conditions must be met. + if (endBoundary && + ! Lexer::isWhitespace (_text[_cursor + common]) && + ! Lexer::isSingleCharOperator (_text[_cursor + common])) + return false; + + _cursor += common; + return true; } //////////////////////////////////////////////////////////////////////////////// -bool Lexer::isOneOf (const std::vector & options, bool endBoundary) +bool Lexer::isOneOf ( + const std::vector & options, + bool allowAbbreviations, + bool endBoundary) { for (auto& item : options) - if (isLiteral (item, endBoundary)) + if (isLiteral (item, allowAbbreviations, endBoundary)) return true; return false; } //////////////////////////////////////////////////////////////////////////////// -bool Lexer::isOneOf (const std::map & options, bool endBoundary) +bool Lexer::isOneOf ( + const std::map & options, + bool allowAbbreviations, + bool endBoundary) { for (auto& item : options) - if (isLiteral (item.first, endBoundary)) + if (isLiteral (item.first, allowAbbreviations, endBoundary)) return true; return false; diff --git a/src/Lexer.h b/src/Lexer.h index 742dedd54..a244bf030 100644 --- a/src/Lexer.h +++ b/src/Lexer.h @@ -41,7 +41,7 @@ public: // These are overridable. static std::string dateFormat; static bool isoEnabled; - static int minimumMatchLength; + static std::string::size_type minimumMatchLength; static std::map attributes; enum class Type { uuid, number, hex, @@ -61,36 +61,35 @@ public: static std::string typeToString (Lexer::Type); // Static helpers. - static const std::string typeName (const Lexer::Type&); - static bool isWhitespace (int); - static bool isAlpha (int); - static bool isDigit (int); - static bool isHexDigit (int); - static bool isIdentifierStart (int); - static bool isIdentifierNext (int); - static bool isSingleCharOperator (int); - static bool isDoubleCharOperator (int, int, int); - static bool isTripleCharOperator (int, int, int, int); - static bool isBoundary (int, int); - static bool isHardBoundary (int, int); - static bool isPunctuation (int); - static bool isAllDigits (const std::string&); - static void dequote (std::string&, const std::string& quotes = "'\""); - static bool wasQuoted (const std::string&); - static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); - static bool readWord (const std::string&, std::string::size_type&, std::string&); - static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&); - static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&); - static bool decomposePattern (const std::string&, std::string&, std::string&); - static int hexToInt (int); - static int hexToInt (int, int); - static int hexToInt (int, int, int, int); - static int commonLength (const std::string&, const std::string&); - static int commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type); - - bool isEOS () const; + static const std::string typeName (const Lexer::Type&); + static bool isWhitespace (int); + static bool isAlpha (int); + static bool isDigit (int); + static bool isHexDigit (int); + static bool isIdentifierStart (int); + static bool isIdentifierNext (int); + static bool isSingleCharOperator (int); + static bool isDoubleCharOperator (int, int, int); + static bool isTripleCharOperator (int, int, int, int); + static bool isBoundary (int, int); + static bool isHardBoundary (int, int); + static bool isPunctuation (int); + static bool isAllDigits (const std::string&); + static void dequote (std::string&, const std::string& quotes = "'\""); + static bool wasQuoted (const std::string&); + static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&); + static bool readWord (const std::string&, std::string::size_type&, std::string&); + static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&); + static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&); + static bool decomposePattern (const std::string&, std::string&, std::string&); + static int hexToInt (int); + static int hexToInt (int, int); + static int hexToInt (int, int, int, int); + static std::string::size_type commonLength (const std::string&, const std::string&); + static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type); // Stream Classifiers. + bool isEOS () const; bool isString (std::string&, Lexer::Type&, const std::string&); bool isDate (std::string&, Lexer::Type&); bool isDuration (std::string&, Lexer::Type&); @@ -110,9 +109,9 @@ public: bool isDOM (std::string&, Lexer::Type&); bool isIdentifier (std::string&, Lexer::Type&); bool isWord (std::string&, Lexer::Type&); - bool isLiteral (const std::string&, bool); - bool isOneOf (const std::vector &, bool); - bool isOneOf (const std::map &, bool); + bool isLiteral (const std::string&, bool, bool); + bool isOneOf (const std::vector &, bool, bool); + bool isOneOf (const std::map &, bool, bool); private: std::string _text; diff --git a/test/lexer.t.cpp b/test/lexer.t.cpp index d5ec2a416..6c6acda98 100644 --- a/test/lexer.t.cpp +++ b/test/lexer.t.cpp @@ -37,7 +37,7 @@ Context context; //////////////////////////////////////////////////////////////////////////////// int main (int argc, char** argv) { - UnitTest t (1160); + UnitTest t (1170); std::vector > tokens; std::string token; @@ -265,18 +265,34 @@ int main (int argc, char** argv) t.ok (Lexer::readWord (text, cursor, word), "readWord \"one \" --> true"); t.is (word, "one", " word '" + word + "'"); - // bool isLiteral (const std::string&, bool); + // bool isLiteral (const std::string&, bool, bool); Lexer l4 ("one.two"); - t.notok (l4.isLiteral("zero", false), "isLiteral 'one.two' --> false"); - t.ok (l4.isLiteral("one", false), "isLiteral 'one.two' --> 'one'"); - t.ok (l4.isLiteral(".", false), "isLiteral 'one.two' --> '.'"); - t.ok (l4.isLiteral("two", true), "isLiteral 'one.two' --> 'two'"); + t.notok (l4.isLiteral("zero", false, false), "isLiteral 'one.two' --> false"); + t.ok (l4.isLiteral("one", false, false), "isLiteral 'one.two' --> 'one'"); + t.ok (l4.isLiteral(".", false, false), "isLiteral 'one.two' --> '.'"); + t.ok (l4.isLiteral("two", false, true), "isLiteral 'one.two' --> 'two'"); - // bool isOneOf (const std::string&, bool); - Lexer l5 ("Grumpy."); + Lexer l5 ("wonderful"); + t.notok (l5.isLiteral ("wonder", false, false), "isLiteral 'wonder' != 'wonderful' without abbreviation"); + t.ok (l5.isLiteral ("wonder", true, false), "isLiteral 'wonder' == 'wonderful' with abbreviation"); + + // bool isOneOf (const std::string&, bool, bool); + Lexer l6 ("Grumpy."); std::vector dwarves = {"Sneezy", "Doc", "Bashful", "Grumpy", "Happy", "Sleepy", "Dopey"}; - t.notok (l5.isOneOf (dwarves, true), "isOneof ('Grumpy', true) --> false"); - t.ok (l5.isOneOf (dwarves, false), "isOneOf ('Grumpy', false) --> true"); + t.notok (l6.isOneOf (dwarves, false, true), "isOneof ('Grumpy', true) --> false"); + t.ok (l6.isOneOf (dwarves, false, false), "isOneOf ('Grumpy', false) --> true"); + + // static std::string::size_type commonLength (const std::string&, const std::string&); + t.is ((int)Lexer::commonLength ("", ""), 0, "commonLength '' : '' --> 0"); + t.is ((int)Lexer::commonLength ("a", "a"), 1, "commonLength 'a' : 'a' --> 1"); + t.is ((int)Lexer::commonLength ("abcde", "abcde"), 5, "commonLength 'abcde' : 'abcde' --> 5"); + t.is ((int)Lexer::commonLength ("abc", ""), 0, "commonLength 'abc' : '' --> 0"); + t.is ((int)Lexer::commonLength ("abc", "def"), 0, "commonLength 'abc' : 'def' --> 0"); + t.is ((int)Lexer::commonLength ("foobar", "foo"), 3, "commonLength 'foobar' : 'foo' --> 3"); + t.is ((int)Lexer::commonLength ("foo", "foobar"), 3, "commonLength 'foo' : 'foobar' --> 3"); + + // static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type); + t.is ((int)Lexer::commonLength ("wonder", 0, "prowonderbread", 3), 6, "'wonder'+0 : 'prowonderbread'+3 --> 6"); // Test all Lexer types. #define NO {"",Lexer::Type::word}