Lexer: Integrated ::commonLength
- Uses std::string::size_type for all string lengths, offsets. - Rewrote ::isLiteral to be simpler. - Added support for abbreviated DOM refs. - Obeys rc.abbreviation.minimum, indirectly. - Added tests.
This commit is contained in:
100
src/Lexer.cpp
100
src/Lexer.cpp
@@ -37,7 +37,7 @@ static const unsigned int uuid_min_length = 8;
|
|||||||
|
|
||||||
std::string Lexer::dateFormat = "";
|
std::string Lexer::dateFormat = "";
|
||||||
bool Lexer::isoEnabled = true;
|
bool Lexer::isoEnabled = true;
|
||||||
int Lexer::minimumMatchLength = 3;
|
std::string::size_type Lexer::minimumMatchLength = 3;
|
||||||
std::map <std::string, std::string> Lexer::attributes;
|
std::map <std::string, std::string> Lexer::attributes;
|
||||||
|
|
||||||
|
|
||||||
@@ -373,7 +373,9 @@ int Lexer::hexToInt (int c0, int c1, int c2, int c3)
|
|||||||
// left: wonderful
|
// left: wonderful
|
||||||
// right: wonderbread
|
// right: wonderbread
|
||||||
// returns: ^ 6
|
// returns: ^ 6
|
||||||
int Lexer::commonLength (const std::string& left, const std::string& right)
|
std::string::size_type Lexer::commonLength (
|
||||||
|
const std::string& left,
|
||||||
|
const std::string& right)
|
||||||
{
|
{
|
||||||
std::string::size_type l = 0;
|
std::string::size_type l = 0;
|
||||||
std::string::size_type r = 0;
|
std::string::size_type r = 0;
|
||||||
@@ -382,7 +384,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right)
|
|||||||
utf8_next_char (right, r))
|
utf8_next_char (right, r))
|
||||||
;
|
;
|
||||||
|
|
||||||
return (int) l;
|
return l;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -393,7 +395,7 @@ int Lexer::commonLength (const std::string& left, const std::string& right)
|
|||||||
// right: prowonderbread
|
// right: prowonderbread
|
||||||
// r: ^
|
// r: ^
|
||||||
// returns: ^ 6
|
// returns: ^ 6
|
||||||
int Lexer::commonLength (
|
std::string::size_type Lexer::commonLength (
|
||||||
const std::string& left,
|
const std::string& left,
|
||||||
std::string::size_type l,
|
std::string::size_type l,
|
||||||
const std::string& right,
|
const std::string& right,
|
||||||
@@ -404,7 +406,7 @@ int Lexer::commonLength (
|
|||||||
utf8_next_char (right, r))
|
utf8_next_char (right, r))
|
||||||
;
|
;
|
||||||
|
|
||||||
return (int) l;
|
return l;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -1077,7 +1079,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||||||
|
|
||||||
std::string partialToken;
|
std::string partialToken;
|
||||||
Lexer::Type partialType;
|
Lexer::Type partialType;
|
||||||
if (isLiteral ("rc.", false) &&
|
if (isLiteral ("rc.", false, false) &&
|
||||||
isWord (partialToken, partialType))
|
isWord (partialToken, partialType))
|
||||||
{
|
{
|
||||||
token = _text.substr (marker, _cursor - marker);
|
token = _text.substr (marker, _cursor - marker);
|
||||||
@@ -1090,7 +1092,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||||||
"context.width",
|
"context.width",
|
||||||
"context.height",
|
"context.height",
|
||||||
"system.version",
|
"system.version",
|
||||||
"system.os"}, true))
|
"system.os"}, false, true))
|
||||||
{
|
{
|
||||||
token = _text.substr (marker, _cursor - marker);
|
token = _text.substr (marker, _cursor - marker);
|
||||||
type = Lexer::Type::dom;
|
type = Lexer::Type::dom;
|
||||||
@@ -1105,7 +1107,7 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||||||
if (isUUID (extractedToken, extractedType, false) ||
|
if (isUUID (extractedToken, extractedType, false) ||
|
||||||
isInteger (extractedToken, extractedType))
|
isInteger (extractedToken, extractedType))
|
||||||
{
|
{
|
||||||
if (! isLiteral (".", false))
|
if (! isLiteral (".", false, false))
|
||||||
{
|
{
|
||||||
_cursor = marker;
|
_cursor = marker;
|
||||||
return false;
|
return false;
|
||||||
@@ -1116,8 +1118,9 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||||||
std::size_t checkpoint = _cursor;
|
std::size_t checkpoint = _cursor;
|
||||||
|
|
||||||
// [prefix]tags.<word>
|
// [prefix]tags.<word>
|
||||||
if (isLiteral ("tags.", false) &&
|
if (isLiteral ("tags", true, false) &&
|
||||||
isWord (partialToken, partialType))
|
isLiteral (".", false, false) &&
|
||||||
|
isWord (partialToken, partialType))
|
||||||
{
|
{
|
||||||
token = _text.substr (marker, _cursor - marker);
|
token = _text.substr (marker, _cursor - marker);
|
||||||
type = Lexer::Type::dom;
|
type = Lexer::Type::dom;
|
||||||
@@ -1127,28 +1130,26 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||||||
_cursor = checkpoint;
|
_cursor = checkpoint;
|
||||||
|
|
||||||
// [prefix]attribute
|
// [prefix]attribute
|
||||||
if (isOneOf (attributes, true))
|
if (isOneOf (attributes, true, true))
|
||||||
{
|
{
|
||||||
token = _text.substr (marker, _cursor - marker);
|
token = _text.substr (marker, _cursor - marker);
|
||||||
type = Lexer::Type::dom;
|
type = Lexer::Type::dom;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
else
|
|
||||||
_cursor = checkpoint;
|
|
||||||
|
|
||||||
// [prefix]attribute
|
// [prefix]attribute
|
||||||
if (isOneOf (attributes, false))
|
if (isOneOf (attributes, true, false))
|
||||||
{
|
{
|
||||||
if (isLiteral (".", false))
|
if (isLiteral (".", false, false))
|
||||||
{
|
{
|
||||||
std::string attribute = _text.substr (checkpoint, _cursor - checkpoint - 1);
|
std::string attribute = _text.substr (checkpoint, _cursor - checkpoint - 1);
|
||||||
|
|
||||||
// if attribute type is 'date'
|
// if attribute type is 'date', then it has sub-elements.
|
||||||
if (attributes[attribute] == "date" &&
|
if (attributes[attribute] == "date" &&
|
||||||
isOneOf ({"year", "month", "day",
|
isOneOf ({"year", "month", "day",
|
||||||
"week", "weekday",
|
"week", "weekday",
|
||||||
"julian",
|
"julian",
|
||||||
"hour", "minute", "second"}, true))
|
"hour", "minute", "second"}, true, true))
|
||||||
{
|
{
|
||||||
token = _text.substr (marker, _cursor - marker);
|
token = _text.substr (marker, _cursor - marker);
|
||||||
type = Lexer::Type::dom;
|
type = Lexer::Type::dom;
|
||||||
@@ -1162,35 +1163,35 @@ bool Lexer::isDOM (std::string& token, Lexer::Type& type)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
_cursor = checkpoint;
|
|
||||||
|
|
||||||
// [prefix]annotations.
|
// [prefix]annotations.
|
||||||
if (isLiteral ("annotations.", false))
|
if (isLiteral ("annotations", true, false) &&
|
||||||
|
isLiteral (".", false, true))
|
||||||
{
|
{
|
||||||
std::string extractedToken;
|
std::string extractedToken;
|
||||||
Lexer::Type extractedType;
|
Lexer::Type extractedType;
|
||||||
if (isInteger (extractedToken, extractedType))
|
if (isInteger (extractedToken, extractedType))
|
||||||
{
|
{
|
||||||
if (isLiteral (".", false))
|
if (isLiteral (".", false, false))
|
||||||
{
|
{
|
||||||
if (isLiteral ("description", true))
|
if (isLiteral ("description", true, true))
|
||||||
{
|
{
|
||||||
token = _text.substr (marker, _cursor - marker);
|
token = _text.substr (marker, _cursor - marker);
|
||||||
type = Lexer::Type::dom;
|
type = Lexer::Type::dom;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
else if (isLiteral ("entry", true))
|
else if (isLiteral ("entry", true, true))
|
||||||
{
|
{
|
||||||
token = _text.substr (marker, _cursor - marker);
|
token = _text.substr (marker, _cursor - marker);
|
||||||
type = Lexer::Type::dom;
|
type = Lexer::Type::dom;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
else if (isLiteral ("entry.", false) &&
|
else if (isLiteral ("entry", true, false) &&
|
||||||
|
isLiteral (".", false, true) &&
|
||||||
isOneOf ({"year", "month", "day",
|
isOneOf ({"year", "month", "day",
|
||||||
"week", "weekday",
|
"week", "weekday",
|
||||||
"julian",
|
"julian",
|
||||||
"hour", "minute", "second"}, true))
|
"hour", "minute", "second"}, true, true))
|
||||||
{
|
{
|
||||||
token = _text.substr (marker, _cursor - marker);
|
token = _text.substr (marker, _cursor - marker);
|
||||||
type = Lexer::Type::dom;
|
type = Lexer::Type::dom;
|
||||||
@@ -1251,37 +1252,54 @@ bool Lexer::isWord (std::string& token, Lexer::Type& type)
|
|||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool Lexer::isLiteral (const std::string& literal, bool endBoundary)
|
bool Lexer::isLiteral (
|
||||||
|
const std::string& literal,
|
||||||
|
bool allowAbbreviations,
|
||||||
|
bool endBoundary)
|
||||||
{
|
{
|
||||||
auto len = literal.length ();
|
auto common = commonLength (literal, 0, _text, _cursor);
|
||||||
if (_text.find (literal, _cursor) == _cursor &&
|
|
||||||
(! endBoundary ||
|
|
||||||
_text.length () == _cursor + len ||
|
|
||||||
Lexer::isWhitespace (_text[_cursor + len]) ||
|
|
||||||
Lexer::isSingleCharOperator (_text[_cursor + len])))
|
|
||||||
{
|
|
||||||
_cursor += len;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
// Without abbreviations, common must equal literal length.
|
||||||
|
if (! allowAbbreviations &&
|
||||||
|
common < literal.length ())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// Abbreviations must meet the minimum size.
|
||||||
|
if (allowAbbreviations &&
|
||||||
|
common < minimumMatchLength)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// End boundary conditions must be met.
|
||||||
|
if (endBoundary &&
|
||||||
|
! Lexer::isWhitespace (_text[_cursor + common]) &&
|
||||||
|
! Lexer::isSingleCharOperator (_text[_cursor + common]))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
_cursor += common;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool Lexer::isOneOf (const std::vector <std::string>& options, bool endBoundary)
|
bool Lexer::isOneOf (
|
||||||
|
const std::vector <std::string>& options,
|
||||||
|
bool allowAbbreviations,
|
||||||
|
bool endBoundary)
|
||||||
{
|
{
|
||||||
for (auto& item : options)
|
for (auto& item : options)
|
||||||
if (isLiteral (item, endBoundary))
|
if (isLiteral (item, allowAbbreviations, endBoundary))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool Lexer::isOneOf (const std::map <std::string, std::string>& options, bool endBoundary)
|
bool Lexer::isOneOf (
|
||||||
|
const std::map <std::string, std::string>& options,
|
||||||
|
bool allowAbbreviations,
|
||||||
|
bool endBoundary)
|
||||||
{
|
{
|
||||||
for (auto& item : options)
|
for (auto& item : options)
|
||||||
if (isLiteral (item.first, endBoundary))
|
if (isLiteral (item.first, allowAbbreviations, endBoundary))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
63
src/Lexer.h
63
src/Lexer.h
@@ -41,7 +41,7 @@ public:
|
|||||||
// These are overridable.
|
// These are overridable.
|
||||||
static std::string dateFormat;
|
static std::string dateFormat;
|
||||||
static bool isoEnabled;
|
static bool isoEnabled;
|
||||||
static int minimumMatchLength;
|
static std::string::size_type minimumMatchLength;
|
||||||
static std::map <std::string, std::string> attributes;
|
static std::map <std::string, std::string> attributes;
|
||||||
|
|
||||||
enum class Type { uuid, number, hex,
|
enum class Type { uuid, number, hex,
|
||||||
@@ -61,36 +61,35 @@ public:
|
|||||||
static std::string typeToString (Lexer::Type);
|
static std::string typeToString (Lexer::Type);
|
||||||
|
|
||||||
// Static helpers.
|
// Static helpers.
|
||||||
static const std::string typeName (const Lexer::Type&);
|
static const std::string typeName (const Lexer::Type&);
|
||||||
static bool isWhitespace (int);
|
static bool isWhitespace (int);
|
||||||
static bool isAlpha (int);
|
static bool isAlpha (int);
|
||||||
static bool isDigit (int);
|
static bool isDigit (int);
|
||||||
static bool isHexDigit (int);
|
static bool isHexDigit (int);
|
||||||
static bool isIdentifierStart (int);
|
static bool isIdentifierStart (int);
|
||||||
static bool isIdentifierNext (int);
|
static bool isIdentifierNext (int);
|
||||||
static bool isSingleCharOperator (int);
|
static bool isSingleCharOperator (int);
|
||||||
static bool isDoubleCharOperator (int, int, int);
|
static bool isDoubleCharOperator (int, int, int);
|
||||||
static bool isTripleCharOperator (int, int, int, int);
|
static bool isTripleCharOperator (int, int, int, int);
|
||||||
static bool isBoundary (int, int);
|
static bool isBoundary (int, int);
|
||||||
static bool isHardBoundary (int, int);
|
static bool isHardBoundary (int, int);
|
||||||
static bool isPunctuation (int);
|
static bool isPunctuation (int);
|
||||||
static bool isAllDigits (const std::string&);
|
static bool isAllDigits (const std::string&);
|
||||||
static void dequote (std::string&, const std::string& quotes = "'\"");
|
static void dequote (std::string&, const std::string& quotes = "'\"");
|
||||||
static bool wasQuoted (const std::string&);
|
static bool wasQuoted (const std::string&);
|
||||||
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
static bool readWord (const std::string&, const std::string&, std::string::size_type&, std::string&);
|
||||||
static bool readWord (const std::string&, std::string::size_type&, std::string&);
|
static bool readWord (const std::string&, std::string::size_type&, std::string&);
|
||||||
static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&);
|
static bool decomposePair (const std::string&, std::string&, std::string&, std::string&, std::string&);
|
||||||
static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&);
|
static bool decomposeSubstitution (const std::string&, std::string&, std::string&, std::string&);
|
||||||
static bool decomposePattern (const std::string&, std::string&, std::string&);
|
static bool decomposePattern (const std::string&, std::string&, std::string&);
|
||||||
static int hexToInt (int);
|
static int hexToInt (int);
|
||||||
static int hexToInt (int, int);
|
static int hexToInt (int, int);
|
||||||
static int hexToInt (int, int, int, int);
|
static int hexToInt (int, int, int, int);
|
||||||
static int commonLength (const std::string&, const std::string&);
|
static std::string::size_type commonLength (const std::string&, const std::string&);
|
||||||
static int commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
|
static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
|
||||||
|
|
||||||
bool isEOS () const;
|
|
||||||
|
|
||||||
// Stream Classifiers.
|
// Stream Classifiers.
|
||||||
|
bool isEOS () const;
|
||||||
bool isString (std::string&, Lexer::Type&, const std::string&);
|
bool isString (std::string&, Lexer::Type&, const std::string&);
|
||||||
bool isDate (std::string&, Lexer::Type&);
|
bool isDate (std::string&, Lexer::Type&);
|
||||||
bool isDuration (std::string&, Lexer::Type&);
|
bool isDuration (std::string&, Lexer::Type&);
|
||||||
@@ -110,9 +109,9 @@ public:
|
|||||||
bool isDOM (std::string&, Lexer::Type&);
|
bool isDOM (std::string&, Lexer::Type&);
|
||||||
bool isIdentifier (std::string&, Lexer::Type&);
|
bool isIdentifier (std::string&, Lexer::Type&);
|
||||||
bool isWord (std::string&, Lexer::Type&);
|
bool isWord (std::string&, Lexer::Type&);
|
||||||
bool isLiteral (const std::string&, bool);
|
bool isLiteral (const std::string&, bool, bool);
|
||||||
bool isOneOf (const std::vector <std::string>&, bool);
|
bool isOneOf (const std::vector <std::string>&, bool, bool);
|
||||||
bool isOneOf (const std::map <std::string, std::string>&, bool);
|
bool isOneOf (const std::map <std::string, std::string>&, bool, bool);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string _text;
|
std::string _text;
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ Context context;
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main (int argc, char** argv)
|
int main (int argc, char** argv)
|
||||||
{
|
{
|
||||||
UnitTest t (1160);
|
UnitTest t (1170);
|
||||||
|
|
||||||
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
std::vector <std::pair <std::string, Lexer::Type>> tokens;
|
||||||
std::string token;
|
std::string token;
|
||||||
@@ -265,18 +265,34 @@ int main (int argc, char** argv)
|
|||||||
t.ok (Lexer::readWord (text, cursor, word), "readWord \"one \" --> true");
|
t.ok (Lexer::readWord (text, cursor, word), "readWord \"one \" --> true");
|
||||||
t.is (word, "one", " word '" + word + "'");
|
t.is (word, "one", " word '" + word + "'");
|
||||||
|
|
||||||
// bool isLiteral (const std::string&, bool);
|
// bool isLiteral (const std::string&, bool, bool);
|
||||||
Lexer l4 ("one.two");
|
Lexer l4 ("one.two");
|
||||||
t.notok (l4.isLiteral("zero", false), "isLiteral 'one.two' --> false");
|
t.notok (l4.isLiteral("zero", false, false), "isLiteral 'one.two' --> false");
|
||||||
t.ok (l4.isLiteral("one", false), "isLiteral 'one.two' --> 'one'");
|
t.ok (l4.isLiteral("one", false, false), "isLiteral 'one.two' --> 'one'");
|
||||||
t.ok (l4.isLiteral(".", false), "isLiteral 'one.two' --> '.'");
|
t.ok (l4.isLiteral(".", false, false), "isLiteral 'one.two' --> '.'");
|
||||||
t.ok (l4.isLiteral("two", true), "isLiteral 'one.two' --> 'two'");
|
t.ok (l4.isLiteral("two", false, true), "isLiteral 'one.two' --> 'two'");
|
||||||
|
|
||||||
// bool isOneOf (const std::string&, bool);
|
Lexer l5 ("wonderful");
|
||||||
Lexer l5 ("Grumpy.");
|
t.notok (l5.isLiteral ("wonder", false, false), "isLiteral 'wonder' != 'wonderful' without abbreviation");
|
||||||
|
t.ok (l5.isLiteral ("wonder", true, false), "isLiteral 'wonder' == 'wonderful' with abbreviation");
|
||||||
|
|
||||||
|
// bool isOneOf (const std::string&, bool, bool);
|
||||||
|
Lexer l6 ("Grumpy.");
|
||||||
std::vector <std::string> dwarves = {"Sneezy", "Doc", "Bashful", "Grumpy", "Happy", "Sleepy", "Dopey"};
|
std::vector <std::string> dwarves = {"Sneezy", "Doc", "Bashful", "Grumpy", "Happy", "Sleepy", "Dopey"};
|
||||||
t.notok (l5.isOneOf (dwarves, true), "isOneof ('Grumpy', true) --> false");
|
t.notok (l6.isOneOf (dwarves, false, true), "isOneof ('Grumpy', true) --> false");
|
||||||
t.ok (l5.isOneOf (dwarves, false), "isOneOf ('Grumpy', false) --> true");
|
t.ok (l6.isOneOf (dwarves, false, false), "isOneOf ('Grumpy', false) --> true");
|
||||||
|
|
||||||
|
// static std::string::size_type commonLength (const std::string&, const std::string&);
|
||||||
|
t.is ((int)Lexer::commonLength ("", ""), 0, "commonLength '' : '' --> 0");
|
||||||
|
t.is ((int)Lexer::commonLength ("a", "a"), 1, "commonLength 'a' : 'a' --> 1");
|
||||||
|
t.is ((int)Lexer::commonLength ("abcde", "abcde"), 5, "commonLength 'abcde' : 'abcde' --> 5");
|
||||||
|
t.is ((int)Lexer::commonLength ("abc", ""), 0, "commonLength 'abc' : '' --> 0");
|
||||||
|
t.is ((int)Lexer::commonLength ("abc", "def"), 0, "commonLength 'abc' : 'def' --> 0");
|
||||||
|
t.is ((int)Lexer::commonLength ("foobar", "foo"), 3, "commonLength 'foobar' : 'foo' --> 3");
|
||||||
|
t.is ((int)Lexer::commonLength ("foo", "foobar"), 3, "commonLength 'foo' : 'foobar' --> 3");
|
||||||
|
|
||||||
|
// static std::string::size_type commonLength (const std::string&, std::string::size_type, const std::string&, std::string::size_type);
|
||||||
|
t.is ((int)Lexer::commonLength ("wonder", 0, "prowonderbread", 3), 6, "'wonder'+0 : 'prowonderbread'+3 --> 6");
|
||||||
|
|
||||||
// Test all Lexer types.
|
// Test all Lexer types.
|
||||||
#define NO {"",Lexer::Type::word}
|
#define NO {"",Lexer::Type::word}
|
||||||
|
|||||||
Reference in New Issue
Block a user