- Integrated Lexer2 in place of Lexer. Tests fail.
This commit is contained in:
Paul Beckingham
2015-02-22 13:52:14 -05:00
parent 2155bd3969
commit 0cf18f3b16
12 changed files with 416 additions and 1408 deletions

View File

@@ -29,7 +29,6 @@
#include <algorithm> #include <algorithm>
#include <Context.h> #include <Context.h>
#include <Nibbler.h> #include <Nibbler.h>
#include <Lexer.h>
#include <Lexer2.h> #include <Lexer2.h>
#include <CLI.h> #include <CLI.h>
#include <Color.h> #include <Color.h>
@@ -662,13 +661,13 @@ void CLI::addArg (const std::string& arg)
// that cause the lexemes to be ignored, and the original arugment used // that cause the lexemes to be ignored, and the original arugment used
// intact. // intact.
std::string lexeme; std::string lexeme;
Lexer::Type type; Lexer2::Type type;
Lexer lex (raw); Lexer2 lex (raw);
lex.ambiguity (false); lex.ambiguity (false);
std::vector <std::pair <std::string, Lexer::Type> > lexemes; std::vector <std::pair <std::string, Lexer2::Type> > lexemes;
while (lex.token (lexeme, type)) while (lex.token (lexeme, type))
lexemes.push_back (std::pair <std::string, Lexer::Type> (lexeme, type)); lexemes.push_back (std::pair <std::string, Lexer2::Type> (lexeme, type));
if (disqualifyInsufficientTerms (lexemes) || if (disqualifyInsufficientTerms (lexemes) ||
disqualifyNoOps (lexemes) || disqualifyNoOps (lexemes) ||
@@ -682,7 +681,7 @@ void CLI::addArg (const std::string& arg)
{ {
// How often have I said to you that when you have eliminated the // How often have I said to you that when you have eliminated the
// impossible, whatever remains, however improbable, must be the truth? // impossible, whatever remains, however improbable, must be the truth?
std::vector <std::pair <std::string, Lexer::Type> >::iterator l; std::vector <std::pair <std::string, Lexer2::Type> >::iterator l;
for (l = lexemes.begin (); l != lexemes.end (); ++l) for (l = lexemes.begin (); l != lexemes.end (); ++l)
_original_args.push_back (l->first); _original_args.push_back (l->first);
} }
@@ -714,9 +713,7 @@ void CLI::aliasExpansion ()
{ {
if (_aliases.find (raw) != _aliases.end ()) if (_aliases.find (raw) != _aliases.end ())
{ {
std::vector <std::string> lexed; std::vector <std::string> lexed = Lexer2::split (_aliases[raw]);
Lexer::token_split (lexed, _aliases[raw]);
std::vector <std::string>::iterator l; std::vector <std::string>::iterator l;
for (l = lexed.begin (); l != lexed.end (); ++l) for (l = lexed.begin (); l != lexed.end (); ++l)
{ {
@@ -1815,8 +1812,7 @@ void CLI::injectDefaults ()
if (defaultCommand != "") if (defaultCommand != "")
{ {
// Split the defaultCommand into separate args. // Split the defaultCommand into separate args.
std::vector <std::string> tokens; std::vector <std::string> tokens = Lexer2::split (defaultCommand);
Lexer::token_split (tokens, defaultCommand);
// Modify _args to be: <args0> [<def0> ...] <args1> [...] // Modify _args to be: <args0> [<def0> ...] <args1> [...]
std::vector <A> reconstructed; std::vector <A> reconstructed;
@@ -2306,9 +2302,9 @@ bool CLI::isName (const std::string& raw) const
{ {
for (int i = 0; i < raw.length (); ++i) for (int i = 0; i < raw.length (); ++i)
{ {
if (i == 0 && ! Lexer::is_ident_start (raw[i])) if (i == 0 && ! Lexer2::isIdentifierStart (raw[i]))
return false; return false;
else if (! Lexer::is_ident (raw[i])) else if (! Lexer2::isIdentifierNext (raw[i]))
return false; return false;
} }
@@ -2320,19 +2316,19 @@ bool CLI::isName (const std::string& raw) const
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
bool CLI::disqualifyInsufficientTerms ( bool CLI::disqualifyInsufficientTerms (
const std::vector <std::pair <std::string, Lexer::Type> >& lexemes) const const std::vector <std::pair <std::string, Lexer2::Type> >& lexemes) const
{ {
return lexemes.size () < 3 ? true : false; return lexemes.size () < 3 ? true : false;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
bool CLI::disqualifyNoOps ( bool CLI::disqualifyNoOps (
const std::vector <std::pair <std::string, Lexer::Type> >& lexemes) const const std::vector <std::pair <std::string, Lexer2::Type> >& lexemes) const
{ {
bool foundOP = false; bool foundOP = false;
std::vector <std::pair <std::string, Lexer::Type> >::const_iterator l; std::vector <std::pair <std::string, Lexer2::Type> >::const_iterator l;
for (l = lexemes.begin (); l != lexemes.end (); ++l) for (l = lexemes.begin (); l != lexemes.end (); ++l)
if (l->second == Lexer::typeOperator) if (l->second == Lexer2::Type::op)
foundOP = true; foundOP = true;
return ! foundOP; return ! foundOP;
@@ -2340,16 +2336,16 @@ bool CLI::disqualifyNoOps (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
bool CLI::disqualifyOnlyParenOps ( bool CLI::disqualifyOnlyParenOps (
const std::vector <std::pair <std::string, Lexer::Type> >& lexemes) const const std::vector <std::pair <std::string, Lexer2::Type> >& lexemes) const
{ {
int opCount = 0; int opCount = 0;
int opSugarCount = 0; int opSugarCount = 0;
int opParenCount = 0; int opParenCount = 0;
std::vector <std::pair <std::string, Lexer::Type> >::const_iterator l; std::vector <std::pair <std::string, Lexer2::Type> >::const_iterator l;
for (l = lexemes.begin (); l != lexemes.end (); ++l) for (l = lexemes.begin (); l != lexemes.end (); ++l)
{ {
if (l->second == Lexer::typeOperator) if (l->second == Lexer2::Type::op)
{ {
++opCount; ++opCount;
@@ -2376,7 +2372,7 @@ bool CLI::disqualifyOnlyParenOps (
// as there are no operators in between, which includes syntactic sugar that // as there are no operators in between, which includes syntactic sugar that
// hides operators. // hides operators.
bool CLI::disqualifyFirstLastBinary ( bool CLI::disqualifyFirstLastBinary (
const std::vector <std::pair <std::string, Lexer::Type> >& lexemes) const const std::vector <std::pair <std::string, Lexer2::Type> >& lexemes) const
{ {
bool firstBinary = false; bool firstBinary = false;
bool lastBinary = false; bool lastBinary = false;
@@ -2395,7 +2391,7 @@ bool CLI::disqualifyFirstLastBinary (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Disqualify terms when there operators hidden by syntactic sugar. // Disqualify terms when there operators hidden by syntactic sugar.
bool CLI::disqualifySugarFree ( bool CLI::disqualifySugarFree (
const std::vector <std::pair <std::string, Lexer::Type> >& lexemes) const const std::vector <std::pair <std::string, Lexer2::Type> >& lexemes) const
{ {
bool sugared = true; bool sugared = true;
for (unsigned int i = 1; i < lexemes.size () - 1; ++i) for (unsigned int i = 1; i < lexemes.size () - 1; ++i)

View File

@@ -29,7 +29,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <map> #include <map>
#include <Lexer.h> #include <Lexer2.h>
#include <Path.h> #include <Path.h>
#include <File.h> #include <File.h>
@@ -126,11 +126,11 @@ private:
bool isOperator (const std::string&) const; bool isOperator (const std::string&) const;
bool isName (const std::string&) const; bool isName (const std::string&) const;
bool disqualifyInsufficientTerms (const std::vector <std::pair <std::string, Lexer::Type> >&) const; bool disqualifyInsufficientTerms (const std::vector <std::pair <std::string, Lexer2::Type> >&) const;
bool disqualifyNoOps (const std::vector <std::pair <std::string, Lexer::Type> >&) const; bool disqualifyNoOps (const std::vector <std::pair <std::string, Lexer2::Type> >&) const;
bool disqualifyOnlyParenOps (const std::vector <std::pair <std::string, Lexer::Type> >&) const; bool disqualifyOnlyParenOps (const std::vector <std::pair <std::string, Lexer2::Type> >&) const;
bool disqualifyFirstLastBinary (const std::vector <std::pair <std::string, Lexer::Type> >&) const; bool disqualifyFirstLastBinary (const std::vector <std::pair <std::string, Lexer2::Type> >&) const;
bool disqualifySugarFree (const std::vector <std::pair <std::string, Lexer::Type> >&) const; bool disqualifySugarFree (const std::vector <std::pair <std::string, Lexer2::Type> >&) const;
public: public:
std::multimap <std::string, std::string> _entities; std::multimap <std::string, std::string> _entities;

View File

@@ -20,7 +20,6 @@ set (task_SRCS CLI.cpp CLI.h
Hooks.cpp Hooks.h Hooks.cpp Hooks.h
ISO8601.cpp ISO8601.h ISO8601.cpp ISO8601.h
JSON.cpp JSON.h JSON.cpp JSON.h
Lexer.cpp Lexer.h
Lexer2.cpp Lexer2.h Lexer2.cpp Lexer2.h
Msg.cpp Msg.h Msg.cpp Msg.h
Nibbler.cpp Nibbler.h Nibbler.cpp Nibbler.h

View File

@@ -657,8 +657,8 @@ void Context::staticInitialization ()
Task::searchCaseSensitive = Variant::searchCaseSensitive = config.getBoolean ("search.case.sensitive"); Task::searchCaseSensitive = Variant::searchCaseSensitive = config.getBoolean ("search.case.sensitive");
Task::regex = Variant::searchUsingRegex = config.getBoolean ("regex"); Task::regex = Variant::searchUsingRegex = config.getBoolean ("regex");
Lexer::dateFormat = Variant::dateFormat = config.get ("dateformat"); Lexer2::dateFormat = Variant::dateFormat = config.get ("dateformat");
Lexer::isoEnabled = Variant::isoEnabled = config.getBoolean ("date.iso"); Lexer2::isoEnabled = Variant::isoEnabled = config.getBoolean ("date.iso");
std::map <std::string, Column*>::iterator i; std::map <std::string, Column*>::iterator i;
for (i = columns.begin (); i != columns.end (); ++i) for (i = columns.begin (); i != columns.end (); ++i)

View File

@@ -125,13 +125,13 @@ void Eval::addSource (bool (*source)(const std::string&, Variant&))
void Eval::evaluateInfixExpression (const std::string& e, Variant& v) const void Eval::evaluateInfixExpression (const std::string& e, Variant& v) const
{ {
// Reduce e to a vector of tokens. // Reduce e to a vector of tokens.
Lexer l (e); Lexer2 l (e);
l.ambiguity (_ambiguity); l.ambiguity (_ambiguity);
std::vector <std::pair <std::string, Lexer::Type> > tokens; std::vector <std::pair <std::string, Lexer2::Type> > tokens;
std::string token; std::string token;
Lexer::Type type; Lexer2::Type type;
while (l.token (token, type)) while (l.token (token, type))
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
// Parse for syntax checking and operator replacement. // Parse for syntax checking and operator replacement.
if (_debug) if (_debug)
@@ -153,13 +153,13 @@ void Eval::evaluateInfixExpression (const std::string& e, Variant& v) const
void Eval::evaluatePostfixExpression (const std::string& e, Variant& v) const void Eval::evaluatePostfixExpression (const std::string& e, Variant& v) const
{ {
// Reduce e to a vector of tokens. // Reduce e to a vector of tokens.
Lexer l (e); Lexer2 l (e);
l.ambiguity (_ambiguity); l.ambiguity (_ambiguity);
std::vector <std::pair <std::string, Lexer::Type> > tokens; std::vector <std::pair <std::string, Lexer2::Type> > tokens;
std::string token; std::string token;
Lexer::Type type; Lexer2::Type type;
while (l.token (token, type)) while (l.token (token, type))
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
if (_debug) if (_debug)
context.debug ("FILTER Postfix " + dump (tokens)); context.debug ("FILTER Postfix " + dump (tokens));
@@ -172,15 +172,15 @@ void Eval::evaluatePostfixExpression (const std::string& e, Variant& v) const
void Eval::compileExpression (const std::string& e) void Eval::compileExpression (const std::string& e)
{ {
// Reduce e to a vector of tokens. // Reduce e to a vector of tokens.
Lexer l (e); Lexer2 l (e);
l.ambiguity (_ambiguity); l.ambiguity (_ambiguity);
std::string token; std::string token;
Lexer::Type type; Lexer2::Type type;
while (l.token (token, type)) while (l.token (token, type))
{ {
if (_debug) if (_debug)
context.debug ("Lexer '" + token + "' " + Lexer::type_name (type)); context.debug ("Lexer '" + token + "' " + Lexer2::typeToString (type));
_compiled.push_back (std::pair <std::string, Lexer::Type> (token, type)); _compiled.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
// Parse for syntax checking and operator replacement. // Parse for syntax checking and operator replacement.
@@ -236,7 +236,7 @@ void Eval::getBinaryOperators (std::vector <std::string>& all)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void Eval::evaluatePostfixStack ( void Eval::evaluatePostfixStack (
const std::vector <std::pair <std::string, Lexer::Type> >& tokens, const std::vector <std::pair <std::string, Lexer2::Type> >& tokens,
Variant& result) const Variant& result) const
{ {
if (tokens.size () == 0) if (tokens.size () == 0)
@@ -245,11 +245,11 @@ void Eval::evaluatePostfixStack (
// This is stack used by the postfix evaluator. // This is stack used by the postfix evaluator.
std::vector <Variant> values; std::vector <Variant> values;
std::vector <std::pair <std::string, Lexer::Type> >::const_iterator token; std::vector <std::pair <std::string, Lexer2::Type> >::const_iterator token;
for (token = tokens.begin (); token != tokens.end (); ++token) for (token = tokens.begin (); token != tokens.end (); ++token)
{ {
// Unary operators. // Unary operators.
if (token->second == Lexer::typeOperator && if (token->second == Lexer2::Type::op &&
token->first == "!") token->first == "!")
{ {
if (values.size () < 1) if (values.size () < 1)
@@ -262,7 +262,7 @@ void Eval::evaluatePostfixStack (
if (_debug) if (_debug)
context.debug (format ("Eval {1} ↓'{2}' → ↑'{3}'", token->first, (std::string) right, (std::string) result)); context.debug (format ("Eval {1} ↓'{2}' → ↑'{3}'", token->first, (std::string) right, (std::string) result));
} }
else if (token->second == Lexer::typeOperator && else if (token->second == Lexer2::Type::op &&
token->first == "_neg_") token->first == "_neg_")
{ {
if (values.size () < 1) if (values.size () < 1)
@@ -278,7 +278,7 @@ void Eval::evaluatePostfixStack (
if (_debug) if (_debug)
context.debug (format ("Eval {1} ↓'{2}' → ↑'{3}'", token->first, (std::string) right, (std::string) result)); context.debug (format ("Eval {1} ↓'{2}' → ↑'{3}'", token->first, (std::string) right, (std::string) result));
} }
else if (token->second == Lexer::typeOperator && else if (token->second == Lexer2::Type::op &&
token->first == "_pos_") token->first == "_pos_")
{ {
// The _pos_ operator is a NOP. // The _pos_ operator is a NOP.
@@ -287,7 +287,7 @@ void Eval::evaluatePostfixStack (
} }
// Binary operators. // Binary operators.
else if (token->second == Lexer::typeOperator) else if (token->second == Lexer2::Type::op)
{ {
if (values.size () < 2) if (values.size () < 2)
throw std::string (STRING_EVAL_NO_EVAL); throw std::string (STRING_EVAL_NO_EVAL);
@@ -338,24 +338,27 @@ void Eval::evaluatePostfixStack (
Variant v (token->first); Variant v (token->first);
switch (token->second) switch (token->second)
{ {
case Lexer::typeNumber: case Lexer2::Type::number:
case Lexer::typeHex: if (Lexer2::isAllDigits (token->first))
v.cast (Variant::type_integer); {
if (_debug) v.cast (Variant::type_integer);
context.debug (format ("Eval literal number ↑'{1}'", (std::string) v)); if (_debug)
context.debug (format ("Eval literal number ↑'{1}'", (std::string) v));
}
else
{
v.cast (Variant::type_real);
if (_debug)
context.debug (format ("Eval literal decimal ↑'{1}'", (std::string) v));
}
break; break;
case Lexer::typeDecimal:
v.cast (Variant::type_real);
if (_debug)
context.debug (format ("Eval literal decimal ↑'{1}'", (std::string) v));
break;
case Lexer::typeOperator: case Lexer2::Type::op:
throw std::string (STRING_EVAL_OP_EXPECTED); throw std::string (STRING_EVAL_OP_EXPECTED);
break; break;
case Lexer::typeIdentifier: case Lexer2::Type::identifier:
{ {
bool found = false; bool found = false;
std::vector <bool (*)(const std::string&, Variant&)>::const_iterator source; std::vector <bool (*)(const std::string&, Variant&)>::const_iterator source;
@@ -380,20 +383,33 @@ void Eval::evaluatePostfixStack (
} }
break; break;
case Lexer::typeDate: case Lexer2::Type::date:
v.cast (Variant::type_date); v.cast (Variant::type_date);
if (_debug) if (_debug)
context.debug (format ("Eval literal date ↑'{1}'", (std::string) v)); context.debug (format ("Eval literal date ↑'{1}'", (std::string) v));
break; break;
case Lexer::typeDuration: case Lexer2::Type::duration:
v.cast (Variant::type_duration); v.cast (Variant::type_duration);
if (_debug) if (_debug)
context.debug (format ("Eval literal duration ↑'{1}'", (std::string) v)); context.debug (format ("Eval literal duration ↑'{1}'", (std::string) v));
break; break;
// Nothing to do. // Nothing to do.
case Lexer::typeString: /*
case Lexer2::Type::uuid:
case Lexer2::Type::hex:
case Lexer2::Type::list:
case Lexer2::Type::url:
case Lexer2::Type::pair:
case Lexer2::Type::separator:
case Lexer2::Type::tag:
case Lexer2::Type::path:
case Lexer2::Type::substitution:
case Lexer2::Type::pattern:
case Lexer2::Type::word:
*/
case Lexer2::Type::string:
default: default:
if (_debug) if (_debug)
context.debug (format ("Eval literal string ↑'{1}'", (std::string) v)); context.debug (format ("Eval literal string ↑'{1}'", (std::string) v));
@@ -427,7 +443,7 @@ void Eval::evaluatePostfixStack (
// Primitive --> "(" Logical ")" | Variant // Primitive --> "(" Logical ")" | Variant
// //
void Eval::infixParse ( void Eval::infixParse (
std::vector <std::pair <std::string, Lexer::Type> >& infix) const std::vector <std::pair <std::string, Lexer2::Type> >& infix) const
{ {
int i = 0; int i = 0;
parseLogical (infix, i); parseLogical (infix, i);
@@ -436,17 +452,17 @@ void Eval::infixParse (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Logical --> Regex {( "and" | "or" | "xor" ) Regex} // Logical --> Regex {( "and" | "or" | "xor" ) Regex}
bool Eval::parseLogical ( bool Eval::parseLogical (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size () && if (i < infix.size () &&
parseRegex (infix, i)) parseRegex (infix, i))
{ {
while (i < infix.size () && while (i < infix.size () &&
infix[i].second == Lexer2::Type::op &&
(infix[i].first == "and" || (infix[i].first == "and" ||
infix[i].first == "or" || infix[i].first == "or" ||
infix[i].first == "xor") && infix[i].first == "xor"))
infix[i].second == Lexer::typeOperator)
{ {
++i; ++i;
if (! parseRegex (infix, i)) if (! parseRegex (infix, i))
@@ -462,16 +478,16 @@ bool Eval::parseLogical (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Regex --> Equality {( "~" | "!~" ) Equality} // Regex --> Equality {( "~" | "!~" ) Equality}
bool Eval::parseRegex ( bool Eval::parseRegex (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size () && if (i < infix.size () &&
parseEquality (infix, i)) parseEquality (infix, i))
{ {
while (i < infix.size () && while (i < infix.size () &&
infix[i].second == Lexer2::Type::op &&
(infix[i].first == "~" || (infix[i].first == "~" ||
infix[i].first == "!~") && infix[i].first == "!~"))
infix[i].second == Lexer::typeOperator)
{ {
++i; ++i;
if (! parseEquality (infix, i)) if (! parseEquality (infix, i))
@@ -487,18 +503,18 @@ bool Eval::parseRegex (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Equality --> Comparative {( "==" | "=" | "!==" | "!=" ) Comparative} // Equality --> Comparative {( "==" | "=" | "!==" | "!=" ) Comparative}
bool Eval::parseEquality ( bool Eval::parseEquality (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size () && if (i < infix.size () &&
parseComparative (infix, i)) parseComparative (infix, i))
{ {
while (i < infix.size () && while (i < infix.size () &&
infix[i].second == Lexer2::Type::op &&
(infix[i].first == "==" || (infix[i].first == "==" ||
infix[i].first == "=" || infix[i].first == "=" ||
infix[i].first == "!==" || infix[i].first == "!==" ||
infix[i].first == "!=") && infix[i].first == "!="))
infix[i].second == Lexer::typeOperator)
{ {
++i; ++i;
if (! parseComparative (infix, i)) if (! parseComparative (infix, i))
@@ -514,18 +530,18 @@ bool Eval::parseEquality (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Comparative --> Arithmetic {( "<=" | "<" | ">=" | ">" ) Arithmetic} // Comparative --> Arithmetic {( "<=" | "<" | ">=" | ">" ) Arithmetic}
bool Eval::parseComparative ( bool Eval::parseComparative (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size () && if (i < infix.size () &&
parseArithmetic (infix, i)) parseArithmetic (infix, i))
{ {
while (i < infix.size () && while (i < infix.size () &&
infix[i].second == Lexer2::Type::op &&
(infix[i].first == "<=" || (infix[i].first == "<=" ||
infix[i].first == "<" || infix[i].first == "<" ||
infix[i].first == ">=" || infix[i].first == ">=" ||
infix[i].first == ">") && infix[i].first == ">"))
infix[i].second == Lexer::typeOperator)
{ {
++i; ++i;
if (! parseArithmetic (infix, i)) if (! parseArithmetic (infix, i))
@@ -541,16 +557,16 @@ bool Eval::parseComparative (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Arithmetic --> Geometric {( "+" | "-" ) Geometric} // Arithmetic --> Geometric {( "+" | "-" ) Geometric}
bool Eval::parseArithmetic ( bool Eval::parseArithmetic (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size () && if (i < infix.size () &&
parseGeometric (infix, i)) parseGeometric (infix, i))
{ {
while (i < infix.size () && while (i < infix.size () &&
infix[i].second == Lexer2::Type::op &&
(infix[i].first == "+" || (infix[i].first == "+" ||
infix[i].first == "-") && infix[i].first == "-"))
infix[i].second == Lexer::typeOperator)
{ {
++i; ++i;
if (! parseGeometric (infix, i)) if (! parseGeometric (infix, i))
@@ -566,17 +582,17 @@ bool Eval::parseArithmetic (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Geometric --> Tag {( "*" | "/" | "%" ) Tag} // Geometric --> Tag {( "*" | "/" | "%" ) Tag}
bool Eval::parseGeometric ( bool Eval::parseGeometric (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size () && if (i < infix.size () &&
parseTag (infix, i)) parseTag (infix, i))
{ {
while (i < infix.size () && while (i < infix.size () &&
infix[i].second == Lexer2::Type::op &&
(infix[i].first == "*" || (infix[i].first == "*" ||
infix[i].first == "/" || infix[i].first == "/" ||
infix[i].first == "%") && infix[i].first == "%"))
infix[i].second == Lexer::typeOperator)
{ {
++i; ++i;
if (! parseTag (infix, i)) if (! parseTag (infix, i))
@@ -592,16 +608,16 @@ bool Eval::parseGeometric (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Tag --> Unary {( "_hastag_" | "_notag_" ) Unary} // Tag --> Unary {( "_hastag_" | "_notag_" ) Unary}
bool Eval::parseTag ( bool Eval::parseTag (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size () && if (i < infix.size () &&
parseUnary (infix, i)) parseUnary (infix, i))
{ {
while (i < infix.size () && while (i < infix.size () &&
infix[i].second == Lexer2::Type::op &&
(infix[i].first == "_hastag_" || (infix[i].first == "_hastag_" ||
infix[i].first == "_notag_") && infix[i].first == "_notag_"))
infix[i].second == Lexer::typeOperator)
{ {
++i; ++i;
if (! parseUnary (infix, i)) if (! parseUnary (infix, i))
@@ -617,7 +633,7 @@ bool Eval::parseTag (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Unary --> [( "-" | "+" | "!" )] Exponent // Unary --> [( "-" | "+" | "!" )] Exponent
bool Eval::parseUnary ( bool Eval::parseUnary (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size ()) if (i < infix.size ())
@@ -644,15 +660,15 @@ bool Eval::parseUnary (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Exponent --> Primitive ["^" Primitive] // Exponent --> Primitive ["^" Primitive]
bool Eval::parseExponent ( bool Eval::parseExponent (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size () && if (i < infix.size () &&
parsePrimitive (infix, i)) parsePrimitive (infix, i))
{ {
while (i < infix.size () && while (i < infix.size () &&
infix[i].first == "^" && infix[i].second == Lexer2::Type::op &&
infix[i].second == Lexer::typeOperator) infix[i].first == "^")
{ {
++i; ++i;
if (! parsePrimitive (infix, i)) if (! parsePrimitive (infix, i))
@@ -668,7 +684,7 @@ bool Eval::parseExponent (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Primitive --> "(" Logical ")" | Variant // Primitive --> "(" Logical ")" | Variant
bool Eval::parsePrimitive ( bool Eval::parsePrimitive (
std::vector <std::pair <std::string, Lexer::Type> >& infix, std::vector <std::pair <std::string, Lexer2::Type> >& infix,
int &i) const int &i) const
{ {
if (i < infix.size ()) if (i < infix.size ())
@@ -706,7 +722,7 @@ bool Eval::parsePrimitive (
++i; ++i;
return true; return true;
} }
else if (infix[i].second != Lexer::typeOperator) else if (infix[i].second != Lexer2::Type::op)
{ {
++i; ++i;
return true; return true;
@@ -750,32 +766,32 @@ bool Eval::parsePrimitive (
// Exit. // Exit.
// //
void Eval::infixToPostfix ( void Eval::infixToPostfix (
std::vector <std::pair <std::string, Lexer::Type> >& infix) const std::vector <std::pair <std::string, Lexer2::Type> >& infix) const
{ {
// Short circuit. // Short circuit.
if (infix.size () == 1) if (infix.size () == 1)
return; return;
// Result. // Result.
std::vector <std::pair <std::string, Lexer::Type> > postfix; std::vector <std::pair <std::string, Lexer2::Type> > postfix;
// Shunting yard. // Shunting yard.
std::vector <std::pair <std::string, Lexer::Type> > op_stack; std::vector <std::pair <std::string, Lexer2::Type> > op_stack;
// Operator characteristics. // Operator characteristics.
char type; char type;
int precedence; int precedence;
char associativity; char associativity;
std::vector <std::pair <std::string, Lexer::Type> >::iterator token; std::vector <std::pair <std::string, Lexer2::Type> >::iterator token;
for (token = infix.begin (); token != infix.end (); ++token) for (token = infix.begin (); token != infix.end (); ++token)
{ {
if (token->second == Lexer::typeOperator && if (token->second == Lexer2::Type::op &&
token->first == "(") token->first == "(")
{ {
op_stack.push_back (*token); op_stack.push_back (*token);
} }
else if (token->second == Lexer::typeOperator && else if (token->second == Lexer2::Type::op &&
token->first == ")") token->first == ")")
{ {
while (op_stack.size () && while (op_stack.size () &&
@@ -790,7 +806,7 @@ void Eval::infixToPostfix (
else else
throw std::string ("Mismatched parentheses in expression"); throw std::string ("Mismatched parentheses in expression");
} }
else if (token->second == Lexer::typeOperator && else if (token->second == Lexer2::Type::op &&
identifyOperator (token->first, type, precedence, associativity)) identifyOperator (token->first, type, precedence, associativity))
{ {
char type2; char type2;
@@ -849,22 +865,20 @@ bool Eval::identifyOperator (
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
std::string Eval::dump ( std::string Eval::dump (
std::vector <std::pair <std::string, Lexer::Type> >& tokens) const std::vector <std::pair <std::string, Lexer2::Type> >& tokens) const
{ {
// Set up a color mapping. // Set up a color mapping.
std::map <Lexer::Type, Color> color_map; std::map <Lexer2::Type, Color> color_map;
color_map[Lexer::typeNone] = Color ("rgb000 on gray6"); color_map[Lexer2::Type::op] = Color ("gray14 on gray6");
color_map[Lexer::typeOperator] = Color ("gray14 on gray6"); color_map[Lexer2::Type::number] = Color ("rgb530 on gray6");
color_map[Lexer::typeNumber] = Color ("rgb530 on gray6"); color_map[Lexer2::Type::hex] = Color ("rgb303 on gray6");
color_map[Lexer::typeHex] = Color ("rgb303 on gray6"); color_map[Lexer2::Type::string] = Color ("rgb550 on gray6");
color_map[Lexer::typeDecimal] = Color ("rgb530 on gray6"); color_map[Lexer2::Type::identifier] = Color ("rgb035 on gray6");
color_map[Lexer::typeString] = Color ("rgb550 on gray6"); color_map[Lexer2::Type::date] = Color ("rgb150 on gray6");
color_map[Lexer::typeIdentifier] = Color ("rgb035 on gray6"); color_map[Lexer2::Type::duration] = Color ("rgb531 on gray6");
color_map[Lexer::typeDate] = Color ("rgb150 on gray6");
color_map[Lexer::typeDuration] = Color ("rgb531 on gray6");
std::string output; std::string output;
std::vector <std::pair <std::string, Lexer::Type> >::const_iterator i; std::vector <std::pair <std::string, Lexer2::Type> >::const_iterator i;
for (i = tokens.begin (); i != tokens.end (); ++i) for (i = tokens.begin (); i != tokens.end (); ++i)
{ {
if (i != tokens.begin ()) if (i != tokens.begin ())
@@ -874,7 +888,7 @@ std::string Eval::dump (
if (color_map[i->second].nontrivial ()) if (color_map[i->second].nontrivial ())
c = color_map[i->second]; c = color_map[i->second];
else else
c = color_map[Lexer::typeNone]; c = Color ("rgb000 on gray6");
output += c.colorize (i->first); output += c.colorize (i->first);
} }

View File

@@ -29,7 +29,7 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <Lexer.h> #include <Lexer2.h>
#include <Variant.h> #include <Variant.h>
class Eval class Eval
@@ -53,28 +53,28 @@ public:
static void getBinaryOperators (std::vector <std::string>&); static void getBinaryOperators (std::vector <std::string>&);
private: private:
void evaluatePostfixStack (const std::vector <std::pair <std::string, Lexer::Type> >&, Variant&) const; void evaluatePostfixStack (const std::vector <std::pair <std::string, Lexer2::Type> >&, Variant&) const;
void infixToPostfix (std::vector <std::pair <std::string, Lexer::Type> >&) const; void infixToPostfix (std::vector <std::pair <std::string, Lexer2::Type> >&) const;
void infixParse (std::vector <std::pair <std::string, Lexer::Type> >&) const; void infixParse (std::vector <std::pair <std::string, Lexer2::Type> >&) const;
bool parseLogical (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseLogical (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parseRegex (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseRegex (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parseEquality (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseEquality (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parseComparative (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseComparative (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parseArithmetic (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseArithmetic (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parseGeometric (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseGeometric (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parseTag (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseTag (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parseUnary (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseUnary (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parseExponent (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parseExponent (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool parsePrimitive (std::vector <std::pair <std::string, Lexer::Type> >&, int &) const; bool parsePrimitive (std::vector <std::pair <std::string, Lexer2::Type> >&, int &) const;
bool identifyOperator (const std::string&, char&, int&, char&) const; bool identifyOperator (const std::string&, char&, int&, char&) const;
std::string dump (std::vector <std::pair <std::string, Lexer::Type> >&) const; std::string dump (std::vector <std::pair <std::string, Lexer2::Type> >&) const;
private: private:
std::vector <bool (*)(const std::string&, Variant&)> _sources; std::vector <bool (*)(const std::string&, Variant&)> _sources;
bool _ambiguity; bool _ambiguity;
bool _debug; bool _debug;
std::vector <std::pair <std::string, Lexer::Type> > _compiled; std::vector <std::pair <std::string, Lexer2::Type> > _compiled;
}; };

View File

@@ -1,898 +0,0 @@
////////////////////////////////////////////////////////////////////////////////
//
// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// http://www.opensource.org/licenses/mit-license.php
//
////////////////////////////////////////////////////////////////////////////////
#include <cmake.h>
#include <ctype.h>
#include <utf8.h>
#include <ISO8601.h>
#include <Date.h>
#include <Duration.h>
#include <Lexer.h>
#include <i18n.h>
std::string Lexer::dateFormat = "";
bool Lexer::isoEnabled = true;
////////////////////////////////////////////////////////////////////////////////
Lexer::Lexer (const std::string& input)
: _input (input)
, _i (0)
, _shift_counter (0)
, _n0 (32)
, _n1 (32)
, _n2 (32)
, _n3 (32)
, _boundary01 (false)
, _boundary12 (false)
, _boundary23 (false)
, _ambiguity (true)
{
// Read 4 chars in preparation. Even if there are < 4. Take a deep breath.
shift ();
shift ();
shift ();
shift ();
// Reset because the four shifts above do not represent advancement into the
// _input. All subsequents shiftѕ do though.
_shift_counter = 0;
}
////////////////////////////////////////////////////////////////////////////////
Lexer::~Lexer ()
{
}
////////////////////////////////////////////////////////////////////////////////
// Walk the input string, looking for transitions.
bool Lexer::token (std::string& result, Type& type)
{
// Start with nothing.
result = "";
// Different types of matching quote: ', ".
int quote = 0;
type = typeNone;
while (_n0)
{
switch (type)
{
case typeNone:
if (is_ws (_n0))
shift ();
else if (_n0 == '"' || _n0 == '\'')
{
type = typeString;
quote = _n0;
result += utf8_character (_n0);
shift ();
}
else if (_n0 == '0' &&
_n1 == 'x' &&
is_hex_digit (_n2))
{
type = typeHex;
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
}
else if (is_dec_digit (_n0))
{
// Speculatively try a date and duration parse. Longest wins.
if (is_date (result))
{
type = typeDate;
return true;
}
if (is_duration (result))
{
type = typeDuration;
return true;
}
type = typeNumber;
result += utf8_character (_n0);
shift ();
}
else if (_n0 == '.' && is_dec_digit (_n1))
{
type = typeDecimal;
result += utf8_character (_n0);
shift ();
}
else if ((_n0 == '+' || _n0 == '-') && is_ident_start (_n1))
{
type = typeTag;
result += utf8_character (_n0);
shift ();
}
else if (is_triple_op (_n0, _n1, _n2))
{
type = typeOperator;
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
return true;
}
else if (is_double_op (_n0, _n1, _n2))
{
type = typeOperator;
result += utf8_character (_n0);
shift ();
result += utf8_character (_n0);
shift ();
return true;
}
else if (is_single_op (_n0))
{
type = typeOperator;
result += utf8_character (_n0);
shift ();
return true;
}
else if (_n0 == '\\')
{
type = typeIdentifierEscape;
shift ();
}
else if (is_ident_start (_n0))
{
if (is_date (result))
{
type = typeDate;
return true;
}
if (is_duration (result))
{
type = typeDuration;
return true;
}
type = typeIdentifier;
result += utf8_character (_n0);
shift ();
}
else
throw std::string (STRING_LEX_IMMEDIATE_UNK);
break;
case typeString:
if (_n0 == quote)
{
result += utf8_character (_n0);
shift ();
quote = 0;
return true;
}
else if (_n0 == '\\')
{
type = typeEscape;
shift ();
}
else
{
result += utf8_character (_n0);
shift ();
}
break;
case typeTag:
if (is_ident_start (_n0))
{
result += utf8_character (_n0);
shift ();
}
else
{
return true;
}
break;
case typeIdentifier:
if (is_ident (_n0))
{
result += utf8_character (_n0);
shift ();
}
else
{
// typeIdentifier is a catch-all type. Anything word-like becomes an
// identifier. At this point in the processing, an identifier is found,
// and can be matched against a list of potential upgrades.
if (result == "_hastag_" ||
result == "_notag_" ||
result == "_neg_" ||
result == "_pos_")
type = typeOperator;
return true;
}
break;
case typeIdentifierEscape:
if (_n0 == 'u')
{
type = typeEscapeUnicode;
shift ();
}
else
{
type = quote ? typeString : typeIdentifier;
result += utf8_character (quote);
result += utf8_character (_n0);
shift ();
}
break;
case typeEscape:
if (_n0 == 'x')
{
type = typeEscapeHex;
shift ();
}
else if (_n0 == 'u')
{
type = typeEscapeUnicode;
shift ();
}
else
{
result += '\\';
result += utf8_character (_n0);
type = quote ? typeString : typeIdentifier;
shift ();
}
break;
case typeEscapeHex:
if (is_hex_digit (_n0) && is_hex_digit (_n1))
{
result += utf8_character (hex_to_int (_n0, _n1));
type = quote ? typeString : typeIdentifier;
shift ();
shift ();
}
else
{
type = quote ? typeString : typeIdentifier;
shift ();
quote = 0;
return true;
}
break;
case typeEscapeUnicode:
if (is_hex_digit (_n0) &&
is_hex_digit (_n1) &&
is_hex_digit (_n2) &&
is_hex_digit (_n3))
{
result += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
shift ();
shift ();
shift ();
shift ();
type = quote ? typeString : typeIdentifier;
}
else if (_n0 == quote)
{
type = typeString;
shift ();
quote = 0;
return true;
}
break;
case typeNumber:
if (is_dec_digit (_n0))
{
result += utf8_character (_n0);
shift ();
}
else if (_n0 == '.')
{
type = typeDecimal;
result += utf8_character (_n0);
shift ();
}
else if (_n0 == 'e' || _n0 == 'E')
{
type = typeExponentIndicator;
result += utf8_character (_n0);
shift ();
}
else if (is_ident_start (_n0))
{
type = typeIdentifier;
result += utf8_character (_n0);
shift ();
}
else
{
return true;
}
break;
case typeDecimal:
if (is_dec_digit (_n0))
{
result += utf8_character (_n0);
shift ();
}
else if (_n0 == 'e' || _n0 == 'E')
{
type = typeExponentIndicator;
result += utf8_character (_n0);
shift ();
}
else if (is_ident_start (_n0))
{
type = typeIdentifier;
result += utf8_character (_n0);
shift ();
}
else
{
return true;
}
break;
case typeExponentIndicator:
if (_n0 == '+' || _n0 == '-')
{
result += utf8_character (_n0);
shift ();
}
else if (is_dec_digit (_n0))
{
type = typeExponent;
result += utf8_character (_n0);
shift ();
}
else if (is_ident_start (_n0))
{
type = typeIdentifier;
result += utf8_character (_n0);
shift ();
}
break;
case typeExponent:
if (is_dec_digit (_n0) || _n0 == '.')
{
result += utf8_character (_n0);
shift ();
}
else
{
type = typeDecimal;
return true;
}
break;
case typeHex:
if (is_hex_digit (_n0))
{
result += utf8_character (_n0);
shift ();
}
else
{
return true;
}
break;
default:
throw std::string (STRING_LEX_TYPE_UNK);
break;
}
// Fence post.
if (!_n0 && result != "")
return true;
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
// Just like Lexer::token, but no operators, numbers, dates or durations.
bool Lexer::word (std::string& token, Type& type)
{
// Start with nothing.
token = "";
// Different types of matching quote: ', ".
int quote = 0;
type = typeNone;
while (_n0)
{
switch (type)
{
case typeNone:
if (is_ws (_n0))
shift ();
else if (_n0 == '"' || _n0 == '\'')
{
type = typeString;
quote = _n0;
token += utf8_character (_n0);
shift ();
}
else
{
type = typeString;
token += utf8_character (_n0);
shift ();
}
break;
case typeString:
if (_n0 == quote)
{
token += utf8_character (_n0);
shift ();
quote = 0;
return true;
}
else if (_n0 == '\\')
{
type = typeEscape;
shift ();
}
else if (! quote && is_ws (_n0))
{
shift ();
return true;
}
else
{
token += utf8_character (_n0);
shift ();
}
break;
case typeEscape:
if (_n0 == 'x')
{
type = typeEscapeHex;
shift ();
}
else if (_n0 == 'u')
{
type = typeEscapeUnicode;
shift ();
}
else
{
token += '\\';
token += utf8_character (_n0);
type = typeString;
shift ();
}
break;
case typeEscapeHex:
if (is_hex_digit (_n0) && is_hex_digit (_n1))
{
token += utf8_character (hex_to_int (_n0, _n1));
type = typeString;
shift ();
shift ();
}
else
{
type = typeString;
shift ();
quote = 0;
return true;
}
break;
case typeEscapeUnicode:
if (is_hex_digit (_n0) &&
is_hex_digit (_n1) &&
is_hex_digit (_n2) &&
is_hex_digit (_n3))
{
token += utf8_character (hex_to_int (_n0, _n1, _n2, _n3));
shift ();
shift ();
shift ();
shift ();
type = typeString;
}
else if (_n0 == quote)
{
type = typeString;
shift ();
quote = 0;
return true;
}
break;
default:
throw std::string (STRING_LEX_TYPE_UNK);
break;
}
// Fence post.
if (!_n0 && token != "")
return true;
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
void Lexer::ambiguity (bool value)
{
_ambiguity = value;
}
////////////////////////////////////////////////////////////////////////////////
// No L10N - these are for internal purposes.
const std::string Lexer::type_name (const Type& type)
{
switch (type)
{
case Lexer::typeNone: return "None";
case Lexer::typeString: return "String";
case Lexer::typeIdentifier: return "Identifier";
case Lexer::typeIdentifierEscape: return "IdentifierEscape";
case Lexer::typeNumber: return "Number";
case Lexer::typeDecimal: return "Decimal";
case Lexer::typeExponentIndicator: return "ExponentIndicator";
case Lexer::typeExponent: return "Exponent";
case Lexer::typeHex: return "Hex";
case Lexer::typeOperator: return "Operator";
case Lexer::typeEscape: return "Escape";
case Lexer::typeEscapeHex: return "EscapeHex";
case Lexer::typeEscapeUnicode: return "EscapeUnicode";
case Lexer::typeDate: return "Date";
case Lexer::typeDuration: return "Duration";
case Lexer::typeTag: return "Tag";
}
}
////////////////////////////////////////////////////////////////////////////////
// Complete Unicode whitespace list.
//
// http://en.wikipedia.org/wiki/Whitespace_character
// Updated 2013-11-18
bool Lexer::is_ws (int c)
{
return (c == 0x0020 || // space Common Separator, space
c == 0x0009 || // Common Other, control HT, Horizontal Tab
c == 0x000A || // Common Other, control LF, Line feed
c == 0x000B || // Common Other, control VT, Vertical Tab
c == 0x000C || // Common Other, control FF, Form feed
c == 0x000D || // Common Other, control CR, Carriage return
c == 0x0085 || // Common Other, control NEL, Next line
c == 0x00A0 || // no-break space Common Separator, space
c == 0x1680 || // ogham space mark Ogham Separator, space
c == 0x180E || // mongolian vowel separator Mongolian Separator, space
c == 0x2000 || // en quad Common Separator, space
c == 0x2001 || // em quad Common Separator, space
c == 0x2002 || // en space Common Separator, space
c == 0x2003 || // em space Common Separator, space
c == 0x2004 || // three-per-em space Common Separator, space
c == 0x2005 || // four-per-em space Common Separator, space
c == 0x2006 || // six-per-em space Common Separator, space
c == 0x2007 || // figure space Common Separator, space
c == 0x2008 || // punctuation space Common Separator, space
c == 0x2009 || // thin space Common Separator, space
c == 0x200A || // hair space Common Separator, space
c == 0x2028 || // line separator Common Separator, line
c == 0x2029 || // paragraph separator Common Separator, paragraph
c == 0x202F || // narrow no-break space Common Separator, space
c == 0x205F || // medium mathematical space Common Separator, space
c == 0x3000); // ideographic space Common Separator, space
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_ident_start (int c)
{
return c && // Include null character check.
! is_ws (c) &&
! is_dec_digit (c) &&
! is_single_op (c);
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_ident (int c)
{
return c && // Include null character check.
! is_ws (c) &&
! is_single_op (c);
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_single_op (int c)
{
return c == '+' ||
c == '-' ||
c == '*' ||
c == '/' ||
c == '(' ||
c == ')' ||
c == '<' ||
c == '>' ||
c == '^' ||
c == '!' ||
c == '%' ||
c == '=' ||
c == '~';
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_dec_digit (int c)
{
return c >= '0' && c <= '9';
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::boundary (int left, int right)
{
// XOR
if (isalpha (left) != isalpha (right)) return true;
if (isdigit (left) != isdigit (right)) return true;
if (isspace (left) != isspace (right)) return true;
// OR
if (ispunct (left) || ispunct (right)) return true;
return false;
}
////////////////////////////////////////////////////////////////////////////////
// Split 'input' into 'words' on Lexer::is_ws boundaries, observing quotes.
void Lexer::word_split (std::vector <std::string>& words, const std::string& input)
{
words.clear ();
std::string word;
Lexer::Type type;
Lexer lex (input);
while (lex.word (word, type))
words.push_back (word);
}
////////////////////////////////////////////////////////////////////////////////
// Split 'input' into 'tokens'.
void Lexer::token_split (std::vector <std::string>& words, const std::string& input)
{
words.clear ();
std::string word;
Lexer::Type type;
Lexer lex (input);
while (lex.token (word, type))
words.push_back (word);
}
////////////////////////////////////////////////////////////////////////////////
// Split 'input' into 'tokens', preserving type.
void Lexer::token_split (std::vector <std::pair <std::string, Lexer::Type> >& lexemes, const std::string& input)
{
lexemes.clear ();
std::string word;
Lexer::Type type;
Lexer lex (input);
while (lex.token (word, type))
lexemes.push_back (std::pair <std::string, Lexer::Type>(word, type));
}
////////////////////////////////////////////////////////////////////////////////
void Lexer::dequote (std::string& input)
{
int quote = input[0];
size_t len = input.length ();
if ((quote == '\'' || quote == '"') &&
quote == input[len - 1])
{
input = input.substr (1, len - 2);
}
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_date (std::string& result)
{
// Try an ISO date parse.
if (isoEnabled)
{
std::string::size_type iso_i = 0;
std::string iso_result;
ISO8601d iso;
iso.ambiguity (_ambiguity);
if (iso.parse (_input.substr (_shift_counter), iso_i))
{
result = _input.substr (_shift_counter, iso_i);
while (iso_i--) shift ();
return true;
}
}
// Try a legacy rc.dateformat parse here.
if (Lexer::dateFormat != "")
{
try
{
std::string::size_type legacy_i = 0;
Date legacyDate (_input.substr (_shift_counter), legacy_i, Lexer::dateFormat, false, false);
result = _input.substr (_shift_counter, legacy_i);
while (legacy_i--) shift ();
return true;
}
catch (...) { /* Never mind. */ }
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_duration (std::string& result)
{
std::string::size_type iso_i = 0;
std::string iso_result;
ISO8601p iso;
if (iso.parse (_input.substr (_shift_counter), iso_i))
{
result = _input.substr (_shift_counter, iso_i);
while (iso_i--) shift ();
return true;
}
std::string::size_type dur_i = 0;
std::string dur_result;
Duration dur;
if (dur.parse (_input.substr (_shift_counter), dur_i))
{
result = _input.substr (_shift_counter, dur_i);
while (dur_i--) shift ();
return true;
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_punct (int c) const
{
if (c == ',' ||
c == '.') // Tab
return true;
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_num (int c) const
{
if ((c >= '0' && c <= '9') ||
c == '.')
return true;
return false;
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_triple_op (int c0, int c1, int c2) const
{
return (c0 == 'a' && c1 == 'n' && c2 == 'd' && _boundary23) ||
(c0 == 'x' && c1 == 'o' && c2 == 'r' && _boundary23) ||
(c0 == '!' && c1 == '=' && c2 == '=');
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_double_op (int c0, int c1, int c2) const
{
return (c0 == '=' && c1 == '=') ||
(c0 == '!' && c1 == '=') ||
(c0 == '<' && c1 == '=') ||
(c0 == '>' && c1 == '=') ||
(c0 == 'o' && c1 == 'r' && _boundary12) ||
(c0 == '|' && c1 == '|') ||
(c0 == '&' && c1 == '&') ||
(c0 == '!' && c1 == '~');
}
////////////////////////////////////////////////////////////////////////////////
bool Lexer::is_hex_digit (int c) const
{
return (c >= '0' && c <= '9') ||
(c >= 'a' && c <= 'f') ||
(c >= 'A' && c <= 'F');
}
////////////////////////////////////////////////////////////////////////////////
int Lexer::decode_escape (int c) const
{
switch (c)
{
case 'b': return 0x08;
case 'f': return 0x0C;
case 'n': return 0x0A;
case 'r': return 0x0D;
case 't': return 0x09;
case 'v': return 0x0B;
case '\'': return 0x27;
case '"': return 0x22;
default: return c;
}
}
////////////////////////////////////////////////////////////////////////////////
int Lexer::hex_to_int (int c) const
{
if (c >= '0' && c <= '9') return (c - '0');
else if (c >= 'a' && c <= 'f') return (c - 'a' + 10);
else return (c - 'A' + 10);
}
////////////////////////////////////////////////////////////////////////////////
int Lexer::hex_to_int (int c0, int c1) const
{
return (hex_to_int (c0) << 4) + hex_to_int (c1);
}
////////////////////////////////////////////////////////////////////////////////
int Lexer::hex_to_int (int c0, int c1, int c2, int c3) const
{
return (hex_to_int (c0) << 12) +
(hex_to_int (c1) << 8) +
(hex_to_int (c2) << 4) +
hex_to_int (c3);
}
////////////////////////////////////////////////////////////////////////////////
void Lexer::shift ()
{
_n0 = _n1;
_n1 = _n2;
_n2 = _n3;
_n3 = utf8_next_char (_input, _i);
++_shift_counter;
// Detect type boundaries between characters.
_boundary01 = boundary (_n0, _n1);
_boundary12 = boundary (_n1, _n2);
_boundary23 = boundary (_n2, _n3);
}
////////////////////////////////////////////////////////////////////////////////

View File

@@ -1,120 +0,0 @@
////////////////////////////////////////////////////////////////////////////////
//
// Copyright 2013 - 2015, Paul Beckingham, Federico Hernandez.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
//
// http://www.opensource.org/licenses/mit-license.php
//
////////////////////////////////////////////////////////////////////////////////
#ifndef INCLUDED_LEXER
#define INCLUDED_LEXER
#include <vector>
#include <string>
class Lexer
{
public:
static std::string dateFormat;
static bool isoEnabled;
enum Type
{
typeNone = 0,
typeString,
typeIdentifier,
typeIdentifierEscape, // Intermediate
typeEscape, // Intermediate
typeEscapeHex, // Intermediate
typeEscapeUnicode, // Intermediate
typeNumber,
typeDecimal,
typeExponentIndicator, // Intermediate
typeExponent, // Intermediate
typeHex,
typeOperator,
typeDate,
typeDuration,
typeTag,
/*
Recognizing more types means that Lexer::*_split and Lexer::token approach
the ideal form, whereby the command line becomes just one string that is
lexed into tokens. Those tokens are then simply dissected by type..
typeUUID,
typePattern,
typeSubstitution,
typeNameValue,
*/
};
Lexer (const std::string&);
virtual ~Lexer ();
Lexer (const Lexer&); // Not implemented.
Lexer& operator= (const Lexer&); // Not implemented.
bool operator== (const Lexer&); // Not implemented.
bool token (std::string&, Type&);
bool word (std::string&, Type&);
void ambiguity (bool);
static const std::string type_name (const Type&);
static bool is_ws (int);
static bool is_ident_start (int);
static bool is_ident (int);
static bool is_single_op (int);
static bool is_dec_digit (int);
static bool boundary (int, int);
static void word_split (std::vector <std::string>&, const std::string&);
static void token_split (std::vector <std::string>&, const std::string&);
static void token_split (std::vector <std::pair <std::string, Lexer::Type> >&, const std::string&);
static void dequote (std::string&);
private:
bool is_date (std::string&);
bool is_duration (std::string&);
bool is_punct (int) const;
bool is_num (int) const;
bool is_triple_op (int, int, int) const;
bool is_double_op (int, int, int) const;
bool is_hex_digit (int) const;
int decode_escape (int) const;
int hex_to_int (int) const;
int hex_to_int (int, int) const;
int hex_to_int (int, int, int, int) const;
void shift ();
private:
const std::string _input;
std::string::size_type _i;
std::string::size_type _shift_counter;
int _n0;
int _n1;
int _n2;
int _n3;
bool _boundary01;
bool _boundary12;
bool _boundary23;
bool _ambiguity;
};
#endif
////////////////////////////////////////////////////////////////////////////////

View File

@@ -37,13 +37,13 @@ static const int uuid_min_length = 8;
std::string Lexer2::dateFormat = ""; std::string Lexer2::dateFormat = "";
bool Lexer2::isoEnabled = true; bool Lexer2::isoEnabled = true;
bool Lexer2::ambiguity = true;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
Lexer2::Lexer2 (const std::string& text) Lexer2::Lexer2 (const std::string& text)
: _text (text) : _text (text)
, _cursor (0) , _cursor (0)
, _eos (text.size ()) , _eos (text.size ())
, _ambiguity (false)
{ {
} }
@@ -52,6 +52,12 @@ Lexer2::~Lexer2 ()
{ {
} }
////////////////////////////////////////////////////////////////////////////////
void Lexer2::ambiguity (bool value)
{
_ambiguity = value;
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// When a Lexer2 object is constructed with a string, this method walks through // When a Lexer2 object is constructed with a string, this method walks through
// the stream of low-level tokens. // the stream of low-level tokens.
@@ -417,7 +423,7 @@ bool Lexer2::isDate (std::string& token, Lexer2::Type& type)
{ {
std::size_t iso_i = 0; std::size_t iso_i = 0;
ISO8601d iso; ISO8601d iso;
iso.ambiguity (Lexer2::ambiguity); iso.ambiguity (_ambiguity);
if (iso.parse (_text.substr (_cursor), iso_i)) if (iso.parse (_text.substr (_cursor), iso_i))
{ {
type = Lexer2::Type::date; type = Lexer2::Type::date;
@@ -504,10 +510,13 @@ bool Lexer2::isUUID (std::string& token, Lexer2::Type& type)
if (i >= uuid_min_length) if (i >= uuid_min_length)
{ {
token = _text.substr (_cursor, i + 1); token = _text.substr (_cursor, i);
type = Lexer2::Type::uuid; if (! isAllDigits (token))
_cursor += i; {
return true; type = Lexer2::Type::uuid;
_cursor += i;
return true;
}
} }
return false; return false;
@@ -545,7 +554,7 @@ bool Lexer2::isHexNumber (std::string& token, Lexer2::Type& type)
// Lexer2::Type::number // Lexer2::Type::number
// \d+ // \d+
// [ . \d+ ] // [ . \d+ ]
// [ e|E [ +|- ] \d+ ] // [ e|E [ +|- ] \d+ [ . \d+ ] ]
bool Lexer2::isNumber (std::string& token, Lexer2::Type& type) bool Lexer2::isNumber (std::string& token, Lexer2::Type& type)
{ {
std::size_t marker = _cursor; std::size_t marker = _cursor;
@@ -581,6 +590,17 @@ bool Lexer2::isNumber (std::string& token, Lexer2::Type& type)
++marker; ++marker;
while (isDigit (_text[marker])) while (isDigit (_text[marker]))
utf8_next_char (_text, marker); utf8_next_char (_text, marker);
if (_text[marker] == '.')
{
++marker;
if (isDigit (_text[marker]))
{
++marker;
while (isDigit (_text[marker]))
utf8_next_char (_text, marker);
}
}
} }
} }
@@ -667,7 +687,7 @@ bool Lexer2::isURL (std::string& token, Lexer2::Type& type)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Lexer2::Type::pair // Lexer2::Type::pair
// <identifier> : [ <string> | <word> ] // <identifier> :|= [ <string> | <word> ]
bool Lexer2::isPair (std::string& token, Lexer2::Type& type) bool Lexer2::isPair (std::string& token, Lexer2::Type& type)
{ {
std::size_t marker = _cursor; std::size_t marker = _cursor;
@@ -698,11 +718,18 @@ bool Lexer2::isPair (std::string& token, Lexer2::Type& type)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Lexer2::Type::tag // Lexer2::Type::tag
// [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]* // ^ | <isWhiteSpace> [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]*
bool Lexer2::isTag (std::string& token, Lexer2::Type& type) bool Lexer2::isTag (std::string& token, Lexer2::Type& type)
{ {
std::size_t marker = _cursor; std::size_t marker = _cursor;
// This test requires a tag to have a preceding space or start a string.
// bad: 'a+b' --> identifier tag
// good: 'a+b' --> identifier op identifier
if (marker > 0 &&
! isWhitespace (_text[marker - 1]))
return false;
if (_text[marker] == '+' || if (_text[marker] == '+' ||
_text[marker] == '-') _text[marker] == '-')
{ {
@@ -926,7 +953,7 @@ bool Lexer2::isWord (std::string& token, Lexer2::Type& type)
{ {
std::size_t marker = _cursor; std::size_t marker = _cursor;
while (! isWhitespace (_text[marker])) while (_text[marker] && ! isWhitespace (_text[marker]))
utf8_next_char (_text, marker); utf8_next_char (_text, marker);
if (marker > _cursor) if (marker > _cursor)

View File

@@ -40,7 +40,6 @@ public:
// These are overridable. // These are overridable.
static std::string dateFormat; static std::string dateFormat;
static bool isoEnabled; static bool isoEnabled;
static bool ambiguity;
enum class Type { uuid, number, hex, enum class Type { uuid, number, hex,
string, string,
@@ -54,6 +53,7 @@ public:
Lexer2 (const std::string&); Lexer2 (const std::string&);
~Lexer2 (); ~Lexer2 ();
void ambiguity (bool);
bool token (std::string&, Lexer2::Type&); bool token (std::string&, Lexer2::Type&);
static std::vector <std::pair <std::string, Lexer2::Type>> tokens (const std::string&); static std::vector <std::pair <std::string, Lexer2::Type>> tokens (const std::string&);
static std::vector <std::string> split (const std::string&); static std::vector <std::string> split (const std::string&);
@@ -101,8 +101,9 @@ public:
private: private:
std::string _text; std::string _text;
std::size_t _cursor = 0; std::size_t _cursor;
std::size_t _eos = 0; std::size_t _eos;
bool _ambiguity;
}; };
#endif #endif

View File

@@ -32,7 +32,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <Context.h> #include <Context.h>
#include <Filter.h> #include <Filter.h>
#include <Lexer.h> #include <Lexer2.h>
#include <ViewTask.h> #include <ViewTask.h>
#include <i18n.h> #include <i18n.h>
#include <text.h> #include <text.h>
@@ -83,8 +83,8 @@ int CmdCustom::execute (std::string& output)
// Prepend the argument list with those from the report filter. // Prepend the argument list with those from the report filter.
std::string lexeme; std::string lexeme;
Lexer::Type type; Lexer2::Type type;
Lexer lex (reportFilter); Lexer2 lex (reportFilter);
lex.ambiguity (false); lex.ambiguity (false);
while (lex.token (lexeme, type)) while (lex.token (lexeme, type))
context.cli.add (lexeme); context.cli.add (lexeme);

View File

@@ -28,7 +28,7 @@
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include <test.h> #include <test.h>
#include <Lexer.h> #include <Lexer2.h>
#include <Context.h> #include <Context.h>
Context context; Context context;
@@ -36,360 +36,349 @@ Context context;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main (int argc, char** argv) int main (int argc, char** argv)
{ {
UnitTest t (212); UnitTest t (211);
std::vector <std::pair <std::string, Lexer::Type> > tokens; std::vector <std::pair <std::string, Lexer2::Type> > tokens;
std::string token; std::string token;
Lexer::Type type; Lexer2::Type type;
// White space detection. // White space detection.
t.notok (Lexer::is_ws (0x0041), "U+0041 (A) is not ws"); t.notok (Lexer2::isWhitespace (0x0041), "U+0041 (A) ! isWhitespace");
t.ok (Lexer::is_ws (0x0020), "U+0020 is_ws"); t.ok (Lexer2::isWhitespace (0x0020), "U+0020 isWhitespace");
t.ok (Lexer::is_ws (0x0009), "U+0009 is_ws"); t.ok (Lexer2::isWhitespace (0x0009), "U+0009 isWhitespace");
t.ok (Lexer::is_ws (0x000A), "U+000A is_ws"); t.ok (Lexer2::isWhitespace (0x000A), "U+000A isWhitespace");
t.ok (Lexer::is_ws (0x000B), "U+000B is_ws"); t.ok (Lexer2::isWhitespace (0x000B), "U+000B isWhitespace");
t.ok (Lexer::is_ws (0x000C), "U+000C is_ws"); t.ok (Lexer2::isWhitespace (0x000C), "U+000C isWhitespace");
t.ok (Lexer::is_ws (0x000D), "U+000D is_ws"); t.ok (Lexer2::isWhitespace (0x000D), "U+000D isWhitespace");
t.ok (Lexer::is_ws (0x0085), "U+0085 is_ws"); t.ok (Lexer2::isWhitespace (0x0085), "U+0085 isWhitespace");
t.ok (Lexer::is_ws (0x00A0), "U+00A0 is_ws"); t.ok (Lexer2::isWhitespace (0x00A0), "U+00A0 isWhitespace");
t.ok (Lexer::is_ws (0x1680), "U+1680 is_ws"); // 10 t.ok (Lexer2::isWhitespace (0x1680), "U+1680 isWhitespace"); // 10
t.ok (Lexer::is_ws (0x180E), "U+180E is_ws"); t.ok (Lexer2::isWhitespace (0x180E), "U+180E isWhitespace");
t.ok (Lexer::is_ws (0x2000), "U+2000 is_ws"); t.ok (Lexer2::isWhitespace (0x2000), "U+2000 isWhitespace");
t.ok (Lexer::is_ws (0x2001), "U+2001 is_ws"); t.ok (Lexer2::isWhitespace (0x2001), "U+2001 isWhitespace");
t.ok (Lexer::is_ws (0x2002), "U+2002 is_ws"); t.ok (Lexer2::isWhitespace (0x2002), "U+2002 isWhitespace");
t.ok (Lexer::is_ws (0x2003), "U+2003 is_ws"); t.ok (Lexer2::isWhitespace (0x2003), "U+2003 isWhitespace");
t.ok (Lexer::is_ws (0x2004), "U+2004 is_ws"); t.ok (Lexer2::isWhitespace (0x2004), "U+2004 isWhitespace");
t.ok (Lexer::is_ws (0x2005), "U+2005 is_ws"); t.ok (Lexer2::isWhitespace (0x2005), "U+2005 isWhitespace");
t.ok (Lexer::is_ws (0x2006), "U+2006 is_ws"); t.ok (Lexer2::isWhitespace (0x2006), "U+2006 isWhitespace");
t.ok (Lexer::is_ws (0x2007), "U+2007 is_ws"); t.ok (Lexer2::isWhitespace (0x2007), "U+2007 isWhitespace");
t.ok (Lexer::is_ws (0x2008), "U+2008 is_ws"); // 20 t.ok (Lexer2::isWhitespace (0x2008), "U+2008 isWhitespace"); // 20
t.ok (Lexer::is_ws (0x2009), "U+2009 is_ws"); t.ok (Lexer2::isWhitespace (0x2009), "U+2009 isWhitespace");
t.ok (Lexer::is_ws (0x200A), "U+200A is_ws"); t.ok (Lexer2::isWhitespace (0x200A), "U+200A isWhitespace");
t.ok (Lexer::is_ws (0x2028), "U+2028 is_ws"); t.ok (Lexer2::isWhitespace (0x2028), "U+2028 isWhitespace");
t.ok (Lexer::is_ws (0x2029), "U+2029 is_ws"); t.ok (Lexer2::isWhitespace (0x2029), "U+2029 isWhitespace");
t.ok (Lexer::is_ws (0x202F), "U+202F is_ws"); t.ok (Lexer2::isWhitespace (0x202F), "U+202F isWhitespace");
t.ok (Lexer::is_ws (0x205F), "U+205F is_ws"); t.ok (Lexer2::isWhitespace (0x205F), "U+205F isWhitespace");
t.ok (Lexer::is_ws (0x3000), "U+3000 is_ws"); t.ok (Lexer2::isWhitespace (0x3000), "U+3000 isWhitespace");
// static bool Lexer::boundary(int, int); // static bool Lexer2::isBoundary(int, int);
t.ok (Lexer::boundary (' ', 'a'), "' ' --> 'a' = boundary"); t.ok (Lexer2::isBoundary (' ', 'a'), "' ' --> 'a' = isBoundary");
t.ok (Lexer::boundary ('a', ' '), "'a' --> ' ' = boundary"); t.ok (Lexer2::isBoundary ('a', ' '), "'a' --> ' ' = isBoundary");
t.ok (Lexer::boundary (' ', '+'), "' ' --> '+' = boundary"); t.ok (Lexer2::isBoundary (' ', '+'), "' ' --> '+' = isBoundary");
t.ok (Lexer::boundary (' ', ','), "' ' --> ',' = boundary"); t.ok (Lexer2::isBoundary (' ', ','), "' ' --> ',' = isBoundary");
t.notok (Lexer::boundary ('3', '4'), "'3' --> '4' = boundary"); t.notok (Lexer2::isBoundary ('3', '4'), "'3' --> '4' = isBoundary");
t.ok (Lexer::boundary ('(', '('), "'(' --> '(' = boundary"); t.ok (Lexer2::isBoundary ('(', '('), "'(' --> '(' = isBoundary");
t.notok (Lexer::boundary ('r', 'd'), "'r' --> 'd' = boundary"); t.notok (Lexer2::isBoundary ('r', 'd'), "'r' --> 'd' = isBoundary");
// Should result in no tokens. // Should result in no tokens.
Lexer l0 (""); Lexer2 l0 ("");
t.notok (l0.token (token, type), "'' --> no tokens"); t.notok (l0.token (token, type), "'' --> no tokens");
// Should result in no tokens. // Should result in no tokens.
Lexer l1 (" \t "); Lexer2 l1 (" \t ");
t.notok (l1.token (token, type), "' \\t ' --> no tokens"); t.notok (l1.token (token, type), "' \\t ' --> no tokens");
// \u20ac = Euro symbol. // \u20ac = Euro symbol.
Lexer l2 (" one 'two \\'three\\''+456-(1.3*2 - 0x12) \\u0041 1.2e-3.4 foo.bar and '\\u20ac'"); Lexer2 l2 (" one 'two \\'three\\''+456-(1.3*2 - 0x12) 1.2e-3.4 foo.bar and '\\u20ac'");
tokens.clear (); tokens.clear ();
while (l2.token (token, type)) while (l2.token (token, type))
{ {
std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n";
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
t.is (tokens[0].first, "one", "tokens[0] = 'left'"); // 30 t.is (tokens[0].first, "one", "tokens[0] = 'left'"); // 30
t.is (Lexer::type_name (tokens[0].second), "Identifier", "tokens[0] = Identifier"); t.is (Lexer2::typeName (tokens[0].second), "identifier", "tokens[0] = identifier");
t.is (tokens[1].first, "'two \\'three\\''", "tokens[1] = 'two \\'three\\''"); t.is (tokens[1].first, "two 'three'", "tokens[1] = 'two 'three''");
t.is (Lexer::type_name (tokens[1].second), "String", "tokens[1] = String"); t.is (Lexer2::typeName (tokens[1].second), "string", "tokens[1] = string");
t.is (tokens[2].first, "+", "tokens[2] = '+'"); t.is (tokens[2].first, "+", "tokens[2] = '+'");
t.is (Lexer::type_name (tokens[2].second), "Operator", "tokens[2] = Operator"); t.is (Lexer2::typeName (tokens[2].second), "op", "tokens[2] = op");
t.is (tokens[3].first, "456", "tokens[3] = '456'"); t.is (tokens[3].first, "456", "tokens[3] = '456'");
t.is (Lexer::type_name (tokens[3].second), "Number", "tokens[3] = Number"); t.is (Lexer2::typeName (tokens[3].second), "number", "tokens[3] = number");
t.is (tokens[4].first, "-", "tokens[4] = '-'"); t.is (tokens[4].first, "-", "tokens[4] = '-'");
t.is (Lexer::type_name (tokens[4].second), "Operator", "tokens[4] = Operator"); t.is (Lexer2::typeName (tokens[4].second), "op", "tokens[4] = op");
t.is (tokens[5].first, "(", "tokens[5] = '('"); // 40 t.is (tokens[5].first, "(", "tokens[5] = '('"); // 40
t.is (Lexer::type_name (tokens[5].second), "Operator", "tokens[5] = Operator"); t.is (Lexer2::typeName (tokens[5].second), "op", "tokens[5] = op");
t.is (tokens[6].first, "1.3", "tokens[6] = '1.3'"); t.is (tokens[6].first, "1.3", "tokens[6] = '1.3'");
t.is (Lexer::type_name (tokens[6].second), "Decimal", "tokens[6] = Decimal"); t.is (Lexer2::typeName (tokens[6].second), "number", "tokens[6] = number");
t.is (tokens[7].first, "*", "tokens[7] = '*'"); t.is (tokens[7].first, "*", "tokens[7] = '*'");
t.is (Lexer::type_name (tokens[7].second), "Operator", "tokens[7] = Operator"); t.is (Lexer2::typeName (tokens[7].second), "op", "tokens[7] = op");
t.is (tokens[8].first, "2", "tokens[8] = '2'"); t.is (tokens[8].first, "2", "tokens[8] = '2'");
t.is (Lexer::type_name (tokens[8].second), "Number", "tokens[8] = Number"); t.is (Lexer2::typeName (tokens[8].second), "number", "tokens[8] = number");
t.is (tokens[9].first, "-", "tokens[9] = '-'"); t.is (tokens[9].first, "-", "tokens[9] = '-'");
t.is (Lexer::type_name (tokens[9].second), "Operator", "tokens[9] = Operator"); t.is (Lexer2::typeName (tokens[9].second), "op", "tokens[9] = op");
t.is (tokens[10].first, "0x12", "tokens[10] = '0x12'"); // 50 t.is (tokens[10].first, "0x12", "tokens[10] = '0x12'"); // 50
t.is (Lexer::type_name (tokens[10].second), "Hex", "tokens[10] = Hex"); t.is (Lexer2::typeName (tokens[10].second), "hex", "tokens[10] = hex");
t.is (tokens[11].first, ")", "tokens[11] = ')'"); t.is (tokens[11].first, ")", "tokens[11] = ')'");
t.is (Lexer::type_name (tokens[11].second), "Operator", "tokens[11] = Operator"); t.is (Lexer2::typeName (tokens[11].second), "op", "tokens[11] = op");
t.is (tokens[12].first, "A", "tokens[12] = \\u0041 --> 'A'"); t.is (tokens[12].first, "1.2e-3.4", "tokens[12] = '1.2e-3.4'");
t.is (Lexer::type_name (tokens[12].second), "Identifier", "tokens[12] = Identifier"); t.is (Lexer2::typeName (tokens[12].second), "number", "tokens[12] = number");
t.is (tokens[13].first, "1.2e-3.4", "tokens[13] = '1.2e-3.4'"); t.is (tokens[13].first, "foo.bar", "tokens[13] = 'foo.bar'");
t.is (Lexer::type_name (tokens[13].second), "Decimal", "tokens[13] = Decimal"); t.is (Lexer2::typeName (tokens[13].second), "identifier", "tokens[13] = identifier");
t.is (tokens[14].first, "foo.bar", "tokens[14] = 'foo.bar'"); t.is (tokens[14].first, "and", "tokens[14] = 'and'"); // 60
t.is (Lexer::type_name (tokens[14].second), "Identifier", "tokens[14] = Identifier"); t.is (Lexer2::typeName (tokens[14].second), "op", "tokens[14] = op");
t.is (tokens[15].first, "and", "tokens[15] = 'and'"); // 60 t.is (tokens[15].first, "", "tokens[15] = \\u20ac --> '€'");
t.is (Lexer::type_name (tokens[15].second), "Operator", "tokens[15] = Operator"); t.is (Lexer2::typeName (tokens[15].second), "string", "tokens[15] = string");
t.is (tokens[16].first, "'€'", "tokens[16] = \\u20ac --> '€'");
t.is (Lexer::type_name (tokens[16].second), "String", "tokens[16] = String");
// Test for ISO-8601 dates (favoring dates in ambiguous cases). // Test for ISO-8601 dates (favoring dates in ambiguous cases).
Lexer l3 ("1 12 123 1234 12345 123456 1234567 12345678 20131129T225800Z 2013-11-29T22:58:00Z"); Lexer2 l3 ("1 12 123 1234 12345 123456 1234567 12345678 20131129T225800Z 2013-11-29T22:58:00Z");
l3.ambiguity (true); l3.ambiguity (true);
tokens.clear (); tokens.clear ();
while (l3.token (token, type)) while (l3.token (token, type))
{ {
std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n";
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
t.is ((int)tokens.size (), 10, "10 tokens"); t.is ((int)tokens.size (), 10, "10 tokens");
t.is (tokens[0].first, "1", "tokens[0] == '1'"); t.is (tokens[0].first, "1", "tokens[0] == '1'");
t.is (tokens[0].second, Lexer::typeNumber, "tokens[0] == typeNumber"); t.is ((int) tokens[0].second, (int) Lexer2::Type::number, "tokens[0] == Type::number");
t.is (tokens[1].first, "12", "tokens[1] == '12'"); t.is (tokens[1].first, "12", "tokens[1] == '12'");
t.is (tokens[1].second, Lexer::typeDate, "tokens[1] == typeDate"); t.is ((int) tokens[1].second, (int) Lexer2::Type::date, "tokens[1] == Type::date");
t.is (tokens[2].first, "123", "tokens[2] == '123'"); t.is (tokens[2].first, "123", "tokens[2] == '123'");
t.is (tokens[2].second, Lexer::typeNumber, "tokens[2] == typeNumber"); // 70 t.is ((int) tokens[2].second, (int) Lexer2::Type::number, "tokens[2] == Type::number"); // 70
t.is (tokens[3].first, "1234", "tokens[3] == '1234'"); t.is (tokens[3].first, "1234", "tokens[3] == '1234'");
t.is (tokens[3].second, Lexer::typeDate, "tokens[3] == typeDate"); t.is ((int) tokens[3].second, (int) Lexer2::Type::date, "tokens[3] == Type::date");
t.is (tokens[4].first, "12345", "tokens[4] == '12345'"); t.is (tokens[4].first, "12345", "tokens[4] == '12345'");
t.is (tokens[4].second, Lexer::typeNumber, "tokens[4] == typeNumber"); t.is ((int) tokens[4].second, (int) Lexer2::Type::number, "tokens[4] == Type::number");
t.is (tokens[5].first, "123456", "tokens[5] == '123456'"); t.is (tokens[5].first, "123456", "tokens[5] == '123456'");
t.is (tokens[5].second, Lexer::typeDate, "tokens[5] == typeDate"); t.is ((int) tokens[5].second, (int) Lexer2::Type::date, "tokens[5] == Type::date");
t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'"); t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'");
t.is (tokens[6].second, Lexer::typeNumber, "tokens[6] == typeNumber"); t.is ((int) tokens[6].second, (int) Lexer2::Type::number, "tokens[6] == Type::number");
t.is (tokens[7].first, "12345678", "tokens[7] == '12345678'"); t.is (tokens[7].first, "12345678", "tokens[7] == '12345678'");
t.is (tokens[7].second, Lexer::typeNumber, "tokens[7] == typeNumber"); // 80 t.is ((int) tokens[7].second, (int) Lexer2::Type::number, "tokens[7] == Type::number"); // 80
t.is (tokens[8].first, "20131129T225800Z", "tokens[8] == '20131129T225800Z'"); t.is (tokens[8].first, "20131129T225800Z", "tokens[8] == '20131129T225800Z'");
t.is (tokens[8].second, Lexer::typeDate, "tokens[8] == typeDate"); t.is ((int) tokens[8].second, (int) Lexer2::Type::date, "tokens[8] == Type::date");
t.is (tokens[9].first, "2013-11-29T22:58:00Z", "tokens[9] == '2013-11-29T22:58:00Z'"); t.is (tokens[9].first, "2013-11-29T22:58:00Z", "tokens[9] == '2013-11-29T22:58:00Z'");
t.is (tokens[9].second, Lexer::typeDate, "tokens[9] == typeDate"); t.is ((int) tokens[9].second, (int) Lexer2::Type::date, "tokens[9] == Type::date");
// Test for ISO-8601 dates (favoring numbers in ambiguous cases). // Test for ISO-8601 dates (favoring numbers in ambiguous cases).
Lexer l4 ("1 12 123 1234 12345 123456 1234567 12345678 20131129T225800Z 2013-11-29T22:58:00Z"); Lexer2 l4 ("1 12 123 1234 12345 123456 1234567 12345678 20131129T225800Z 2013-11-29T22:58:00Z");
l4.ambiguity (false); l4.ambiguity (false);
tokens.clear (); tokens.clear ();
while (l4.token (token, type)) while (l4.token (token, type))
{ {
std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n";
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
t.is ((int)tokens.size (), 10, "10 tokens"); t.is ((int)tokens.size (), 10, "10 tokens");
t.is (tokens[0].first, "1", "tokens[0] == '1'"); t.is (tokens[0].first, "1", "tokens[0] == '1'");
t.is (tokens[0].second, Lexer::typeNumber, "tokens[0] == typeNumber"); t.is ((int) tokens[0].second, (int) Lexer2::Type::number, "tokens[0] == Type::number");
t.is (tokens[1].first, "12", "tokens[1] == '12'"); t.is (tokens[1].first, "12", "tokens[1] == '12'");
t.is (tokens[1].second, Lexer::typeNumber, "tokens[1] == typeNumber"); t.is ((int) tokens[1].second, (int) Lexer2::Type::number, "tokens[1] == Type::number");
t.is (tokens[2].first, "123", "tokens[2] == '123'"); // 90 t.is (tokens[2].first, "123", "tokens[2] == '123'"); // 90
t.is (tokens[2].second, Lexer::typeNumber, "tokens[2] == typeNumber"); t.is ((int) tokens[2].second, (int) Lexer2::Type::number, "tokens[2] == Type::number");
t.is (tokens[3].first, "1234", "tokens[3] == '1234'"); t.is (tokens[3].first, "1234", "tokens[3] == '1234'");
t.is (tokens[3].second, Lexer::typeNumber, "tokens[3] == typeNumber"); t.is ((int) tokens[3].second, (int) Lexer2::Type::number, "tokens[3] == Type::number");
t.is (tokens[4].first, "12345", "tokens[4] == '12345'"); t.is (tokens[4].first, "12345", "tokens[4] == '12345'");
t.is (tokens[4].second, Lexer::typeNumber, "tokens[4] == typeNumber"); t.is ((int) tokens[4].second, (int) Lexer2::Type::number, "tokens[4] == Type::number");
t.is (tokens[5].first, "123456", "tokens[5] == '123456'"); t.is (tokens[5].first, "123456", "tokens[5] == '123456'");
t.is (tokens[5].second, Lexer::typeNumber, "tokens[5] == typeNumber"); t.is ((int) tokens[5].second, (int) Lexer2::Type::number, "tokens[5] == Type::number");
t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'"); t.is (tokens[6].first, "1234567", "tokens[6] == '1234567'");
t.is (tokens[6].second, Lexer::typeNumber, "tokens[6] == typeNumber"); t.is ((int) tokens[6].second, (int) Lexer2::Type::number, "tokens[6] == Type::number");
t.is (tokens[7].first, "12345678", "tokens[7] == '12345678'"); // 100 t.is (tokens[7].first, "12345678", "tokens[7] == '12345678'"); // 100
t.is (tokens[7].second, Lexer::typeNumber, "tokens[7] == typeNumber"); t.is ((int) tokens[7].second, (int) Lexer2::Type::number, "tokens[7] == Type::number");
t.is (tokens[8].first, "20131129T225800Z", "tokens[8] == '20131129T225800Z'"); t.is (tokens[8].first, "20131129T225800Z", "tokens[8] == '20131129T225800Z'");
t.is (tokens[8].second, Lexer::typeDate, "tokens[8] == typeDate"); t.is ((int) tokens[8].second, (int) Lexer2::Type::date, "tokens[8] == Type::date");
t.is (tokens[9].first, "2013-11-29T22:58:00Z", "tokens[9] == '2013-11-29T22:58:00Z'"); t.is (tokens[9].first, "2013-11-29T22:58:00Z", "tokens[9] == '2013-11-29T22:58:00Z'");
t.is (tokens[9].second, Lexer::typeDate, "tokens[9] == typeDate"); t.is ((int) tokens[9].second, (int) Lexer2::Type::date, "tokens[9] == Type::date");
// Test for durations // Test for durations
Lexer l5 ("1second 1minute 2hour 3 days 4w 5mo 6 years"); Lexer2 l5 ("1second 1minute 2hour 3 days 4w 5mo 6 years");
tokens.clear (); tokens.clear ();
while (l5.token (token, type)) while (l5.token (token, type))
{ {
std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n";
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
t.is ((int)tokens.size (), 7, "7 tokens"); t.is ((int)tokens.size (), 7, "7 tokens");
t.is (tokens[0].first, "1second", "tokens[0] == '1second'"); t.is (tokens[0].first, "1second", "tokens[0] == '1second'");
t.is (tokens[0].second, Lexer::typeDuration, "tokens[0] == typeDuration"); t.is ((int) tokens[0].second, (int) Lexer2::Type::duration, "tokens[0] == Type::duration");
t.is (tokens[1].first, "1minute", "tokens[1] == '1minute'"); t.is (tokens[1].first, "1minute", "tokens[1] == '1minute'");
t.is (tokens[1].second, Lexer::typeDuration, "tokens[1] == typeDuration"); // 110 t.is ((int) tokens[1].second, (int) Lexer2::Type::duration, "tokens[1] == Type::duration"); // 110
t.is (tokens[2].first, "2hour", "tokens[2] == '2hour'"); t.is (tokens[2].first, "2hour", "tokens[2] == '2hour'");
t.is (tokens[2].second, Lexer::typeDuration, "tokens[2] == typeDuration"); t.is ((int) tokens[2].second, (int) Lexer2::Type::duration, "tokens[2] == Type::duration");
t.is (tokens[3].first, "3 days", "tokens[3] == '3 days'"); t.is (tokens[3].first, "3 days", "tokens[3] == '3 days'");
t.is (tokens[3].second, Lexer::typeDuration, "tokens[3] == typeDuration"); t.is ((int) tokens[3].second, (int) Lexer2::Type::duration, "tokens[3] == Type::duration");
t.is (tokens[4].first, "4w", "tokens[4] == '4w'"); t.is (tokens[4].first, "4w", "tokens[4] == '4w'");
t.is (tokens[4].second, Lexer::typeDuration, "tokens[4] == typeDuration"); t.is ((int) tokens[4].second, (int) Lexer2::Type::duration, "tokens[4] == Type::duration");
t.is (tokens[5].first, "5mo", "tokens[5] == '5mo'"); t.is (tokens[5].first, "5mo", "tokens[5] == '5mo'");
t.is (tokens[5].second, Lexer::typeDuration, "tokens[5] == typeDuration"); t.is ((int) tokens[5].second, (int) Lexer2::Type::duration, "tokens[5] == Type::duration");
t.is (tokens[6].first, "6 years", "tokens[6] == '6 years'"); t.is (tokens[6].first, "6 years", "tokens[6] == '6 years'");
t.is (tokens[6].second, Lexer::typeDuration, "tokens[6] == typeDuration"); // 120 t.is ((int) tokens[6].second, (int) Lexer2::Type::duration, "tokens[6] == Type::duration"); // 120
// All the Eval operators. // All the Eval operators.
Lexer l6 ("P1Y PT1H P1Y1M1DT1H1M1S 1s 1second"); Lexer2 l6 ("P1Y PT1H P1Y1M1DT1H1M1S 1s 1second");
tokens.clear (); tokens.clear ();
while (l6.token (token, type)) while (l6.token (token, type))
{ {
std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n";
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
t.is ((int)tokens.size (), 5, "5 ISO periods"); t.is ((int)tokens.size (), 5, "5 ISO periods");
t.is (tokens[0].first, "P1Y", "tokens[0] == 'P1Y'"); t.is (tokens[0].first, "P1Y", "tokens[0] == 'P1Y'");
t.is (tokens[0].second, Lexer::typeDuration, "tokens[0] == typeDuration"); t.is ((int) tokens[0].second, (int) Lexer2::Type::duration, "tokens[0] == Type::duration");
t.is (tokens[1].first, "PT1H", "tokens[1] == 'PT1H'"); t.is (tokens[1].first, "PT1H", "tokens[1] == 'PT1H'");
t.is (tokens[1].second, Lexer::typeDuration, "tokens[1] == typeDuration"); t.is ((int) tokens[1].second, (int) Lexer2::Type::duration, "tokens[1] == Type::duration");
t.is (tokens[2].first, "P1Y1M1DT1H1M1S", "tokens[2] == 'P1Y1M1DT1H1M1S'"); t.is (tokens[2].first, "P1Y1M1DT1H1M1S", "tokens[2] == 'P1Y1M1DT1H1M1S'");
t.is (tokens[2].second, Lexer::typeDuration, "tokens[2] == typeDuration"); t.is ((int) tokens[2].second, (int) Lexer2::Type::duration, "tokens[2] == Type::duration");
t.is (tokens[3].first, "1s", "tokens[3] == '1s'"); t.is (tokens[3].first, "1s", "tokens[3] == '1s'");
t.is (tokens[3].second, Lexer::typeDuration, "tokens[3] == typeDuration"); t.is ((int) tokens[3].second, (int) Lexer2::Type::duration, "tokens[3] == Type::duration");
t.is (tokens[4].first, "1second", "tokens[4] == '1second'"); t.is (tokens[4].first, "1second", "tokens[4] == '1second'");
t.is (tokens[4].second, Lexer::typeDuration, "tokens[4] == typeDuration"); t.is ((int) tokens[4].second, (int) Lexer2::Type::duration, "tokens[4] == Type::duration");
// All the Eval operators. // All (int) the Eval operators.
Lexer l7 ("and xor or <= >= !~ != == = ^ > ~ ! * / % + - < ( )"); Lexer2 l7 ("and xor or <= >= !~ != == = ^ > ~ ! * / % + - < ( )");
tokens.clear (); tokens.clear ();
while (l7.token (token, type)) while (l7.token (token, type))
{ {
std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n";
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
t.is ((int)tokens.size (), 21, "21 operators"); t.is ((int)tokens.size (), 21, "21 operators");
t.is (tokens[0].first, "and", "tokens[0] == 'and'"); t.is (tokens[0].first, "and", "tokens[0] == 'and'");
t.is (tokens[0].second, Lexer::typeOperator, "tokens[0] == typeOperator"); // 130 t.is ((int) tokens[0].second, (int) Lexer2::Type::op, "tokens[0] == Type::op"); // 130
t.is (tokens[1].first, "xor", "tokens[1] == 'xor'"); t.is (tokens[1].first, "xor", "tokens[1] == 'xor'");
t.is (tokens[1].second, Lexer::typeOperator, "tokens[1] == typeOperator"); t.is ((int) tokens[1].second, (int) Lexer2::Type::op, "tokens[1] == Type::op");
t.is (tokens[2].first, "or", "tokens[2] == 'or'"); t.is (tokens[2].first, "or", "tokens[2] == 'or'");
t.is (tokens[2].second, Lexer::typeOperator, "tokens[2] == typeOperator"); t.is ((int) tokens[2].second, (int) Lexer2::Type::op, "tokens[2] == Type::op");
t.is (tokens[3].first, "<=", "tokens[3] == '<='"); t.is (tokens[3].first, "<=", "tokens[3] == '<='");
t.is (tokens[3].second, Lexer::typeOperator, "tokens[3] == typeOperator"); t.is ((int) tokens[3].second, (int) Lexer2::Type::op, "tokens[3] == Type::op");
t.is (tokens[4].first, ">=", "tokens[4] == '>='"); t.is (tokens[4].first, ">=", "tokens[4] == '>='");
t.is (tokens[4].second, Lexer::typeOperator, "tokens[4] == typeOperator"); t.is ((int) tokens[4].second, (int) Lexer2::Type::op, "tokens[4] == Type::op");
t.is (tokens[5].first, "!~", "tokens[5] == '!~'"); t.is (tokens[5].first, "!~", "tokens[5] == '!~'");
t.is (tokens[5].second, Lexer::typeOperator, "tokens[5] == typeOperator"); // 140 t.is ((int) tokens[5].second, (int) Lexer2::Type::op, "tokens[5] == Type::op"); // 140
t.is (tokens[6].first, "!=", "tokens[6] == '!='"); t.is (tokens[6].first, "!=", "tokens[6] == '!='");
t.is (tokens[6].second, Lexer::typeOperator, "tokens[6] == typeOperator"); t.is ((int) tokens[6].second, (int) Lexer2::Type::op, "tokens[6] == Type::op");
t.is (tokens[7].first, "==", "tokens[7] == '=='"); t.is (tokens[7].first, "==", "tokens[7] == '=='");
t.is (tokens[7].second, Lexer::typeOperator, "tokens[7] == typeOperator"); t.is ((int) tokens[7].second, (int) Lexer2::Type::op, "tokens[7] == Type::op");
t.is (tokens[8].first, "=", "tokens[8] == '='"); t.is (tokens[8].first, "=", "tokens[8] == '='");
t.is (tokens[8].second, Lexer::typeOperator, "tokens[8] == typeOperator"); t.is ((int) tokens[8].second, (int) Lexer2::Type::op, "tokens[8] == Type::op");
t.is (tokens[9].first, "^", "tokens[9] == '^'"); t.is (tokens[9].first, "^", "tokens[9] == '^'");
t.is (tokens[9].second, Lexer::typeOperator, "tokens[9] == typeOperator"); t.is ((int) tokens[9].second, (int) Lexer2::Type::op, "tokens[9] == Type::op");
t.is (tokens[10].first, ">", "tokens[10] == '>'"); t.is (tokens[10].first, ">", "tokens[10] == '>'");
t.is (tokens[10].second, Lexer::typeOperator, "tokens[10] == typeOperator"); // 150 t.is ((int) tokens[10].second, (int) Lexer2::Type::op, "tokens[10] == Type::op"); // 150
t.is (tokens[11].first, "~", "tokens[11] == '~'"); t.is (tokens[11].first, "~", "tokens[11] == '~'");
t.is (tokens[11].second, Lexer::typeOperator, "tokens[11] == typeOperator"); t.is ((int) tokens[11].second, (int) Lexer2::Type::op, "tokens[11] == Type::op");
t.is (tokens[12].first, "!", "tokens[12] == '!'"); t.is (tokens[12].first, "!", "tokens[12] == '!'");
t.is (tokens[12].second, Lexer::typeOperator, "tokens[12] == typeOperator"); t.is ((int) tokens[12].second, (int) Lexer2::Type::op, "tokens[12] == Type::op");
t.is (tokens[13].first, "*", "tokens[13] == '*'"); t.is (tokens[13].first, "*", "tokens[13] == '*'");
t.is (tokens[13].second, Lexer::typeOperator, "tokens[13] == typeOperator"); t.is ((int) tokens[13].second, (int) Lexer2::Type::op, "tokens[13] == Type::op");
t.is (tokens[14].first, "/", "tokens[14] == '/'"); t.is (tokens[14].first, "/", "tokens[14] == '/'");
t.is (tokens[14].second, Lexer::typeOperator, "tokens[14] == typeOperator"); t.is ((int) tokens[14].second, (int) Lexer2::Type::op, "tokens[14] == Type::op");
t.is (tokens[15].first, "%", "tokens[15] == '%'"); t.is (tokens[15].first, "%", "tokens[15] == '%'");
t.is (tokens[15].second, Lexer::typeOperator, "tokens[15] == typeOperator"); // 160 t.is ((int) tokens[15].second, (int) Lexer2::Type::op, "tokens[15] == Type::op"); // 160
t.is (tokens[16].first, "+", "tokens[16] == '+'"); t.is (tokens[16].first, "+", "tokens[16] == '+'");
t.is (tokens[16].second, Lexer::typeOperator, "tokens[16] == typeOperator"); t.is ((int) tokens[16].second, (int) Lexer2::Type::op, "tokens[16] == Type::op");
t.is (tokens[17].first, "-", "tokens[17] == '-'"); t.is (tokens[17].first, "-", "tokens[17] == '-'");
t.is (tokens[17].second, Lexer::typeOperator, "tokens[17] == typeOperator"); t.is ((int) tokens[17].second, (int) Lexer2::Type::op, "tokens[17] == Type::op");
t.is (tokens[18].first, "<", "tokens[18] == '<'"); t.is (tokens[18].first, "<", "tokens[18] == '<'");
t.is (tokens[18].second, Lexer::typeOperator, "tokens[18] == typeOperator"); t.is ((int) tokens[18].second, (int) Lexer2::Type::op, "tokens[18] == Type::op");
t.is (tokens[19].first, "(", "tokens[19] == '('"); t.is (tokens[19].first, "(", "tokens[19] == '('");
t.is (tokens[19].second, Lexer::typeOperator, "tokens[19] == typeOperator"); t.is ((int) tokens[19].second, (int) Lexer2::Type::op, "tokens[19] == Type::op");
t.is (tokens[20].first, ")", "tokens[20] == ')'"); t.is (tokens[20].first, ")", "tokens[20] == ')'");
t.is (tokens[20].second, Lexer::typeOperator, "tokens[20] == typeOperator"); // 170 t.is ((int) tokens[20].second, (int)Lexer2::Type::op, "tokens[20] == Type::op"); // 170
// Test ordinal dates. // Test ordinal dates.
Lexer l8 ("9th 10th"); Lexer2 l8 ("9th 10th");
l8.ambiguity (false); l8.ambiguity (false);
tokens.clear (); tokens.clear ();
while (l8.token (token, type)) while (l8.token (token, type))
{ {
std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n";
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
t.is ((int)tokens.size (), 2, "2 tokens"); t.is ((int)tokens.size (), 2, "2 tokens");
t.is (tokens[0].first, "9th", "tokens[0] == '9th'"); t.is (tokens[0].first, "9th", "tokens[0] == '9th'");
t.is (tokens[0].second, Lexer::typeIdentifier, "tokens[0] == typeIdentifier"); t.is ((int) tokens[0].second, (int) Lexer2::Type::identifier, "tokens[0] == Type::identifier");
t.is (tokens[1].first, "10th", "tokens[1] == '10th'"); t.is (tokens[1].first, "10th", "tokens[1] == '10th'");
t.is (tokens[1].second, Lexer::typeIdentifier, "tokens[1] == typeIdentifier"); t.is ((int) tokens[1].second, (int) Lexer2::Type::identifier, "tokens[1] == Type::identifier");
// Test tag recognition. // Test tag recognition.
Lexer l9 ("+with -WITHOUT + 2"); Lexer2 l9 ("+with -WITHOUT + 2");
l9.ambiguity (false); l9.ambiguity (false);
tokens.clear (); tokens.clear ();
while (l9.token (token, type)) while (l9.token (token, type))
{ {
std::cout << "# «" << token << "» " << type << " " << Lexer::type_name (type) << "\n"; std::cout << "# «" << token << "» " << Lexer2::typeName (type) << "\n";
tokens.push_back (std::pair <std::string, Lexer::Type> (token, type)); tokens.push_back (std::pair <std::string, Lexer2::Type> (token, type));
} }
t.is ((int)tokens.size (), 4, "4 tokens"); t.is ((int)tokens.size (), 4, "4 tokens");
t.is (tokens[0].first, "+with", "tokens[0] == '+with'"); t.is (tokens[0].first, "+with", "tokens[0] == '+with'");
t.is (tokens[0].second, Lexer::typeTag, "tokens[0] == typeTag"); t.is ((int) tokens[0].second, (int) Lexer2::Type::tag, "tokens[0] == Type::tag");
t.is (tokens[1].first, "-WITHOUT", "tokens[1] == '-WITHOUT'"); t.is (tokens[1].first, "-WITHOUT", "tokens[1] == '-WITHOUT'");
t.is (tokens[1].second, Lexer::typeTag, "tokens[1] == typeTag"); t.is ((int) tokens[1].second, (int) Lexer2::Type::tag, "tokens[1] == Type::tag");
t.is (tokens[2].first, "+", "tokens[2] == '+'"); t.is (tokens[2].first, "+", "tokens[2] == '+'");
t.is (tokens[2].second, Lexer::typeOperator, "tokens[2] == typeOperator"); t.is ((int) tokens[2].second, (int) Lexer2::Type::op, "tokens[2] == Type::op");
t.is (tokens[3].first, "2", "tokens[3] == '2'"); t.is (tokens[3].first, "2", "tokens[3] == '2'");
t.is (tokens[3].second, Lexer::typeNumber, "tokens[3] == typeNumber"); t.is ((int) tokens[3].second, (int) Lexer2::Type::number, "tokens[3] == Type::number");
// void word_split (std::vector<std::string>&, const std::string&); // void split (std::vector<std::string>&, const std::string&);
std::string unsplit = " ( A or B ) "; std::string unsplit = " ( A or B ) ";
std::vector <std::string> items; std::vector <std::string> items;
Lexer::word_split (items, unsplit); items = Lexer2::split (unsplit);
t.is (items.size (), (size_t) 5, "word_split ' ( A or B ) '"); t.is (items.size (), (size_t) 5, "split ' ( A or B ) '");
t.is (items[0], "(", "word_split ' ( A or B ) ' -> [0] '('"); t.is (items[0], "(", "split ' ( A or B ) ' -> [0] '('");
t.is (items[1], "A", "word_split ' ( A or B ) ' -> [1] 'A'"); t.is (items[1], "A", "split ' ( A or B ) ' -> [1] 'A'");
t.is (items[2], "or", "word_split ' ( A or B ) ' -> [2] 'or'"); t.is (items[2], "or", "split ' ( A or B ) ' -> [2] 'or'");
t.is (items[3], "B", "word_split ' ( A or B ) ' -> [3] 'B'"); t.is (items[3], "B", "split ' ( A or B ) ' -> [3] 'B'");
t.is (items[4], ")", "word_split ' ( A or B ) ' -> [4] ')'"); t.is (items[4], ")", "split ' ( A or B ) ' -> [4] ')'");
// Test simple mode with contrived tokens that ordinarily split. // Test simple mode with contrived tokens that ordinarily split.
unsplit = " +-* a+b 12.3e4 'c d'"; unsplit = " +-* a+b 12.3e4 'c d'";
Lexer::word_split (items, unsplit); items = Lexer2::split (unsplit);
t.is (items.size (), (size_t) 4, "word_split ' +-* a+b 12.3e4 'c d''"); t.is (items.size (), (size_t) 8, "split ' +-* a+b 12.3e4 'c d''");
t.is (items[0], "+-*", "word_split ' +-* a+b 12.3e4 'c d'' -> [0] '+-*'"); t.is (items[0], "+", "split ' +-* a+b 12.3e4 'c d'' -> [0] '+'");
t.is (items[1], "a+b", "word_split ' +-* a+b 12.3e4 'c d'' -> [1] 'a+b'"); t.is (items[1], "-", "split ' +-* a+b 12.3e4 'c d'' -> [1] '-'");
t.is (items[2], "12.3e4", "word_split ' +-* a+b 12.3e4 'c d'' -> [2] '12.3e4'"); t.is (items[2], "*", "split ' +-* a+b 12.3e4 'c d'' -> [2] '*'");
t.is (items[3], "'c d'", "word_split ' +-* a+b 12.3e4 'c d'' -> [3] 'c d'"); t.is (items[3], "a", "split ' +-* a+b 12.3e4 'c d'' -> [3] 'a'");
t.is (items[4], "+", "split ' +-* a+b 12.3e4 'c d'' -> [4] '+'");
t.is (items[5], "b", "split ' +-* a+b 12.3e4 'c d'' -> [5] 'b'");
t.is (items[6], "12.3e4", "split ' +-* a+b 12.3e4 'c d'' -> [6] '12.3e4'");
t.is (items[7], "c d", "split ' +-* a+b 12.3e4 'c d'' -> [7] 'c d'");
// Test common expression element. // Test common expression element.
unsplit = "name=value"; unsplit = "name=value";
Lexer::token_split (items, unsplit); items = Lexer2::split (unsplit);
t.is (items.size (), (size_t) 3, "token_split 'name=value'"); t.is (items.size (), (size_t) 1, "split 'name=value'");
if (items.size () == 3)
{
t.is (items[0], "name", "token_split 'name=value' -> [0] 'name'");
t.is (items[1], "=", "token_split 'name=value' -> [1] '='");
t.is (items[2], "value", "token_split 'name=value' -> [2] 'value'");
}
else
{
t.fail ("token_split 'name=value' -> [0] 'name'");
t.fail ("token_split 'name=value' -> [1] '='");
t.fail ("token_split 'name=value' -> [2] 'value'");
}
// Test unterminated tokens. // Test unterminated tokens.
unsplit = " ordinary "; unsplit = " ordinary ";
Lexer::token_split (items, unsplit); items = Lexer2::split (unsplit);
t.is (items.size (), (size_t) 1, "token_split 'ordinary' --> 1 token"); t.is (items.size (), (size_t) 1, "split 'ordinary' --> 1 token");
t.is (items[0], "ordinary", "token_split 'ordinary' --> 'ordinary'"); t.is (items[0], "ordinary", "split 'ordinary' --> 'ordinary'");
return 0; return 0;
} }