Lexer

- Reduced Lexer::word to simply distinguish quoted and unquoted strings, that may contain escaped characters.
2014-04-24 09:06:04 -04:00
parent 386bcea60e
commit 26d9be8164
1 changed files with 13 additions and 172 deletions
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -293,6 +293,7 @@ bool Lexer::token (std::string& token, Type& type)
        quote = 0;
        return true;
      }
      break;
    case typeNumber:
      if (is_dec_digit (_n0))
@@ -417,72 +418,12 @@ bool Lexer::word (std::string& token, Type& type)
        quote = _n0;
        shift ();
      }
      else if (_n0 == '0' &&
               _n1 == 'x' &&
               is_hex_digit (_n2))
      {
        type = typeHex;
        token += utf8_character (_n0);
        shift ();
        token += utf8_character (_n0);
        shift ();
        token += utf8_character (_n0);
        shift ();
      }
      else if (is_dec_digit (_n0))
      {
        // Speculatively try a date and duration parse.  Longest wins.
        std::string::size_type iso_i = 0;
        std::string iso_token;
        ISO8601d iso;
        iso.ambiguity (_ambiguity);
        if (iso.parse (_input.substr (_i < 4 ? 0 : _i - 4), iso_i))
          iso_token = _input.substr ((_i < 4 ? 0 : _i - 4), iso_i);
        std::string::size_type dur_i = 0;
        std::string dur_token;
        Duration dur;
        if (dur.parse (_input.substr (_i < 4 ? 0 : _i - 4), dur_i))
          dur_token = _input.substr ((_i < 4 ? 0 : _i - 4), dur_i);
        if (iso_token.length () > dur_token.length ())
        {
          while (iso_i--) shift ();
          token = iso_token;
          type = typeDate;
          return true;
        }
        else if (dur_token.length () > iso_token.length ())
        {
          while (dur_i--) shift ();
          token = dur_token;
          type = typeDuration;
          return true;
        }
        type = typeNumber;
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == '.' && is_dec_digit (_n1))
      {
        type = typeDecimal;
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == '\\')
      {
        type = typeIdentifierEscape;
        shift ();
      }
      else if (is_ident_start (_n0))
      {
        type = typeIdentifier;
        token += utf8_character (_n0);
        shift ();
      }
      else
-        throw std::string ("Unexpected error 1");
+      {
        type = typeString;
        token += utf8_character (_n0);
        shift ();
      }
      break;
    case typeString:
@@ -497,29 +438,14 @@ bool Lexer::word (std::string& token, Type& type)
        type = typeEscape;
        shift ();
      }
-      else
+      else if (! quote && is_ws (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      break;
    case typeIdentifier:
      if (is_ident (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        return true;
      }
-      break;
+      else
    case typeIdentifierEscape:
      if (_n0 == 'u')
      {
-        type = typeEscapeUnicode;
+        token += utf8_character (_n0);
        shift ();
      }
      break;
@@ -538,7 +464,7 @@ bool Lexer::word (std::string& token, Type& type)
      else
      {
        token += decode_escape (_n0);
-        type = quote ? typeString : typeIdentifier;
+        type = typeString;
        shift ();
      }
      break;
@@ -547,13 +473,13 @@ bool Lexer::word (std::string& token, Type& type)
      if (is_hex_digit (_n0) && is_hex_digit (_n1))
      {
        token += utf8_character (hex_to_int (_n0, _n1));
-        type = quote ? typeString : typeIdentifier;
+        type = typeString;
        shift ();
        shift ();
      }
      else
      {
-        type = quote ? typeString : typeIdentifier;
+        type = typeString;
        shift ();
        quote = 0;
        return true;
@@ -571,7 +497,7 @@ bool Lexer::word (std::string& token, Type& type)
        shift ();
        shift ();
        shift ();
-        type = quote ? typeString : typeIdentifier;
+        type = typeString;
      }
      else if (_n0 == quote)
      {
@@ -580,91 +506,6 @@ bool Lexer::word (std::string& token, Type& type)
        quote = 0;
        return true;
      }
    case typeNumber:
      if (is_dec_digit (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == '.')
      {
        type = typeDecimal;
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == 'e' || _n0 == 'E')
      {
        type = typeExponentIndicator;
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        return true;
      }
      break;
    case typeDecimal:
      if (is_dec_digit (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == 'e' || _n0 == 'E')
      {
        type = typeExponentIndicator;
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        return true;
      }
      break;
    case typeExponentIndicator:
      if (_n0 == '+' || _n0 == '-')
      {
        token += utf8_character (_n0);
        shift ();
      }
      else if (is_dec_digit (_n0))
      {
        type = typeExponent;
        token += utf8_character (_n0);
        shift ();
      }
      break;
    case typeExponent:
      if (is_dec_digit (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else if (_n0 == '.')
      {
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        type = typeDecimal;
        return true;
      }
      break;
    case typeHex:
      if (is_hex_digit (_n0))
      {
        token += utf8_character (_n0);
        shift ();
      }
      else
      {
        return true;
      }
      break;
    default: