taskwarrior-2.x/src/cli/lexer.rs

use crate::util::datetime::DateTime;
use crate::util::duration::Duration;
use std::convert::TryFrom;

// based on src/Lexer.{h,cpp} in the Taskwarrior code

const UUID_PATTERN: &[u8] = b"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";
const UUID_MIN_LENGTH: usize = 8;
const MINIMUM_MATCH_LEN: usize = 3;
const DATE_SUBELEMENTS: &[&str] = &[
    "year", "month", "day", "week", "weekday", "julian", "hour", "minute", "second",
];

#[derive(PartialEq, Debug, Clone, Copy)]
enum Type {
    Uuid,
    Number,
    Hex,
    String,
    URL,
    Pair,
    Set,
    Separator,
    Tag,
    Path,
    Substitution,
    Pattern,
    Op,
    DOM,
    Identifier,
    Word,
    Date,
    Duration,
}

struct Lexer {
    text: String,
    cursor: usize,
    eos: usize,
    attributes: Vec<String>,
}

// TaskWarrior uses some non-standard character definitions, so they are repeated verbatim here,
// rather than defaulting to the unicode functions available on the char type.

/// Returns true if this character is whitespace, as defined in TaskWarrior's libshared.
fn unicode_whitespace(c: char) -> bool {
    unicode_horizontal_whitespace(c) || unicode_vertical_whitespace(c)
}

/// Returns true if this character is horizontal whitespace, as defined in TaskWarrior's libshared.
fn unicode_horizontal_whitespace(c: char) -> bool {
    let c: u32 = c.into();
    return c == 0x0020 ||   // space Common  Separator, space
          c == 0x0009 ||   // Common  Other, control  HT, Horizontal Tab
          c == 0x00A0 ||   // no-break space  Common  Separator, space
          c == 0x1680 ||   // ogham space mark  Ogham Separator, space
          c == 0x180E ||   // mongolian vowel separator Mongolian Separator, space
          c == 0x2000 ||   // en quad Common  Separator, space
          c == 0x2001 ||   // em quad Common  Separator, space
          c == 0x2002 ||   // en space  Common  Separator, space
          c == 0x2003 ||   // em space  Common  Separator, space
          c == 0x2004 ||   // three-per-em space  Common  Separator, space
          c == 0x2005 ||   // four-per-em space Common  Separator, space
          c == 0x2006 ||   // six-per-em space  Common  Separator, space
          c == 0x2007 ||   // figure space  Common  Separator, space
          c == 0x2008 ||   // punctuation space Common  Separator, space
          c == 0x2009 ||   // thin space  Common  Separator, space
          c == 0x200A ||   // hair space  Common  Separator, space
          c == 0x200B ||   // zero width space
          c == 0x200C ||   // zero width non-joiner
          c == 0x200D ||   // zero width joiner
          c == 0x202F ||   // narrow no-break space Common  Separator, space
          c == 0x205F ||   // medium mathematical space Common  Separator, space
          c == 0x2060 ||   // word joiner
          c == 0x3000; // ideographic space Common  Separator, space
}

/// Returns true if this character is vertical whitespace, as defined in TaskWarrior's libshared.
fn unicode_vertical_whitespace(c: char) -> bool {
    let c: u32 = c.into();
    return c == 0x000A ||   // Common  Other, control  LF, Line feed
          c == 0x000B ||   // Common  Other, control  VT, Vertical Tab
          c == 0x000C ||   // Common  Other, control  FF, Form feed
          c == 0x000D ||   // Common  Other, control  CR, Carriage return
          c == 0x0085 ||   // Common  Other, control  NEL, Next line
          c == 0x2028 ||   // line separator  Common  Separator, line
          c == 0x2029; // paragraph separator Common  Separator, paragraph
}

/// Returns true if the given character is an ascii digit
fn unicode_latin_digit(c: char) -> bool {
    c.is_ascii_digit()
}

/// Returns true if the given character is an ascii letter
fn unicode_latin_alpha(c: char) -> bool {
    c.is_ascii_alphabetic()
}

/// Replicates the C function of the same name, which only recognizes ASCII printable
fn isprint(c: char) -> bool {
    c.is_ascii_graphic()
}

/// Returns true if the given character is punctuation.
fn is_punctuation(c: char) -> bool {
    isprint(c)
        && c != ' '
        && c != '@'
        && c != '#'
        && c != '$'
        && c != '_'
        && !unicode_latin_digit(c)
        && !unicode_latin_alpha(c)
}

/// Returns true if this character is an operator
fn is_single_char_operator(c: char) -> bool {
    match c {
        '+' | '-' | '*' | '/' | '(' | ')' | '<' | '>' | '^' | '!' | '%' | '=' | '~' => true,
        _ => false,
    }
}

/// Returns true if this character can start an identifier
fn is_identifier_start(c: char) -> bool {
    !unicode_whitespace(c)
        && !unicode_latin_digit(c)
        && !is_single_char_operator(c)
        && !is_punctuation(c)
}

/// Returns true if this character can be in the middle of an identifier
fn is_identifier_next(c: char) -> bool {
    c != ':' && c != '=' && !unicode_whitespace(c) && !is_single_char_operator(c)
}

/// Returns true if the sequence `<left><right>` represents a token boundary.
fn is_boundary(left: char, right: char) -> bool {
    right == '\0'
        || (unicode_latin_alpha(left) != unicode_latin_alpha(right))
        || (unicode_latin_digit(left) != unicode_latin_digit(right))
        || (unicode_whitespace(left) != unicode_whitespace(right))
        || is_punctuation(left)
        || is_punctuation(right)
}

/// Returns true if the sequence `<left><right>` represents a hard token boundary.
fn is_hard_boundary(left: char, right: char) -> bool {
    right == '\0' || left == '(' || left == ')' || right == '(' || right == ')'
}

/// Returns true if the given string must have been shell-quoted
fn was_quoted(s: &str) -> bool {
    s.contains(&[' ', '\t', '(', ')', '<', '>', '&', '~'][..])
}

fn is_unicode_hex_digit(c: char) -> bool {
    match c {
        '0'..='9' | 'a'..='f' | 'A'..='F' => true,
        _ => false,
    }
}

fn hex_to_char(hex: &str) -> Option<char> {
    let mut num = 0u32;
    for c in hex.chars() {
        num <<= 4;
        num += match c {
            '0'..='9' => c as u32 - '0' as u32,
            'a'..='f' => 10 + (c as u32 - 'a' as u32),
            'A'..='F' => 10 + (c as u32 - 'A' as u32),
            _ => return None,
        }
    }

    if let Ok(c) = char::try_from(num) {
        Some(c)
    } else {
        None
    }
}

/// Strips matching quote symbols from the beginning and end of the given string
/// (removing all quotes if given a single quote `'`)
fn dequote<'a, 'b>(s: &'a str, quotes: &'b str) -> &'a str {
    // note that this returns a new ref to the same string, rather
    // than modifying its argument as the C++ version does.
    if let Some(first_char) = s.chars().next() {
        if let Some(last_char) = s.chars().rev().next() {
            if first_char == last_char && quotes.contains(first_char) {
                let quote_len = first_char.len_utf8();
                if s.len() > 2 * quote_len {
                    return &s[quote_len..s.len() - quote_len];
                } else {
                    return "";
                }
            }
        }
    }
    s
}

fn read_word_quoted(text: &str, quotes: &str, cursor: usize) -> Option<(String, usize)> {
    let mut pos = cursor;
    let mut res = String::new();
    let mut skipchars = 0;

    let mut chars = text.get(cursor..)?.chars();
    let quote = chars.next();
    if quote.is_none() {
        return None;
    }
    let quote = quote.unwrap();
    if !quotes.contains(quote) {
        return None;
    }

    res.push(quote);
    pos += quote.len_utf8();

    for c in chars {
        if skipchars > 0 {
            skipchars -= 1;
            pos += c.len_utf8();
            continue;
        }
        if c == quote {
            res.push(c);
            pos += quote.len_utf8();
            return Some((res, pos));
        }

        if c == 'U' {
            if let Some('+') = text.get(pos + 1..).unwrap().chars().next() {
                if let Some(hex) = text.get(pos + 2..pos + 6) {
                    if let Some(c) = hex_to_char(hex) {
                        res.push(c);
                        skipchars += 5;
                    } else {
                        res.push('U');
                    }
                } else {
                    res.push('U');
                }
            } else {
                res.push('U');
            }
        } else if c == '\\' {
            match text.get(pos + 1..).unwrap().chars().next() {
                None => res.push(c),
                Some('b') => res.push('\x08'),
                Some('f') => res.push('\x0c'),
                Some('n') => res.push('\x0a'),
                Some('r') => res.push('\x0d'),
                Some('t') => res.push('\x09'),
                Some('v') => res.push('\x0b'),
                Some('u') => {
                    if let Some(hex) = text.get(pos + 2..pos + 6) {
                        if let Some(c) = hex_to_char(hex) {
                            res.push(c);
                            skipchars += 4;
                        } else {
                            res.push('u')
                        }
                    } else {
                        res.push('u')
                    }
                }
                Some(c @ _) => res.push(c),
            }
            skipchars += 1;
        } else {
            res.push(c);
        }

        pos += c.len_utf8();
    }

    None
}

fn read_word_unquoted(text: &str, cursor: usize) -> Option<(String, usize)> {
    let mut pos = cursor;
    let mut res = String::new();
    let mut prev = None;
    let mut skipchars = 0;

    for c in text.get(cursor..)?.chars() {
        if skipchars > 0 {
            skipchars -= 1;
            pos += c.len_utf8();
            prev = Some(c);
            continue;
        }
        if unicode_whitespace(c) {
            break;
        }
        if let Some(p) = prev {
            if is_hard_boundary(p, c) {
                break;
            }
        }

        if c == 'U' {
            if let Some('+') = text.get(pos + 1..).unwrap().chars().next() {
                if let Some(hex) = text.get(pos + 2..pos + 6) {
                    if let Some(c) = hex_to_char(hex) {
                        res.push(c);
                        skipchars += 5;
                    } else {
                        res.push('U');
                    }
                } else {
                    res.push('U');
                }
            } else {
                res.push('U');
            }
        } else if c == '\\' {
            match text.get(pos + 1..).unwrap().chars().next() {
                None => res.push(c),
                Some('b') => res.push('\x08'),
                Some('f') => res.push('\x0c'),
                Some('n') => res.push('\x0a'),
                Some('r') => res.push('\x0d'),
                Some('t') => res.push('\x09'),
                Some('v') => res.push('\x0b'),
                Some('u') => {
                    if let Some(hex) = text.get(pos + 2..pos + 6) {
                        if let Some(c) = hex_to_char(hex) {
                            res.push(c);
                            skipchars += 4;
                        } else {
                            res.push('u')
                        }
                    } else {
                        res.push('u')
                    }
                }
                Some(c @ _) => res.push(c),
            }
            skipchars += 1;
        } else {
            res.push(c);
        }

        pos += c.len_utf8();
        prev = Some(c);
    }

    if pos != cursor {
        Some((res, pos))
    } else {
        None
    }
}

fn common_length(s1: &str, s2: &str) -> usize {
    s1.chars()
        .zip(s2.chars())
        .take_while(|(c1, c2)| c1 == c2)
        .collect::<Vec<_>>()
        .len()
}

#[derive(Debug, PartialEq)]
pub struct DecomposedPair {
    name: String,
    modifier: String,
    separator: String,
    value: String,
}

impl Lexer {
    pub fn new<S: Into<String>>(text: S) -> Lexer {
        let text = text.into();
        let eos = text.len();
        Lexer {
            text,
            cursor: 0,
            eos,
            attributes: vec![],
        }
    }

    pub fn add_attribute<S: Into<String>>(&mut self, attribute: S) {
        self.attributes.push(attribute.into());
    }

    /// This static method tokenizes the input, but discards the type information.
    pub fn split<S: Into<String>>(text: S) -> Vec<String> {
        Lexer::new(text).into_iter().map(|(tx, ty)| tx).collect()
    }

    pub fn token(&mut self) -> Option<(String, Type)> {
        // Eat whitespace
        while let Some(c) = self.text[self.cursor..].chars().next() {
            if unicode_whitespace(c) {
                self.cursor += c.len_utf8();
                continue;
            }
            break;
        }

        if self.cursor == self.eos {
            return None;
        }

        // The sequence is specific, and must follow these rules:
        //   - date < duration < uuid < identifier
        //   - dom < uuid
        //   - uuid < hex < number
        //   - url < pair < identifier
        //   - hex < number
        //   - separator < tag < operator
        //   - path < substitution < pattern
        //   - set < number
        //   - word last
        if let Some(r) = self.is_string("\"'") {
            return Some(r);
        }
        if let Some(r) = self.is_date() {
            return Some(r);
        }
        if let Some(r) = self.is_duration() {
            return Some(r);
        }
        if let Some(r) = self.is_url() {
            return Some(r);
        }
        if let Some(r) = self.is_pair() {
            return Some(r);
        }
        if let Some(r) = self.is_uuid(true) {
            return Some(r);
        }
        if let Some(r) = self.is_set() {
            return Some(r);
        }
        if let Some(r) = self.is_dom() {
            return Some(r);
        }
        if let Some(r) = self.is_hexnumber() {
            return Some(r);
        }
        if let Some(r) = self.is_number() {
            return Some(r);
        }
        if let Some(r) = self.is_separator() {
            return Some(r);
        }
        if let Some(r) = self.is_tag() {
            return Some(r);
        }
        if let Some(r) = self.is_path() {
            return Some(r);
        }
        if let Some(r) = self.is_substitution() {
            return Some(r);
        }
        if let Some(r) = self.is_pattern() {
            return Some(r);
        }
        if let Some(r) = self.is_operator() {
            return Some(r);
        }
        if let Some(r) = self.is_identifier() {
            return Some(r);
        }
        if let Some(r) = self.is_word() {
            return Some(r);
        }
        None
    }

    pub fn decompose_pair(text: &str) -> Option<DecomposedPair> {
        let npos = usize::max_value();
        // npos
        let dot = text.find(".").unwrap_or(npos);
        // npos
        let sep_defer = text.find("::").unwrap_or(npos);
        // npos
        let sep_eval = text.find(":=").unwrap_or(npos);
        // 4
        let sep_colon = text.find(":").unwrap_or(npos);
        // npos
        let sep_equal = text.find("=").unwrap_or(npos);

        let (sep, sep_end) = if sep_defer != npos
            && sep_defer <= sep_eval
            && sep_defer <= sep_colon
            && sep_defer <= sep_equal
        {
            (sep_defer, sep_defer + 2)
        } else if sep_eval != npos
            && sep_eval <= sep_defer
            && sep_eval <= sep_colon
            && sep_eval <= sep_equal
        {
            (sep_eval, sep_eval + 2)
        } else if sep_colon != npos
            && sep_colon <= sep_defer
            && sep_colon <= sep_eval
            && sep_colon <= sep_equal
        {
            (sep_colon, sep_colon + 1)
        } else if sep_equal != npos
            && sep_equal <= sep_defer
            && sep_equal <= sep_eval
            && sep_equal <= sep_colon
        {
            (sep_equal, sep_equal + 1)
        } else {
            return None;
        };

        let (name, modifier) = if dot != npos && dot < sep {
            (
                text.get(0..dot).unwrap().into(),
                text.get(dot + 1..sep).unwrap().into(),
            )
        } else {
            (text.get(0..sep).unwrap().into(), "".into())
        };

        let separator = text.get(sep..sep_end).unwrap().into();
        let value = text.get(sep_end..).unwrap().into();

        Some(DecomposedPair {
            name,
            modifier,
            separator,
            value,
        })
    }

    // recognizers for the `token` method

    fn is_string(&mut self, quotes: &str) -> Option<(String, Type)> {
        if let Some((s, pos)) = read_word_quoted(&self.text, quotes, self.cursor) {
            self.cursor = pos;
            return Some((s, Type::String));
        }
        None
    }

    fn is_date(&mut self) -> Option<(String, Type)> {
        let (_, read) = DateTime::parse(&self.text[self.cursor..], "")?;
        let token = self.text[self.cursor..self.cursor + read].into();
        self.cursor += read;
        Some((token, Type::Date))
    }

    fn is_duration(&mut self) -> Option<(String, Type)> {
        let marker = self.cursor;

        if self.is_operator().is_some() {
            self.cursor = marker;
            return None;
        }

        let (_, read) = Duration::parse(&self.text[self.cursor..], "")?;
        let token = self.text[self.cursor..self.cursor + read].into();
        self.cursor += read;
        Some((token, Type::Duration))
    }

    fn is_url(&mut self) -> Option<(String, Type)> {
        let remainder = &self.text[self.cursor..];
        if remainder.starts_with("https://") || remainder.starts_with("http://") {
            if let Some(i) = remainder.find(unicode_whitespace) {
                let token = &remainder[..i];
                self.cursor += i;
                return Some((token.into(), Type::URL));
            } else {
                self.cursor = self.eos;
                return Some((remainder.into(), Type::URL));
            }
        }
        None
    }

    fn is_pair(&mut self) -> Option<(String, Type)> {
        let marker = self.cursor;
        if self.is_identifier().is_some() {
            let separator = &self.text[self.cursor..];
            if separator.starts_with("::") || separator.starts_with(":=") {
                self.cursor += 2;
            } else if separator.starts_with(":") || separator.starts_with("=") {
                self.cursor += 1;
            } else {
                self.cursor = marker;
                return None;
            }

            // String, word, or nothing are all valid
            let marker2 = self.cursor;
            if let Some((word, end)) = read_word_quoted(&self.text[..], "'\"", self.cursor) {
                self.cursor = end;
                return Some((
                    format!("{}{}", &self.text[marker..marker2], word),
                    Type::Pair,
                ));
            }
            if let Some((word, end)) = read_word_unquoted(&self.text[..], self.cursor) {
                self.cursor = end;
                return Some((
                    format!("{}{}", &self.text[marker..marker2], word),
                    Type::Pair,
                ));
            }
            if self.cursor == self.eos
                || unicode_whitespace(self.text[self.cursor..].chars().next().unwrap())
            {
                return Some((self.text[marker..self.cursor].into(), Type::Pair));
            }
        }
        self.cursor = marker;
        None
    }

    fn is_uuid(&mut self, end_boundary: bool) -> Option<(String, Type)> {
        let mut i = 0;
        for c in self.text[self.cursor..].chars() {
            if UUID_PATTERN[i] == b'x' {
                if !is_unicode_hex_digit(c) {
                    break;
                }
            } else {
                if c != '-' {
                    break;
                }
            }
            i += 1;
            if i >= UUID_PATTERN.len() {
                break;
            }
        }

        if i < UUID_MIN_LENGTH {
            return None;
        }

        if end_boundary {
            let c = self.text[self.cursor + i..].chars().next();
            if let Some(c) = c {
                if !unicode_whitespace(c) && !is_single_char_operator(c) {
                    return None;
                }
            }
        }

        let token = self.text[self.cursor..self.cursor + i].into();
        self.cursor += i;
        Some((token, Type::Uuid))
    }

    fn is_set(&mut self) -> Option<(String, Type)> {
        let marker = self.cursor;
        let mut count = 0;
        loop {
            if self.is_integer().is_some() {
                count += 1;
                if self.is_literal("-", false, false) {
                    if self.is_integer().is_some() {
                        count += 1;
                    } else {
                        self.cursor = marker;
                        return None;
                    }
                }
            } else {
                self.cursor = marker;
                return None;
            }
            if !self.is_literal(",", false, false) {
                break;
            }
        }

        if count <= 1 {
            self.cursor = marker;
            return None;
        }

        // -1 is OK here since integers are ASCII
        let last_char = self.text[self.cursor - 1..].chars().next().unwrap();

        // look ahead a bit
        match self.text[self.cursor..].chars().next() {
            Some(c) if !unicode_whitespace(c) && !is_hard_boundary(last_char, c) => {
                self.cursor = marker;
                return None;
            }
            _ => (),
        }

        Some((self.text[marker..self.cursor].into(), Type::Set))
    }

    fn is_dom(&mut self) -> Option<(String, Type)> {
        let marker = self.cursor;

        // rc. ...
        if self.is_literal("rc.", false, false) && self.is_word().is_some() {
            return Some((self.text[marker..self.cursor].into(), Type::DOM));
        } else {
            self.cursor = marker;
        }

        // Literals
        if self.is_one_of(
            &vec![
                "tw.syncneeded",
                "tw.program",
                "tw.args",
                "tw.width",
                "tw.height",
                "tw.version",
                "context.program",
                "context.args",
                "context.width",
                "context.height",
                "system.version",
                "system.os",
            ],
            false,
            true,
        ) {
            return Some((self.text[marker..self.cursor].into(), Type::DOM));
        }

        // Optional:
        //   <uuid>.
        //   <id>.
        if self.is_uuid(false).is_some() || self.is_integer().is_some() {
            if !self.is_literal(".", false, false) {
                self.cursor = marker;
                return None;
            }
        }

        // Any failure after this line should rollback to the checkpoint.
        let checkpoint = self.cursor;

        // [prefix]tags.<word>
        if self.is_literal("tags", false, false)
            && self.is_literal(".", false, false)
            && self.is_word().is_some()
        {
            return Some((self.text[marker..self.cursor].into(), Type::DOM));
        } else {
            self.cursor = checkpoint;
        }

        // [prefix]attribute (bounded)
        // (have to clone here to avoid double-borrowing self
        let attributes = self.attributes.clone();
        if self.is_one_of(&attributes, false, true) {
            return Some((self.text[marker..self.cursor].into(), Type::DOM));
        }

        // [prefix]attribute. (unbounded)
        if self.is_one_of(&attributes, false, false) {
            if self.is_literal(".", false, false) {
                let attribute = &self.text[checkpoint..self.cursor - 1];
                // if attribute type is 'date', then it has sub-elements.
                if attribute == "date" && self.is_one_of(&DATE_SUBELEMENTS, false, true) {
                    return Some((self.text[marker..self.cursor].into(), Type::DOM));
                }
                self.cursor = checkpoint;
            }
            // Lookahead: !<alpha>
            else if !self.text[marker..]
                .chars()
                .next()
                .map_or(false, |c| unicode_latin_alpha(c))
            {
                return Some((self.text[marker..self.cursor].into(), Type::DOM));
            }
            self.cursor = checkpoint;
        }

        // [prefix]annotations.
        if self.is_literal("annotations", true, false) && self.is_literal(".", false, false) {
            if self.is_literal("count", false, false) {
                return Some((self.text[marker..self.cursor].into(), Type::DOM));
            }

            if self.is_integer().is_some() {
                if self.is_literal(".", false, false) {
                    if self.is_literal("description", false, true) {
                        return Some((self.text[marker..self.cursor].into(), Type::DOM));
                    } else if self.is_literal("entry", false, true) {
                        return Some((self.text[marker..self.cursor].into(), Type::DOM));
                    } else if self.is_literal("entry", false, false)
                        && self.is_literal(".", false, false)
                        && self.is_one_of(&DATE_SUBELEMENTS, false, true)
                    {
                        return Some((self.text[marker..self.cursor].into(), Type::DOM));
                    }
                }
            } else {
                self.cursor = checkpoint;
            }
        }

        self.cursor = marker;
        None
    }

    fn is_hexnumber(&mut self) -> Option<(String, Type)> {
        let remainder = &self.text[self.cursor..];

        if !remainder.starts_with("0x") {
            return None;
        }
        let mut end = 2;
        for (i, c) in remainder[2..].char_indices() {
            if is_unicode_hex_digit(c) {
                end = 2 + i + c.len_utf8();
            } else {
                break;
            }
        }
        if end > 2 {
            self.cursor += end;
            Some((remainder[..end].into(), Type::Hex))
        } else {
            None
        }
    }

    fn is_number(&mut self) -> Option<(String, Type)> {
        let remainder = &self.text[self.cursor..];
        let mut chars = remainder.char_indices().peekable();
        let mut marker = 0;

        // A hand-rolled regexp.  States are as follows:
        //   \d \d* (. \d \d*)? ([eE] [+-]? \d \d* (.  \d \d*)?)?
        // 0 1  2    3 4  5      6    7     8  9    10 11 12
        let mut state = 0;

        loop {
            let c = match chars.peek() {
                Some((i, c)) => {
                    marker = *i;
                    Some(*c)
                }
                None => None,
            };
            match (state, c) {
                (0, Some(c)) if unicode_latin_digit(c) => state = 1,

                (1, Some(c)) if unicode_latin_digit(c) => state = 2,
                (1, Some(c)) if c == '.' => state = 3,
                (1, Some(c)) if c == 'e' || c == 'E' => state = 6,
                (1, _) => break,

                (2, Some(c)) if unicode_latin_digit(c) => state = 2,
                (2, Some(c)) if c == '.' => state = 3,
                (2, Some(c)) if c == 'e' || c == 'E' => state = 6,
                (2, _) => break,

                (3, Some(c)) if unicode_latin_digit(c) => state = 4,
                (3, Some(c)) if c == 'e' || c == 'E' => state = 6,
                (3, _) => break,

                (4, Some(c)) if unicode_latin_digit(c) => state = 5,
                (4, Some(c)) if c == 'e' || c == 'E' => state = 6,
                (4, _) => break,

                (5, Some(c)) if unicode_latin_digit(c) => state = 5,
                (5, Some(c)) if c == 'e' || c == 'E' => state = 6,
                (5, _) => break,

                (6, Some(c)) if unicode_latin_digit(c) => state = 8,
                (6, Some(c)) if c == '-' || c == '+' => state = 7,
                (6, _) => break,

                (7, Some(c)) if unicode_latin_digit(c) => state = 8,
                (7, _) => break,

                (8, Some(c)) if unicode_latin_digit(c) => state = 9,
                (8, Some(c)) if c == '.' => state = 10,
                (8, _) => break,

                (9, Some(c)) if unicode_latin_digit(c) => state = 9,
                (9, Some(c)) if c == '.' => state = 10,
                (9, _) => break,

                (10, Some(c)) if unicode_latin_digit(c) => state = 11,
                (10, _) => break,

                (11, Some(c)) if unicode_latin_digit(c) => state = 11,
                (11, _) => break,

                _ => return None,
            };
            if let Some((i, c)) = chars.next() {
                marker = i + c.len_utf8();
            }
        }
        // lookahead
        if let Some((_, c)) = chars.peek() {
            if !unicode_whitespace(*c) && !is_single_char_operator(*c) {
                return None;
            }
        }
        self.cursor += marker;
        Some((remainder[..marker].into(), Type::Number))
    }

    fn is_separator(&mut self) -> Option<(String, Type)> {
        let next_chars = self
            .text
            .get(self.cursor..self.cursor + 2)?
            .chars()
            .collect::<Vec<_>>();
        if &next_chars[..] == &['-', '-'] {
            self.cursor += 2;
            return Some(("--".into(), Type::Separator));
        }
        None
    }

    fn is_tag(&mut self) -> Option<(String, Type)> {
        let mut marker = self.cursor;

        // Lookbehind: Assert ^ or preceded by whitespace, (, or ).
        if marker > 0 {
            // if the previous byte is not a valid character, then it's
            // not ( or )
            if let Some(lookbehind) = self.text.get(self.cursor - 1..) {
                if let Some(c) = lookbehind.chars().next() {
                    if !unicode_whitespace(c) && c != '(' && c != ')' {
                        return None;
                    }
                }
            } else {
                return None;
            }
        }

        let mut chars = self.text[marker..].chars();
        if let Some(c) = chars.next() {
            if c == '+' || c == '-' {
                marker += c.len_utf8();
                if let Some(c) = chars.next() {
                    if is_identifier_start(c) {
                        marker += c.len_utf8();
                        while let Some(c) = chars.next() {
                            if !is_identifier_next(c) {
                                break;
                            }
                            marker += c.len_utf8();
                        }
                        let token = self.text[self.cursor..marker].into();
                        self.cursor = marker;
                        return Some((token, Type::Tag));
                    }
                }
            }
        }

        None
    }

    fn is_path(&mut self) -> Option<(String, Type)> {
        let mut marker = self.cursor;
        let mut slash_count = 0;
        let mut chars = self.text[self.cursor..].chars().peekable();

        loop {
            if let Some('/') = chars.next() {
                marker += 1;
                slash_count += 1;
            } else {
                break;
            }

            if let Some(c) = chars.next() {
                if !unicode_whitespace(c) && c != '/' {
                    marker += 1;
                    while let Some(c) = chars.peek() {
                        if !unicode_whitespace(*c) && *c != '/' {
                            marker += 1;
                            chars.next();
                        } else {
                            break;
                        }
                    }
                } else {
                    break;
                }
            } else {
                break;
            }
        }

        if marker > self.cursor && slash_count > 3 {
            let token = self.text[self.cursor..marker].into();
            self.cursor = marker;
            return Some((token, Type::Path));
        }

        None
    }

    fn is_substitution(&mut self) -> Option<(String, Type)> {
        let marker = self.cursor;

        if let Some((_, end)) = read_word_quoted(&self.text, "/", self.cursor) {
            // end-1 to step back over the middle `/`
            if let Some((_, end)) = read_word_quoted(&self.text, "/", end - 1) {
                let mut remainder = self.text[end..].chars();
                return match remainder.next() {
                    None => {
                        self.cursor = end;
                        Some((self.text[marker..self.cursor].into(), Type::Substitution))
                    }
                    Some('g') => match remainder.next() {
                        None => {
                            self.cursor = end + 1;
                            Some((self.text[marker..self.cursor].into(), Type::Substitution))
                        }
                        Some(c) if unicode_whitespace(c) => {
                            self.cursor = end + 1;
                            Some((self.text[marker..self.cursor].into(), Type::Substitution))
                        }
                        _ => None,
                    },
                    Some(c) if unicode_whitespace(c) => {
                        self.cursor = end;
                        Some((self.text[marker..self.cursor].into(), Type::Substitution))
                    }
                    _ => None,
                };
            }
        }

        None
    }

    fn is_pattern(&mut self) -> Option<(String, Type)> {
        let marker = self.cursor;
        if let Some((_, end)) = read_word_quoted(&self.text, "/", self.cursor) {
            if end == self.eos || unicode_whitespace(self.text[end..].chars().next().unwrap()) {
                self.cursor = end;
                return Some((self.text[marker..self.cursor].into(), Type::Pattern));
            }
        }
        None
    }

    fn is_operator(&mut self) -> Option<(String, Type)> {
        let remainder = &self.text[self.cursor..];

        // operators that do not require a boundary afterward
        for strop in &[
            // custom stuff
            "_hastag_", "_notag_", "_neg_", "_pos_",
            // triple-char
            "!==", // and, xor below
            // double-char
            "==", "!=", "<=", ">=", "||", "&&", "!~", // or below
            // single-char
            "+", "-", "*", "/", "(", ")", "<", ">", "^", "!", "%", "=", "~",
        ] {
            if remainder.starts_with(strop) {
                self.cursor += strop.len();
                return Some((remainder[..strop.len()].into(), Type::Op));
            }
        }

        // operators that require a boundary afterward
        for strop in &["and", "xor", "!==", "or"] {
            if remainder.starts_with(strop) {
                if self.cursor + strop.len() == self.eos
                    || is_boundary(
                        remainder[strop.len() - 1..].chars().next().unwrap(),
                        remainder[strop.len()..].chars().next().unwrap(),
                    )
                {
                    self.cursor += strop.len();
                    return Some((remainder[..strop.len()].into(), Type::Op));
                }
            }
        }
        None
    }

    fn is_identifier(&mut self) -> Option<(String, Type)> {
        let mut chars = self.text.get(self.cursor..)?.chars();
        let start = self.cursor;
        let mut len = 0;

        if let Some(c) = chars.next() {
            if is_identifier_start(c) {
                len += c.len_utf8();
                for c in chars {
                    if !is_identifier_next(c) {
                        break;
                    }
                    len += c.len_utf8();
                }
                self.cursor += len;
                return Some((self.text.get(start..self.cursor)?.into(), Type::Identifier));
            }
        }

        None
    }

    fn is_word(&mut self) -> Option<(String, Type)> {
        let mut marker = self.cursor;
        for c in self.text[self.cursor..].chars() {
            if unicode_whitespace(c) || is_single_char_operator(c) {
                break;
            }
            marker += c.len_utf8();
        }

        if marker > self.cursor {
            let token = self.text[self.cursor..marker].into();
            self.cursor = marker;
            return Some((token, Type::Word));
        }

        None
    }

    // utilities that may modify self

    fn is_one_of<S: AsRef<str>>(
        &mut self,
        options: &[S],
        allow_abbreviations: bool,
        end_boundary: bool,
    ) -> bool {
        for option in options {
            if self.is_literal(option.as_ref(), allow_abbreviations, end_boundary) {
                return true;
            }
        }
        false
    }

    fn is_literal(&mut self, literal: &str, allow_abbreviations: bool, end_boundary: bool) -> bool {
        // calculate the number of common characters between the literal and the string being
        // parsed
        let common = common_length(literal, &self.text[self.cursor..]);

        // Without abbreviations, common must equal literal length.
        if !allow_abbreviations && common < literal.len() {
            return false;
        }

        if allow_abbreviations && common < MINIMUM_MATCH_LEN {
            return false;
        }

        if end_boundary {
            let c = self.text[self.cursor + common..].chars().next();
            if let Some(c) = c {
                if !unicode_whitespace(c) && !is_single_char_operator(c) {
                    return false;
                }
            }
        }

        self.cursor += common;

        true
    }

    fn is_integer(&mut self) -> Option<(String, Type)> {
        let mut marker = self.cursor;
        for c in self.text[self.cursor..].chars() {
            if !unicode_latin_digit(c) {
                break;
            }
            marker += c.len_utf8();
        }

        if marker > self.cursor {
            let token = self.text[self.cursor..marker].into();
            self.cursor = marker;
            return Some((token, Type::Number));
        }

        None
    }
}

struct LexerIterator(Lexer);

impl Iterator for LexerIterator {
    type Item = (String, Type);

    fn next(&mut self) -> Option<Self::Item> {
        self.0.token()
    }
}

impl IntoIterator for Lexer {
    type Item = (String, Type);
    type IntoIter = LexerIterator;

    fn into_iter(self) -> Self::IntoIter {
        LexerIterator(self)
    }
}

#[cfg(test)]
mod test {
    use super::*;
    const NONE: Option<(String, Type)> = None;

    #[test]
    fn test_is_punctuation_comma() {
        assert!(is_punctuation(','));
    }

    #[test]
    fn test_is_punctuation_slash() {
        assert!(is_punctuation('/'));
    }

    #[test]
    fn test_is_punctuation_at() {
        assert!(!is_punctuation('@'));
    }

    #[test]
    fn test_is_punctuation_hash() {
        assert!(!is_punctuation('#'));
    }

    #[test]
    fn test_is_punctuation_dollar() {
        assert!(!is_punctuation('$'));
    }

    #[test]
    fn test_is_punctuation_underscore() {
        assert!(!is_punctuation('_'));
    }

    #[test]
    fn test_is_punctuation_space() {
        assert!(!is_punctuation(' '));
    }

    #[test]
    fn test_is_punctuation_a() {
        assert!(!is_punctuation('a'));
    }

    #[test]
    fn test_is_punctuation_9() {
        assert!(!is_punctuation('9'));
    }

    #[test]
    fn test_is_punctuation_latin() {
        assert!(!is_punctuation('é'));
    }

    #[test]
    fn test_is_punctuation_euro() {
        assert!(!is_punctuation('€'));
    }

    #[test]
    fn test_is_punctuation_smile() {
        assert!(!is_punctuation('☺'));
    }

    #[test]
    fn test_is_punctuation_numeric() {
        assert!(!is_punctuation('¾'));
    }

    #[test]
    fn test_is_boundary() {
        assert!(is_boundary(' ', 'a'));
        assert!(is_boundary('a', ' '));
        assert!(is_boundary(' ', '+'));
        assert!(is_boundary(' ', ','));
        assert!(!is_boundary('3', '4'));
        assert!(is_boundary('(', '('));
        assert!(!is_boundary('r', 'd'));
    }

    #[test]
    fn test_was_quoted() {
        assert!(!was_quoted(""));
        assert!(!was_quoted("foo"));
        assert!(was_quoted("a b"));
        assert!(was_quoted("(a)"));
    }

    #[test]
    fn test_dequote() {
        assert_eq!(dequote("foo", "'\""), "foo");
        assert_eq!(dequote("'foo'", "'\""), "foo");
        assert_eq!(dequote("\"foo\"", "'\""), "foo");
        assert_eq!(dequote("'o\\'clock'", "'\""), "o\\'clock");
        // single quote char
        assert_eq!(dequote("'", "'\""), "");
        // multibyte quote char
        assert_eq!(dequote("éo\\'clocké", "é"), "o\\'clock");
    }

    #[test]
    fn test_token_empty() {
        let mut l = Lexer::new("");
        assert_eq!(l.token(), NONE);
    }

    #[test]
    fn test_token_tokens() {
        let mut l = Lexer::new(
            " one 'two \\'three\\''+456-(1.3*2 - 0x12) 1.2e-3.4    foo.bar and '\\u20ac'",
        );
        assert_eq!(l.token(), Some((String::from("one"), Type::Identifier)));
        assert_eq!(
            l.token(),
            Some((String::from("'two 'three''"), Type::String))
        );
        assert_eq!(l.token(), Some((String::from("+"), Type::Op)));
        assert_eq!(l.token(), Some((String::from("456"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("-"), Type::Op)));
        assert_eq!(l.token(), Some((String::from("("), Type::Op)));
        assert_eq!(l.token(), Some((String::from("1.3"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("*"), Type::Op)));
        assert_eq!(l.token(), Some((String::from("2"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("-"), Type::Op)));
        assert_eq!(l.token(), Some((String::from("0x12"), Type::Hex)));
        assert_eq!(l.token(), Some((String::from(")"), Type::Op)));
        assert_eq!(l.token(), Some((String::from("1.2e-3.4"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("foo.bar"), Type::Identifier)));
        assert_eq!(l.token(), Some((String::from("and"), Type::Op)));
        assert_eq!(l.token(), Some((String::from("'€'"), Type::String)));
        assert_eq!(l.token(), None);
    }

    #[test]
    fn test_token_short_numbers() {
        let mut l = Lexer::new("1 12 123 1234 12345 123456 1234567 123.45e 12.34e+");
        assert_eq!(l.token(), Some((String::from("1"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("12"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("123"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("1234"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("12345"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("123456"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("1234567"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("123.45e"), Type::Number)));
        assert_eq!(l.token(), Some((String::from("12.34e+"), Type::Number)));
        assert_eq!(l.token(), None);
    }

    #[test]
    fn test_read_word_quoted_simple() {
        assert_eq!(
            read_word_quoted("'one two'", "'\"", 0),
            Some((String::from("'one two'"), 9))
        );
    }

    #[test]
    fn test_read_word_quoted_unterminated() {
        assert_eq!(
            read_word_quoted("'one two", "'\"", 0),
            None as Option<(String, usize)>
        );
    }

    #[test]
    fn test_read_word_quoted_backslash_u() {
        assert_eq!(
            read_word_quoted("'pay \\u20a43'", "'\"", 0),
            Some((String::from("'pay ₤3'"), 13))
        );
    }

    #[test]
    fn test_read_word_quoted_u_plus() {
        assert_eq!(
            read_word_quoted("\"pay U+20AC5\"", "'\"", 0),
            Some((String::from("\"pay €5\""), 13))
        );
    }

    #[test]
    fn test_read_word_unquoted_simple() {
        assert_eq!(
            read_word_unquoted("input", 0),
            Some((String::from("input"), 5))
        );
    }

    #[test]
    fn test_read_word_unquoted_escaped_space() {
        assert_eq!(
            read_word_unquoted("one\\ two", 0),
            Some((String::from("one two"), 8))
        );
    }

    #[test]
    fn test_read_word_unquoted_escaped_quote() {
        assert_eq!(
            read_word_unquoted("one\\\"two", 0),
            Some((String::from("one\"two"), 8))
        );
    }

    #[test]
    fn test_read_word_unquoted_escaped_newline() {
        assert_eq!(
            read_word_unquoted("one\\ntwo", 0),
            Some((String::from("one\x0atwo"), 8))
        );
    }

    #[test]
    fn test_read_word_unquoted_escaped_backslash_u() {
        assert_eq!(
            read_word_unquoted("pay\\u20a43", 0),
            Some((String::from("pay₤3"), 10))
        );
    }

    #[test]
    fn test_read_word_unquoted_incomplete_escaped_backslash_u() {
        assert_eq!(
            read_word_unquoted("\\u203", 0),
            Some((String::from("u203"), 5))
        );
    }

    #[test]
    fn test_read_word_unquoted_nonhex_escaped_backslash_u() {
        assert_eq!(
            read_word_unquoted("\\u2fghk", 0),
            Some((String::from("u2fghk"), 7))
        );
    }

    #[test]
    fn test_read_word_unquoted_escaped_u_plus() {
        assert_eq!(
            read_word_unquoted("payU+20AC4", 0),
            Some((String::from("pay€4"), 10))
        );
    }

    #[test]
    fn test_read_word_unquoted_incomplete_u_plus() {
        assert_eq!(
            read_word_unquoted("U+20A", 0),
            Some((String::from("U+20A"), 5))
        );
    }

    #[test]
    fn test_read_word_trailing_whitespace() {
        assert_eq!(
            read_word_unquoted("one      ", 0),
            Some((String::from("one"), 3))
        );
    }

    #[test]
    fn test_read_word_unquoted_several_words() {
        let text = "one 'two' three\\ four";
        assert_eq!(read_word_unquoted(text, 0), Some((String::from("one"), 3)));
        assert_eq!(
            read_word_unquoted(text, 4),
            Some((String::from("'two'"), 9))
        );
        assert_eq!(
            read_word_unquoted(text, 10),
            Some((String::from("three four"), 21))
        );
    }

    #[test]
    fn test_common_length_empty() {
        assert_eq!(common_length("", ""), 0);
    }

    #[test]
    fn test_common_length_match_one() {
        assert_eq!(common_length("a", "a"), 1);
    }

    #[test]
    fn test_common_length_match_longer() {
        assert_eq!(common_length("abcde", "abcde"), 5);
    }

    #[test]
    fn test_common_length_match_s2_short() {
        assert_eq!(common_length("abc", ""), 0);
    }

    #[test]
    fn test_common_length_match_differ() {
        assert_eq!(common_length("abc", "def"), 0);
    }

    #[test]
    fn test_common_length_match_s2_prefix() {
        assert_eq!(common_length("foobar", "foo"), 3);
    }

    #[test]
    fn test_common_length_match_s1_prefix() {
        assert_eq!(common_length("foo", "foobar"), 3);
    }

    #[test]
    fn test_is_string() {
        let mut l = Lexer::new("'one'");
        assert_eq!(l.is_string("'\""), Some(("'one'".into(), Type::String)));
        assert_eq!(l.cursor, 5);
    }

    #[test]
    fn test_is_string_negative() {
        let mut l = Lexer::new("one");
        assert_eq!(l.is_string("'\""), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_string_empty() {
        let mut l = Lexer::new("''");
        assert_eq!(l.is_string("'\""), Some(("''".into(), Type::String)));
        assert_eq!(l.cursor, 2);
    }

    #[test]
    fn test_is_string_escape() {
        let mut l = Lexer::new("'one\ttwo'");
        assert_eq!(
            l.is_string("'\""),
            Some(("'one\ttwo'".into(), Type::String))
        );
        assert_eq!(l.cursor, 9);
    }

    #[test]
    fn test_is_date_year_eos() {
        let mut l = Lexer::new("2015");
        assert_eq!(l.is_date(), Some(("2015".into(), Type::Date)));
        assert_eq!(l.cursor, 4);
    }

    #[test]
    fn test_is_date_epoch() {
        let mut l = Lexer::new("315532800");
        assert_eq!(l.is_date(), Some(("315532800".into(), Type::Date)));
        assert_eq!(l.cursor, 9);
    }

    #[test]
    fn test_is_date_year_ws() {
        let mut l = Lexer::new("2015  ");
        assert_eq!(l.is_date(), Some(("2015".into(), Type::Date)));
        assert_eq!(l.cursor, 4);
    }

    #[test]
    fn test_is_date_year_ident() {
        let mut l = Lexer::new("2015abc");
        assert_eq!(l.is_date(), Some(("2015".into(), Type::Date)));
        assert_eq!(l.cursor, 4);
    }

    #[test]
    fn test_is_date_year_plus() {
        let mut l = Lexer::new("2015+");
        assert_eq!(l.is_date(), Some(("2015".into(), Type::Date)));
        assert_eq!(l.cursor, 4);
    }

    #[test]
    fn test_is_date_year_minus() {
        let mut l = Lexer::new("2015-xyz");
        assert_eq!(l.is_date(), Some(("2015-".into(), Type::Date)));
        assert_eq!(l.cursor, 5);
    }

    #[test]
    fn test_is_duration_1w() {
        let mut l = Lexer::new("1w");
        assert_eq!(l.is_duration(), Some(("1w".into(), Type::Duration)));
        assert_eq!(l.cursor, 2);
    }

    #[test]
    fn test_is_duration_op() {
        let mut l = Lexer::new("!!");
        assert_eq!(l.is_duration(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_number_digit() {
        let mut l = Lexer::new("3");
        assert_eq!(l.is_number(), Some(("3".into(), Type::Number)));
        assert_eq!(l.cursor, 1);
    }

    #[test]
    fn test_is_number_integer() {
        let mut l = Lexer::new("13");
        assert_eq!(l.is_number(), Some(("13".into(), Type::Number)));
        assert_eq!(l.cursor, 2);
    }

    #[test]
    fn test_is_number_trailing_minus() {
        let mut l = Lexer::new("13-");
        assert_eq!(l.is_number(), Some(("13".into(), Type::Number)));
        assert_eq!(l.cursor, 2);
    }

    #[test]
    fn test_is_number_decimal() {
        let mut l = Lexer::new("1.3");
        assert_eq!(l.is_number(), Some(("1.3".into(), Type::Number)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_number_multiple_decimal() {
        let mut l = Lexer::new("1.3.4");
        assert_eq!(l.is_number(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_number_decimal_no_digits() {
        let mut l = Lexer::new("1.");
        assert_eq!(l.is_number(), Some(("1.".into(), Type::Number)));
        assert_eq!(l.cursor, 2);
    }

    #[test]
    fn test_is_number_decimal_multi_digit() {
        let mut l = Lexer::new("12.32");
        assert_eq!(l.is_number(), Some(("12.32".into(), Type::Number)));
        assert_eq!(l.cursor, 5);
    }

    #[test]
    fn test_is_number_decimal_e_no_exponent() {
        let mut l = Lexer::new("12.32e");
        assert_eq!(l.is_number(), Some(("12.32e".into(), Type::Number)));
        assert_eq!(l.cursor, 6);
    }

    #[test]
    fn test_is_number_decimal_e_plus_no_exponent() {
        let mut l = Lexer::new("12.32e+");
        assert_eq!(l.is_number(), Some(("12.32e+".into(), Type::Number)));
        assert_eq!(l.cursor, 7);
    }

    #[test]
    fn test_is_number_decimal_e_integer_exponent() {
        let mut l = Lexer::new("12.32e-12");
        assert_eq!(l.is_number(), Some(("12.32e-12".into(), Type::Number)));
        assert_eq!(l.cursor, 9);
    }

    #[test]
    fn test_is_number_decimal_e_decimal_exponent() {
        let mut l = Lexer::new("12.32e12.34");
        assert_eq!(l.is_number(), Some(("12.32e12.34".into(), Type::Number)));
        assert_eq!(l.cursor, 11);
    }

    #[test]
    fn test_is_number_integer_invalid_lookahead() {
        let mut l = Lexer::new("13a");
        assert_eq!(l.is_number(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_set_singletons() {
        let mut l = Lexer::new("12,13");
        assert_eq!(l.is_set(), Some(("12,13".into(), Type::Set)));
        assert_eq!(l.cursor, 5);
    }

    #[test]
    fn test_is_set_ranges() {
        let mut l = Lexer::new("12-13,19-200");
        assert_eq!(l.is_set(), Some(("12-13,19-200".into(), Type::Set)));
        assert_eq!(l.cursor, 12);
    }

    #[test]
    fn test_is_set_double_comma() {
        let mut l = Lexer::new("12-13,,19-200");
        assert_eq!(l.is_set(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_set_trailing_comma() {
        let mut l = Lexer::new("12-13,");
        assert_eq!(l.is_set(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_set_trailing_ws() {
        let mut l = Lexer::new("12-13  ");
        assert_eq!(l.is_set(), Some(("12-13".into(), Type::Set)));
        assert_eq!(l.cursor, 5);
    }

    #[test]
    fn test_is_set_trailing_non_hard_boundary() {
        let mut l = Lexer::new("12-13abc");
        assert_eq!(l.is_set(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_separator() {
        let mut l = Lexer::new("  -- ");
        l.cursor = 2;
        assert_eq!(l.is_separator(), Some(("--".into(), Type::Separator)));
        assert_eq!(l.cursor, 4);
    }

    #[test]
    fn test_is_separator_negative() {
        let mut l = Lexer::new("- ");
        assert_eq!(l.is_separator(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_tag_plus() {
        let mut l = Lexer::new("+foo");
        assert_eq!(l.is_tag(), Some(("+foo".into(), Type::Tag)));
        assert_eq!(l.cursor, 4);
    }

    #[test]
    fn test_is_tag_not_after_whitespace() {
        let mut l = Lexer::new("x+y");
        l.cursor = 1;
        assert_eq!(l.is_tag(), NONE);
        assert_eq!(l.cursor, 1);
    }

    #[test]
    fn test_is_tag_after_whitespace() {
        let mut l = Lexer::new(" +y");
        l.cursor = 1;
        assert_eq!(l.is_tag(), Some(("+y".into(), Type::Tag)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_tag_after_lparen() {
        let mut l = Lexer::new("(+y");
        l.cursor = 1;
        assert_eq!(l.is_tag(), Some(("+y".into(), Type::Tag)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_tag_after_rparen() {
        let mut l = Lexer::new(")+y");
        l.cursor = 1;
        assert_eq!(l.is_tag(), Some(("+y".into(), Type::Tag)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_tag_after_multibyte_char() {
        let mut l = Lexer::new("€+y");
        l.cursor = 3;
        assert_eq!(l.is_tag(), NONE);
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_url_http() {
        let mut l = Lexer::new("http://foo.com/bar");
        assert_eq!(l.is_url(), Some(("http://foo.com/bar".into(), Type::URL)));
        assert_eq!(l.cursor, 18);
    }

    #[test]
    fn test_is_url_https() {
        let mut l = Lexer::new("https://foo.com/bar");
        assert_eq!(l.is_url(), Some(("https://foo.com/bar".into(), Type::URL)));
        assert_eq!(l.cursor, 19);
    }

    #[test]
    fn test_is_url_ws() {
        let mut l = Lexer::new("https://foo.com/bar  ");
        assert_eq!(l.is_url(), Some(("https://foo.com/bar".into(), Type::URL)));
        assert_eq!(l.cursor, 19);
    }

    #[test]
    fn test_is_url_with_ops() {
        let mut l = Lexer::new("https://foo.com/bar()+-~");
        assert_eq!(
            l.is_url(),
            Some(("https://foo.com/bar()+-~".into(), Type::URL))
        );
        assert_eq!(l.cursor, 24);
    }

    #[test]
    fn test_is_url_negative() {
        let mut l = Lexer::new("file://foo.com/bar");
        assert_eq!(l.is_url(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_pair_double_colon() {
        let mut l = Lexer::new("foo::bar  ");
        assert_eq!(l.is_pair(), Some(("foo::bar".into(), Type::Pair)));
        assert_eq!(l.cursor, 8);
    }

    #[test]
    fn test_is_pair_colon_eq() {
        let mut l = Lexer::new("foo:=bar  ");
        assert_eq!(l.is_pair(), Some(("foo:=bar".into(), Type::Pair)));
        assert_eq!(l.cursor, 8);
    }

    #[test]
    fn test_is_pair_colon() {
        let mut l = Lexer::new("foo:bar  ");
        assert_eq!(l.is_pair(), Some(("foo:bar".into(), Type::Pair)));
        assert_eq!(l.cursor, 7);
    }

    #[test]
    fn test_is_pair_equal() {
        let mut l = Lexer::new("foo=bar");
        assert_eq!(l.is_pair(), Some(("foo=bar".into(), Type::Pair)));
        assert_eq!(l.cursor, 7);
    }

    #[test]
    fn test_is_pair_quoted() {
        let mut l = Lexer::new("foo='abc def'");
        assert_eq!(l.is_pair(), Some(("foo='abc def'".into(), Type::Pair)));
        assert_eq!(l.cursor, 13);
    }

    #[test]
    fn test_is_pair_quoted_escapes() {
        let mut l = Lexer::new("foo='abc\\u20acdef'");
        assert_eq!(l.is_pair(), Some(("foo='abc€def'".into(), Type::Pair)));
        assert_eq!(l.cursor, 18);
    }

    #[test]
    fn test_is_uuid_long_eof() {
        let u = "ffffffff-ffff-ffff-ffff-ffffffffff";
        let mut l = Lexer::new(u);
        assert_eq!(l.is_uuid(true), Some((u.into(), Type::Uuid)));
        assert_eq!(l.cursor, 34);
    }

    #[test]
    fn test_is_uuid_long_ws() {
        let u = "ffffffff-ffff-ffff-ffff-ffffffffff  kjdf";
        let mut l = Lexer::new(u);
        assert_eq!(l.is_uuid(true), Some((u[..34].into(), Type::Uuid)));
        assert_eq!(l.cursor, 34);
    }

    #[test]
    fn test_is_uuid_long_op() {
        let u = "ffffffff-ffff-ffff-ffff-ffffffffff+";
        let mut l = Lexer::new(u);
        assert_eq!(l.is_uuid(true), Some((u[..34].into(), Type::Uuid)));
        assert_eq!(l.cursor, 34);
    }

    #[test]
    fn test_is_uuid_long_bad_boundary() {
        let u = "ffffffff-ffff-ffff-ffff-ffffffffff_";
        let mut l = Lexer::new(u);
        assert_eq!(l.is_uuid(true), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_uuid_long_bad_boundary_ignored() {
        let u = "ffffffff-ffff-ffff-ffff-ffffffffff_";
        let mut l = Lexer::new(u);
        assert_eq!(l.is_uuid(false), Some((u[..34].into(), Type::Uuid)));
        assert_eq!(l.cursor, 34);
    }

    #[test]
    fn test_is_uuid_too_short() {
        let u = "ffffff";
        let mut l = Lexer::new(u);
        assert_eq!(l.is_uuid(true), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_path_simple() {
        let mut l = Lexer::new("/path/to/a/file");
        assert_eq!(l.is_path(), Some(("/path/to/a/file".into(), Type::Path)));
        assert_eq!(l.cursor, 15);
    }

    #[test]
    fn test_is_path_too_short() {
        let mut l = Lexer::new("/a/file");
        assert_eq!(l.is_path(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_path_trailing_slash() {
        let mut l = Lexer::new("/path/to/a/dir/");
        assert_eq!(l.is_path(), Some(("/path/to/a/dir/".into(), Type::Path)));
        assert_eq!(l.cursor, 15);
    }

    #[test]
    fn test_is_path_double_slash() {
        let mut l = Lexer::new("/a//file");
        assert_eq!(l.is_path(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_path_no_initial_slash() {
        let mut l = Lexer::new("a/path/to/a/file");
        assert_eq!(l.is_path(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_substitution_simple() {
        let mut l = Lexer::new("/foo/bar/");
        assert_eq!(
            l.is_substitution(),
            Some(("/foo/bar/".into(), Type::Substitution))
        );
        assert_eq!(l.cursor, 9);
    }

    #[test]
    fn test_is_substitution_simple_ws() {
        let mut l = Lexer::new("/foo/bar/  ");
        assert_eq!(
            l.is_substitution(),
            Some(("/foo/bar/".into(), Type::Substitution))
        );
        assert_eq!(l.cursor, 9);
    }

    #[test]
    fn test_is_substitution_simple_g() {
        let mut l = Lexer::new("/foo/bar/g");
        assert_eq!(
            l.is_substitution(),
            Some(("/foo/bar/g".into(), Type::Substitution))
        );
        assert_eq!(l.cursor, 10);
    }

    #[test]
    fn test_is_substitution_simple_g_ws() {
        let mut l = Lexer::new("/foo/bar/g  ");
        assert_eq!(
            l.is_substitution(),
            Some(("/foo/bar/g".into(), Type::Substitution))
        );
        assert_eq!(l.cursor, 10);
    }

    #[test]
    fn test_is_substitution_simple_not_g() {
        let mut l = Lexer::new("/foo/bar/h");
        assert_eq!(l.is_substitution(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_substitution_simple_not_g_op() {
        let mut l = Lexer::new("/foo/bar/+");
        assert_eq!(l.is_substitution(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_substitution_simple_g_but_not_ws() {
        let mut l = Lexer::new("/foo/bar/ghi");
        assert_eq!(l.is_substitution(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_pattern_simple() {
        let mut l = Lexer::new("/foo/");
        assert_eq!(l.is_pattern(), Some(("/foo/".into(), Type::Pattern)));
        assert_eq!(l.cursor, 5);
    }

    #[test]
    fn test_is_pattern_escaped() {
        let mut l = Lexer::new("/f\\u20A4o/");
        assert_eq!(l.is_pattern(), Some(("/f\\u20A4o/".into(), Type::Pattern)));
        assert_eq!(l.cursor, 10);
    }

    #[test]
    fn test_is_pattern_simple_trailing_ws() {
        let mut l = Lexer::new("/foo/\n\t");
        assert_eq!(l.is_pattern(), Some(("/foo/".into(), Type::Pattern)));
        assert_eq!(l.cursor, 5);
    }

    #[test]
    fn test_is_operator_hastag() {
        let mut l = Lexer::new("_hastag_");
        assert_eq!(l.is_operator(), Some(("_hastag_".into(), Type::Op)));
    }

    #[test]
    fn test_is_operator_notag() {
        let mut l = Lexer::new("_notag_");
        assert_eq!(l.is_operator(), Some(("_notag_".into(), Type::Op)));
    }

    #[test]
    fn test_is_operator_neg() {
        let mut l = Lexer::new("_neg_");
        assert_eq!(l.is_operator(), Some(("_neg_".into(), Type::Op)));
    }

    #[test]
    fn test_is_operator_xor() {
        let mut l = Lexer::new("xor");
        assert_eq!(l.is_operator(), Some(("xor".into(), Type::Op)));
    }

    #[test]
    fn test_is_identifier_empty() {
        let mut l = Lexer::new("");
        assert_eq!(l.is_identifier(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_identifier_multibyte_nonpunct_first_char() {
        let mut l = Lexer::new("☺");
        assert_eq!(l.is_identifier(), Some(("☺".into(), Type::Identifier)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_identifier_bad_first_char() {
        let mut l = Lexer::new("1abc");
        assert_eq!(l.is_identifier(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_identifier_bad_next_char() {
        let mut l = Lexer::new("a:bc");
        assert_eq!(l.is_identifier(), Some(("a".into(), Type::Identifier)));
        assert_eq!(l.cursor, 1);
    }

    #[test]
    fn test_is_identifier_ok() {
        let mut l = Lexer::new("abc");
        assert_eq!(l.is_identifier(), Some(("abc".into(), Type::Identifier)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_word_no() {
        let mut l = Lexer::new("+");
        assert!(l.is_word().is_none());
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_word_pending() {
        let mut l = Lexer::new("foo.PENDING");
        l.cursor = 4;
        assert_eq!(l.is_word(), Some(("PENDING".into(), Type::Word)));
        assert_eq!(l.cursor, 11);
    }

    #[test]
    fn test_is_word_to_eof() {
        let mut l = Lexer::new("abc");
        assert_eq!(l.is_word(), Some(("abc".into(), Type::Word)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_word_nonzero_start() {
        let mut l = Lexer::new("--abc");
        l.cursor = 2;
        assert_eq!(l.is_word(), Some(("abc".into(), Type::Word)));
        assert_eq!(l.cursor, 5);
    }

    #[test]
    fn test_is_word_to_ws() {
        let mut l = Lexer::new("abc def");
        assert_eq!(l.is_word(), Some(("abc".into(), Type::Word)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_word_to_op() {
        let mut l = Lexer::new("abc*def");
        assert_eq!(l.is_word(), Some(("abc".into(), Type::Word)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_split_simple() {
        assert_eq!(
            Lexer::split(" ( A or B ) "),
            vec![
                String::from("("),
                String::from("A"),
                String::from("or"),
                String::from("B"),
                String::from(")"),
            ]
        );
    }

    #[test]
    fn test_split_confusing() {
        assert_eq!(
            Lexer::split("  +-* a+b 12.3e4 'c d'"),
            vec![
                String::from("+"),
                String::from("-"),
                String::from("*"),
                String::from("a"),
                String::from("+"),
                String::from("b"),
                String::from("12.3e4"),
                String::from("'c d'"),
            ]
        );
    }

    #[test]
    fn test_decompose_pair_combos() {
        let name = "name";
        for modifier in ["", "mod"].iter() {
            for separator in [":", "=", "::", ":="].iter() {
                for value in ["", "value", "a:b", "a::b", "a=b", "a:=b"].iter() {
                    let input = format!(
                        "{}{}{}{}{}",
                        name,
                        if modifier.len() > 0 { "." } else { "" },
                        modifier,
                        separator,
                        value
                    );
                    assert_eq!(
                        Lexer::decompose_pair(&input),
                        Some(DecomposedPair {
                            name: name.into(),
                            modifier: String::from(*modifier),
                            separator: String::from(*separator),
                            value: String::from(*value),
                        })
                    );
                }
            }
        }
    }

    #[test]
    fn test_is_one_of() {
        let mut l = Lexer::new("Grumpy.");
        let dwarves = vec![
            "Sneezy", "Doc", "Bashful", "Grumpy", "Happy", "Sleepy", "Dopey",
        ];
        assert!(!l.is_one_of(&dwarves, false, true));
        assert_eq!(l.cursor, 0);
        assert!(l.is_one_of(&dwarves, false, false));
        assert_eq!(l.cursor, 6);
    }

    #[test]
    fn test_is_integer_negative() {
        let mut l = Lexer::new("one");
        assert_eq!(l.is_integer(), NONE);
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_integer_positive() {
        let mut l = Lexer::new("123");
        assert_eq!(l.is_integer(), Some(("123".into(), Type::Number)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_integer_trailing_dot() {
        let mut l = Lexer::new("123.foo");
        assert_eq!(l.is_integer(), Some(("123".into(), Type::Number)));
        assert_eq!(l.cursor, 3);
    }

    #[test]
    fn test_is_integer_not_at_start() {
        let mut l = Lexer::new("abc.123.foo");
        l.cursor = 4;
        assert_eq!(l.is_integer(), Some(("123".into(), Type::Number)));
        assert_eq!(l.cursor, 7);
    }

    #[test]
    fn test_is_literal_no_match() {
        let mut l = Lexer::new("one.two");
        assert!(!l.is_literal("zero", false, false));
        assert_eq!(l.cursor, 0);
    }

    #[test]
    fn test_is_literal_multi() {
        let mut l = Lexer::new("one.two");
        assert!(l.is_literal("one", false, false));
        assert_eq!(l.cursor, 3);
        assert!(l.is_literal(".", false, false));
        assert_eq!(l.cursor, 4);
        assert!(l.is_literal("two", false, true));
        assert_eq!(l.cursor, 7);
    }

    #[test]
    fn test_is_literal_abbrev() {
        let mut l = Lexer::new("wonder");
        assert!(!l.is_literal("wonderful", false, false));
        assert_eq!(l.cursor, 0);
        assert!(l.is_literal("wonderful", true, false));
        assert_eq!(l.cursor, 6);
    }

    mod integ {
        use super::super::*;

        fn lexer_test(input: &str, expected: Vec<(&str, Type)>) {
            // isolated case..
            let mut lexer = Lexer::new(input);
            lexer.add_attribute("due");
            lexer.add_attribute("tags");
            lexer.add_attribute("description");
            let got: Vec<_> = lexer.into_iter().collect();
            let got_strs: Vec<_> = got.iter().map(|(s, t)| (s.as_ref(), *t)).collect();
            assert_eq!(got_strs, expected);

            // embedded case..
            let mut lexer = Lexer::new(format!(" {} ", input));
            lexer.add_attribute("due");
            lexer.add_attribute("tags");
            lexer.add_attribute("description");
            let got: Vec<_> = lexer.into_iter().collect();
            let got_strs: Vec<_> = got.iter().map(|(s, t)| (s.as_ref(), *t)).collect();
            assert_eq!(got_strs, expected);
        }

        #[test]
        fn test_pattern_foo() {
            lexer_test("/foo/", vec![("/foo/", Type::Pattern)]);
        }

        #[test]
        fn test_pattern_escaped_slash() {
            lexer_test("/a\\/b/", vec![("/a\\/b/", Type::Pattern)]);
        }

        #[test]
        fn test_pattern_quote() {
            lexer_test("/'/", vec![("/'/", Type::Pattern)]);
        }

        // Substitution
        //
        #[test]
        fn test_subst_g() {
            lexer_test("/from/to/g", vec![("/from/to/g", Type::Substitution)]);
        }

        #[test]
        fn test_subst() {
            lexer_test("/from/to/", vec![("/from/to/", Type::Substitution)]);
        }

        // Tag
        //
        #[test]
        fn test_tag_simple() {
            lexer_test("+tag", vec![("+tag", Type::Tag)]);
        }

        #[test]
        fn test_tag_negative() {
            lexer_test("-tag", vec![("-tag", Type::Tag)]);
        }

        #[test]
        fn test_tag_at() {
            lexer_test("+@tag", vec![("+@tag", Type::Tag)]);
        }

        // Path
        //
        #[test]
        fn test_path() {
            lexer_test(
                "/long/path/to/file.txt",
                vec![("/long/path/to/file.txt", Type::Path)],
            );
        }

        #[test]
        fn test_path_dir() {
            lexer_test(
                "/long/path/to/dir/",
                vec![("/long/path/to/dir/", Type::Path)],
            );
        }

        // Word
        //
        #[test]
        fn test_1_foo_bar() {
            lexer_test("1.foo.bar", vec![("1.foo.bar", Type::Word)]);
        }

        // Identifier
        //
        #[test]
        fn test_foo() {
            lexer_test("foo", vec![("foo", Type::Identifier)]);
        }

        #[test]
        fn test_multibyte_ident() {
            lexer_test("Çirçös", vec![("Çirçös", Type::Identifier)]);
        }

        #[test]
        fn test_multibyte_nonpunctuation_single_char() {
            lexer_test("☺", vec![("☺", Type::Identifier)]);
        }

        #[test]
        fn test_name() {
            lexer_test("name", vec![("name", Type::Identifier)]);
        }

        #[test]
        fn test_f1() {
            lexer_test("f1", vec![("f1", Type::Identifier)]);
        }

        #[test]
        fn test_foo_dot_bar() {
            lexer_test("foo.bar", vec![("foo.bar", Type::Identifier)]);
        }

        #[test]
        fn test_long_with_underscore() {
            lexer_test(
                "a1a1a1a1_a1a1_a1a1_a1a1_a1a1a1a1a1a1",
                vec![("a1a1a1a1_a1a1_a1a1_a1a1_a1a1a1a1a1a1", Type::Identifier)],
            );
        }

        // Word that starts wih 'or', which is an operator, but should be ignored.
        //
        #[test]
        fn test_starts_with_or() {
            lexer_test("ordinary", vec![("ordinary", Type::Identifier)]);
        }

        // DOM
        //
        #[test]
        fn test_due() {
            lexer_test("due", vec![("due", Type::DOM)]);
        }

        #[test]
        fn test_123_tags() {
            lexer_test("123.tags", vec![("123.tags", Type::DOM)]);
        }

        #[test]
        fn test_123_tags_pending() {
            lexer_test("123.tags.PENDING", vec![("123.tags.PENDING", Type::DOM)]);
        }

        #[test]
        fn test_123_description() {
            lexer_test("123.description", vec![("123.description", Type::DOM)]);
        }

        #[test]
        fn test_123_annotations_count() {
            lexer_test(
                "123.annotations.count",
                vec![("123.annotations.count", Type::DOM)],
            );
        }

        #[test]
        fn test_123_annotations_1_description() {
            lexer_test(
                "123.annotations.1.description",
                vec![("123.annotations.1.description", Type::DOM)],
            );
        }

        #[test]
        fn test_123_annotations_1_entry() {
            lexer_test(
                "123.annotations.1.entry",
                vec![("123.annotations.1.entry", Type::DOM)],
            );
        }

        #[test]
        fn test_123_annotations_1_entry_year() {
            lexer_test(
                "123.annotations.1.entry.year",
                vec![("123.annotations.1.entry.year", Type::DOM)],
            );
        }

        #[test]
        fn test_uuid_due() {
            lexer_test(
                "a360fc44-315c-4366-b70c-ea7e7520b749.due",
                vec![("a360fc44-315c-4366-b70c-ea7e7520b749.due", Type::DOM)],
            );
        }

        #[test]
        fn test_numeric_uuid_due() {
            lexer_test(
                "12345678-1234-1234-1234-123456789012.due",
                vec![("12345678-1234-1234-1234-123456789012.due", Type::DOM)],
            );
        }

        #[test]
        fn test_system_os() {
            lexer_test("system.os", vec![("system.os", Type::DOM)]);
        }

        #[test]
        fn test_rc_foo() {
            lexer_test("rc.foo", vec![("rc.foo", Type::DOM)]);
        }

        // URL
        //
        #[test]
        fn test_lexer_31() {
            lexer_test(
                "http://example.com",
                vec![("http://example.com", Type::URL)],
            );
        }

        #[test]
        fn test_lexer_32() {
            lexer_test(
                "https://foo.example.com",
                vec![("https://foo.example.com", Type::URL)],
            );
        }

        // String
        //
        #[test]
        fn test_quoted_string() {
            lexer_test("'one two'", vec![("'one two'", Type::String)]);
        }

        #[test]
        fn test_double_quoted_string() {
            lexer_test("\"three\"", vec![("\"three\"", Type::String)]);
        }

        #[test]
        fn test_string_quoted_with_escapes() {
            lexer_test("'\\''", vec![("'''", Type::String)]);
        }

        #[test]
        fn test_string_quoted_quotes() {
            lexer_test("\"\\\"\"", vec![("\"\"\"", Type::String)]);
        }

        #[test]
        fn test_quoted_tabs() {
            lexer_test("\"\tfoo\t\"", vec![("\"\tfoo\t\"", Type::String)]);
        }

        #[test]
        fn test_multibyte_slash_u() {
            lexer_test("\"\\u20A43\"", vec![("\"₤3\"", Type::String)]);
        }

        #[test]
        fn test_multibyte_u_plus() {
            lexer_test("\"U+20AC4\"", vec![("\"€4\"", Type::String)]);
        }

        // Number
        //
        #[test]
        fn test_one() {
            lexer_test("1", vec![("1", Type::Number)]);
        }

        #[test]
        fn test_pi() {
            lexer_test("3.14", vec![("3.14", Type::Number)]);
        }

        #[test]
        fn test_avogadro() {
            lexer_test("6.02217e23", vec![("6.02217e23", Type::Number)]);
        }

        #[test]
        fn test_expo() {
            lexer_test("1.2e-3.4", vec![("1.2e-3.4", Type::Number)]);
        }

        #[test]
        fn test_hex() {
            lexer_test("0x2f", vec![("0x2f", Type::Hex)]);
        }

        // Set (1,2,4-7,9)
        //
        #[test]
        fn test_set_pair() {
            lexer_test("1,2", vec![("1,2", Type::Set)]);
        }

        #[test]
        fn test_set_range() {
            lexer_test("1-2", vec![("1-2", Type::Set)]);
        }

        #[test]
        fn test_set_range_pair() {
            lexer_test("1-2,4", vec![("1-2,4", Type::Set)]);
        }

        #[test]
        fn test_set_range_pair_ws() {
            lexer_test("1-2,4 ", vec![("1-2,4", Type::Set)]);
        }

        #[test]
        fn test_set_range_pair_paren() {
            lexer_test("1-2,4(", vec![("1-2,4", Type::Set), ("(", Type::Op)]);
        }

        #[test]
        fn test_ranges_and_singletons() {
            lexer_test("1-2,4,6-8", vec![("1-2,4,6-8", Type::Set)]);
        }

        #[test]
        fn test_set_more_ranges_and_singletons() {
            lexer_test("1-2,4,6-8,10-12", vec![("1-2,4,6-8,10-12", Type::Set)]);
        }

        // Pair
        //
        #[test]
        fn test_name_colon_value() {
            lexer_test("name:value", vec![("name:value", Type::Pair)]);
        }

        #[test]
        fn test_name_eq_value() {
            lexer_test("name=value", vec![("name=value", Type::Pair)]);
        }

        #[test]
        fn test_name_colon_eq_value() {
            lexer_test("name:=value", vec![("name:=value", Type::Pair)]);
        }

        #[test]
        fn test_name_dot_mod_colon_value() {
            lexer_test("name.mod:value", vec![("name.mod:value", Type::Pair)]);
        }

        #[test]
        fn test_name_dot_mod_eq_value() {
            lexer_test("name.mod=value", vec![("name.mod=value", Type::Pair)]);
        }

        #[test]
        fn test_name_colon() {
            lexer_test("name:", vec![("name:", Type::Pair)]);
        }

        #[test]
        fn test_name_eq() {
            lexer_test("name=", vec![("name=", Type::Pair)]);
        }

        #[test]
        fn test_name_dot_mod_colon() {
            lexer_test("name.mod:", vec![("name.mod:", Type::Pair)]);
        }

        #[test]
        fn test_name_dot_mod_equal() {
            lexer_test("name.mod=", vec![("name.mod=", Type::Pair)]);
        }

        #[test]
        fn test_pro_quoted() {
            lexer_test("pro:'P 1'", vec![("pro:'P 1'", Type::Pair)]);
        }

        #[test]
        fn test_rc_colon_x() {
            lexer_test("rc:x", vec![("rc:x", Type::Pair)]);
        }

        #[test]
        fn test_rc_dot_name_colon_value() {
            lexer_test("rc.name:value", vec![("rc.name:value", Type::Pair)]);
        }

        #[test]
        fn test_rc_dot_name_eq_value() {
            lexer_test("rc.name=value", vec![("rc.name=value", Type::Pair)]);
        }

        #[test]
        fn test_rc_dot_name_colon_eq_value() {
            lexer_test("rc.name:=value", vec![("rc.name:=value", Type::Pair)]);
        }

        #[test]
        fn test_due_colon_eq_quoted() {
            lexer_test("due:='eow - 2d'", vec![("due:='eow - 2d'", Type::Pair)]);
        }

        #[test]
        fn test_name_colon_quoted_with_newline() {
            lexer_test("name:'foo\nbar'", vec![("name:'foo\nbar'", Type::Pair)]);
        }

        // Operator - complete set
        //
        #[test]
        fn test_caret() {
            lexer_test("^", vec![("^", Type::Op)]);
        }

        #[test]
        fn test_bang() {
            lexer_test("!", vec![("!", Type::Op)]);
        }

        #[test]
        fn test_neg() {
            lexer_test("_neg_", vec![("_neg_", Type::Op)]);
        }

        #[test]
        fn test_pos() {
            lexer_test("_pos_", vec![("_pos_", Type::Op)]);
        }

        #[test]
        fn test_hastag() {
            lexer_test("_hastag_", vec![("_hastag_", Type::Op)]);
        }

        #[test]
        fn test_notag() {
            lexer_test("_notag_", vec![("_notag_", Type::Op)]);
        }

        #[test]
        fn test_star() {
            lexer_test("*", vec![("*", Type::Op)]);
        }

        #[test]
        fn test_slash() {
            lexer_test("/", vec![("/", Type::Op)]);
        }

        #[test]
        fn test_percent() {
            lexer_test("%", vec![("%", Type::Op)]);
        }

        #[test]
        fn test_plus() {
            lexer_test("+", vec![("+", Type::Op)]);
        }

        #[test]
        fn test_minus() {
            lexer_test("-", vec![("-", Type::Op)]);
        }

        #[test]
        fn test_leq() {
            lexer_test("<=", vec![("<=", Type::Op)]);
        }

        #[test]
        fn test_geq() {
            lexer_test(">=", vec![(">=", Type::Op)]);
        }

        #[test]
        fn test_gt() {
            lexer_test(">", vec![(">", Type::Op)]);
        }

        #[test]
        fn test_lt() {
            lexer_test("<", vec![("<", Type::Op)]);
        }

        #[test]
        fn test_eq() {
            lexer_test("=", vec![("=", Type::Op)]);
        }

        #[test]
        fn test_double_eq() {
            lexer_test("==", vec![("==", Type::Op)]);
        }

        #[test]
        fn test_not_eq() {
            lexer_test("!=", vec![("!=", Type::Op)]);
        }

        #[test]
        fn test_not_double_eq() {
            lexer_test("!==", vec![("!==", Type::Op)]);
        }

        #[test]
        fn test_tilde() {
            lexer_test("~", vec![("~", Type::Op)]);
        }

        #[test]
        fn test_not_tilde() {
            lexer_test("!~", vec![("!~", Type::Op)]);
        }

        #[test]
        fn test_and() {
            lexer_test("and", vec![("and", Type::Op)]);
        }

        #[test]
        fn test_or() {
            lexer_test("or", vec![("or", Type::Op)]);
        }

        #[test]
        fn test_xor() {
            lexer_test("xor", vec![("xor", Type::Op)]);
        }

        #[test]
        fn test_lparen() {
            lexer_test("(", vec![("(", Type::Op)]);
        }

        #[test]
        fn test_rparen() {
            lexer_test(")", vec![(")", Type::Op)]);
        }

        // UUID
        //
        #[test]
        fn test_uuid_ffs() {
            lexer_test(
                "ffffffff-ffff-ffff-ffff-ffffffffffff",
                vec![("ffffffff-ffff-ffff-ffff-ffffffffffff", Type::Uuid)],
            );
        }

        #[test]
        fn test_uuid_00s() {
            lexer_test(
                "00000000-0000-0000-0000-0000000",
                vec![("00000000-0000-0000-0000-0000000", Type::Uuid)],
            );
        }

        #[test]
        fn test_uuid_shorter() {
            lexer_test(
                "00000000-0000-0000-0000",
                vec![("00000000-0000-0000-0000", Type::Uuid)],
            );
        }

        #[test]
        fn test_uuid_shorter_still() {
            lexer_test(
                "00000000-0000-0000",
                vec![("00000000-0000-0000", Type::Uuid)],
            );
        }

        #[test]
        fn test_uuid_even_shorter() {
            lexer_test("00000000-0000", vec![("00000000-0000", Type::Uuid)]);
        }

        #[test]
        fn test_uuid_only_first_bit() {
            lexer_test("00000000", vec![("00000000", Type::Uuid)]);
        }

        #[test]
        fn test_real_uuid() {
            lexer_test(
                "a360fc44-315c-4366-b70c-ea7e7520b749",
                vec![("a360fc44-315c-4366-b70c-ea7e7520b749", Type::Uuid)],
            );
        }

        #[test]
        fn test_real_uuid_shorter() {
            lexer_test(
                "a360fc44-315c-4366-b70c-ea7e752",
                vec![("a360fc44-315c-4366-b70c-ea7e752", Type::Uuid)],
            );
        }

        #[test]
        fn test_real_uuid_shorter_still() {
            lexer_test(
                "a360fc44-315c-4366-b70c",
                vec![("a360fc44-315c-4366-b70c", Type::Uuid)],
            );
        }

        #[test]
        fn test_real_uuid_even_shorter() {
            lexer_test(
                "a360fc44-315c-4366",
                vec![("a360fc44-315c-4366", Type::Uuid)],
            );
        }

        #[test]
        fn test_real_uuid_naming_is_hard() {
            lexer_test("a360fc44-315c", vec![("a360fc44-315c", Type::Uuid)]);
        }

        #[test]
        fn test_real_uuid_only_first_bit() {
            lexer_test("a360fc44", vec![("a360fc44", Type::Uuid)]);
        }

        // Date
        //
        #[test]
        fn test_year_week() {
            lexer_test("2015-W01", vec![("2015-W01", Type::Date)]);
        }

        #[test]
        fn test_year_month_day() {
            lexer_test("2015-02-17", vec![("2015-02-17", Type::Date)]);
        }

        #[test]
        fn test_timestamp() {
            lexer_test(
                "2013-11-29T22:58:00Z",
                vec![("2013-11-29T22:58:00Z", Type::Date)],
            );
        }

        #[test]
        fn test_abbrev_timestamp() {
            lexer_test("20131129T225800Z", vec![("20131129T225800Z", Type::Date)]);
        }

        #[test]
        fn test_9thn() {
            lexer_test("9th", vec![("9th", Type::Date)]);
        }

        #[test]
        fn test_10th() {
            lexer_test("10th", vec![("10th", Type::Date)]);
        }

        #[test]
        fn test_today() {
            lexer_test("today", vec![("today", Type::Date)]);
        }

        // Duration
        //
        #[test]
        fn test_year() {
            lexer_test("year", vec![("year", Type::Duration)]);
        }

        #[test]
        fn test_4weeks() {
            lexer_test("4weeks", vec![("4weeks", Type::Duration)]);
        }

        #[test]
        fn test_pt23h() {
            lexer_test("PT23H", vec![("PT23H", Type::Duration)]);
        }

        #[test]
        fn test_1second() {
            lexer_test("1second", vec![("1second", Type::Duration)]);
        }

        #[test]
        fn test_1s() {
            lexer_test("1s", vec![("1s", Type::Duration)]);
        }

        #[test]
        fn test_1minute() {
            lexer_test("1minute", vec![("1minute", Type::Duration)]);
        }

        #[test]
        fn test_2hour() {
            lexer_test("2hour", vec![("2hour", Type::Duration)]);
        }

        #[test]
        fn test_3_days() {
            lexer_test("3 days", vec![("3 days", Type::Duration)]);
        }

        #[test]
        fn test_4w() {
            lexer_test("4w", vec![("4w", Type::Duration)]);
        }

        #[test]
        fn test_5mo() {
            lexer_test("5mo", vec![("5mo", Type::Duration)]);
        }

        #[test]
        fn test_6_years() {
            lexer_test("6 years", vec![("6 years", Type::Duration)]);
        }

        #[test]
        fn test_p1y() {
            lexer_test("P1Y", vec![("P1Y", Type::Duration)]);
        }

        #[test]
        fn test_pt1h() {
            lexer_test("PT1H", vec![("PT1H", Type::Duration)]);
        }

        #[test]
        fn test_p_full() {
            lexer_test("P1Y1M1DT1H1M1S", vec![("P1Y1M1DT1H1M1S", Type::Duration)]);
        }

        // Misc
        //
        #[test]
        fn test_separator() {
            lexer_test("--", vec![("--", Type::Separator)]);
        }

        #[test]
        fn test_separator_ws() {
            lexer_test("  --  ", vec![("--", Type::Separator)]);
        }

        #[test]
        fn test_separator_boundaries() {
            lexer_test(
                "123--123  ",
                vec![
                    ("123", Type::Number),
                    ("--", Type::Separator),
                    ("123", Type::Number),
                ],
            );
        }

        // Expression
        // due:eom-2w
        // due < eom + 1w + 1d
        // ( /pattern/ or 8ad2e3db-914d-4832-b0e6-72fa04f6e331,3b6218f9-726a-44fc-aa63-889ff52be442 )
        //
        #[test]
        fn test_expression() {
            lexer_test(
                "(1+2)",
                vec![
                    ("(", Type::Op),
                    ("1", Type::Number),
                    ("+", Type::Op),
                    ("2", Type::Number),
                    (")", Type::Op),
                ],
            );
        }

        #[test]
        fn test_expression_dom_tilde() {
            lexer_test(
                "description~pattern",
                vec![
                    ("description", Type::DOM),
                    ("~", Type::Op),
                    ("pattern", Type::Identifier),
                ],
            );
        }

        #[test]
        fn test_expression_paren_tag() {
            lexer_test(
                "(+tag)",
                vec![("(", Type::Op), ("+tag", Type::Tag), (")", Type::Op)],
            );
        }

        #[test]
        fn test_expression_paren_name_value() {
            lexer_test(
                "(name:value)",
                vec![("(", Type::Op), ("name:value", Type::Pair), (")", Type::Op)],
            );
        }
    }
}