places/
match_impl.rs

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

use crate::util;
use bitflags::bitflags;
use caseless::Caseless;
use rusqlite::{
    self,
    types::{FromSql, FromSqlError, FromSqlResult, ToSql, ToSqlOutput, ValueRef},
};
use std::borrow::Cow;

const MAX_CHARS_TO_SEARCH_THROUGH: usize = 255;

#[derive(Clone, Copy, PartialEq, Eq, Debug)]
#[repr(u32)]
pub enum MatchBehavior {
    // Match anywhere in each searchable tearm
    Anywhere = 0,
    /// Match first on word boundaries, and if we do not get enough results, then
    /// match anywhere in each searchable term.
    BoundaryAnywhere = 1,
    /// Match on word boundaries in each searchable term.
    Boundary = 2,
    /// Match only the beginning of each search term.
    Beginning = 3,
    /// Match anywhere in each searchable term without doing any transformation
    /// or stripping on the underlying data.
    AnywhereUnmodified = 4,
    /// Match only the beginning of each search term using a case sensitive
    /// comparator
    BeginningCaseSensitive = 5,
}

impl FromSql for MatchBehavior {
    #[inline]
    fn column_result(value: ValueRef<'_>) -> FromSqlResult<Self> {
        Ok(match value.as_i64()? {
            0 => MatchBehavior::Anywhere,
            1 => MatchBehavior::BoundaryAnywhere,
            2 => MatchBehavior::Boundary,
            3 => MatchBehavior::Beginning,
            4 => MatchBehavior::AnywhereUnmodified,
            5 => MatchBehavior::BeginningCaseSensitive,
            _ => return Err(FromSqlError::InvalidType),
        })
    }
}

impl ToSql for MatchBehavior {
    #[inline]
    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
        Ok(ToSqlOutput::from(*self as u32))
    }
}

bitflags! {
    pub struct SearchBehavior: u32 {
        /// Search through history.
        const HISTORY = 1;

        /// Search through bookmarks.
        const BOOKMARK = 1 << 1;

        /// Search through tags.
        const TAG = 1 << 2;

        /// Search through the title of pages.
        const TITLE = 1 << 3;

        /// Search the URL of pages.
        const URL = 1 << 4;

        /// Search for typed pages
        const TYPED = 1 << 5;

        /// Search for javascript: urls
        const JAVASCRIPT = 1 << 6;

        /// Search for open pages (currently not meaningfully implemented)
        const OPENPAGE = 1 << 7;

        /// Use intersection between history, typed, bookmark, tag and openpage
        /// instead of union, when the restrict bit is set.
        const RESTRICT = 1 << 8;

        /// Include search suggestions from the currently selected search provider
        /// (currently not implemented)
        const SEARCHES = 1 << 9;
    }
}

impl Default for SearchBehavior {
    // See `defaultBehavior` in Desktop's `UrlbarPrefs.jsm`.
    fn default() -> SearchBehavior {
        SearchBehavior::HISTORY
            | SearchBehavior::BOOKMARK
            | SearchBehavior::OPENPAGE
            | SearchBehavior::SEARCHES
    }
}

impl SearchBehavior {
    #[inline]
    pub fn any() -> Self {
        SearchBehavior::all() & !SearchBehavior::RESTRICT
    }
}

impl FromSql for SearchBehavior {
    #[inline]
    fn column_result(value: ValueRef<'_>) -> FromSqlResult<Self> {
        SearchBehavior::from_bits(u32::column_result(value)?).ok_or(FromSqlError::InvalidType)
    }
}

impl ToSql for SearchBehavior {
    #[inline]
    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
        Ok(ToSqlOutput::from(self.bits()))
    }
}

/// Convert `c` to lower case if it's an alphabetic ascii character, or completely mangle it if it's
/// not. Just returns `c | 0x20`. I don't know if I actually believe this is faster in a way that
/// matters than the saner version.
#[inline(always)]
fn dubious_to_ascii_lower(c: u8) -> u8 {
    c | 0x20
}

/// A port of nextSearchCandidate in the desktop places's SQLFunctions.cpp:
///
/// > Scan forward through UTF-8 text until the next potential character that
/// > could match a given codepoint when lower-cased (false positives are okay).
/// > This avoids having to actually parse the UTF-8 text, which is slow.
///
/// It returns the byte index of the first character that could possibly match.
#[inline(always)]
fn next_search_candidate(to_search: &str, search_for: char) -> Option<usize> {
    // If the character we search for is ASCII, then we can scan until we find
    // it or its ASCII uppercase character, modulo the special cases
    // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+212A KELVIN SIGN
    // (which are the only non-ASCII characters that lower-case to ASCII ones).
    // Since false positives are okay, we approximate ASCII lower-casing by
    // bit-ORing with 0x20, for increased performance.
    //
    // If the character we search for is *not* ASCII, we can ignore everything
    // that is, since all ASCII characters lower-case to ASCII.
    //
    // Because of how UTF-8 uses high-order bits, this will never land us
    // in the middle of a codepoint.
    //
    // The assumptions about Unicode made here are verified in test_casing.
    let search_bytes = to_search.as_bytes();
    if (search_for as u32) < 128 {
        // When searching for I or K, we pick out the first byte of the UTF-8
        // encoding of the corresponding special case character, and look for it
        // in the loop below.  For other characters we fall back to 0xff, which
        // is not a valid UTF-8 byte.
        let target = dubious_to_ascii_lower(search_for as u8);
        let special = if target == b'i' {
            0xc4u8
        } else if target == b'k' {
            0xe2u8
        } else {
            0xffu8
        };
        // Note: rustc doesn't do great at all on the more idiomatic
        // implementation of this (or below), but it does okay for this.
        let mut ci = 0;
        while ci < search_bytes.len() {
            let cur = search_bytes[ci];
            if dubious_to_ascii_lower(cur) == target || cur == special {
                return Some(ci);
            }
            ci += 1;
        }
    } else {
        let mut ci = 0;
        while ci < search_bytes.len() {
            let cur = search_bytes[ci];
            if (cur & 0x80) != 0 {
                return Some(ci);
            }
            ci += 1;
        }
    }
    None
}

#[inline(always)]
fn is_ascii_lower_alpha(c: u8) -> bool {
    // Equivalent to (but fewer operations than) `b'a' <= c && c <= b'z'`
    c.wrapping_sub(b'a') <= (b'z' - b'a')
}

/// port of isOnBoundary from gecko places.
///
/// > Check whether a character position is on a word boundary of a UTF-8 string
/// > (rather than within a word).  We define "within word" to be any position
/// > between [a-zA-Z] and [a-z] -- this lets us match CamelCase words.
/// > TODO: support non-latin alphabets.
#[inline(always)]
fn is_on_boundary(text: &str, index: usize) -> bool {
    if index == 0 {
        return true;
    }
    let bytes = text.as_bytes();
    if is_ascii_lower_alpha(bytes[index]) {
        let prev_lower = dubious_to_ascii_lower(bytes[index - 1]);
        !is_ascii_lower_alpha(prev_lower)
    } else {
        true
    }
}

/// Returns true if `source` starts with `token` ignoring case.
///
/// Loose port of stringMatch from places, which we've modified to perform more correct case
/// folding (if this turns out to be a perf issue we can always address it then).
#[inline]
fn string_match(token: &str, source: &str) -> bool {
    if source.len() < token.len() {
        return false;
    }
    let mut ti = token.chars().default_case_fold();
    let mut si = source.chars().default_case_fold();
    loop {
        match (ti.next(), si.next()) {
            (None, _) => return true,
            (Some(_), None) => return false,
            (Some(x), Some(y)) => {
                if x != y {
                    return false;
                }
            }
        }
    }
}

/// This performs single-codepoint case folding. It will do the wrong thing
/// for characters which have lowercase equivalents with multiple characters.
#[inline]
fn char_to_lower_single(c: char) -> char {
    c.to_lowercase().next().unwrap()
}

/// Read the next codepoint out of `s` and return it's lowercase variant, and the index of the
/// codepoint after it.
#[inline]
fn next_codepoint_lower(s: &str) -> (char, usize) {
    // This is super convoluted, and I wish a more direct way to do it was exposed. (In theory
    // this should be more efficient than this implementation is)
    let mut indices = s.char_indices();
    let (_, next_char) = indices.next().unwrap();
    let next_index = indices
        .next()
        .map(|(index, _)| index)
        .unwrap_or_else(|| s.len());
    (char_to_lower_single(next_char), next_index)
}

// Port of places `findInString`.
pub fn find_in_string(token: &str, src: &str, only_boundary: bool) -> bool {
    // Place's version has this restriction too
    assert!(!token.is_empty(), "Don't search for an empty string");
    if src.len() < token.len() {
        return false;
    }

    let token_first_char = next_codepoint_lower(token).0;
    // The C++ code is a big ol pointer party, and even indexes with negative numbers
    // in some places. We aren't quite this depraved, so we just use indices into slices.
    //
    // There's probably a higher cost to this than usual, and if we had more robust testing
    // (fuzzing, even) it might be worth measuring a version of this that avoids more of the
    // bounds checks.
    let mut cur_offset = 0;
    // Scan forward to the next viable candidate (if any).
    while let Some(src_idx) = next_search_candidate(&src[cur_offset..], token_first_char) {
        if cur_offset + src_idx >= src.len() {
            break;
        }
        cur_offset += src_idx;
        let src_cur = &src[cur_offset..];

        // Check whether the first character in the token matches the character
        // at src_cur. At the same time, get the index of the next character
        // in the source.
        let (src_next_char, next_offset_in_cur) = next_codepoint_lower(src_cur);

        // If it is the first character, and we either don't care about boundaries or
        // we're on one, do the more expensive string matching and return true if it hits.
        if src_next_char == token_first_char
            && (!only_boundary || is_on_boundary(src, cur_offset))
            && string_match(token, src_cur)
        {
            return true;
        }
        cur_offset += next_offset_in_cur;
    }
    false
}

// Search functions used as function pointers by AutocompleteMatch::Invoke

fn find_anywhere(token: &str, source: &str) -> bool {
    assert!(!token.is_empty(), "Don't search for an empty token");
    find_in_string(token, source, false)
}

fn find_on_boundary(token: &str, source: &str) -> bool {
    assert!(!token.is_empty(), "Don't search for an empty token");
    find_in_string(token, source, true)
}

fn find_beginning(token: &str, source: &str) -> bool {
    assert!(!token.is_empty(), "Don't search for an empty token");
    string_match(token, source)
}

fn find_beginning_case_sensitive(token: &str, source: &str) -> bool {
    assert!(!token.is_empty(), "Don't search for an empty token");
    source.starts_with(token)
}

// I can't wait for Rust 2018 when lifetime annotations are automatic.
pub struct AutocompleteMatch<'search, 'url, 'title, 'tags> {
    pub search_str: &'search str,
    pub url_str: &'url str,
    pub title_str: &'title str,
    pub tags: &'tags str,
    pub visit_count: u32,
    pub typed: bool,
    pub bookmarked: bool,
    pub open_page_count: u32,
    pub match_behavior: MatchBehavior,
    pub search_behavior: SearchBehavior,
}

impl AutocompleteMatch<'_, '_, '_, '_> {
    fn get_search_fn(&self) -> fn(&str, &str) -> bool {
        match self.match_behavior {
            MatchBehavior::Anywhere | MatchBehavior::AnywhereUnmodified => find_anywhere,
            MatchBehavior::Beginning => find_beginning,
            MatchBehavior::BeginningCaseSensitive => find_beginning_case_sensitive,
            _ => find_on_boundary,
        }
    }

    fn fixup_url_str<'a>(&self, mut s: &'a str) -> Cow<'a, str> {
        if self.match_behavior != MatchBehavior::AnywhereUnmodified {
            if s.starts_with("http://") {
                s = &s[7..];
            } else if s.starts_with("https://") {
                s = &s[8..];
            } else if s.starts_with("ftp://") {
                s = &s[6..];
            }
        }
        // Bail out early if we don't need to percent decode. It's a
        // little weird that it's measurably faster to check this
        // separately, but whatever.
        if memchr::memchr(b'%', s.as_bytes()).is_none() {
            return Cow::Borrowed(s);
        }
        // TODO: would be nice to decode punycode here too, but for now
        // this is probably fine.
        match percent_encoding::percent_decode(s.as_bytes()).decode_utf8() {
            Err(_) => Cow::Borrowed(s),
            Ok(decoded) => decoded,
        }
    }

    #[inline]
    fn has_behavior(&self, behavior: SearchBehavior) -> bool {
        self.search_behavior.intersects(behavior)
    }

    pub fn invoke(&self) -> bool {
        // We only want to filter javascript: URLs if we are not supposed to search
        // for them, and the search does not start with "javascript:".
        if self.match_behavior == MatchBehavior::AnywhereUnmodified
            && self.url_str.starts_with("javascript:")
            && !self.has_behavior(SearchBehavior::JAVASCRIPT)
            && !self.search_str.starts_with("javascript:")
        {
            return false;
        }
        let matches = if self.has_behavior(SearchBehavior::RESTRICT) {
            (!self.has_behavior(SearchBehavior::HISTORY) || self.visit_count > 0)
                && (!self.has_behavior(SearchBehavior::TYPED) || self.typed)
                && (!self.has_behavior(SearchBehavior::BOOKMARK) || self.bookmarked)
                && (!self.has_behavior(SearchBehavior::TAG) || !self.tags.is_empty())
                && (!self.has_behavior(SearchBehavior::OPENPAGE) || self.open_page_count > 0)
        } else {
            (self.has_behavior(SearchBehavior::HISTORY) && self.visit_count > 0)
                || (self.has_behavior(SearchBehavior::TYPED) && self.typed)
                || (self.has_behavior(SearchBehavior::BOOKMARK) && self.bookmarked)
                || (self.has_behavior(SearchBehavior::TAG) && !self.tags.is_empty())
                || (self.has_behavior(SearchBehavior::OPENPAGE) && self.open_page_count > 0)
        };
        if !matches {
            return false;
        }
        let fixed_url = self.fixup_url_str(self.url_str);
        let search_fn = self.get_search_fn();

        let trimmed_url = util::slice_up_to(fixed_url.as_ref(), MAX_CHARS_TO_SEARCH_THROUGH);
        let trimmed_title = util::slice_up_to(self.title_str, MAX_CHARS_TO_SEARCH_THROUGH);
        for token in self.search_str.split_ascii_whitespace() {
            let matches = match (
                self.has_behavior(SearchBehavior::TITLE),
                self.has_behavior(SearchBehavior::URL),
            ) {
                (true, true) => {
                    (search_fn(token, trimmed_title) || search_fn(token, self.tags))
                        && search_fn(token, trimmed_url)
                }
                (true, false) => search_fn(token, trimmed_title) || search_fn(token, self.tags),
                (false, true) => search_fn(token, trimmed_url),
                (false, false) => {
                    search_fn(token, trimmed_url)
                        || search_fn(token, trimmed_title)
                        || search_fn(token, self.tags)
                }
            };
            if !matches {
                return false;
            }
        }
        true
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_is_ascii_lower_alpha() {
        // just check exhaustively
        for c in 0u8..=255u8 {
            assert_eq!(
                is_ascii_lower_alpha(c),
                c.is_ascii_lowercase(),
                "is_lower_ascii_alpha is wrong for {}",
                c
            );
        }
    }

    // Test the various dubious things this code assumes about unicode / ascii text
    // in the name of performance. This is mostly a port of the test_casing gtests in places
    #[test]
    fn test_casing_assumptions() {
        use std::char;
        // Verify the assertion in next_search_candidate that the
        // only non-ASCII characters that lower-case to ASCII ones are:
        //  * U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
        //  * U+212A KELVIN SIGN
        //
        // It also checks that U+0130 is the only single codepoint that lower cases
        // to multiple characters.
        for c in 128..0x11_0000 {
            if let Some(ch) = char::from_u32(c) {
                // Not quite the same (because codepoints aren't characters), but
                // should serve the same purpose.
                let mut li = ch.to_lowercase();
                let lc = li.next().unwrap();
                if c != 304 && c != 8490 {
                    assert!(
                        (lc as u32) >= 128,
                        "Lower case of non-ascii '{}' ({}) was unexpectedly ascii",
                        ch,
                        c
                    );
                    // This one we added (it's an implicit assumption in the utilities the
                    // places code uses).
                    assert!(
                        li.next().is_none(),
                        "Lower case of '{}' ({}) produced multiple codepoints unexpectedly",
                        ch,
                        c
                    );
                } else {
                    assert!(
                        (lc as u32) < 128,
                        "Lower case of non-ascii '{}' ({}) was unexpectedly not ascii",
                        ch,
                        c
                    );
                }
            }
        }

        // Verify the assertion that all ASCII characters lower-case to ASCII.
        for c in 0..128 {
            let ch = char::from_u32(c).unwrap();
            let mut li = ch.to_lowercase();
            let lc = li.next().unwrap();
            assert!(
                li.next().is_none() && (lc as u32) < 128,
                "Lower case of ascii '{}' ({}) wasn't ascii :(",
                ch,
                c
            );
        }

        for c in (b'a'..=b'z').chain(b'A'..=b'Z') {
            assert_eq!(
                dubious_to_ascii_lower(c),
                c.to_ascii_lowercase(),
                "c: '{}'",
                c as char
            );
        }
    }
}
places/match_impl.rs

places/
match_impl.rs