places/
match_impl.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5use crate::util;
6use bitflags::bitflags;
7use icu_casemap::CaseMapperBorrowed;
8use rusqlite::{
9    self,
10    types::{FromSql, FromSqlError, FromSqlResult, ToSql, ToSqlOutput, ValueRef},
11};
12use std::borrow::Cow;
13
14const MAX_CHARS_TO_SEARCH_THROUGH: usize = 255;
15
16static CASE_MAPPER: CaseMapperBorrowed<'_> = CaseMapperBorrowed::new();
17
18#[derive(Clone, Copy, PartialEq, Eq, Debug)]
19#[repr(u32)]
20pub enum MatchBehavior {
21    // Match anywhere in each searchable tearm
22    Anywhere = 0,
23    /// Match first on word boundaries, and if we do not get enough results, then
24    /// match anywhere in each searchable term.
25    BoundaryAnywhere = 1,
26    /// Match on word boundaries in each searchable term.
27    Boundary = 2,
28    /// Match only the beginning of each search term.
29    Beginning = 3,
30    /// Match anywhere in each searchable term without doing any transformation
31    /// or stripping on the underlying data.
32    AnywhereUnmodified = 4,
33    /// Match only the beginning of each search term using a case sensitive
34    /// comparator
35    BeginningCaseSensitive = 5,
36}
37
38impl FromSql for MatchBehavior {
39    #[inline]
40    fn column_result(value: ValueRef<'_>) -> FromSqlResult<Self> {
41        Ok(match value.as_i64()? {
42            0 => MatchBehavior::Anywhere,
43            1 => MatchBehavior::BoundaryAnywhere,
44            2 => MatchBehavior::Boundary,
45            3 => MatchBehavior::Beginning,
46            4 => MatchBehavior::AnywhereUnmodified,
47            5 => MatchBehavior::BeginningCaseSensitive,
48            _ => return Err(FromSqlError::InvalidType),
49        })
50    }
51}
52
53impl ToSql for MatchBehavior {
54    #[inline]
55    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
56        Ok(ToSqlOutput::from(*self as u32))
57    }
58}
59
60bitflags! {
61    pub struct SearchBehavior: u32 {
62        /// Search through history.
63        const HISTORY = 1;
64
65        /// Search through bookmarks.
66        const BOOKMARK = 1 << 1;
67
68        /// Search through tags.
69        const TAG = 1 << 2;
70
71        /// Search through the title of pages.
72        const TITLE = 1 << 3;
73
74        /// Search the URL of pages.
75        const URL = 1 << 4;
76
77        /// Search for typed pages
78        const TYPED = 1 << 5;
79
80        /// Search for javascript: urls
81        const JAVASCRIPT = 1 << 6;
82
83        /// Search for open pages (currently not meaningfully implemented)
84        const OPENPAGE = 1 << 7;
85
86        /// Use intersection between history, typed, bookmark, tag and openpage
87        /// instead of union, when the restrict bit is set.
88        const RESTRICT = 1 << 8;
89
90        /// Include search suggestions from the currently selected search provider
91        /// (currently not implemented)
92        const SEARCHES = 1 << 9;
93    }
94}
95
96impl Default for SearchBehavior {
97    // See `defaultBehavior` in Desktop's `UrlbarPrefs.jsm`.
98    fn default() -> SearchBehavior {
99        SearchBehavior::HISTORY
100            | SearchBehavior::BOOKMARK
101            | SearchBehavior::OPENPAGE
102            | SearchBehavior::SEARCHES
103    }
104}
105
106impl SearchBehavior {
107    #[inline]
108    pub fn any() -> Self {
109        SearchBehavior::all() & !SearchBehavior::RESTRICT
110    }
111}
112
113impl FromSql for SearchBehavior {
114    #[inline]
115    fn column_result(value: ValueRef<'_>) -> FromSqlResult<Self> {
116        SearchBehavior::from_bits(u32::column_result(value)?).ok_or(FromSqlError::InvalidType)
117    }
118}
119
120impl ToSql for SearchBehavior {
121    #[inline]
122    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
123        Ok(ToSqlOutput::from(self.bits()))
124    }
125}
126
127/// Convert `c` to lower case if it's an alphabetic ascii character, or completely mangle it if it's
128/// not. Just returns `c | 0x20`. I don't know if I actually believe this is faster in a way that
129/// matters than the saner version.
130#[inline(always)]
131fn dubious_to_ascii_lower(c: u8) -> u8 {
132    c | 0x20
133}
134
135/// A port of nextSearchCandidate in the desktop places's SQLFunctions.cpp:
136///
137/// > Scan forward through UTF-8 text until the next potential character that
138/// > could match a given codepoint when lower-cased (false positives are okay).
139/// > This avoids having to actually parse the UTF-8 text, which is slow.
140///
141/// It returns the byte index of the first character that could possibly match.
142#[inline(always)]
143fn next_search_candidate(to_search: &str, search_for: char) -> Option<usize> {
144    // If the character we search for is ASCII, then we can scan until we find
145    // it or its ASCII uppercase character, modulo the special cases
146    // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+212A KELVIN SIGN
147    // (which are the only non-ASCII characters that lower-case to ASCII ones).
148    // Since false positives are okay, we approximate ASCII lower-casing by
149    // bit-ORing with 0x20, for increased performance.
150    //
151    // If the character we search for is *not* ASCII, we can ignore everything
152    // that is, since all ASCII characters lower-case to ASCII.
153    //
154    // Because of how UTF-8 uses high-order bits, this will never land us
155    // in the middle of a codepoint.
156    //
157    // The assumptions about Unicode made here are verified in test_casing.
158    let search_bytes = to_search.as_bytes();
159    if (search_for as u32) < 128 {
160        // When searching for I or K, we pick out the first byte of the UTF-8
161        // encoding of the corresponding special case character, and look for it
162        // in the loop below.  For other characters we fall back to 0xff, which
163        // is not a valid UTF-8 byte.
164        let target = dubious_to_ascii_lower(search_for as u8);
165        let special = if target == b'i' {
166            0xc4u8
167        } else if target == b'k' {
168            0xe2u8
169        } else {
170            0xffu8
171        };
172        // Note: rustc doesn't do great at all on the more idiomatic
173        // implementation of this (or below), but it does okay for this.
174        let mut ci = 0;
175        while ci < search_bytes.len() {
176            let cur = search_bytes[ci];
177            if dubious_to_ascii_lower(cur) == target || cur == special {
178                return Some(ci);
179            }
180            ci += 1;
181        }
182    } else {
183        let mut ci = 0;
184        while ci < search_bytes.len() {
185            let cur = search_bytes[ci];
186            if (cur & 0x80) != 0 {
187                return Some(ci);
188            }
189            ci += 1;
190        }
191    }
192    None
193}
194
195#[inline(always)]
196fn is_ascii_lower_alpha(c: u8) -> bool {
197    // Equivalent to (but fewer operations than) `b'a' <= c && c <= b'z'`
198    c.wrapping_sub(b'a') <= (b'z' - b'a')
199}
200
201/// port of isOnBoundary from gecko places.
202///
203/// > Check whether a character position is on a word boundary of a UTF-8 string
204/// > (rather than within a word).  We define "within word" to be any position
205/// > between [a-zA-Z] and [a-z] -- this lets us match CamelCase words.
206/// > TODO: support non-latin alphabets.
207#[inline(always)]
208fn is_on_boundary(text: &str, index: usize) -> bool {
209    if index == 0 {
210        return true;
211    }
212    let bytes = text.as_bytes();
213    if is_ascii_lower_alpha(bytes[index]) {
214        let prev_lower = dubious_to_ascii_lower(bytes[index - 1]);
215        !is_ascii_lower_alpha(prev_lower)
216    } else {
217        true
218    }
219}
220
221/// Returns true if `source` starts with `token` ignoring case.
222///
223/// Loose port of stringMatch from places, which we've modified to perform more correct case
224/// folding (if this turns out to be a perf issue we can always address it then).
225#[inline]
226fn string_match(token: &str, source: &str) -> bool {
227    if source.len() < token.len() {
228        return false;
229    }
230    let t_folded = CASE_MAPPER.fold_string(token);
231    let mut ti = t_folded.chars();
232    let s_folded = CASE_MAPPER.fold_string(source);
233    let mut si = s_folded.chars();
234    loop {
235        match (ti.next(), si.next()) {
236            (None, _) => return true,
237            (Some(_), None) => return false,
238            (Some(x), Some(y)) => {
239                if x != y {
240                    return false;
241                }
242            }
243        }
244    }
245}
246
247/// This performs single-codepoint case folding. It will do the wrong thing
248/// for characters which have lowercase equivalents with multiple characters.
249#[inline]
250fn char_to_lower_single(c: char) -> char {
251    c.to_lowercase().next().unwrap()
252}
253
254/// Read the next codepoint out of `s` and return it's lowercase variant, and the index of the
255/// codepoint after it.
256#[inline]
257fn next_codepoint_lower(s: &str) -> (char, usize) {
258    // This is super convoluted, and I wish a more direct way to do it was exposed. (In theory
259    // this should be more efficient than this implementation is)
260    let mut indices = s.char_indices();
261    let (_, next_char) = indices.next().unwrap();
262    let next_index = indices
263        .next()
264        .map(|(index, _)| index)
265        .unwrap_or_else(|| s.len());
266    (char_to_lower_single(next_char), next_index)
267}
268
269// Port of places `findInString`.
270pub fn find_in_string(token: &str, src: &str, only_boundary: bool) -> bool {
271    // Place's version has this restriction too
272    assert!(!token.is_empty(), "Don't search for an empty string");
273    if src.len() < token.len() {
274        return false;
275    }
276
277    let token_first_char = next_codepoint_lower(token).0;
278    // The C++ code is a big ol pointer party, and even indexes with negative numbers
279    // in some places. We aren't quite this depraved, so we just use indices into slices.
280    //
281    // There's probably a higher cost to this than usual, and if we had more robust testing
282    // (fuzzing, even) it might be worth measuring a version of this that avoids more of the
283    // bounds checks.
284    let mut cur_offset = 0;
285    // Scan forward to the next viable candidate (if any).
286    while let Some(src_idx) = next_search_candidate(&src[cur_offset..], token_first_char) {
287        if cur_offset + src_idx >= src.len() {
288            break;
289        }
290        cur_offset += src_idx;
291        let src_cur = &src[cur_offset..];
292
293        // Check whether the first character in the token matches the character
294        // at src_cur. At the same time, get the index of the next character
295        // in the source.
296        let (src_next_char, next_offset_in_cur) = next_codepoint_lower(src_cur);
297
298        // If it is the first character, and we either don't care about boundaries or
299        // we're on one, do the more expensive string matching and return true if it hits.
300        if src_next_char == token_first_char
301            && (!only_boundary || is_on_boundary(src, cur_offset))
302            && string_match(token, src_cur)
303        {
304            return true;
305        }
306        cur_offset += next_offset_in_cur;
307    }
308    false
309}
310
311// Search functions used as function pointers by AutocompleteMatch::Invoke
312
313fn find_anywhere(token: &str, source: &str) -> bool {
314    assert!(!token.is_empty(), "Don't search for an empty token");
315    find_in_string(token, source, false)
316}
317
318fn find_on_boundary(token: &str, source: &str) -> bool {
319    assert!(!token.is_empty(), "Don't search for an empty token");
320    find_in_string(token, source, true)
321}
322
323fn find_beginning(token: &str, source: &str) -> bool {
324    assert!(!token.is_empty(), "Don't search for an empty token");
325    string_match(token, source)
326}
327
328fn find_beginning_case_sensitive(token: &str, source: &str) -> bool {
329    assert!(!token.is_empty(), "Don't search for an empty token");
330    source.starts_with(token)
331}
332
333// I can't wait for Rust 2018 when lifetime annotations are automatic.
334pub struct AutocompleteMatch<'search, 'url, 'title, 'tags> {
335    pub search_str: &'search str,
336    pub url_str: &'url str,
337    pub title_str: &'title str,
338    pub tags: &'tags str,
339    pub visit_count: u32,
340    pub typed: bool,
341    pub bookmarked: bool,
342    pub open_page_count: u32,
343    pub match_behavior: MatchBehavior,
344    pub search_behavior: SearchBehavior,
345}
346
347impl AutocompleteMatch<'_, '_, '_, '_> {
348    fn get_search_fn(&self) -> fn(&str, &str) -> bool {
349        match self.match_behavior {
350            MatchBehavior::Anywhere | MatchBehavior::AnywhereUnmodified => find_anywhere,
351            MatchBehavior::Beginning => find_beginning,
352            MatchBehavior::BeginningCaseSensitive => find_beginning_case_sensitive,
353            _ => find_on_boundary,
354        }
355    }
356
357    fn fixup_url_str<'a>(&self, mut s: &'a str) -> Cow<'a, str> {
358        if self.match_behavior != MatchBehavior::AnywhereUnmodified {
359            if s.starts_with("http://") {
360                s = &s[7..];
361            } else if s.starts_with("https://") {
362                s = &s[8..];
363            } else if s.starts_with("ftp://") {
364                s = &s[6..];
365            }
366        }
367        // Bail out early if we don't need to percent decode. It's a
368        // little weird that it's measurably faster to check this
369        // separately, but whatever.
370        if memchr::memchr(b'%', s.as_bytes()).is_none() {
371            return Cow::Borrowed(s);
372        }
373        // TODO: would be nice to decode punycode here too, but for now
374        // this is probably fine.
375        match percent_encoding::percent_decode(s.as_bytes()).decode_utf8() {
376            Err(_) => Cow::Borrowed(s),
377            Ok(decoded) => decoded,
378        }
379    }
380
381    #[inline]
382    fn has_behavior(&self, behavior: SearchBehavior) -> bool {
383        self.search_behavior.intersects(behavior)
384    }
385
386    pub fn invoke(&self) -> bool {
387        // We only want to filter javascript: URLs if we are not supposed to search
388        // for them, and the search does not start with "javascript:".
389        if self.match_behavior == MatchBehavior::AnywhereUnmodified
390            && self.url_str.starts_with("javascript:")
391            && !self.has_behavior(SearchBehavior::JAVASCRIPT)
392            && !self.search_str.starts_with("javascript:")
393        {
394            return false;
395        }
396        let matches = if self.has_behavior(SearchBehavior::RESTRICT) {
397            (!self.has_behavior(SearchBehavior::HISTORY) || self.visit_count > 0)
398                && (!self.has_behavior(SearchBehavior::TYPED) || self.typed)
399                && (!self.has_behavior(SearchBehavior::BOOKMARK) || self.bookmarked)
400                && (!self.has_behavior(SearchBehavior::TAG) || !self.tags.is_empty())
401                && (!self.has_behavior(SearchBehavior::OPENPAGE) || self.open_page_count > 0)
402        } else {
403            (self.has_behavior(SearchBehavior::HISTORY) && self.visit_count > 0)
404                || (self.has_behavior(SearchBehavior::TYPED) && self.typed)
405                || (self.has_behavior(SearchBehavior::BOOKMARK) && self.bookmarked)
406                || (self.has_behavior(SearchBehavior::TAG) && !self.tags.is_empty())
407                || (self.has_behavior(SearchBehavior::OPENPAGE) && self.open_page_count > 0)
408        };
409        if !matches {
410            return false;
411        }
412        let fixed_url = self.fixup_url_str(self.url_str);
413        let search_fn = self.get_search_fn();
414
415        let trimmed_url = util::slice_up_to(fixed_url.as_ref(), MAX_CHARS_TO_SEARCH_THROUGH);
416        let trimmed_title = util::slice_up_to(self.title_str, MAX_CHARS_TO_SEARCH_THROUGH);
417        for token in self.search_str.split_ascii_whitespace() {
418            let matches = match (
419                self.has_behavior(SearchBehavior::TITLE),
420                self.has_behavior(SearchBehavior::URL),
421            ) {
422                (true, true) => {
423                    (search_fn(token, trimmed_title) || search_fn(token, self.tags))
424                        && search_fn(token, trimmed_url)
425                }
426                (true, false) => search_fn(token, trimmed_title) || search_fn(token, self.tags),
427                (false, true) => search_fn(token, trimmed_url),
428                (false, false) => {
429                    search_fn(token, trimmed_url)
430                        || search_fn(token, trimmed_title)
431                        || search_fn(token, self.tags)
432                }
433            };
434            if !matches {
435                return false;
436            }
437        }
438        true
439    }
440}
441
442#[cfg(test)]
443mod test {
444    use super::*;
445
446    #[test]
447    fn test_is_ascii_lower_alpha() {
448        // just check exhaustively
449        for c in 0u8..=255u8 {
450            assert_eq!(
451                is_ascii_lower_alpha(c),
452                c.is_ascii_lowercase(),
453                "is_lower_ascii_alpha is wrong for {}",
454                c
455            );
456        }
457    }
458
459    // Test the various dubious things this code assumes about unicode / ascii text
460    // in the name of performance. This is mostly a port of the test_casing gtests in places
461    #[test]
462    fn test_casing_assumptions() {
463        use std::char;
464        // Verify the assertion in next_search_candidate that the
465        // only non-ASCII characters that lower-case to ASCII ones are:
466        //  * U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
467        //  * U+212A KELVIN SIGN
468        //
469        // It also checks that U+0130 is the only single codepoint that lower cases
470        // to multiple characters.
471        for c in 128..0x11_0000 {
472            if let Some(ch) = char::from_u32(c) {
473                // Not quite the same (because codepoints aren't characters), but
474                // should serve the same purpose.
475                let mut li = ch.to_lowercase();
476                let lc = li.next().unwrap();
477                if c != 304 && c != 8490 {
478                    assert!(
479                        (lc as u32) >= 128,
480                        "Lower case of non-ascii '{}' ({}) was unexpectedly ascii",
481                        ch,
482                        c
483                    );
484                    // This one we added (it's an implicit assumption in the utilities the
485                    // places code uses).
486                    assert!(
487                        li.next().is_none(),
488                        "Lower case of '{}' ({}) produced multiple codepoints unexpectedly",
489                        ch,
490                        c
491                    );
492                } else {
493                    assert!(
494                        (lc as u32) < 128,
495                        "Lower case of non-ascii '{}' ({}) was unexpectedly not ascii",
496                        ch,
497                        c
498                    );
499                }
500            }
501        }
502
503        // Verify the assertion that all ASCII characters lower-case to ASCII.
504        for c in 0..128 {
505            let ch = char::from_u32(c).unwrap();
506            let mut li = ch.to_lowercase();
507            let lc = li.next().unwrap();
508            assert!(
509                li.next().is_none() && (lc as u32) < 128,
510                "Lower case of ascii '{}' ({}) wasn't ascii :(",
511                ch,
512                c
513            );
514        }
515
516        for c in (b'a'..=b'z').chain(b'A'..=b'Z') {
517            assert_eq!(
518                dubious_to_ascii_lower(c),
519                c.to_ascii_lowercase(),
520                "c: '{}'",
521                c as char
522            );
523        }
524    }
525}