places/
match_impl.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5use crate::util;
6use bitflags::bitflags;
7use caseless::Caseless;
8use rusqlite::{
9    self,
10    types::{FromSql, FromSqlError, FromSqlResult, ToSql, ToSqlOutput, ValueRef},
11};
12use std::borrow::Cow;
13
14const MAX_CHARS_TO_SEARCH_THROUGH: usize = 255;
15
16#[derive(Clone, Copy, PartialEq, Eq, Debug)]
17#[repr(u32)]
18pub enum MatchBehavior {
19    // Match anywhere in each searchable tearm
20    Anywhere = 0,
21    /// Match first on word boundaries, and if we do not get enough results, then
22    /// match anywhere in each searchable term.
23    BoundaryAnywhere = 1,
24    /// Match on word boundaries in each searchable term.
25    Boundary = 2,
26    /// Match only the beginning of each search term.
27    Beginning = 3,
28    /// Match anywhere in each searchable term without doing any transformation
29    /// or stripping on the underlying data.
30    AnywhereUnmodified = 4,
31    /// Match only the beginning of each search term using a case sensitive
32    /// comparator
33    BeginningCaseSensitive = 5,
34}
35
36impl FromSql for MatchBehavior {
37    #[inline]
38    fn column_result(value: ValueRef<'_>) -> FromSqlResult<Self> {
39        Ok(match value.as_i64()? {
40            0 => MatchBehavior::Anywhere,
41            1 => MatchBehavior::BoundaryAnywhere,
42            2 => MatchBehavior::Boundary,
43            3 => MatchBehavior::Beginning,
44            4 => MatchBehavior::AnywhereUnmodified,
45            5 => MatchBehavior::BeginningCaseSensitive,
46            _ => return Err(FromSqlError::InvalidType),
47        })
48    }
49}
50
51impl ToSql for MatchBehavior {
52    #[inline]
53    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
54        Ok(ToSqlOutput::from(*self as u32))
55    }
56}
57
58bitflags! {
59    pub struct SearchBehavior: u32 {
60        /// Search through history.
61        const HISTORY = 1;
62
63        /// Search through bookmarks.
64        const BOOKMARK = 1 << 1;
65
66        /// Search through tags.
67        const TAG = 1 << 2;
68
69        /// Search through the title of pages.
70        const TITLE = 1 << 3;
71
72        /// Search the URL of pages.
73        const URL = 1 << 4;
74
75        /// Search for typed pages
76        const TYPED = 1 << 5;
77
78        /// Search for javascript: urls
79        const JAVASCRIPT = 1 << 6;
80
81        /// Search for open pages (currently not meaningfully implemented)
82        const OPENPAGE = 1 << 7;
83
84        /// Use intersection between history, typed, bookmark, tag and openpage
85        /// instead of union, when the restrict bit is set.
86        const RESTRICT = 1 << 8;
87
88        /// Include search suggestions from the currently selected search provider
89        /// (currently not implemented)
90        const SEARCHES = 1 << 9;
91    }
92}
93
94impl Default for SearchBehavior {
95    // See `defaultBehavior` in Desktop's `UrlbarPrefs.jsm`.
96    fn default() -> SearchBehavior {
97        SearchBehavior::HISTORY
98            | SearchBehavior::BOOKMARK
99            | SearchBehavior::OPENPAGE
100            | SearchBehavior::SEARCHES
101    }
102}
103
104impl SearchBehavior {
105    #[inline]
106    pub fn any() -> Self {
107        SearchBehavior::all() & !SearchBehavior::RESTRICT
108    }
109}
110
111impl FromSql for SearchBehavior {
112    #[inline]
113    fn column_result(value: ValueRef<'_>) -> FromSqlResult<Self> {
114        SearchBehavior::from_bits(u32::column_result(value)?).ok_or(FromSqlError::InvalidType)
115    }
116}
117
118impl ToSql for SearchBehavior {
119    #[inline]
120    fn to_sql(&self) -> rusqlite::Result<ToSqlOutput<'_>> {
121        Ok(ToSqlOutput::from(self.bits()))
122    }
123}
124
125/// Convert `c` to lower case if it's an alphabetic ascii character, or completely mangle it if it's
126/// not. Just returns `c | 0x20`. I don't know if I actually believe this is faster in a way that
127/// matters than the saner version.
128#[inline(always)]
129fn dubious_to_ascii_lower(c: u8) -> u8 {
130    c | 0x20
131}
132
133/// A port of nextSearchCandidate in the desktop places's SQLFunctions.cpp:
134///
135/// > Scan forward through UTF-8 text until the next potential character that
136/// > could match a given codepoint when lower-cased (false positives are okay).
137/// > This avoids having to actually parse the UTF-8 text, which is slow.
138///
139/// It returns the byte index of the first character that could possibly match.
140#[inline(always)]
141fn next_search_candidate(to_search: &str, search_for: char) -> Option<usize> {
142    // If the character we search for is ASCII, then we can scan until we find
143    // it or its ASCII uppercase character, modulo the special cases
144    // U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE and U+212A KELVIN SIGN
145    // (which are the only non-ASCII characters that lower-case to ASCII ones).
146    // Since false positives are okay, we approximate ASCII lower-casing by
147    // bit-ORing with 0x20, for increased performance.
148    //
149    // If the character we search for is *not* ASCII, we can ignore everything
150    // that is, since all ASCII characters lower-case to ASCII.
151    //
152    // Because of how UTF-8 uses high-order bits, this will never land us
153    // in the middle of a codepoint.
154    //
155    // The assumptions about Unicode made here are verified in test_casing.
156    let search_bytes = to_search.as_bytes();
157    if (search_for as u32) < 128 {
158        // When searching for I or K, we pick out the first byte of the UTF-8
159        // encoding of the corresponding special case character, and look for it
160        // in the loop below.  For other characters we fall back to 0xff, which
161        // is not a valid UTF-8 byte.
162        let target = dubious_to_ascii_lower(search_for as u8);
163        let special = if target == b'i' {
164            0xc4u8
165        } else if target == b'k' {
166            0xe2u8
167        } else {
168            0xffu8
169        };
170        // Note: rustc doesn't do great at all on the more idiomatic
171        // implementation of this (or below), but it does okay for this.
172        let mut ci = 0;
173        while ci < search_bytes.len() {
174            let cur = search_bytes[ci];
175            if dubious_to_ascii_lower(cur) == target || cur == special {
176                return Some(ci);
177            }
178            ci += 1;
179        }
180    } else {
181        let mut ci = 0;
182        while ci < search_bytes.len() {
183            let cur = search_bytes[ci];
184            if (cur & 0x80) != 0 {
185                return Some(ci);
186            }
187            ci += 1;
188        }
189    }
190    None
191}
192
193#[inline(always)]
194fn is_ascii_lower_alpha(c: u8) -> bool {
195    // Equivalent to (but fewer operations than) `b'a' <= c && c <= b'z'`
196    c.wrapping_sub(b'a') <= (b'z' - b'a')
197}
198
199/// port of isOnBoundary from gecko places.
200///
201/// > Check whether a character position is on a word boundary of a UTF-8 string
202/// > (rather than within a word).  We define "within word" to be any position
203/// > between [a-zA-Z] and [a-z] -- this lets us match CamelCase words.
204/// > TODO: support non-latin alphabets.
205#[inline(always)]
206fn is_on_boundary(text: &str, index: usize) -> bool {
207    if index == 0 {
208        return true;
209    }
210    let bytes = text.as_bytes();
211    if is_ascii_lower_alpha(bytes[index]) {
212        let prev_lower = dubious_to_ascii_lower(bytes[index - 1]);
213        !is_ascii_lower_alpha(prev_lower)
214    } else {
215        true
216    }
217}
218
219/// Returns true if `source` starts with `token` ignoring case.
220///
221/// Loose port of stringMatch from places, which we've modified to perform more correct case
222/// folding (if this turns out to be a perf issue we can always address it then).
223#[inline]
224fn string_match(token: &str, source: &str) -> bool {
225    if source.len() < token.len() {
226        return false;
227    }
228    let mut ti = token.chars().default_case_fold();
229    let mut si = source.chars().default_case_fold();
230    loop {
231        match (ti.next(), si.next()) {
232            (None, _) => return true,
233            (Some(_), None) => return false,
234            (Some(x), Some(y)) => {
235                if x != y {
236                    return false;
237                }
238            }
239        }
240    }
241}
242
243/// This performs single-codepoint case folding. It will do the wrong thing
244/// for characters which have lowercase equivalents with multiple characters.
245#[inline]
246fn char_to_lower_single(c: char) -> char {
247    c.to_lowercase().next().unwrap()
248}
249
250/// Read the next codepoint out of `s` and return it's lowercase variant, and the index of the
251/// codepoint after it.
252#[inline]
253fn next_codepoint_lower(s: &str) -> (char, usize) {
254    // This is super convoluted, and I wish a more direct way to do it was exposed. (In theory
255    // this should be more efficient than this implementation is)
256    let mut indices = s.char_indices();
257    let (_, next_char) = indices.next().unwrap();
258    let next_index = indices
259        .next()
260        .map(|(index, _)| index)
261        .unwrap_or_else(|| s.len());
262    (char_to_lower_single(next_char), next_index)
263}
264
265// Port of places `findInString`.
266pub fn find_in_string(token: &str, src: &str, only_boundary: bool) -> bool {
267    // Place's version has this restriction too
268    assert!(!token.is_empty(), "Don't search for an empty string");
269    if src.len() < token.len() {
270        return false;
271    }
272
273    let token_first_char = next_codepoint_lower(token).0;
274    // The C++ code is a big ol pointer party, and even indexes with negative numbers
275    // in some places. We aren't quite this depraved, so we just use indices into slices.
276    //
277    // There's probably a higher cost to this than usual, and if we had more robust testing
278    // (fuzzing, even) it might be worth measuring a version of this that avoids more of the
279    // bounds checks.
280    let mut cur_offset = 0;
281    // Scan forward to the next viable candidate (if any).
282    while let Some(src_idx) = next_search_candidate(&src[cur_offset..], token_first_char) {
283        if cur_offset + src_idx >= src.len() {
284            break;
285        }
286        cur_offset += src_idx;
287        let src_cur = &src[cur_offset..];
288
289        // Check whether the first character in the token matches the character
290        // at src_cur. At the same time, get the index of the next character
291        // in the source.
292        let (src_next_char, next_offset_in_cur) = next_codepoint_lower(src_cur);
293
294        // If it is the first character, and we either don't care about boundaries or
295        // we're on one, do the more expensive string matching and return true if it hits.
296        if src_next_char == token_first_char
297            && (!only_boundary || is_on_boundary(src, cur_offset))
298            && string_match(token, src_cur)
299        {
300            return true;
301        }
302        cur_offset += next_offset_in_cur;
303    }
304    false
305}
306
307// Search functions used as function pointers by AutocompleteMatch::Invoke
308
309fn find_anywhere(token: &str, source: &str) -> bool {
310    assert!(!token.is_empty(), "Don't search for an empty token");
311    find_in_string(token, source, false)
312}
313
314fn find_on_boundary(token: &str, source: &str) -> bool {
315    assert!(!token.is_empty(), "Don't search for an empty token");
316    find_in_string(token, source, true)
317}
318
319fn find_beginning(token: &str, source: &str) -> bool {
320    assert!(!token.is_empty(), "Don't search for an empty token");
321    string_match(token, source)
322}
323
324fn find_beginning_case_sensitive(token: &str, source: &str) -> bool {
325    assert!(!token.is_empty(), "Don't search for an empty token");
326    source.starts_with(token)
327}
328
329// I can't wait for Rust 2018 when lifetime annotations are automatic.
330pub struct AutocompleteMatch<'search, 'url, 'title, 'tags> {
331    pub search_str: &'search str,
332    pub url_str: &'url str,
333    pub title_str: &'title str,
334    pub tags: &'tags str,
335    pub visit_count: u32,
336    pub typed: bool,
337    pub bookmarked: bool,
338    pub open_page_count: u32,
339    pub match_behavior: MatchBehavior,
340    pub search_behavior: SearchBehavior,
341}
342
343impl AutocompleteMatch<'_, '_, '_, '_> {
344    fn get_search_fn(&self) -> fn(&str, &str) -> bool {
345        match self.match_behavior {
346            MatchBehavior::Anywhere | MatchBehavior::AnywhereUnmodified => find_anywhere,
347            MatchBehavior::Beginning => find_beginning,
348            MatchBehavior::BeginningCaseSensitive => find_beginning_case_sensitive,
349            _ => find_on_boundary,
350        }
351    }
352
353    fn fixup_url_str<'a>(&self, mut s: &'a str) -> Cow<'a, str> {
354        if self.match_behavior != MatchBehavior::AnywhereUnmodified {
355            if s.starts_with("http://") {
356                s = &s[7..];
357            } else if s.starts_with("https://") {
358                s = &s[8..];
359            } else if s.starts_with("ftp://") {
360                s = &s[6..];
361            }
362        }
363        // Bail out early if we don't need to percent decode. It's a
364        // little weird that it's measurably faster to check this
365        // separately, but whatever.
366        if memchr::memchr(b'%', s.as_bytes()).is_none() {
367            return Cow::Borrowed(s);
368        }
369        // TODO: would be nice to decode punycode here too, but for now
370        // this is probably fine.
371        match percent_encoding::percent_decode(s.as_bytes()).decode_utf8() {
372            Err(_) => Cow::Borrowed(s),
373            Ok(decoded) => decoded,
374        }
375    }
376
377    #[inline]
378    fn has_behavior(&self, behavior: SearchBehavior) -> bool {
379        self.search_behavior.intersects(behavior)
380    }
381
382    pub fn invoke(&self) -> bool {
383        // We only want to filter javascript: URLs if we are not supposed to search
384        // for them, and the search does not start with "javascript:".
385        if self.match_behavior == MatchBehavior::AnywhereUnmodified
386            && self.url_str.starts_with("javascript:")
387            && !self.has_behavior(SearchBehavior::JAVASCRIPT)
388            && !self.search_str.starts_with("javascript:")
389        {
390            return false;
391        }
392        let matches = if self.has_behavior(SearchBehavior::RESTRICT) {
393            (!self.has_behavior(SearchBehavior::HISTORY) || self.visit_count > 0)
394                && (!self.has_behavior(SearchBehavior::TYPED) || self.typed)
395                && (!self.has_behavior(SearchBehavior::BOOKMARK) || self.bookmarked)
396                && (!self.has_behavior(SearchBehavior::TAG) || !self.tags.is_empty())
397                && (!self.has_behavior(SearchBehavior::OPENPAGE) || self.open_page_count > 0)
398        } else {
399            (self.has_behavior(SearchBehavior::HISTORY) && self.visit_count > 0)
400                || (self.has_behavior(SearchBehavior::TYPED) && self.typed)
401                || (self.has_behavior(SearchBehavior::BOOKMARK) && self.bookmarked)
402                || (self.has_behavior(SearchBehavior::TAG) && !self.tags.is_empty())
403                || (self.has_behavior(SearchBehavior::OPENPAGE) && self.open_page_count > 0)
404        };
405        if !matches {
406            return false;
407        }
408        let fixed_url = self.fixup_url_str(self.url_str);
409        let search_fn = self.get_search_fn();
410
411        let trimmed_url = util::slice_up_to(fixed_url.as_ref(), MAX_CHARS_TO_SEARCH_THROUGH);
412        let trimmed_title = util::slice_up_to(self.title_str, MAX_CHARS_TO_SEARCH_THROUGH);
413        for token in self.search_str.split_ascii_whitespace() {
414            let matches = match (
415                self.has_behavior(SearchBehavior::TITLE),
416                self.has_behavior(SearchBehavior::URL),
417            ) {
418                (true, true) => {
419                    (search_fn(token, trimmed_title) || search_fn(token, self.tags))
420                        && search_fn(token, trimmed_url)
421                }
422                (true, false) => search_fn(token, trimmed_title) || search_fn(token, self.tags),
423                (false, true) => search_fn(token, trimmed_url),
424                (false, false) => {
425                    search_fn(token, trimmed_url)
426                        || search_fn(token, trimmed_title)
427                        || search_fn(token, self.tags)
428                }
429            };
430            if !matches {
431                return false;
432            }
433        }
434        true
435    }
436}
437
438#[cfg(test)]
439mod test {
440    use super::*;
441
442    #[test]
443    fn test_is_ascii_lower_alpha() {
444        // just check exhaustively
445        for c in 0u8..=255u8 {
446            assert_eq!(
447                is_ascii_lower_alpha(c),
448                c.is_ascii_lowercase(),
449                "is_lower_ascii_alpha is wrong for {}",
450                c
451            );
452        }
453    }
454
455    // Test the various dubious things this code assumes about unicode / ascii text
456    // in the name of performance. This is mostly a port of the test_casing gtests in places
457    #[test]
458    fn test_casing_assumptions() {
459        use std::char;
460        // Verify the assertion in next_search_candidate that the
461        // only non-ASCII characters that lower-case to ASCII ones are:
462        //  * U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE
463        //  * U+212A KELVIN SIGN
464        //
465        // It also checks that U+0130 is the only single codepoint that lower cases
466        // to multiple characters.
467        for c in 128..0x11_0000 {
468            if let Some(ch) = char::from_u32(c) {
469                // Not quite the same (because codepoints aren't characters), but
470                // should serve the same purpose.
471                let mut li = ch.to_lowercase();
472                let lc = li.next().unwrap();
473                if c != 304 && c != 8490 {
474                    assert!(
475                        (lc as u32) >= 128,
476                        "Lower case of non-ascii '{}' ({}) was unexpectedly ascii",
477                        ch,
478                        c
479                    );
480                    // This one we added (it's an implicit assumption in the utilities the
481                    // places code uses).
482                    assert!(
483                        li.next().is_none(),
484                        "Lower case of '{}' ({}) produced multiple codepoints unexpectedly",
485                        ch,
486                        c
487                    );
488                } else {
489                    assert!(
490                        (lc as u32) < 128,
491                        "Lower case of non-ascii '{}' ({}) was unexpectedly not ascii",
492                        ch,
493                        c
494                    );
495                }
496            }
497        }
498
499        // Verify the assertion that all ASCII characters lower-case to ASCII.
500        for c in 0..128 {
501            let ch = char::from_u32(c).unwrap();
502            let mut li = ch.to_lowercase();
503            let lc = li.next().unwrap();
504            assert!(
505                li.next().is_none() && (lc as u32) < 128,
506                "Lower case of ascii '{}' ({}) wasn't ascii :(",
507                ch,
508                c
509            );
510        }
511
512        for c in (b'a'..=b'z').chain(b'A'..=b'Z') {
513            assert_eq!(
514                dubious_to_ascii_lower(c),
515                c.to_ascii_lowercase(),
516                "c: '{}'",
517                c as char
518            );
519        }
520    }
521}