relevancy/
url_hash.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5use md5::{Digest, Md5};
6use url::{Host, Url};
7
8pub type UrlHash = [u8; 16];
9
10/// Given a URL, extract the part of it that we want to use to identify it.
11pub fn url_hash_source(url: &str) -> Option<String> {
12    // We currently use the final 2 components of the URL domain.
13    const URL_COMPONENTS_TO_USE: usize = 2;
14
15    let url = Url::parse(url).ok()?;
16    let domain = match url.host() {
17        Some(Host::Domain(d)) => d,
18        _ => return None,
19    };
20    // This will store indexes of `.` chars as we search backwards.
21    let mut pos = domain.len();
22    for _ in 0..URL_COMPONENTS_TO_USE {
23        match domain[0..pos].rfind('.') {
24            Some(p) => pos = p,
25            // The domain has less than 3 dots, return it all
26            None => return Some(domain.to_owned()),
27        }
28    }
29    Some(domain[pos + 1..].to_owned())
30}
31
32pub fn hash_url(url: &str) -> Option<UrlHash> {
33    url_hash_source(url).map(|hash_source| {
34        let mut hasher = Md5::new();
35        hasher.update(hash_source);
36        let result = hasher.finalize();
37        result.into()
38    })
39}
40
41#[cfg(test)]
42mod test {
43    use super::*;
44
45    #[test]
46    fn test_url_hash_source() {
47        let table = [
48            ("http://example.com/some-path", Some("example.com")),
49            ("http://foo.example.com/some-path", Some("example.com")),
50            (
51                "http://foo.bar.baz.example.com/some-path",
52                Some("example.com"),
53            ),
54            ("http://foo.com.uk/some-path", Some("com.uk")),
55            ("http://amazon.com/some-path", Some("amazon.com")),
56            ("http://192.168.0.1/some-path", None),
57        ];
58        for (url, expected) in table {
59            assert_eq!(url_hash_source(url).as_deref(), expected)
60        }
61    }
62}