relevancy/
ingest.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5use crate::db::RelevancyDao;
6use crate::rs::{
7    from_json, from_json_slice, RelevancyAttachmentData, RelevancyRecord,
8    RelevancyRemoteSettingsClient,
9};
10use crate::url_hash::UrlHash;
11use crate::{Error, Interest, RelevancyDb, Result};
12use base64::{engine::general_purpose::STANDARD, Engine};
13use remote_settings::RemoteSettingsRecord;
14
15// Number of rows to write when inserting interest data before checking for interruption
16const WRITE_CHUNK_SIZE: usize = 100;
17
18pub fn ensure_interest_data_populated<C: RelevancyRemoteSettingsClient>(
19    db: &RelevancyDb,
20    client: C,
21) -> Result<()> {
22    if !db.read(|dao| dao.need_to_load_url_interests())? {
23        return Ok(());
24    }
25
26    match fetch_interest_data_inner(client) {
27        Ok(data) => {
28            db.read_write(move |dao| insert_interest_data(data, dao))?;
29        }
30        Err(e) => {
31            crate::warn!("error fetching interest data: {e}");
32            return Err(Error::FetchInterestDataError);
33        }
34    }
35    Ok(())
36}
37
38/// Fetch the interest data
39fn fetch_interest_data_inner<C: RelevancyRemoteSettingsClient>(
40    client: C,
41) -> Result<Vec<(Interest, UrlHash)>> {
42    let remote_settings_response = client.get_records()?;
43    let mut result = vec![];
44
45    for record in remote_settings_response {
46        let attachment_data = client.get_attachment(&record)?;
47        let interest = get_interest(&record)?;
48        let urls = get_hash_urls(attachment_data)?;
49        result.extend(std::iter::repeat(interest).zip(urls));
50    }
51    Ok(result)
52}
53
54fn get_hash_urls(attachment_data: Vec<u8>) -> Result<Vec<UrlHash>> {
55    let mut hash_urls = vec![];
56
57    let parsed_attachment_data: Vec<RelevancyAttachmentData> = from_json_slice(&attachment_data)?;
58
59    for attachment_data in parsed_attachment_data {
60        let hash_url = STANDARD
61            .decode(attachment_data.domain)
62            .map_err(|_| Error::Base64DecodeError("Invalid base64 error".to_string()))?;
63        let url_hash = hash_url.try_into().map_err(|_| {
64            Error::Base64DecodeError("Base64 string has wrong number of bytes".to_string())
65        })?;
66        hash_urls.push(url_hash);
67    }
68    Ok(hash_urls)
69}
70
71/// Extract Interest from the record info
72fn get_interest(record: &RemoteSettingsRecord) -> Result<Interest> {
73    let record_fields: RelevancyRecord =
74        from_json(serde_json::Value::Object(record.fields.clone()))?;
75    let custom_details = record_fields.record_custom_details;
76    let category_code = custom_details.category_to_domains.category_code;
77    Interest::try_from(category_code as u32)
78}
79
80/// Insert Interests into Db
81fn insert_interest_data(data: Vec<(Interest, UrlHash)>, dao: &mut RelevancyDao) -> Result<()> {
82    for chunk in data.chunks(WRITE_CHUNK_SIZE) {
83        dao.err_if_interrupted()?;
84        for (interest, hash_url) in chunk {
85            dao.add_url_interest(*hash_url, *interest)?;
86        }
87    }
88
89    Ok(())
90}
91
92#[cfg(test)]
93mod test {
94
95    use std::{cell::RefCell, collections::HashMap};
96
97    use anyhow::Context;
98    use serde_json::json;
99
100    use super::*;
101    use crate::{rs::RelevancyRemoteSettingsClient, url_hash::hash_url, InterestVector};
102
103    /// A snapshot containing fake Remote Settings records and attachments for
104    /// the store to ingest. We use snapshots to test the store's behavior in a
105    /// data-driven way.
106    struct Snapshot {
107        records: Vec<RemoteSettingsRecord>,
108        attachments: HashMap<&'static str, Vec<u8>>,
109    }
110
111    impl Snapshot {
112        /// Creates a snapshot from a JSON value that represents a collection of
113        /// Relevancy Remote Settings records.
114        ///
115        /// You can use the [`serde_json::json!`] macro to construct the JSON
116        /// value, then pass it to this function. It's easier to use the
117        /// `Snapshot::with_records(json!(...))` idiom than to construct the
118        /// records by hand.
119        fn with_records(value: serde_json::Value) -> anyhow::Result<Self> {
120            Ok(Self {
121                records: serde_json::from_value(value)
122                    .context("Couldn't create snapshot with Remote Settings records")?,
123                attachments: HashMap::new(),
124            })
125        }
126
127        /// Adds a data attachment to the snapshot.
128        fn with_data(
129            mut self,
130            location: &'static str,
131            value: serde_json::Value,
132        ) -> anyhow::Result<Self> {
133            self.attachments.insert(
134                location,
135                serde_json::to_vec(&value).context("Couldn't add data attachment to snapshot")?,
136            );
137            Ok(self)
138        }
139    }
140
141    /// A fake Remote Settings client that returns records and attachments from
142    /// a snapshot.
143    struct SnapshotSettingsClient {
144        /// The current snapshot. You can modify it using
145        /// [`RefCell::borrow_mut()`] to simulate remote updates in tests.
146        snapshot: RefCell<Snapshot>,
147    }
148
149    impl SnapshotSettingsClient {
150        /// Creates a client with an initial snapshot.
151        fn with_snapshot(snapshot: Snapshot) -> Self {
152            Self {
153                snapshot: RefCell::new(snapshot),
154            }
155        }
156    }
157
158    impl RelevancyRemoteSettingsClient for SnapshotSettingsClient {
159        fn get_records(&self) -> Result<Vec<RemoteSettingsRecord>> {
160            Ok(self.snapshot.borrow().records.clone())
161        }
162
163        fn get_attachment(&self, record: &RemoteSettingsRecord) -> Result<Vec<u8>> {
164            let location = record.attachment.as_ref().unwrap().location.as_str();
165            Ok(self
166                .snapshot
167                .borrow()
168                .attachments
169                .get(location)
170                .unwrap_or_else(|| unreachable!("Unexpected request for attachment `{}`", location))
171                .clone())
172        }
173
174        fn close(&self) {}
175    }
176
177    #[test]
178    fn test_interest_vectors() {
179        let db = RelevancyDb::new_for_test();
180        db.read_write(|dao| {
181            // Test that the interest data matches the values we started from in
182            // `bin/generate-test-data.rs`
183
184            dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?;
185            dao.add_url_interest(hash_url("https://dogs.com").unwrap(), Interest::Animals)?;
186            dao.add_url_interest(hash_url("https://cars.com").unwrap(), Interest::Autos)?;
187            dao.add_url_interest(
188                hash_url("https://www.vouge.com").unwrap(),
189                Interest::Fashion,
190            )?;
191            dao.add_url_interest(hash_url("https://slashdot.org").unwrap(), Interest::Tech)?;
192            dao.add_url_interest(hash_url("https://www.nascar.com").unwrap(), Interest::Autos)?;
193            dao.add_url_interest(
194                hash_url("https://www.nascar.com").unwrap(),
195                Interest::Sports,
196            )?;
197            dao.add_url_interest(
198                hash_url("https://unknown.url").unwrap(),
199                Interest::Inconclusive,
200            )?;
201
202            assert_eq!(
203                dao.get_url_interest_vector("https://espn.com/").unwrap(),
204                InterestVector {
205                    sports: 1,
206                    ..InterestVector::default()
207                }
208            );
209            assert_eq!(
210                dao.get_url_interest_vector("https://dogs.com/").unwrap(),
211                InterestVector {
212                    animals: 1,
213                    ..InterestVector::default()
214                }
215            );
216            assert_eq!(
217                dao.get_url_interest_vector("https://cars.com/").unwrap(),
218                InterestVector {
219                    autos: 1,
220                    ..InterestVector::default()
221                }
222            );
223            assert_eq!(
224                dao.get_url_interest_vector("https://www.vouge.com/")
225                    .unwrap(),
226                InterestVector {
227                    fashion: 1,
228                    ..InterestVector::default()
229                }
230            );
231            assert_eq!(
232                dao.get_url_interest_vector("https://slashdot.org/")
233                    .unwrap(),
234                InterestVector {
235                    tech: 1,
236                    ..InterestVector::default()
237                }
238            );
239            assert_eq!(
240                dao.get_url_interest_vector("https://www.nascar.com/")
241                    .unwrap(),
242                InterestVector {
243                    autos: 1,
244                    sports: 1,
245                    ..InterestVector::default()
246                }
247            );
248            assert_eq!(
249                dao.get_url_interest_vector("https://unknown.url/").unwrap(),
250                InterestVector {
251                    inconclusive: 1,
252                    ..InterestVector::default()
253                }
254            );
255            Ok(())
256        })
257        .unwrap();
258    }
259
260    #[test]
261    fn test_variations_on_the_url() {
262        let db = RelevancyDb::new_for_test();
263        db.read_write(|dao| {
264            dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?;
265            dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Autos)?;
266            dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Sports)?;
267
268            // Different paths/queries should work
269            assert_eq!(
270                dao.get_url_interest_vector("https://espn.com/foo/bar/?baz")
271                    .unwrap(),
272                InterestVector {
273                    sports: 1,
274                    ..InterestVector::default()
275                }
276            );
277            // Different schemes should too
278            assert_eq!(
279                dao.get_url_interest_vector("http://espn.com/").unwrap(),
280                InterestVector {
281                    sports: 1,
282                    ..InterestVector::default()
283                }
284            );
285            // But changes to the domain shouldn't
286            assert_eq!(
287                dao.get_url_interest_vector("http://espn2.com/").unwrap(),
288                InterestVector::default()
289            );
290            // However, extra components past the 2nd one in the domain are ignored
291            assert_eq!(
292                dao.get_url_interest_vector("https://www.nascar.com/")
293                    .unwrap(),
294                InterestVector {
295                    autos: 1,
296                    sports: 1,
297                    ..InterestVector::default()
298                }
299            );
300            Ok(())
301        })
302        .unwrap();
303    }
304
305    #[test]
306    fn test_parse_records() -> anyhow::Result<()> {
307        let snapshot = Snapshot::with_records(json!([{
308            "id": "animals-0001",
309            "last_modified": 15,
310            "type": "category_to_domains",
311            "attachment": {
312                "filename": "data-1.json",
313                "mimetype": "application/json",
314                "location": "data-1.json",
315                "hash": "",
316                "size": 0
317            },
318            "record_custom_details": {
319              "category_to_domains": {
320                "category": "animals",
321                "category_code": 1,
322                "version": 1
323              }
324            }
325        }]))?
326        .with_data(
327            "data-1.json",
328            json!([
329            {"domain": "J2jtyjQtYQ/+/p//xhz43Q=="},
330            {"domain": "Zd4awCwGZLkat59nIWje3g=="}]),
331        )?;
332        let rs_client = SnapshotSettingsClient::with_snapshot(snapshot);
333        assert_eq!(
334            fetch_interest_data_inner(rs_client).unwrap(),
335            vec![
336                (Interest::Animals, hash_url("https://dogs.com").unwrap()),
337                (Interest::Animals, hash_url("https://cats.com").unwrap())
338            ]
339        );
340
341        Ok(())
342    }
343
344    #[test]
345    fn test_parse_records_with_bad_domain_strings() -> anyhow::Result<()> {
346        let snapshot = Snapshot::with_records(json!([{
347            "id": "animals-0001",
348            "last_modified": 15,
349            "type": "category_to_domains",
350            "attachment": {
351                "filename": "data-1.json",
352                "mimetype": "application/json",
353                "location": "data-1.json",
354                "hash": "",
355                "size": 0
356            },
357            "record_custom_details": {
358              "category_to_domains": {
359                "category": "animals",
360                "category_code": 1,
361                "version": 1
362              }
363            }
364        }]))?
365        .with_data(
366            "data-1.json",
367            json!([
368                {"domain": "badString"},
369                {"domain": "notBase64"}]),
370        )?;
371        let rs_client = SnapshotSettingsClient::with_snapshot(snapshot);
372        fetch_interest_data_inner(rs_client).expect_err("Invalid base64 error");
373
374        Ok(())
375    }
376}