1use crate::db::RelevancyDao;
6use crate::rs::{
7 from_json, from_json_slice, RelevancyAttachmentData, RelevancyRecord,
8 RelevancyRemoteSettingsClient,
9};
10use crate::url_hash::UrlHash;
11use crate::{Error, Interest, RelevancyDb, Result};
12use base64::{engine::general_purpose::STANDARD, Engine};
13use remote_settings::RemoteSettingsRecord;
14
15const WRITE_CHUNK_SIZE: usize = 100;
17
18pub fn ensure_interest_data_populated<C: RelevancyRemoteSettingsClient>(
19 db: &RelevancyDb,
20 client: C,
21) -> Result<()> {
22 if !db.read(|dao| dao.need_to_load_url_interests())? {
23 return Ok(());
24 }
25
26 match fetch_interest_data_inner(client) {
27 Ok(data) => {
28 db.read_write(move |dao| insert_interest_data(data, dao))?;
29 }
30 Err(e) => {
31 crate::warn!("error fetching interest data: {e}");
32 return Err(Error::FetchInterestDataError);
33 }
34 }
35 Ok(())
36}
37
38fn fetch_interest_data_inner<C: RelevancyRemoteSettingsClient>(
40 client: C,
41) -> Result<Vec<(Interest, UrlHash)>> {
42 let remote_settings_response = client.get_records()?;
43 let mut result = vec![];
44
45 for record in remote_settings_response {
46 let attachment_data = client.get_attachment(&record)?;
47 let interest = get_interest(&record)?;
48 let urls = get_hash_urls(attachment_data)?;
49 result.extend(std::iter::repeat(interest).zip(urls));
50 }
51 Ok(result)
52}
53
54fn get_hash_urls(attachment_data: Vec<u8>) -> Result<Vec<UrlHash>> {
55 let mut hash_urls = vec![];
56
57 let parsed_attachment_data: Vec<RelevancyAttachmentData> = from_json_slice(&attachment_data)?;
58
59 for attachment_data in parsed_attachment_data {
60 let hash_url = STANDARD
61 .decode(attachment_data.domain)
62 .map_err(|_| Error::Base64DecodeError("Invalid base64 error".to_string()))?;
63 let url_hash = hash_url.try_into().map_err(|_| {
64 Error::Base64DecodeError("Base64 string has wrong number of bytes".to_string())
65 })?;
66 hash_urls.push(url_hash);
67 }
68 Ok(hash_urls)
69}
70
71fn get_interest(record: &RemoteSettingsRecord) -> Result<Interest> {
73 let record_fields: RelevancyRecord =
74 from_json(serde_json::Value::Object(record.fields.clone()))?;
75 let custom_details = record_fields.record_custom_details;
76 let category_code = custom_details.category_to_domains.category_code;
77 Interest::try_from(category_code as u32)
78}
79
80fn insert_interest_data(data: Vec<(Interest, UrlHash)>, dao: &mut RelevancyDao) -> Result<()> {
82 for chunk in data.chunks(WRITE_CHUNK_SIZE) {
83 dao.err_if_interrupted()?;
84 for (interest, hash_url) in chunk {
85 dao.add_url_interest(*hash_url, *interest)?;
86 }
87 }
88
89 Ok(())
90}
91
92#[cfg(test)]
93mod test {
94
95 use std::{cell::RefCell, collections::HashMap};
96
97 use anyhow::Context;
98 use serde_json::json;
99
100 use super::*;
101 use crate::{rs::RelevancyRemoteSettingsClient, url_hash::hash_url, InterestVector};
102
103 struct Snapshot {
107 records: Vec<RemoteSettingsRecord>,
108 attachments: HashMap<&'static str, Vec<u8>>,
109 }
110
111 impl Snapshot {
112 fn with_records(value: serde_json::Value) -> anyhow::Result<Self> {
120 Ok(Self {
121 records: serde_json::from_value(value)
122 .context("Couldn't create snapshot with Remote Settings records")?,
123 attachments: HashMap::new(),
124 })
125 }
126
127 fn with_data(
129 mut self,
130 location: &'static str,
131 value: serde_json::Value,
132 ) -> anyhow::Result<Self> {
133 self.attachments.insert(
134 location,
135 serde_json::to_vec(&value).context("Couldn't add data attachment to snapshot")?,
136 );
137 Ok(self)
138 }
139 }
140
141 struct SnapshotSettingsClient {
144 snapshot: RefCell<Snapshot>,
147 }
148
149 impl SnapshotSettingsClient {
150 fn with_snapshot(snapshot: Snapshot) -> Self {
152 Self {
153 snapshot: RefCell::new(snapshot),
154 }
155 }
156 }
157
158 impl RelevancyRemoteSettingsClient for SnapshotSettingsClient {
159 fn get_records(&self) -> Result<Vec<RemoteSettingsRecord>> {
160 Ok(self.snapshot.borrow().records.clone())
161 }
162
163 fn get_attachment(&self, record: &RemoteSettingsRecord) -> Result<Vec<u8>> {
164 let location = record.attachment.as_ref().unwrap().location.as_str();
165 Ok(self
166 .snapshot
167 .borrow()
168 .attachments
169 .get(location)
170 .unwrap_or_else(|| unreachable!("Unexpected request for attachment `{}`", location))
171 .clone())
172 }
173
174 fn close(&self) {}
175 }
176
177 #[test]
178 fn test_interest_vectors() {
179 let db = RelevancyDb::new_for_test();
180 db.read_write(|dao| {
181 dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?;
185 dao.add_url_interest(hash_url("https://dogs.com").unwrap(), Interest::Animals)?;
186 dao.add_url_interest(hash_url("https://cars.com").unwrap(), Interest::Autos)?;
187 dao.add_url_interest(
188 hash_url("https://www.vouge.com").unwrap(),
189 Interest::Fashion,
190 )?;
191 dao.add_url_interest(hash_url("https://slashdot.org").unwrap(), Interest::Tech)?;
192 dao.add_url_interest(hash_url("https://www.nascar.com").unwrap(), Interest::Autos)?;
193 dao.add_url_interest(
194 hash_url("https://www.nascar.com").unwrap(),
195 Interest::Sports,
196 )?;
197 dao.add_url_interest(
198 hash_url("https://unknown.url").unwrap(),
199 Interest::Inconclusive,
200 )?;
201
202 assert_eq!(
203 dao.get_url_interest_vector("https://espn.com/").unwrap(),
204 InterestVector {
205 sports: 1,
206 ..InterestVector::default()
207 }
208 );
209 assert_eq!(
210 dao.get_url_interest_vector("https://dogs.com/").unwrap(),
211 InterestVector {
212 animals: 1,
213 ..InterestVector::default()
214 }
215 );
216 assert_eq!(
217 dao.get_url_interest_vector("https://cars.com/").unwrap(),
218 InterestVector {
219 autos: 1,
220 ..InterestVector::default()
221 }
222 );
223 assert_eq!(
224 dao.get_url_interest_vector("https://www.vouge.com/")
225 .unwrap(),
226 InterestVector {
227 fashion: 1,
228 ..InterestVector::default()
229 }
230 );
231 assert_eq!(
232 dao.get_url_interest_vector("https://slashdot.org/")
233 .unwrap(),
234 InterestVector {
235 tech: 1,
236 ..InterestVector::default()
237 }
238 );
239 assert_eq!(
240 dao.get_url_interest_vector("https://www.nascar.com/")
241 .unwrap(),
242 InterestVector {
243 autos: 1,
244 sports: 1,
245 ..InterestVector::default()
246 }
247 );
248 assert_eq!(
249 dao.get_url_interest_vector("https://unknown.url/").unwrap(),
250 InterestVector {
251 inconclusive: 1,
252 ..InterestVector::default()
253 }
254 );
255 Ok(())
256 })
257 .unwrap();
258 }
259
260 #[test]
261 fn test_variations_on_the_url() {
262 let db = RelevancyDb::new_for_test();
263 db.read_write(|dao| {
264 dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?;
265 dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Autos)?;
266 dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Sports)?;
267
268 assert_eq!(
270 dao.get_url_interest_vector("https://espn.com/foo/bar/?baz")
271 .unwrap(),
272 InterestVector {
273 sports: 1,
274 ..InterestVector::default()
275 }
276 );
277 assert_eq!(
279 dao.get_url_interest_vector("http://espn.com/").unwrap(),
280 InterestVector {
281 sports: 1,
282 ..InterestVector::default()
283 }
284 );
285 assert_eq!(
287 dao.get_url_interest_vector("http://espn2.com/").unwrap(),
288 InterestVector::default()
289 );
290 assert_eq!(
292 dao.get_url_interest_vector("https://www.nascar.com/")
293 .unwrap(),
294 InterestVector {
295 autos: 1,
296 sports: 1,
297 ..InterestVector::default()
298 }
299 );
300 Ok(())
301 })
302 .unwrap();
303 }
304
305 #[test]
306 fn test_parse_records() -> anyhow::Result<()> {
307 let snapshot = Snapshot::with_records(json!([{
308 "id": "animals-0001",
309 "last_modified": 15,
310 "type": "category_to_domains",
311 "attachment": {
312 "filename": "data-1.json",
313 "mimetype": "application/json",
314 "location": "data-1.json",
315 "hash": "",
316 "size": 0
317 },
318 "record_custom_details": {
319 "category_to_domains": {
320 "category": "animals",
321 "category_code": 1,
322 "version": 1
323 }
324 }
325 }]))?
326 .with_data(
327 "data-1.json",
328 json!([
329 {"domain": "J2jtyjQtYQ/+/p//xhz43Q=="},
330 {"domain": "Zd4awCwGZLkat59nIWje3g=="}]),
331 )?;
332 let rs_client = SnapshotSettingsClient::with_snapshot(snapshot);
333 assert_eq!(
334 fetch_interest_data_inner(rs_client).unwrap(),
335 vec![
336 (Interest::Animals, hash_url("https://dogs.com").unwrap()),
337 (Interest::Animals, hash_url("https://cats.com").unwrap())
338 ]
339 );
340
341 Ok(())
342 }
343
344 #[test]
345 fn test_parse_records_with_bad_domain_strings() -> anyhow::Result<()> {
346 let snapshot = Snapshot::with_records(json!([{
347 "id": "animals-0001",
348 "last_modified": 15,
349 "type": "category_to_domains",
350 "attachment": {
351 "filename": "data-1.json",
352 "mimetype": "application/json",
353 "location": "data-1.json",
354 "hash": "",
355 "size": 0
356 },
357 "record_custom_details": {
358 "category_to_domains": {
359 "category": "animals",
360 "category_code": 1,
361 "version": 1
362 }
363 }
364 }]))?
365 .with_data(
366 "data-1.json",
367 json!([
368 {"domain": "badString"},
369 {"domain": "notBase64"}]),
370 )?;
371 let rs_client = SnapshotSettingsClient::with_snapshot(snapshot);
372 fetch_interest_data_inner(rs_client).expect_err("Invalid base64 error");
373
374 Ok(())
375 }
376}