Source code for mozetl.taar.taar_utils

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import contextlib
import hashlib
import json
import logging
import os.path
import shutil
import tempfile

import boto3
from botocore.exceptions import ClientError

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

AMO_DUMP_BUCKET = "telemetry-parquet"
AMO_DUMP_KEY = "telemetry-ml/addon_recommender/addons_database.json"

AMO_WHITELIST_KEY = "telemetry-ml/addon_recommender/whitelist_addons_database.json"
AMO_CURATED_WHITELIST_KEY = "telemetry-ml/addon_recommender/only_guids_top_200.json"


[docs]@contextlib.contextmanager def selfdestructing_path(dirname): yield dirname shutil.rmtree(dirname)
[docs]def read_from_s3(s3_dest_file_name, s3_prefix, bucket): """ Read JSON from an S3 bucket and return the decoded JSON blob """ full_s3_name = "{}{}".format(s3_prefix, s3_dest_file_name) conn = boto3.resource("s3", region_name="us-west-2") stored_data = json.loads( conn.Object(bucket, full_s3_name).get()["Body"].read().decode("utf-8") ) return stored_data
[docs]def write_to_s3(source_file_name, s3_dest_file_name, s3_prefix, bucket): """Store the new json file containing current top addons per locale to S3. :param source_file_name: The name of the local source file. :param s3_dest_file_name: The name of the destination file on S3. :param s3_prefix: The S3 prefix in the bucket. :param bucket: The S3 bucket. """ client = boto3.client("s3", "us-west-2") transfer = boto3.s3.transfer.S3Transfer(client) # Update the state in the analysis bucket. key_path = s3_prefix + s3_dest_file_name transfer.upload_file(source_file_name, bucket, key_path)
[docs]def store_json_to_s3(json_data, base_filename, date, prefix, bucket): """Saves the JSON data to a local file and then uploads it to S3. Two copies of the file will get uploaded: one with as "<base_filename>.json" and the other as "<base_filename><YYYYMMDD>.json" for backup purposes. :param json_data: A string with the JSON content to write. :param base_filename: A string with the base name of the file to use for saving locally and uploading to S3. :param date: A date string in the "YYYYMMDD" format. :param prefix: The S3 prefix. :param bucket: The S3 bucket name. """ tempdir = tempfile.mkdtemp() with selfdestructing_path(tempdir): JSON_FILENAME = "{}.json".format(base_filename) FULL_FILENAME = os.path.join(tempdir, JSON_FILENAME) with open(FULL_FILENAME, "w+") as json_file: json_file.write(json_data) archived_file_copy = "{}{}.json".format(base_filename, date) # Store a copy of the current JSON with datestamp. write_to_s3(FULL_FILENAME, archived_file_copy, prefix, bucket) write_to_s3(FULL_FILENAME, JSON_FILENAME, prefix, bucket)
[docs]def load_amo_external_whitelist(): """Download and parse the AMO add-on whitelist. :raises RuntimeError: the AMO whitelist file cannot be downloaded or contains no valid add-ons. """ final_whitelist = [] amo_dump = {} try: # Load the most current AMO dump JSON resource. s3 = boto3.client("s3") s3_contents = s3.get_object(Bucket=AMO_DUMP_BUCKET, Key=AMO_WHITELIST_KEY) amo_dump = json.loads(s3_contents["Body"].read().decode("utf-8")) except ClientError: logger.exception( "Failed to download from S3", extra={"bucket": AMO_DUMP_BUCKET, "key": AMO_DUMP_KEY}, ) # If the load fails, we will have an empty whitelist, this may be problematic. for key, value in list(amo_dump.items()): addon_files = value.get("current_version", {}).get("files", {}) # If any of the addon files are web_extensions compatible, it can be recommended. if any([f.get("is_webextension", False) for f in addon_files]): final_whitelist.append(value["guid"]) if len(final_whitelist) == 0: raise RuntimeError("Empty AMO whitelist detected") return final_whitelist
[docs]def load_amo_curated_whitelist(): """ Return the curated whitelist of addon GUIDs """ whitelist = read_from_s3( "only_guids_top_200.json", "telemetry-ml/addon_recommender/", "telemetry-parquet", ) return list(whitelist)
[docs]def hash_telemetry_id(telemetry_id): """ This hashing function is a reference implementation based on : https://phabricator.services.mozilla.com/D8311 """ return hashlib.sha256(telemetry_id.encode("utf8")).hexdigest()