Source code for mozetl.utils

import csv
import datetime as DT
import logging
import os
import shutil
import tempfile

from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

import boto3
import six
from six import text_type

ACTIVITY_SUBMISSION_LAG = DT.timedelta(10)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


[docs]def format_as_submission_date(date): return DT.date.strftime(date, "%Y%m%d")
[docs]def parse_as_submission_date(date_string): return DT.datetime.strptime(date_string, "%Y%m%d").date()
[docs]def format_spark_path(bucket, prefix): return "s3://{}/{}".format(bucket, prefix)
[docs]def generate_filter_parameters(end_date, days_back): d = {} min_activity_date = end_date - DT.timedelta(days_back) d["min_activity_iso"] = min_activity_date.isoformat() d["max_activity_iso"] = (end_date + DT.timedelta(1)).isoformat() d["min_submission_string"] = format_as_submission_date(min_activity_date) max_submission_date = end_date + ACTIVITY_SUBMISSION_LAG d["max_submission_string"] = format_as_submission_date(max_submission_date) return d
[docs]def write_csv(dataframe, path, header=True): """Write a dataframe to local disk. Disclaimer: Do not write csv files larger than driver memory. This is ~15GB for ec2 c3.xlarge (due to caching overhead). """ # NOTE: Before spark 2.1, toLocalIterator will timeout on some dataframes # because rdd materialization can take a long time. Instead of using # an iterator over all partitions, collect everything into driver memory. logger.info("Writing {} rows to {}".format(dataframe.count(), path)) with open(path, "wb") if six.PY2 else open(path, "w", newline="") as fout: writer = csv.writer(fout) if header: writer.writerow(dataframe.columns) for row in dataframe.collect(): row = [text_type(s).encode("utf-8") for s in row] if six.PY2 else row writer.writerow(row)
[docs]def write_csv_to_s3(dataframe, bucket, key, header=True): path = tempfile.mkdtemp() if not os.path.exists(path): os.makedirs(path) filepath = os.path.join(path, "temp.csv") write_csv(dataframe, filepath, header) # create the s3 resource for this transaction s3 = boto3.client("s3", region_name="us-west-2") # write the contents of the file to right location upload_file_to_s3(s3, filepath, bucket, key) logger.info("Sucessfully wrote {} to {}".format(key, bucket)) # clean up the temporary directory shutil.rmtree(path)
[docs]def upload_file_to_s3(client, filepath, bucket, key, ACL="bucket-owner-full-control"): with open(filepath, "rb") as data: client.put_object(Bucket=bucket, Key=key, Body=data, ACL=ACL)
[docs]def delete_from_s3(bucket_name, keys_to_delete): bucket = boto3.resource("s3").Bucket(bucket_name) objects = [{"Key": key} for key in keys_to_delete] response = bucket.delete_objects(Delete={"Objects": objects}) code = response["ResponseMetadata"]["HTTPStatusCode"] if code != 200: msg = "AWS returned {} when attempting to delete {}" raise RuntimeError(msg.format(code, keys_to_delete))
[docs]def send_ses(fromaddr, subject, body, recipient, filename=""): """Send an email via the Amazon SES service. Example: ``` send_ses('me@example.com, 'greetings', "Hi!", 'you@example.com) ``` Raises a RuntimeError if the message did not send correctly.""" msg = MIMEMultipart() msg["Subject"] = subject msg["From"] = fromaddr msg["To"] = recipient msg.attach(MIMEText(body)) if filename: attachment = open(filename, "rb").read() part = MIMEApplication(attachment) part.add_header("Content-Disposition", "attachment", filename=filename) msg.attach(part) ses = boto3.client("ses", region_name="us-west-2") result = ses.send_raw_email(RawMessage={"Data": msg.as_string()}) if "ErrorResponse" in result: raise RuntimeError("Error sending email: " + result)
[docs]def extract_submission_window_for_activity_day(frame, date, lag_days): """ Extract rows with an activity_date of `date` minus lag_days and a submission_date between `activity_date` and `date` (inclusive). :date 'Y-m-d' of the end of the target period :lag_days number of days after `date` in the target period :frame DataFrame homologous with main_summary Note that the start_date will be `lag-days` days before date. In other words, if you pass in 2017-01-20 and set `lag-days` to 5, the aggregation will be processed for day 2017-01-15 (the resulting data will cover submission dates including the activity day itself plus 5 days of lag for a total of 6 days). """ from .clientsdaily.fields import ACTIVITY_DATE_COLUMN end_date = DT.datetime.strptime(date, "%Y-%m-%d").date() # Rewind by `lag_days` to the desired activity date. start_date = end_date - DT.timedelta(lag_days) frame = frame.select("*", ACTIVITY_DATE_COLUMN) activity_iso = start_date.strftime("%Y-%m-%d") submission_start_str = start_date.strftime("%Y%m%d") submission_end_str = end_date.strftime("%Y%m%d") result = ( frame.where("submission_date_s3 >= '{}'".format(submission_start_str)) .where("submission_date_s3 <= '{}'".format(submission_end_str)) .where("activity_date = '{}'".format(activity_iso)) ) return result, start_date
[docs]def stop_session_safely(spark_session): """ Safely stops Spark session. This is no-op if running on Databricks - we shouldn't stop session there since it's managed by the platform, doing so fails the job. """ if not spark_session.conf.get("spark.home", "").startswith("/databricks"): spark_session.stop()