Source code for mozanalysis.segments

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import warnings

import attr

warnings.simplefilter("default")
warnings.warn(
    """
    segments and segment data source objects created in mozanalysis are deprecated
    please create directly from metric-hub with ConfigLoader like

    from mozanalysis.config import ConfigLoader
    segment=ConfigLoader.get_segment(segment_slug="regular_users_v3",
                                        app_name="firefox_desktop")

    and data sources like
    segment_data_source=ConfigLoader.get_segment_data_source("clients_last_seen",
                                                                "firefox_desktop")

    """,
    DeprecationWarning,
    stacklevel=1,
)


[docs] @attr.s(frozen=True, slots=True) class SegmentDataSource: """Represents a table or view, from which segments may be defined. ``window_start`` and ``window_end`` define the window of data used to determine whether each client fits a segment. Ideally this window ends at/before the moment of enrollment, so that user's branches can't bias the segment assignment. ``window_start`` and ``window_end`` are integers, representing the number of days before or after enrollment. Args: name (str): Name for the Data Source. Should be unique to avoid confusion. from_expr (str): FROM expression - often just a fully-qualified table name. Sometimes a subquery. May contain the string ``{dataset}`` which will be replaced with an app-specific dataset for Glean apps. If the expression is templated on dataset, default_dataset is mandatory. window_start (int, optional): See above. window_end (int, optional): See above. client_id_column (str, optional): Name of the column that contains the ``client_id`` (join key). Defaults to 'client_id'. submission_date_column (str, optional): Name of the column that contains the submission date (as a date, not timestamp). Defaults to 'submission_date'. default_dataset (str, optional): The value to use for `{dataset}` in from_expr if a value is not provided at runtime. Mandatory if from_expr contains a `{dataset}` parameter. app_name: (str, optional): app_name used with metric-hub, used for validation """ name = attr.ib(validator=attr.validators.instance_of(str)) _from_expr = attr.ib(validator=attr.validators.instance_of(str)) window_start = attr.ib(default=0, type=int) window_end = attr.ib(default=0, type=int) client_id_column = attr.ib(default="client_id", type=str) submission_date_column = attr.ib(default="submission_date", type=str) default_dataset = attr.ib(default=None, type=str | None) app_name = attr.ib(default=None, type=str | None) @default_dataset.validator def _check_default_dataset_provided_if_needed(self, attribute, value): self.from_expr_for(None)
[docs] def from_expr_for(self, dataset: str | None) -> str: """Expands the ``from_expr`` template for the given dataset. If ``from_expr`` is not a template, returns ``from_expr``. Args: dataset (str or None): Dataset name to substitute into the from expression. """ effective_dataset = dataset or self.default_dataset if effective_dataset is None: try: return self._from_expr.format() except Exception as e: raise ValueError( f"{self.name}: from_expr contains a dataset template but no value was provided." # noqa:E501 ) from e return self._from_expr.format(dataset=effective_dataset)
[docs] def build_query( self, segment_list, time_limits, experiment_slug, from_expr_dataset=None, ): """Return a nearly self contained SQL query. The query takes a list of ``client_id``s from ``raw_enrollments``, and adds one non-NULL boolean column per segment: True if the client is in the segment, False otherwise. """ return """SELECT e.client_id, e.branch, {segments} FROM raw_enrollments e LEFT JOIN {from_expr} ds ON ds.{client_id} = e.client_id AND ds.{submission_date} BETWEEN DATE_ADD('{first_enrollment}', interval {window_start} day) AND DATE_ADD('{last_enrollment}', interval {window_end} day) AND ds.{submission_date} BETWEEN DATE_ADD(e.enrollment_date, interval {window_start} day) AND DATE_ADD(e.enrollment_date, interval {window_end} day) GROUP BY e.client_id, e.branch""".format( client_id=self.client_id_column or "client_id", submission_date=self.submission_date_column or "submission_date", from_expr=self.from_expr_for(from_expr_dataset), first_enrollment=time_limits.first_enrollment_date, last_enrollment=time_limits.last_enrollment_date, window_start=self.window_start, window_end=self.window_end, segments=",\n ".join( f"{m.select_expr} AS {m.name}" for m in segment_list ), )
[docs] def build_query_target( self, target, time_limits, from_expr_dataset=None, ): """ Return a nearly-self contained SQL query, for use with mozanalysis.sizing.HistoricalTarget. This query returns all distinct client IDs that satisfy the criteria for inclusion in a historical analysis using this datasource. Separate sub-queries are constructed for each additional Segment in the analysis. """ return """ SELECT {client_id} as client_id, target_first_date, target_last_date, {target_name} FROM (SELECT {client_id}, MIN({submission_date}) as target_first_date, MAX({submission_date}) as target_last_date, {target} FROM {from_expr} WHERE {submission_date} BETWEEN '{fddr}' AND '{lddr}' GROUP BY {client_id}) WHERE {target_name}""".format( client_id=self.client_id_column or "client_id", submission_date=self.submission_date_column or "submission_date", from_expr=self.from_expr_for(from_expr_dataset), fddr=time_limits.first_enrollment_date, lddr=time_limits.last_enrollment_date, target=f"{target.select_expr} AS {target.name}", target_name=target.name, )
@window_start.validator def window_start_lte_window_end(self, attribute, value): if value > self.window_end: raise ValueError("window_start must be <= window_end")
[docs] @attr.s(frozen=True, slots=True) class Segment: """Represents an experiment Segment. Args: name (str): The segment's name; will be a column name. data_source (SegmentDataSource): Data source that provides the columns referenced in ``select_expr``. select_expr (str): A SQL select expression that includes an aggregation function (we ``GROUP BY client_id``). Returns a non-NULL ``BOOL``: ``True`` if the user is in the segment, ``False`` otherwise. friendly_name (str): A human-readable dashboard title for this segment description (str): A paragraph of Markdown-formatted text describing the segment in more detail, to be shown on dashboards app_name: (str, optional): app_name used with metric-hub, used for validation """ name = attr.ib(type=str) data_source = attr.ib(validator=attr.validators.instance_of(SegmentDataSource)) select_expr = attr.ib(type=str) friendly_name = attr.ib(type=str | None, default=None) description = attr.ib(type=str | None, default=None) app_name = attr.ib(type=str | None, default=None)