generator.namespaces

Generate namespaces.yaml.

  1"""Generate namespaces.yaml."""
  2
  3import fnmatch
  4import json
  5import re
  6import urllib.request
  7import warnings
  8from collections.abc import Mapping
  9from datetime import datetime
 10from itertools import groupby
 11from operator import itemgetter
 12from pathlib import Path
 13from typing import Any, Dict, List, Union
 14
 15import click
 16import yaml
 17from google.cloud import bigquery
 18
 19from generator import operational_monitoring_utils
 20
 21from .explores import EXPLORE_TYPES
 22from .metrics_utils import LOOKER_METRIC_HUB_REPO, METRIC_HUB_REPO, MetricsConfigLoader
 23from .views import VIEW_TYPES, View, lookml_utils
 24
 25DEFAULT_GENERATED_SQL_URI = (
 26    "https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz"
 27)
 28
 29PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org"
 30DEFAULT_SPOKE = "looker-spoke-default"
 31OPMON_DATASET = "operational_monitoring"
 32PROD_PROJECT = "moz-fx-data-shared-prod"
 33
 34
 35def _normalize_slug(name):
 36    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
 37
 38
 39def _merge_namespaces(dct, merge_dct):
 40    """Recursively merge namespaces."""
 41    for k, _ in merge_dct.items():
 42        if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping):
 43            if "glean_app" in merge_dct[k] and merge_dct[k]["glean_app"] is False:
 44                # if glean_app gets set to False, Glean views and explores should not be generated
 45                dct[k] = merge_dct[k]
 46            else:
 47                _merge_namespaces(dct[k], merge_dct[k])
 48        else:
 49            if k == "owners" and "owners" in dct:
 50                # combine owners
 51                dct[k] += merge_dct[k]
 52            else:
 53                dct[k] = merge_dct[k]
 54
 55
 56def _get_opmon(bq_client: bigquery.Client, namespaces: Dict[str, Any]):
 57    om_content: Dict[str, Any] = {"views": {}, "explores": {}, "dashboards": {}}
 58    # get operational monitoring namespace information
 59
 60    opmon_namespace = namespaces["operational_monitoring"]
 61    views = opmon_namespace.get("views")
 62
 63    if views is None:
 64        print("No views defined for operational monitoring")
 65        return {}
 66
 67    projects_view = views.get("projects")
 68
 69    if projects_view is None:
 70        print("No projects view defined for operational monitoring")
 71        return {}
 72
 73    projects_table = projects_view["tables"][0]["table"]
 74    projects = operational_monitoring_utils.get_active_projects(
 75        bq_client, project_table=projects_table
 76    )
 77
 78    # Iterating over all defined operational monitoring projects
 79    for project in projects:
 80        table_prefix = _normalize_slug(project["slug"])
 81        project_name = lookml_utils.slug_to_title(
 82            re.sub("[^0-9a-zA-Z_]+", "_", "_".join(project["name"].lower().split(" ")))
 83        )
 84        branches = project.get("branches", ["enabled", "disabled"])
 85
 86        # append view and explore for data type
 87        table = f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics"
 88        dimensions = operational_monitoring_utils.get_dimension_defaults(
 89            bq_client, table, project["dimensions"]
 90        )
 91        om_content["views"][table_prefix] = {
 92            "type": "operational_monitoring_view",
 93            "tables": [
 94                {
 95                    "table": table,
 96                    "xaxis": project["xaxis"],
 97                    "dimensions": dimensions,
 98                }
 99            ],
100        }
101        om_content["explores"][table_prefix] = {
102            "type": "operational_monitoring_explore",
103            "views": {"base_view": f"{table_prefix}"},
104            "branches": branches,
105            "xaxis": project["xaxis"],
106            "dimensions": dimensions,
107            "summaries": project["summaries"],
108        }
109
110        if "alerting" in project and project["alerting"]:
111            # create an alerting view if available
112            om_content["views"][f"{table_prefix}_alerts"] = {
113                "type": "operational_monitoring_alerting_view",
114                "tables": [
115                    {
116                        "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
117                    }
118                ],
119            }
120            om_content["explores"][f"{table_prefix}_alerts"] = {
121                "type": "operational_monitoring_alerting_explore",
122                "views": {"base_view": f"{table_prefix}_alerts"},
123            }
124
125        om_content["dashboards"][table_prefix] = {
126            "type": "operational_monitoring_dashboard",
127            "title": project_name,
128            "tables": [
129                {
130                    "explore": f"{table_prefix}",
131                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics",
132                    "branches": branches,
133                    "xaxis": project["xaxis"],
134                    "compact_visualization": project.get(
135                        "compact_visualization", False
136                    ),
137                    "dimensions": dimensions,
138                    "group_by_dimension": project.get("group_by_dimension", None),
139                    "summaries": project["summaries"],
140                }
141            ],
142        }
143
144        if "alerting" in project and project["alerting"]:
145            om_content["dashboards"][table_prefix]["tables"].append(
146                {
147                    "explore": f"{table_prefix}_alerts",
148                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
149                }
150            )
151
152    return om_content
153
154
155def _get_metric_hub_namespaces(existing_namespaces):
156    metric_hub_data_sources = _get_metric_hub_data_sources()
157
158    metric_hub_namespaces = {}
159    for namespace, metric_hub_data_sources in metric_hub_data_sources.items():
160        # each data source definition is represented by a view and an explore
161        explores = {}
162        views = {}
163        for data_source in sorted(metric_hub_data_sources):
164            views[f"metric_definitions_{data_source}"] = {
165                "type": "metric_definitions_view"
166            }
167
168            explores[f"metric_definitions_{data_source}"] = {
169                "type": "metric_definitions_explore",
170                "views": {"base_view": f"metric_definitions_{data_source}"},
171            }
172
173        metric_hub_namespaces[namespace] = {
174            "pretty_name": lookml_utils.slug_to_title(namespace),
175            "views": views,
176            "explores": explores,
177        }
178
179    return metric_hub_namespaces
180
181
182def _get_glean_apps(
183    app_listings_uri: str,
184) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
185    # define key function and reuse it for sorted and groupby
186    if app_listings_uri.startswith(PROBE_INFO_BASE_URI):
187        # For probe-info-service requests, add query param to bypass cloudfront cache
188        app_listings_uri += f"?t={datetime.utcnow().isoformat()}"
189
190    get_app_name = itemgetter("app_name")
191    with urllib.request.urlopen(app_listings_uri) as f:
192        # groupby requires input be sorted by key to produce one result per key
193        app_listings = sorted(json.loads(f.read()), key=get_app_name)
194
195    apps = []
196    for app_name, group in groupby(app_listings, get_app_name):
197        variants = list(group)
198
199        # use canonical_app_name where channel=="release" or the first one
200        release_variant = next(
201            (
202                channel
203                for channel in variants
204                if channel.get("app_channel") == "release"
205            ),
206            variants[0],
207        )
208
209        canonical_app_name = release_variant["canonical_app_name"]
210        v1_name = release_variant["v1_name"]
211        emails = release_variant["notification_emails"]
212
213        # we use the `source_dataset` concept to figure out what reference
214        # we should be looking for inside bigquery-etl
215        # For release we are currently using an app-level dataset which
216        # references the app id specific one (so we look for that view as
217        # a reference).
218        # For other channels, we refer to the stable tables
219        channels = [
220            {
221                "channel": channel.get("app_channel"),
222                "dataset": (
223                    channel.get("app_name").replace("-", "_")
224                    if channel.get("app_channel") == "release"
225                    else channel.get("bq_dataset_family")
226                ),
227                "source_dataset": (
228                    channel.get("bq_dataset_family")
229                    if channel.get("app_channel") == "release"
230                    else channel.get("bq_dataset_family") + "_stable"
231                ),
232            }
233            for channel in variants
234            if not channel.get("deprecated")
235        ]
236
237        # If all channels are deprecated, don't include this app
238        if channels:
239            apps.append(
240                {
241                    "name": app_name,
242                    "pretty_name": canonical_app_name,
243                    "channels": channels,
244                    "owners": emails,
245                    "glean_app": True,
246                    "v1_name": v1_name,
247                }
248            )
249
250    return apps
251
252
253def _get_looker_views(
254    app: Dict[str, Union[str, List[Dict[str, str]]]],
255    db_views: Dict[str, Dict[str, List[List[str]]]],
256) -> List[View]:
257    views, view_names = [], []
258
259    for klass in VIEW_TYPES.values():
260        for view in klass.from_db_views(  # type: ignore
261            app["name"], app["glean_app"], app["channels"], db_views
262        ):
263            if view.name in view_names:
264                raise KeyError(
265                    (
266                        f"Duplicate Looker View name {view.name} "
267                        f"when generating views for namespace {app['name']}"
268                    )
269                )
270            views.append(view)
271            view_names.append(view.name)
272
273    return views
274
275
276def _get_explores(views: List[View]) -> dict:
277    explores = {}
278    for _, klass in EXPLORE_TYPES.items():
279        for explore in klass.from_views(views):  # type: ignore
280            explores.update(explore.to_dict())
281
282    return explores
283
284
285def _get_metric_hub_data_sources() -> Dict[str, List[str]]:
286    """Get data source definitions from metric-hub repository for each namespace."""
287    data_sources_per_namespace: Dict[str, List[str]] = {}
288    for definition in MetricsConfigLoader.configs.definitions:
289        for data_source_slug in definition.spec.data_sources.definitions.keys():
290            if (
291                len(
292                    MetricsConfigLoader.metrics_of_data_source(
293                        data_source_slug, definition.platform
294                    )
295                )
296                > 0  # ignore data sources that are not used for any metric definition
297            ):
298                if definition.platform in data_sources_per_namespace:
299                    data_sources_per_namespace[definition.platform].append(
300                        data_source_slug
301                    )
302                else:
303                    data_sources_per_namespace[definition.platform] = [data_source_slug]
304
305    return data_sources_per_namespace
306
307
308@click.command(help=__doc__)
309@click.option(
310    "--custom-namespaces",
311    default="custom-namespaces.yaml",
312    type=click.File(),
313    help="Path to a custom namespaces file",
314)
315@click.option(
316    "--generated-sql-uri",
317    default=DEFAULT_GENERATED_SQL_URI,
318    help="URI of a tar archive of the bigquery-etl generated-sql branch, which is "
319    "used to list views and determine whether they reference stable tables",
320)
321@click.option(
322    "--app-listings-uri",
323    default="https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings",
324    help="URI for probeinfo service v2 glean app listings",
325)
326@click.option(
327    "--disallowlist",
328    type=click.File(),
329    default="namespaces-disallowlist.yaml",
330    help="Path to namespace disallow list",
331)
332@click.option(
333    "--metric-hub-repos",
334    "--metric_hub_repos",
335    multiple=True,
336    default=[METRIC_HUB_REPO, LOOKER_METRIC_HUB_REPO],
337    help="Repos to load metric configs from.",
338)
339def namespaces(
340    custom_namespaces,
341    generated_sql_uri,
342    app_listings_uri,
343    disallowlist,
344    metric_hub_repos,
345):
346    """Generate namespaces.yaml."""
347    warnings.filterwarnings("ignore", module="google.auth._default")
348    glean_apps = _get_glean_apps(app_listings_uri)
349    db_views = lookml_utils.get_bigquery_view_reference_map(generated_sql_uri)
350
351    namespaces = {}
352    for app in glean_apps:
353        looker_views = _get_looker_views(app, db_views)
354        explores = _get_explores(looker_views)
355        views_as_dict = {view.name: view.as_dict() for view in looker_views}
356
357        namespaces[app["name"]] = {
358            "owners": app["owners"],
359            "pretty_name": app["pretty_name"],
360            "views": views_as_dict,
361            "explores": explores,
362            "glean_app": True,
363        }
364
365    if custom_namespaces is not None:
366        custom_namespaces = yaml.safe_load(custom_namespaces.read()) or {}
367
368        # generating operational monitoring namespace, if available
369        if "operational_monitoring" in custom_namespaces:
370            client = bigquery.Client()
371            opmon = _get_opmon(bq_client=client, namespaces=custom_namespaces)
372            custom_namespaces["operational_monitoring"].update(opmon)
373
374        _merge_namespaces(namespaces, custom_namespaces)
375
376    if metric_hub_repos:
377        MetricsConfigLoader.update_repos(metric_hub_repos)
378
379    _merge_namespaces(namespaces, _get_metric_hub_namespaces(namespaces))
380
381    disallowed_namespaces = yaml.safe_load(disallowlist.read()) or {}
382    disallowed_regex = [
383        fnmatch.translate(namespace) for namespace in disallowed_namespaces
384    ]
385    disallowed_namespaces_pattern = re.compile("|".join(disallowed_regex))
386
387    updated_namespaces = {}
388    for namespace, _ in namespaces.items():
389        if not disallowed_namespaces_pattern.fullmatch(namespace):
390            if "spoke" not in namespaces[namespace]:
391                namespaces[namespace]["spoke"] = DEFAULT_SPOKE
392            if "glean_app" not in namespaces[namespace]:
393                namespaces[namespace]["glean_app"] = False
394            updated_namespaces[namespace] = namespaces[namespace]
395
396    Path("namespaces.yaml").write_text(yaml.safe_dump(updated_namespaces))
DEFAULT_GENERATED_SQL_URI = 'https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz'
PROBE_INFO_BASE_URI = 'https://probeinfo.telemetry.mozilla.org'
DEFAULT_SPOKE = 'looker-spoke-default'
OPMON_DATASET = 'operational_monitoring'
PROD_PROJECT = 'moz-fx-data-shared-prod'
namespaces = <Command namespaces>

Generate namespaces.yaml.