generator.namespaces

Generate namespaces.yaml.

  1"""Generate namespaces.yaml."""
  2
  3import fnmatch
  4import json
  5import re
  6import urllib.request
  7import warnings
  8from collections.abc import Mapping
  9from copy import deepcopy
 10from datetime import datetime
 11from itertools import groupby
 12from operator import itemgetter
 13from pathlib import Path
 14from typing import Any, Dict, List, Union
 15
 16import click
 17import yaml
 18from google.cloud import bigquery
 19
 20from generator import operational_monitoring_utils
 21
 22from .explores import EXPLORE_TYPES
 23from .metrics_utils import LOOKER_METRIC_HUB_REPO, METRIC_HUB_REPO, MetricsConfigLoader
 24from .views import VIEW_TYPES, View, lookml_utils
 25
 26DEFAULT_GENERATED_SQL_URI = (
 27    "https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz"
 28)
 29
 30PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org"
 31DEFAULT_SPOKE = "looker-spoke-default"
 32OPMON_DATASET = "operational_monitoring"
 33PROD_PROJECT = "moz-fx-data-shared-prod"
 34
 35
 36def _normalize_slug(name):
 37    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
 38
 39
 40def _merge_namespaces(dct, merge_dct):
 41    """Recursively merge namespaces."""
 42    for k, _ in merge_dct.items():
 43        if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping):
 44            if "glean_app" in merge_dct[k] and merge_dct[k]["glean_app"] is False:
 45                # if glean_app gets set to False, Glean views and explores should not be generated
 46                dct[k] = merge_dct[k]
 47            else:
 48                _merge_namespaces(dct[k], merge_dct[k])
 49        else:
 50            if k == "owners" and "owners" in dct:
 51                # combine owners
 52                dct[k] += merge_dct[k]
 53            else:
 54                dct[k] = merge_dct[k]
 55
 56
 57def _get_opmon(bq_client: bigquery.Client, namespaces: Dict[str, Any]):
 58    om_content: Dict[str, Any] = {"views": {}, "explores": {}, "dashboards": {}}
 59    # get operational monitoring namespace information
 60
 61    opmon_namespace = namespaces["operational_monitoring"]
 62    views = opmon_namespace.get("views")
 63
 64    if views is None:
 65        print("No views defined for operational monitoring")
 66        return {}
 67
 68    projects_view = views.get("projects")
 69
 70    if projects_view is None:
 71        print("No projects view defined for operational monitoring")
 72        return {}
 73
 74    projects_table = projects_view["tables"][0]["table"]
 75    projects = operational_monitoring_utils.get_active_projects(
 76        bq_client, project_table=projects_table
 77    )
 78
 79    # Iterating over all defined operational monitoring projects
 80    for project in projects:
 81        table_prefix = _normalize_slug(project["slug"])
 82        project_name = lookml_utils.slug_to_title(
 83            re.sub("[^0-9a-zA-Z_]+", "_", "_".join(project["name"].lower().split(" ")))
 84        )
 85        branches = project.get("branches", ["enabled", "disabled"])
 86
 87        # append view and explore for data type
 88        table = f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics"
 89        dimensions = operational_monitoring_utils.get_dimension_defaults(
 90            bq_client, table, project["dimensions"]
 91        )
 92        om_content["views"][table_prefix] = {
 93            "type": "operational_monitoring_view",
 94            "tables": [
 95                {
 96                    "table": table,
 97                    "xaxis": project["xaxis"],
 98                    "dimensions": dimensions,
 99                }
100            ],
101        }
102        om_content["explores"][table_prefix] = {
103            "type": "operational_monitoring_explore",
104            "views": {"base_view": f"{table_prefix}"},
105            "branches": branches,
106            "xaxis": project["xaxis"],
107            "dimensions": dimensions,
108            "summaries": project["summaries"],
109        }
110
111        if "alerting" in project and project["alerting"]:
112            # create an alerting view if available
113            om_content["views"][f"{table_prefix}_alerts"] = {
114                "type": "operational_monitoring_alerting_view",
115                "tables": [
116                    {
117                        "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
118                    }
119                ],
120            }
121            om_content["explores"][f"{table_prefix}_alerts"] = {
122                "type": "operational_monitoring_alerting_explore",
123                "views": {"base_view": f"{table_prefix}_alerts"},
124            }
125
126        om_content["dashboards"][table_prefix] = {
127            "type": "operational_monitoring_dashboard",
128            "title": project_name,
129            "tables": [
130                {
131                    "explore": f"{table_prefix}",
132                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics",
133                    "branches": branches,
134                    "xaxis": project["xaxis"],
135                    "compact_visualization": project.get(
136                        "compact_visualization", False
137                    ),
138                    "dimensions": dimensions,
139                    "group_by_dimension": project.get("group_by_dimension", None),
140                    "summaries": project["summaries"],
141                }
142            ],
143        }
144
145        if "alerting" in project and project["alerting"]:
146            om_content["dashboards"][table_prefix]["tables"].append(
147                {
148                    "explore": f"{table_prefix}_alerts",
149                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
150                }
151            )
152
153    return om_content
154
155
156def _get_metric_hub_namespaces(existing_namespaces):
157    metric_hub_data_sources = _get_metric_hub_data_sources()
158
159    metric_hub_namespaces = {}
160    for namespace, metric_hub_data_sources in metric_hub_data_sources.items():
161        # each data source definition is represented by a view and an explore
162        explores = {}
163        views = {}
164        for data_source in sorted(metric_hub_data_sources):
165            views[f"metric_definitions_{data_source}"] = {
166                "type": "metric_definitions_view"
167            }
168
169            explores[f"metric_definitions_{data_source}"] = {
170                "type": "metric_definitions_explore",
171                "views": {"base_view": f"metric_definitions_{data_source}"},
172            }
173
174        metric_hub_namespaces[namespace] = {
175            "pretty_name": lookml_utils.slug_to_title(namespace),
176            "views": views,
177            "explores": explores,
178        }
179
180    return metric_hub_namespaces
181
182
183def _get_glean_apps(
184    app_listings_uri: str,
185) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
186    # define key function and reuse it for sorted and groupby
187    if app_listings_uri.startswith(PROBE_INFO_BASE_URI):
188        # For probe-info-service requests, add query param to bypass cloudfront cache
189        app_listings_uri += f"?t={datetime.utcnow().isoformat()}"
190
191    get_app_name = itemgetter("app_name")
192    with urllib.request.urlopen(app_listings_uri) as f:
193        # groupby requires input be sorted by key to produce one result per key
194        app_listings = sorted(json.loads(f.read()), key=get_app_name)
195
196    apps = []
197    for app_name, group in groupby(app_listings, get_app_name):
198        variants = list(group)
199
200        # use canonical_app_name where channel=="release" or the first one
201        release_variant = next(
202            (
203                channel
204                for channel in variants
205                if channel.get("app_channel") == "release"
206            ),
207            variants[0],
208        )
209
210        canonical_app_name = release_variant["canonical_app_name"]
211        v1_name = release_variant["v1_name"]
212        emails = release_variant["notification_emails"]
213
214        # we use the `source_dataset` concept to figure out what reference
215        # we should be looking for inside bigquery-etl
216        # For release we are currently using an app-level dataset which
217        # references the app id specific one (so we look for that view as
218        # a reference).
219        # For other channels, we refer to the stable tables
220        channels = [
221            {
222                "channel": channel.get("app_channel"),
223                "dataset": (
224                    channel.get("app_name").replace("-", "_")
225                    if channel.get("app_channel") == "release"
226                    else channel.get("bq_dataset_family")
227                ),
228                "source_dataset": (
229                    channel.get("bq_dataset_family")
230                    if channel.get("app_channel") == "release"
231                    else channel.get("bq_dataset_family") + "_stable"
232                ),
233            }
234            for channel in variants
235        ]
236
237        apps.append(
238            {
239                "name": app_name,
240                "pretty_name": canonical_app_name,
241                "channels": channels,
242                "owners": emails,
243                "glean_app": True,
244                "v1_name": v1_name,
245            }
246        )
247
248    return apps
249
250
251def _get_looker_views(
252    app: Dict[str, Union[str, List[Dict[str, str]]]],
253    db_views: Dict[str, Dict[str, List[List[str]]]],
254) -> List[View]:
255    views, view_names = [], []
256
257    for klass in VIEW_TYPES.values():
258        for view in klass.from_db_views(  # type: ignore
259            app["name"], app["glean_app"], app["channels"], db_views
260        ):
261            if view.name in view_names:
262                raise KeyError(
263                    (
264                        f"Duplicate Looker View name {view.name} "
265                        f"when generating views for namespace {app['name']}"
266                    )
267                )
268            views.append(view)
269            view_names.append(view.name)
270
271    return views
272
273
274def _get_explores(views: List[View]) -> dict:
275    explores = {}
276    for _, klass in EXPLORE_TYPES.items():
277        for explore in klass.from_views(views):  # type: ignore
278            explores.update(explore.to_dict())
279
280    return explores
281
282
283def _get_metric_hub_data_sources() -> Dict[str, List[str]]:
284    """Get data source definitions from metric-hub repository for each namespace."""
285    data_sources_per_namespace: Dict[str, List[str]] = {}
286    for definition in MetricsConfigLoader.configs.definitions:
287        for data_source_slug in definition.spec.data_sources.definitions.keys():
288            if (
289                len(
290                    MetricsConfigLoader.metrics_of_data_source(
291                        data_source_slug, definition.platform
292                    )
293                )
294                > 0  # ignore data sources that are not used for any metric definition
295            ):
296                if definition.platform in data_sources_per_namespace:
297                    data_sources_per_namespace[definition.platform].append(
298                        data_source_slug
299                    )
300                else:
301                    data_sources_per_namespace[definition.platform] = [data_source_slug]
302
303    return data_sources_per_namespace
304
305
306@click.command(help=__doc__)
307@click.option(
308    "--custom-namespaces",
309    default="custom-namespaces.yaml",
310    type=click.File(),
311    help="Path to a custom namespaces file",
312)
313@click.option(
314    "--generated-sql-uri",
315    default=DEFAULT_GENERATED_SQL_URI,
316    help="URI of a tar archive of the bigquery-etl generated-sql branch, which is "
317    "used to list views and determine whether they reference stable tables",
318)
319@click.option(
320    "--app-listings-uri",
321    default="https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings",
322    help="URI for probeinfo service v2 glean app listings",
323)
324@click.option(
325    "--disallowlist",
326    type=click.File(),
327    default="namespaces-disallowlist.yaml",
328    help="Path to namespace disallow list",
329)
330@click.option(
331    "--metric-hub-repos",
332    "--metric_hub_repos",
333    multiple=True,
334    default=[METRIC_HUB_REPO, LOOKER_METRIC_HUB_REPO],
335    help="Repos to load metric configs from.",
336)
337@click.option(
338    "--ignore",
339    multiple=True,
340    default=[],
341    help="Namespaces to ignore during generation.",
342)
343@click.option(
344    "--use_cloud_function",
345    "--use-cloud-function",
346    help="Use the Cloud Function to run dry runs during LookML generation.",
347    type=bool,
348)
349def namespaces(
350    custom_namespaces,
351    generated_sql_uri,
352    app_listings_uri,
353    disallowlist,
354    metric_hub_repos,
355    ignore,
356    use_cloud_function,
357):
358    """Generate namespaces.yaml."""
359    warnings.filterwarnings("ignore", module="google.auth._default")
360    glean_apps = _get_glean_apps(app_listings_uri)
361    db_views = lookml_utils.get_bigquery_view_reference_map(generated_sql_uri)
362
363    namespaces = {}
364    for app in glean_apps:
365        if app["name"] not in ignore:
366            looker_views = _get_looker_views(app, db_views)
367            explores = _get_explores(looker_views)
368            views_as_dict = {view.name: view.as_dict() for view in looker_views}
369
370            namespaces[app["name"]] = {
371                "owners": app["owners"],
372                "pretty_name": app["pretty_name"],
373                "views": views_as_dict,
374                "explores": explores,
375                "glean_app": True,
376            }
377
378    if custom_namespaces is not None:
379        custom_namespaces = yaml.safe_load(custom_namespaces.read()) or {}
380        # remove namespaces that should be ignored
381        for ignored_namespace in ignore:
382            if ignored_namespace in custom_namespaces:
383                del custom_namespaces[ignored_namespace]
384
385        # generating operational monitoring namespace, if available
386        if "operational_monitoring" in custom_namespaces:
387            if use_cloud_function:
388                raise Exception("Cannot generate OpMon using dry run Cloud Function")
389
390            client = bigquery.Client()
391            opmon = _get_opmon(bq_client=client, namespaces=custom_namespaces)
392            custom_namespaces["operational_monitoring"].update(opmon)
393
394        _merge_namespaces(namespaces, custom_namespaces)
395
396    if metric_hub_repos:
397        MetricsConfigLoader.update_repos(metric_hub_repos)
398
399    _merge_namespaces(namespaces, _get_metric_hub_namespaces(namespaces))
400
401    updated_namespaces = _filter_disallowed(namespaces, disallowlist)
402    for namespace in updated_namespaces:
403        if namespace not in ignore:
404            if "spoke" not in updated_namespaces[namespace]:
405                updated_namespaces[namespace]["spoke"] = DEFAULT_SPOKE
406            if "glean_app" not in updated_namespaces[namespace]:
407                updated_namespaces[namespace]["glean_app"] = False
408
409    Path("namespaces.yaml").write_text(yaml.safe_dump(updated_namespaces))
410
411
412def _filter_disallowed(namespaces, disallowlist):
413    """Filter models, explores and views from the generated namespaces config, based on the disallowlist."""
414
415    def match_any(name, patterns):
416        return any(fnmatch.fnmatch(name, p) for p in patterns)
417
418    # transform namespace disallowlist to a dict
419    disallowed_namespaces = yaml.safe_load(disallowlist.read()) or []
420    disallowed_namespaces_dict = {}
421    for ns in [
422        {namespace: {}} if isinstance(namespace, str) else namespace
423        for namespace in disallowed_namespaces
424    ]:
425        disallowed_namespaces_dict.update(ns)
426
427    filtered_namespaces = deepcopy(namespaces)
428
429    for pattern, sub_filters in disallowed_namespaces_dict.items():
430        for key in list(filtered_namespaces):
431            if fnmatch.fnmatch(key, pattern):
432                # if no sub_filters, remove entire section
433                if not sub_filters:
434                    del filtered_namespaces[key]
435                    continue
436
437                entry = filtered_namespaces.get(key, {})
438
439                # remove matching artifact types (views, explores)
440                for artifact_type, disallowed_artifact_names in sub_filters.items():
441                    if artifact_type in entry:
442                        for key in list(entry[artifact_type]):
443                            if match_any(key, disallowed_artifact_names):
444                                del entry[artifact_type][key]
445
446    return filtered_namespaces
DEFAULT_GENERATED_SQL_URI = 'https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz'
PROBE_INFO_BASE_URI = 'https://probeinfo.telemetry.mozilla.org'
DEFAULT_SPOKE = 'looker-spoke-default'
OPMON_DATASET = 'operational_monitoring'
PROD_PROJECT = 'moz-fx-data-shared-prod'
namespaces = <Command namespaces>

Generate namespaces.yaml.