generator.namespaces

Generate namespaces.yaml.

  1"""Generate namespaces.yaml."""
  2
  3import fnmatch
  4import json
  5import re
  6import urllib.request
  7import warnings
  8from collections.abc import Mapping
  9from copy import deepcopy
 10from datetime import datetime
 11from itertools import groupby
 12from operator import itemgetter
 13from pathlib import Path
 14from typing import Any, Dict, List, Union
 15
 16import click
 17import yaml
 18from google.cloud import bigquery
 19
 20from generator import operational_monitoring_utils
 21
 22from .explores import EXPLORE_TYPES
 23from .metrics_utils import LOOKER_METRIC_HUB_REPO, METRIC_HUB_REPO, MetricsConfigLoader
 24from .views import VIEW_TYPES, View, lookml_utils
 25
 26DEFAULT_GENERATED_SQL_URI = (
 27    "https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz"
 28)
 29
 30PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org"
 31DEFAULT_SPOKE = "looker-spoke-default"
 32OPMON_DATASET = "operational_monitoring"
 33PROD_PROJECT = "moz-fx-data-shared-prod"
 34SKIP_DEPRECATED = ["mozilla-vpn"]
 35
 36
 37def _normalize_slug(name):
 38    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
 39
 40
 41def _merge_namespaces(dct, merge_dct):
 42    """Recursively merge namespaces."""
 43    for k, _ in merge_dct.items():
 44        if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping):
 45            if "glean_app" in merge_dct[k] and merge_dct[k]["glean_app"] is False:
 46                # if glean_app gets set to False, Glean views and explores should not be generated
 47                dct[k] = merge_dct[k]
 48            else:
 49                _merge_namespaces(dct[k], merge_dct[k])
 50        else:
 51            if k == "owners" and "owners" in dct:
 52                # combine owners
 53                dct[k] += merge_dct[k]
 54            else:
 55                dct[k] = merge_dct[k]
 56
 57
 58def _get_opmon(bq_client: bigquery.Client, namespaces: Dict[str, Any]):
 59    om_content: Dict[str, Any] = {"views": {}, "explores": {}, "dashboards": {}}
 60    # get operational monitoring namespace information
 61
 62    opmon_namespace = namespaces["operational_monitoring"]
 63    views = opmon_namespace.get("views")
 64
 65    if views is None:
 66        print("No views defined for operational monitoring")
 67        return {}
 68
 69    projects_view = views.get("projects")
 70
 71    if projects_view is None:
 72        print("No projects view defined for operational monitoring")
 73        return {}
 74
 75    projects_table = projects_view["tables"][0]["table"]
 76    projects = operational_monitoring_utils.get_active_projects(
 77        bq_client, project_table=projects_table
 78    )
 79
 80    # Iterating over all defined operational monitoring projects
 81    for project in projects:
 82        table_prefix = _normalize_slug(project["slug"])
 83        project_name = lookml_utils.slug_to_title(
 84            re.sub("[^0-9a-zA-Z_]+", "_", "_".join(project["name"].lower().split(" ")))
 85        )
 86        branches = project.get("branches", ["enabled", "disabled"])
 87
 88        # append view and explore for data type
 89        table = f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics"
 90        dimensions = operational_monitoring_utils.get_dimension_defaults(
 91            bq_client, table, project["dimensions"]
 92        )
 93        om_content["views"][table_prefix] = {
 94            "type": "operational_monitoring_view",
 95            "tables": [
 96                {
 97                    "table": table,
 98                    "xaxis": project["xaxis"],
 99                    "dimensions": dimensions,
100                }
101            ],
102        }
103        om_content["explores"][table_prefix] = {
104            "type": "operational_monitoring_explore",
105            "views": {"base_view": f"{table_prefix}"},
106            "branches": branches,
107            "xaxis": project["xaxis"],
108            "dimensions": dimensions,
109            "summaries": project["summaries"],
110        }
111
112        if "alerting" in project and project["alerting"]:
113            # create an alerting view if available
114            om_content["views"][f"{table_prefix}_alerts"] = {
115                "type": "operational_monitoring_alerting_view",
116                "tables": [
117                    {
118                        "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
119                    }
120                ],
121            }
122            om_content["explores"][f"{table_prefix}_alerts"] = {
123                "type": "operational_monitoring_alerting_explore",
124                "views": {"base_view": f"{table_prefix}_alerts"},
125            }
126
127        om_content["dashboards"][table_prefix] = {
128            "type": "operational_monitoring_dashboard",
129            "title": project_name,
130            "tables": [
131                {
132                    "explore": f"{table_prefix}",
133                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics",
134                    "branches": branches,
135                    "xaxis": project["xaxis"],
136                    "compact_visualization": project.get(
137                        "compact_visualization", False
138                    ),
139                    "dimensions": dimensions,
140                    "group_by_dimension": project.get("group_by_dimension", None),
141                    "summaries": project["summaries"],
142                }
143            ],
144        }
145
146        if "alerting" in project and project["alerting"]:
147            om_content["dashboards"][table_prefix]["tables"].append(
148                {
149                    "explore": f"{table_prefix}_alerts",
150                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
151                }
152            )
153
154    return om_content
155
156
157def _get_metric_hub_namespaces(existing_namespaces):
158    metric_hub_data_sources = _get_metric_hub_data_sources()
159
160    metric_hub_namespaces = {}
161    for namespace, metric_hub_data_sources in metric_hub_data_sources.items():
162        # each data source definition is represented by a view and an explore
163        explores = {}
164        views = {}
165        for data_source in sorted(metric_hub_data_sources):
166            views[f"metric_definitions_{data_source}"] = {
167                "type": "metric_definitions_view"
168            }
169
170            explores[f"metric_definitions_{data_source}"] = {
171                "type": "metric_definitions_explore",
172                "views": {"base_view": f"metric_definitions_{data_source}"},
173            }
174
175        metric_hub_namespaces[namespace] = {
176            "pretty_name": lookml_utils.slug_to_title(namespace),
177            "views": views,
178            "explores": explores,
179        }
180
181    return metric_hub_namespaces
182
183
184def _get_glean_apps(
185    app_listings_uri: str,
186) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
187    # define key function and reuse it for sorted and groupby
188    if app_listings_uri.startswith(PROBE_INFO_BASE_URI):
189        # For probe-info-service requests, add query param to bypass cloudfront cache
190        app_listings_uri += f"?t={datetime.utcnow().isoformat()}"
191
192    get_app_name = itemgetter("app_name")
193    with urllib.request.urlopen(app_listings_uri) as f:
194        # groupby requires input be sorted by key to produce one result per key
195        app_listings = sorted(json.loads(f.read()), key=get_app_name)
196
197    apps = []
198    for app_name, group in groupby(app_listings, get_app_name):
199        variants = list(group)
200
201        # use canonical_app_name where channel=="release" or the first one
202        release_variant = next(
203            (
204                channel
205                for channel in variants
206                if channel.get("app_channel") == "release"
207            ),
208            variants[0],
209        )
210
211        canonical_app_name = release_variant["canonical_app_name"]
212        v1_name = release_variant["v1_name"]
213        emails = release_variant["notification_emails"]
214
215        # we use the `source_dataset` concept to figure out what reference
216        # we should be looking for inside bigquery-etl
217        # For release we are currently using an app-level dataset which
218        # references the app id specific one (so we look for that view as
219        # a reference).
220        # For other channels, we refer to the stable tables
221        channels = [
222            {
223                "channel": channel.get("app_channel"),
224                "dataset": (
225                    channel.get("app_name").replace("-", "_")
226                    if channel.get("app_channel") == "release"
227                    else channel.get("bq_dataset_family")
228                ),
229                "source_dataset": (
230                    channel.get("bq_dataset_family")
231                    if channel.get("app_channel") == "release"
232                    else channel.get("bq_dataset_family") + "_stable"
233                ),
234            }
235            for channel in variants
236            if not channel.get("deprecated")
237            or channel.get("app_name")
238            not in SKIP_DEPRECATED  # TODO handling for deprecated apps
239        ]
240
241        # If all channels are deprecated, don't include this app
242        if channels:
243            apps.append(
244                {
245                    "name": app_name,
246                    "pretty_name": canonical_app_name,
247                    "channels": channels,
248                    "owners": emails,
249                    "glean_app": True,
250                    "v1_name": v1_name,
251                }
252            )
253
254    return apps
255
256
257def _get_looker_views(
258    app: Dict[str, Union[str, List[Dict[str, str]]]],
259    db_views: Dict[str, Dict[str, List[List[str]]]],
260) -> List[View]:
261    views, view_names = [], []
262
263    for klass in VIEW_TYPES.values():
264        for view in klass.from_db_views(  # type: ignore
265            app["name"], app["glean_app"], app["channels"], db_views
266        ):
267            if view.name in view_names:
268                raise KeyError(
269                    (
270                        f"Duplicate Looker View name {view.name} "
271                        f"when generating views for namespace {app['name']}"
272                    )
273                )
274            views.append(view)
275            view_names.append(view.name)
276
277    return views
278
279
280def _get_explores(views: List[View]) -> dict:
281    explores = {}
282    for _, klass in EXPLORE_TYPES.items():
283        for explore in klass.from_views(views):  # type: ignore
284            explores.update(explore.to_dict())
285
286    return explores
287
288
289def _get_metric_hub_data_sources() -> Dict[str, List[str]]:
290    """Get data source definitions from metric-hub repository for each namespace."""
291    data_sources_per_namespace: Dict[str, List[str]] = {}
292    for definition in MetricsConfigLoader.configs.definitions:
293        for data_source_slug in definition.spec.data_sources.definitions.keys():
294            if (
295                len(
296                    MetricsConfigLoader.metrics_of_data_source(
297                        data_source_slug, definition.platform
298                    )
299                )
300                > 0  # ignore data sources that are not used for any metric definition
301            ):
302                if definition.platform in data_sources_per_namespace:
303                    data_sources_per_namespace[definition.platform].append(
304                        data_source_slug
305                    )
306                else:
307                    data_sources_per_namespace[definition.platform] = [data_source_slug]
308
309    return data_sources_per_namespace
310
311
312@click.command(help=__doc__)
313@click.option(
314    "--custom-namespaces",
315    default="custom-namespaces.yaml",
316    type=click.File(),
317    help="Path to a custom namespaces file",
318)
319@click.option(
320    "--generated-sql-uri",
321    default=DEFAULT_GENERATED_SQL_URI,
322    help="URI of a tar archive of the bigquery-etl generated-sql branch, which is "
323    "used to list views and determine whether they reference stable tables",
324)
325@click.option(
326    "--app-listings-uri",
327    default="https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings",
328    help="URI for probeinfo service v2 glean app listings",
329)
330@click.option(
331    "--disallowlist",
332    type=click.File(),
333    default="namespaces-disallowlist.yaml",
334    help="Path to namespace disallow list",
335)
336@click.option(
337    "--metric-hub-repos",
338    "--metric_hub_repos",
339    multiple=True,
340    default=[METRIC_HUB_REPO, LOOKER_METRIC_HUB_REPO],
341    help="Repos to load metric configs from.",
342)
343@click.option(
344    "--ignore",
345    multiple=True,
346    default=[],
347    help="Namespaces to ignore during generation.",
348)
349@click.option(
350    "--use_cloud_function",
351    "--use-cloud-function",
352    help="Use the Cloud Function to run dry runs during LookML generation.",
353    type=bool,
354)
355def namespaces(
356    custom_namespaces,
357    generated_sql_uri,
358    app_listings_uri,
359    disallowlist,
360    metric_hub_repos,
361    ignore,
362    use_cloud_function,
363):
364    """Generate namespaces.yaml."""
365    warnings.filterwarnings("ignore", module="google.auth._default")
366    glean_apps = _get_glean_apps(app_listings_uri)
367    db_views = lookml_utils.get_bigquery_view_reference_map(generated_sql_uri)
368
369    namespaces = {}
370    for app in glean_apps:
371        if app["name"] not in ignore:
372            looker_views = _get_looker_views(app, db_views)
373            explores = _get_explores(looker_views)
374            views_as_dict = {view.name: view.as_dict() for view in looker_views}
375
376            namespaces[app["name"]] = {
377                "owners": app["owners"],
378                "pretty_name": app["pretty_name"],
379                "views": views_as_dict,
380                "explores": explores,
381                "glean_app": True,
382            }
383
384    if custom_namespaces is not None:
385        custom_namespaces = yaml.safe_load(custom_namespaces.read()) or {}
386        # remove namespaces that should be ignored
387        for ignored_namespace in ignore:
388            if ignored_namespace in custom_namespaces:
389                del custom_namespaces[ignored_namespace]
390
391        # generating operational monitoring namespace, if available
392        if "operational_monitoring" in custom_namespaces:
393            if use_cloud_function:
394                raise Exception("Cannot generate OpMon using dry run Cloud Function")
395
396            client = bigquery.Client()
397            opmon = _get_opmon(bq_client=client, namespaces=custom_namespaces)
398            custom_namespaces["operational_monitoring"].update(opmon)
399
400        _merge_namespaces(namespaces, custom_namespaces)
401
402    if metric_hub_repos:
403        MetricsConfigLoader.update_repos(metric_hub_repos)
404
405    _merge_namespaces(namespaces, _get_metric_hub_namespaces(namespaces))
406
407    updated_namespaces = _filter_disallowed(namespaces, disallowlist)
408    for namespace in updated_namespaces:
409        if namespace not in ignore:
410            if "spoke" not in updated_namespaces[namespace]:
411                updated_namespaces[namespace]["spoke"] = DEFAULT_SPOKE
412            if "glean_app" not in updated_namespaces[namespace]:
413                updated_namespaces[namespace]["glean_app"] = False
414
415    Path("namespaces.yaml").write_text(yaml.safe_dump(updated_namespaces))
416
417
418def _filter_disallowed(namespaces, disallowlist):
419    """Filter models, explores and views from the generated namespaces config, based on the disallowlist."""
420
421    def match_any(name, patterns):
422        return any(fnmatch.fnmatch(name, p) for p in patterns)
423
424    # transform namespace disallowlist to a dict
425    disallowed_namespaces = yaml.safe_load(disallowlist.read()) or []
426    disallowed_namespaces_dict = {}
427    for ns in [
428        {namespace: {}} if isinstance(namespace, str) else namespace
429        for namespace in disallowed_namespaces
430    ]:
431        disallowed_namespaces_dict.update(ns)
432
433    filtered_namespaces = deepcopy(namespaces)
434
435    for pattern, sub_filters in disallowed_namespaces_dict.items():
436        for key in list(filtered_namespaces):
437            if fnmatch.fnmatch(key, pattern):
438                # if no sub_filters, remove entire section
439                if not sub_filters:
440                    del filtered_namespaces[key]
441                    continue
442
443                entry = filtered_namespaces.get(key, {})
444
445                # remove matching artifact types (views, explores)
446                for artifact_type, disallowed_artifact_names in sub_filters.items():
447                    if artifact_type in entry:
448                        for key in list(entry[artifact_type]):
449                            if match_any(key, disallowed_artifact_names):
450                                del entry[artifact_type][key]
451
452    return filtered_namespaces
DEFAULT_GENERATED_SQL_URI = 'https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz'
PROBE_INFO_BASE_URI = 'https://probeinfo.telemetry.mozilla.org'
DEFAULT_SPOKE = 'looker-spoke-default'
OPMON_DATASET = 'operational_monitoring'
PROD_PROJECT = 'moz-fx-data-shared-prod'
SKIP_DEPRECATED = ['mozilla-vpn']
namespaces = <Command namespaces>

Generate namespaces.yaml.