generator.namespaces

Generate namespaces.yaml.
View Source
  1"""Generate namespaces.yaml."""
  2
  3import fnmatch
  4import json
  5import re
  6import urllib.request
  7import warnings
  8from collections.abc import Mapping
  9from copy import deepcopy
 10from datetime import datetime
 11from itertools import groupby
 12from operator import itemgetter
 13from pathlib import Path
 14from typing import Any, Dict, List, Union
 15
 16import click
 17import yaml
 18from google.cloud import bigquery
 19
 20from generator import operational_monitoring_utils
 21
 22from .explores import EXPLORE_TYPES
 23from .metrics_utils import LOOKER_METRIC_HUB_REPO, METRIC_HUB_REPO, MetricsConfigLoader
 24from .views import VIEW_TYPES, View, lookml_utils
 25
 26DEFAULT_GENERATED_SQL_URI = (
 27    "https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz"
 28)
 29
 30PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org"
 31DEFAULT_SPOKE = "looker-spoke-default"
 32OPMON_DATASET = "operational_monitoring"
 33PROD_PROJECT = "moz-fx-data-shared-prod"
 34
 35
 36def _normalize_slug(name):
 37    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
 38
 39
 40def _merge_namespaces(dct, merge_dct):
 41    """Recursively merge namespaces."""
 42    for k, _ in merge_dct.items():
 43        if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping):
 44            if "glean_app" in merge_dct[k] and merge_dct[k]["glean_app"] is False:
 45                # if glean_app gets set to False, Glean views and explores should not be generated
 46                dct[k] = merge_dct[k]
 47            else:
 48                _merge_namespaces(dct[k], merge_dct[k])
 49        else:
 50            if k == "owners" and "owners" in dct:
 51                # combine owners
 52                dct[k] += merge_dct[k]
 53            else:
 54                dct[k] = merge_dct[k]
 55
 56
 57def _get_opmon(bq_client: bigquery.Client, namespaces: Dict[str, Any]):
 58    om_content: Dict[str, Any] = {"views": {}, "explores": {}, "dashboards": {}}
 59    # get operational monitoring namespace information
 60
 61    opmon_namespace = namespaces["operational_monitoring"]
 62    views = opmon_namespace.get("views")
 63
 64    if views is None:
 65        print("No views defined for operational monitoring")
 66        return {}
 67
 68    projects_view = views.get("projects")
 69
 70    if projects_view is None:
 71        print("No projects view defined for operational monitoring")
 72        return {}
 73
 74    projects_table = projects_view["tables"][0]["table"]
 75    projects = operational_monitoring_utils.get_active_projects(
 76        bq_client, project_table=projects_table
 77    )
 78
 79    # Iterating over all defined operational monitoring projects
 80    for project in projects:
 81        table_prefix = _normalize_slug(project["slug"])
 82        project_name = lookml_utils.slug_to_title(
 83            re.sub("[^0-9a-zA-Z_]+", "_", "_".join(project["name"].lower().split(" ")))
 84        )
 85        branches = project.get("branches", ["enabled", "disabled"])
 86
 87        # append view and explore for data type
 88        table = f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics"
 89        dimensions = operational_monitoring_utils.get_dimension_defaults(
 90            bq_client, table, project["dimensions"]
 91        )
 92        om_content["views"][table_prefix] = {
 93            "type": "operational_monitoring_view",
 94            "tables": [
 95                {
 96                    "table": table,
 97                    "xaxis": project["xaxis"],
 98                    "dimensions": dimensions,
 99                }
100            ],
101        }
102        om_content["explores"][table_prefix] = {
103            "type": "operational_monitoring_explore",
104            "views": {"base_view": f"{table_prefix}"},
105            "branches": branches,
106            "xaxis": project["xaxis"],
107            "dimensions": dimensions,
108            "summaries": project["summaries"],
109        }
110
111        if "alerting" in project and project["alerting"]:
112            # create an alerting view if available
113            om_content["views"][f"{table_prefix}_alerts"] = {
114                "type": "operational_monitoring_alerting_view",
115                "tables": [
116                    {
117                        "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
118                    }
119                ],
120            }
121            om_content["explores"][f"{table_prefix}_alerts"] = {
122                "type": "operational_monitoring_alerting_explore",
123                "views": {"base_view": f"{table_prefix}_alerts"},
124            }
125
126        om_content["dashboards"][table_prefix] = {
127            "type": "operational_monitoring_dashboard",
128            "title": project_name,
129            "tables": [
130                {
131                    "explore": f"{table_prefix}",
132                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics",
133                    "branches": branches,
134                    "xaxis": project["xaxis"],
135                    "compact_visualization": project.get(
136                        "compact_visualization", False
137                    ),
138                    "dimensions": dimensions,
139                    "group_by_dimension": project.get("group_by_dimension", None),
140                    "summaries": project["summaries"],
141                }
142            ],
143        }
144
145        if "alerting" in project and project["alerting"]:
146            om_content["dashboards"][table_prefix]["tables"].append(
147                {
148                    "explore": f"{table_prefix}_alerts",
149                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
150                }
151            )
152
153    return om_content
154
155
156def _get_metric_hub_namespaces(existing_namespaces):
157    metric_hub_data_sources = _get_metric_hub_data_sources()
158
159    metric_hub_namespaces = {}
160    for namespace, metric_hub_data_sources in metric_hub_data_sources.items():
161        # each data source definition is represented by a view and an explore
162        explores = {}
163        views = {}
164        for data_source in sorted(metric_hub_data_sources):
165            views[f"metric_definitions_{data_source}"] = {
166                "type": "metric_definitions_view"
167            }
168
169            explores[f"metric_definitions_{data_source}"] = {
170                "type": "metric_definitions_explore",
171                "views": {"base_view": f"metric_definitions_{data_source}"},
172            }
173
174        metric_hub_namespaces[namespace] = {
175            "pretty_name": lookml_utils.slug_to_title(namespace),
176            "views": views,
177            "explores": explores,
178        }
179
180    return metric_hub_namespaces
181
182
183def _get_glean_apps(
184    app_listings_uri: str,
185) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
186    # define key function and reuse it for sorted and groupby
187    if app_listings_uri.startswith(PROBE_INFO_BASE_URI):
188        # For probe-info-service requests, add query param to bypass cloudfront cache
189        app_listings_uri += f"?t={datetime.utcnow().isoformat()}"
190
191    get_app_name = itemgetter("app_name")
192    with urllib.request.urlopen(app_listings_uri) as f:
193        # groupby requires input be sorted by key to produce one result per key
194        app_listings = sorted(json.loads(f.read()), key=get_app_name)
195
196    apps = []
197    for app_name, group in groupby(app_listings, get_app_name):
198        variants = list(group)
199
200        # use canonical_app_name where channel=="release" or the first one
201        release_variant = next(
202            (
203                channel
204                for channel in variants
205                if channel.get("app_channel") == "release"
206            ),
207            variants[0],
208        )
209
210        canonical_app_name = release_variant["canonical_app_name"]
211        v1_name = release_variant["v1_name"]
212        emails = release_variant["notification_emails"]
213
214        # we use the `source_dataset` concept to figure out what reference
215        # we should be looking for inside bigquery-etl
216        # For release we are currently using an app-level dataset which
217        # references the app id specific one (so we look for that view as
218        # a reference).
219        # For other channels, we refer to the stable tables
220        channels = [
221            {
222                "channel": channel.get("app_channel"),
223                "dataset": (
224                    channel.get("app_name").replace("-", "_")
225                    if channel.get("app_channel") == "release"
226                    else channel.get("bq_dataset_family")
227                ),
228                "source_dataset": (
229                    channel.get("bq_dataset_family")
230                    if channel.get("app_channel") == "release"
231                    else channel.get("bq_dataset_family") + "_stable"
232                ),
233            }
234            for channel in variants
235            if not channel.get("deprecated")
236        ]
237
238        # If all channels are deprecated, don't include this app
239        if channels:
240            apps.append(
241                {
242                    "name": app_name,
243                    "pretty_name": canonical_app_name,
244                    "channels": channels,
245                    "owners": emails,
246                    "glean_app": True,
247                    "v1_name": v1_name,
248                }
249            )
250
251    return apps
252
253
254def _get_looker_views(
255    app: Dict[str, Union[str, List[Dict[str, str]]]],
256    db_views: Dict[str, Dict[str, List[List[str]]]],
257) -> List[View]:
258    views, view_names = [], []
259
260    for klass in VIEW_TYPES.values():
261        for view in klass.from_db_views(  # type: ignore
262            app["name"], app["glean_app"], app["channels"], db_views
263        ):
264            if view.name in view_names:
265                raise KeyError(
266                    (
267                        f"Duplicate Looker View name {view.name} "
268                        f"when generating views for namespace {app['name']}"
269                    )
270                )
271            views.append(view)
272            view_names.append(view.name)
273
274    return views
275
276
277def _get_explores(views: List[View]) -> dict:
278    explores = {}
279    for _, klass in EXPLORE_TYPES.items():
280        for explore in klass.from_views(views):  # type: ignore
281            explores.update(explore.to_dict())
282
283    return explores
284
285
286def _get_metric_hub_data_sources() -> Dict[str, List[str]]:
287    """Get data source definitions from metric-hub repository for each namespace."""
288    data_sources_per_namespace: Dict[str, List[str]] = {}
289    for definition in MetricsConfigLoader.configs.definitions:
290        for data_source_slug in definition.spec.data_sources.definitions.keys():
291            if (
292                len(
293                    MetricsConfigLoader.metrics_of_data_source(
294                        data_source_slug, definition.platform
295                    )
296                )
297                > 0  # ignore data sources that are not used for any metric definition
298            ):
299                if definition.platform in data_sources_per_namespace:
300                    data_sources_per_namespace[definition.platform].append(
301                        data_source_slug
302                    )
303                else:
304                    data_sources_per_namespace[definition.platform] = [data_source_slug]
305
306    return data_sources_per_namespace
307
308
309@click.command(help=__doc__)
310@click.option(
311    "--custom-namespaces",
312    default="custom-namespaces.yaml",
313    type=click.File(),
314    help="Path to a custom namespaces file",
315)
316@click.option(
317    "--generated-sql-uri",
318    default=DEFAULT_GENERATED_SQL_URI,
319    help="URI of a tar archive of the bigquery-etl generated-sql branch, which is "
320    "used to list views and determine whether they reference stable tables",
321)
322@click.option(
323    "--app-listings-uri",
324    default="https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings",
325    help="URI for probeinfo service v2 glean app listings",
326)
327@click.option(
328    "--disallowlist",
329    type=click.File(),
330    default="namespaces-disallowlist.yaml",
331    help="Path to namespace disallow list",
332)
333@click.option(
334    "--metric-hub-repos",
335    "--metric_hub_repos",
336    multiple=True,
337    default=[METRIC_HUB_REPO, LOOKER_METRIC_HUB_REPO],
338    help="Repos to load metric configs from.",
339)
340@click.option(
341    "--ignore",
342    multiple=True,
343    default=[],
344    help="Namespaces to ignore during generation.",
345)
346@click.option(
347    "--use_cloud_function",
348    "--use-cloud-function",
349    help="Use the Cloud Function to run dry runs during LookML generation.",
350    type=bool,
351)
352def namespaces(
353    custom_namespaces,
354    generated_sql_uri,
355    app_listings_uri,
356    disallowlist,
357    metric_hub_repos,
358    ignore,
359    use_cloud_function,
360):
361    """Generate namespaces.yaml."""
362    warnings.filterwarnings("ignore", module="google.auth._default")
363    glean_apps = _get_glean_apps(app_listings_uri)
364    db_views = lookml_utils.get_bigquery_view_reference_map(generated_sql_uri)
365
366    namespaces = {}
367    for app in glean_apps:
368        if app["name"] not in ignore:
369            looker_views = _get_looker_views(app, db_views)
370            explores = _get_explores(looker_views)
371            views_as_dict = {view.name: view.as_dict() for view in looker_views}
372
373            namespaces[app["name"]] = {
374                "owners": app["owners"],
375                "pretty_name": app["pretty_name"],
376                "views": views_as_dict,
377                "explores": explores,
378                "glean_app": True,
379            }
380
381    if custom_namespaces is not None:
382        custom_namespaces = yaml.safe_load(custom_namespaces.read()) or {}
383        # remove namespaces that should be ignored
384        for ignored_namespace in ignore:
385            if ignored_namespace in custom_namespaces:
386                del custom_namespaces[ignored_namespace]
387
388        # generating operational monitoring namespace, if available
389        if "operational_monitoring" in custom_namespaces:
390            if use_cloud_function:
391                raise Exception("Cannot generate OpMon using dry run Cloud Function")
392
393            client = bigquery.Client()
394            opmon = _get_opmon(bq_client=client, namespaces=custom_namespaces)
395            custom_namespaces["operational_monitoring"].update(opmon)
396
397        _merge_namespaces(namespaces, custom_namespaces)
398
399    if metric_hub_repos:
400        MetricsConfigLoader.update_repos(metric_hub_repos)
401
402    _merge_namespaces(namespaces, _get_metric_hub_namespaces(namespaces))
403
404    updated_namespaces = _filter_disallowed(namespaces, disallowlist)
405    for namespace in updated_namespaces:
406        if namespace not in ignore:
407            if "spoke" not in updated_namespaces[namespace]:
408                updated_namespaces[namespace]["spoke"] = DEFAULT_SPOKE
409            if "glean_app" not in updated_namespaces[namespace]:
410                updated_namespaces[namespace]["glean_app"] = False
411
412    Path("namespaces.yaml").write_text(yaml.safe_dump(updated_namespaces))
413
414
415def _filter_disallowed(namespaces, disallowlist):
416    """Filter models, explores and views from the generated namespaces config, based on the disallowlist."""
417
418    def match_any(name, patterns):
419        return any(fnmatch.fnmatch(name, p) for p in patterns)
420
421    # transform namespace disallowlist to a dict
422    disallowed_namespaces = yaml.safe_load(disallowlist.read()) or []
423    disallowed_namespaces_dict = {}
424    for ns in [
425        {namespace: {}} if isinstance(namespace, str) else namespace
426        for namespace in disallowed_namespaces
427    ]:
428        disallowed_namespaces_dict.update(ns)
429
430    filtered_namespaces = deepcopy(namespaces)
431
432    for pattern, sub_filters in disallowed_namespaces_dict.items():
433        for key in list(filtered_namespaces):
434            if fnmatch.fnmatch(key, pattern):
435                # if no sub_filters, remove entire section
436                if not sub_filters:
437                    del filtered_namespaces[key]
438                    continue
439
440                entry = filtered_namespaces.get(key, {})
441
442                # remove matching artifact types (views, explores)
443                for artifact_type, disallowed_artifact_names in sub_filters.items():
444                    if artifact_type in entry:
445                        for key in list(entry[artifact_type]):
446                            if match_any(key, disallowed_artifact_names):
447                                del entry[artifact_type][key]
448
449    return filtered_namespaces
DEFAULT_GENERATED_SQL_URI = 'https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz'
PROBE_INFO_BASE_URI = 'https://probeinfo.telemetry.mozilla.org'
DEFAULT_SPOKE = 'looker-spoke-default'
OPMON_DATASET = 'operational_monitoring'
PROD_PROJECT = 'moz-fx-data-shared-prod'
namespaces = <Command namespaces>
Generate namespaces.yaml.