generator.namespaces

Generate namespaces.yaml.

  1"""Generate namespaces.yaml."""
  2
  3import fnmatch
  4import json
  5import re
  6import urllib.request
  7import warnings
  8from collections.abc import Mapping
  9from datetime import datetime
 10from itertools import groupby
 11from operator import itemgetter
 12from pathlib import Path
 13from typing import Any, Dict, List, Union
 14
 15import click
 16import yaml
 17from google.cloud import bigquery
 18
 19from generator import operational_monitoring_utils
 20
 21from .explores import EXPLORE_TYPES
 22from .metrics_utils import LOOKER_METRIC_HUB_REPO, METRIC_HUB_REPO, MetricsConfigLoader
 23from .views import VIEW_TYPES, View, lookml_utils
 24
 25DEFAULT_GENERATED_SQL_URI = (
 26    "https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz"
 27)
 28
 29PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org"
 30DEFAULT_SPOKE = "looker-spoke-default"
 31OPMON_DATASET = "operational_monitoring"
 32PROD_PROJECT = "moz-fx-data-shared-prod"
 33
 34
 35def _normalize_slug(name):
 36    return re.sub(r"[^a-zA-Z0-9_]", "_", name)
 37
 38
 39def _merge_namespaces(dct, merge_dct):
 40    """Recursively merge namespaces."""
 41    for k, _ in merge_dct.items():
 42        if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping):
 43            if "glean_app" in merge_dct[k] and merge_dct[k]["glean_app"] is False:
 44                # if glean_app gets set to False, Glean views and explores should not be generated
 45                dct[k] = merge_dct[k]
 46            else:
 47                _merge_namespaces(dct[k], merge_dct[k])
 48        else:
 49            if k == "owners" and "owners" in dct:
 50                # combine owners
 51                dct[k] += merge_dct[k]
 52            else:
 53                dct[k] = merge_dct[k]
 54
 55
 56def _get_opmon(bq_client: bigquery.Client, namespaces: Dict[str, Any]):
 57    om_content: Dict[str, Any] = {"views": {}, "explores": {}, "dashboards": {}}
 58    # get operational monitoring namespace information
 59
 60    opmon_namespace = namespaces["operational_monitoring"]
 61    views = opmon_namespace.get("views")
 62
 63    if views is None:
 64        print("No views defined for operational monitoring")
 65        return {}
 66
 67    projects_view = views.get("projects")
 68
 69    if projects_view is None:
 70        print("No projects view defined for operational monitoring")
 71        return {}
 72
 73    projects_table = projects_view["tables"][0]["table"]
 74    projects = operational_monitoring_utils.get_active_projects(
 75        bq_client, project_table=projects_table
 76    )
 77
 78    # Iterating over all defined operational monitoring projects
 79    for project in projects:
 80        table_prefix = _normalize_slug(project["slug"])
 81        project_name = lookml_utils.slug_to_title(
 82            re.sub("[^0-9a-zA-Z_]+", "_", "_".join(project["name"].lower().split(" ")))
 83        )
 84        branches = project.get("branches", ["enabled", "disabled"])
 85
 86        # append view and explore for data type
 87        table = f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics"
 88        dimensions = operational_monitoring_utils.get_dimension_defaults(
 89            bq_client, table, project["dimensions"]
 90        )
 91        om_content["views"][table_prefix] = {
 92            "type": "operational_monitoring_view",
 93            "tables": [
 94                {
 95                    "table": table,
 96                    "xaxis": project["xaxis"],
 97                    "dimensions": dimensions,
 98                }
 99            ],
100        }
101        om_content["explores"][table_prefix] = {
102            "type": "operational_monitoring_explore",
103            "views": {"base_view": f"{table_prefix}"},
104            "branches": branches,
105            "xaxis": project["xaxis"],
106            "dimensions": dimensions,
107            "summaries": project["summaries"],
108        }
109
110        if "alerting" in project and project["alerting"]:
111            # create an alerting view if available
112            om_content["views"][f"{table_prefix}_alerts"] = {
113                "type": "operational_monitoring_alerting_view",
114                "tables": [
115                    {
116                        "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
117                    }
118                ],
119            }
120            om_content["explores"][f"{table_prefix}_alerts"] = {
121                "type": "operational_monitoring_alerting_explore",
122                "views": {"base_view": f"{table_prefix}_alerts"},
123            }
124
125        om_content["dashboards"][table_prefix] = {
126            "type": "operational_monitoring_dashboard",
127            "title": project_name,
128            "tables": [
129                {
130                    "explore": f"{table_prefix}",
131                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics",
132                    "branches": branches,
133                    "xaxis": project["xaxis"],
134                    "compact_visualization": project.get(
135                        "compact_visualization", False
136                    ),
137                    "dimensions": dimensions,
138                    "group_by_dimension": project.get("group_by_dimension", None),
139                    "summaries": project["summaries"],
140                }
141            ],
142        }
143
144        if "alerting" in project and project["alerting"]:
145            om_content["dashboards"][table_prefix]["tables"].append(
146                {
147                    "explore": f"{table_prefix}_alerts",
148                    "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts",
149                }
150            )
151
152    return om_content
153
154
155def _get_metric_hub_namespaces(existing_namespaces):
156    metric_hub_data_sources = _get_metric_hub_data_sources()
157
158    metric_hub_namespaces = {}
159    for namespace, metric_hub_data_sources in metric_hub_data_sources.items():
160        # each data source definition is represented by a view and an explore
161        explores = {}
162        views = {}
163        for data_source in sorted(metric_hub_data_sources):
164            views[f"metric_definitions_{data_source}"] = {
165                "type": "metric_definitions_view"
166            }
167
168            explores[f"metric_definitions_{data_source}"] = {
169                "type": "metric_definitions_explore",
170                "views": {"base_view": f"metric_definitions_{data_source}"},
171            }
172
173        metric_hub_namespaces[namespace] = {
174            "pretty_name": lookml_utils.slug_to_title(namespace),
175            "views": views,
176            "explores": explores,
177        }
178
179    return metric_hub_namespaces
180
181
182def _get_glean_apps(
183    app_listings_uri: str,
184) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]:
185    # define key function and reuse it for sorted and groupby
186    if app_listings_uri.startswith(PROBE_INFO_BASE_URI):
187        # For probe-info-service requests, add query param to bypass cloudfront cache
188        app_listings_uri += f"?t={datetime.utcnow().isoformat()}"
189
190    get_app_name = itemgetter("app_name")
191    with urllib.request.urlopen(app_listings_uri) as f:
192        # groupby requires input be sorted by key to produce one result per key
193        app_listings = sorted(json.loads(f.read()), key=get_app_name)
194
195    apps = []
196    for app_name, group in groupby(app_listings, get_app_name):
197        variants = list(group)
198
199        # use canonical_app_name where channel=="release" or the first one
200        release_variant = next(
201            (
202                channel
203                for channel in variants
204                if channel.get("app_channel") == "release"
205            ),
206            variants[0],
207        )
208
209        canonical_app_name = release_variant["canonical_app_name"]
210        v1_name = release_variant["v1_name"]
211        emails = release_variant["notification_emails"]
212
213        # we use the `source_dataset` concept to figure out what reference
214        # we should be looking for inside bigquery-etl
215        # For release we are currently using an app-level dataset which
216        # references the app id specific one (so we look for that view as
217        # a reference).
218        # For other channels, we refer to the stable tables
219        channels = [
220            {
221                "channel": channel.get("app_channel"),
222                "dataset": (
223                    channel.get("app_name").replace("-", "_")
224                    if channel.get("app_channel") == "release"
225                    else channel.get("bq_dataset_family")
226                ),
227                "source_dataset": (
228                    channel.get("bq_dataset_family")
229                    if channel.get("app_channel") == "release"
230                    else channel.get("bq_dataset_family") + "_stable"
231                ),
232            }
233            for channel in variants
234            if not channel.get("deprecated")
235        ]
236
237        # If all channels are deprecated, don't include this app
238        if channels:
239            apps.append(
240                {
241                    "name": app_name,
242                    "pretty_name": canonical_app_name,
243                    "channels": channels,
244                    "owners": emails,
245                    "glean_app": True,
246                    "v1_name": v1_name,
247                }
248            )
249
250    return apps
251
252
253def _get_looker_views(
254    app: Dict[str, Union[str, List[Dict[str, str]]]],
255    db_views: Dict[str, Dict[str, List[List[str]]]],
256) -> List[View]:
257    views, view_names = [], []
258
259    for klass in VIEW_TYPES.values():
260        for view in klass.from_db_views(  # type: ignore
261            app["name"], app["glean_app"], app["channels"], db_views
262        ):
263            if view.name in view_names:
264                raise KeyError(
265                    (
266                        f"Duplicate Looker View name {view.name} "
267                        f"when generating views for namespace {app['name']}"
268                    )
269                )
270            views.append(view)
271            view_names.append(view.name)
272
273    return views
274
275
276def _get_explores(views: List[View]) -> dict:
277    explores = {}
278    for _, klass in EXPLORE_TYPES.items():
279        for explore in klass.from_views(views):  # type: ignore
280            explores.update(explore.to_dict())
281
282    return explores
283
284
285def _get_metric_hub_data_sources() -> Dict[str, List[str]]:
286    """Get data source definitions from metric-hub repository for each namespace."""
287    data_sources_per_namespace: Dict[str, List[str]] = {}
288    for definition in MetricsConfigLoader.configs.definitions:
289        for data_source_slug in definition.spec.data_sources.definitions.keys():
290            if (
291                len(
292                    MetricsConfigLoader.metrics_of_data_source(
293                        data_source_slug, definition.platform
294                    )
295                )
296                > 0  # ignore data sources that are not used for any metric definition
297            ):
298                if definition.platform in data_sources_per_namespace:
299                    data_sources_per_namespace[definition.platform].append(
300                        data_source_slug
301                    )
302                else:
303                    data_sources_per_namespace[definition.platform] = [data_source_slug]
304
305    return data_sources_per_namespace
306
307
308@click.command(help=__doc__)
309@click.option(
310    "--custom-namespaces",
311    default="custom-namespaces.yaml",
312    type=click.File(),
313    help="Path to a custom namespaces file",
314)
315@click.option(
316    "--generated-sql-uri",
317    default=DEFAULT_GENERATED_SQL_URI,
318    help="URI of a tar archive of the bigquery-etl generated-sql branch, which is "
319    "used to list views and determine whether they reference stable tables",
320)
321@click.option(
322    "--app-listings-uri",
323    default="https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings",
324    help="URI for probeinfo service v2 glean app listings",
325)
326@click.option(
327    "--disallowlist",
328    type=click.File(),
329    default="namespaces-disallowlist.yaml",
330    help="Path to namespace disallow list",
331)
332@click.option(
333    "--metric-hub-repos",
334    "--metric_hub_repos",
335    multiple=True,
336    default=[METRIC_HUB_REPO, LOOKER_METRIC_HUB_REPO],
337    help="Repos to load metric configs from.",
338)
339@click.option(
340    "--ignore",
341    multiple=True,
342    default=[],
343    help="Namespaces to ignore during generation.",
344)
345@click.option(
346    "--use_cloud_function",
347    "--use-cloud-function",
348    help="Use the Cloud Function to run dry runs during LookML generation.",
349    type=bool,
350)
351def namespaces(
352    custom_namespaces,
353    generated_sql_uri,
354    app_listings_uri,
355    disallowlist,
356    metric_hub_repos,
357    ignore,
358    use_cloud_function,
359):
360    """Generate namespaces.yaml."""
361    warnings.filterwarnings("ignore", module="google.auth._default")
362    glean_apps = _get_glean_apps(app_listings_uri)
363    db_views = lookml_utils.get_bigquery_view_reference_map(generated_sql_uri)
364
365    namespaces = {}
366    for app in glean_apps:
367        if app["name"] not in ignore:
368            looker_views = _get_looker_views(app, db_views)
369            explores = _get_explores(looker_views)
370            views_as_dict = {view.name: view.as_dict() for view in looker_views}
371
372            namespaces[app["name"]] = {
373                "owners": app["owners"],
374                "pretty_name": app["pretty_name"],
375                "views": views_as_dict,
376                "explores": explores,
377                "glean_app": True,
378            }
379
380    if custom_namespaces is not None:
381        custom_namespaces = yaml.safe_load(custom_namespaces.read()) or {}
382        # remove namespaces that should be ignored
383        for ignored_namespace in ignore:
384            if ignored_namespace in custom_namespaces:
385                del custom_namespaces[ignored_namespace]
386
387        # generating operational monitoring namespace, if available
388        if "operational_monitoring" in custom_namespaces:
389            if use_cloud_function:
390                raise Exception("Cannot generate OpMon using dry run Cloud Function")
391
392            client = bigquery.Client()
393            opmon = _get_opmon(bq_client=client, namespaces=custom_namespaces)
394            custom_namespaces["operational_monitoring"].update(opmon)
395
396        _merge_namespaces(namespaces, custom_namespaces)
397
398    if metric_hub_repos:
399        MetricsConfigLoader.update_repos(metric_hub_repos)
400
401    _merge_namespaces(namespaces, _get_metric_hub_namespaces(namespaces))
402
403    disallowed_namespaces = yaml.safe_load(disallowlist.read()) or {}
404    disallowed_regex = [
405        fnmatch.translate(namespace) for namespace in disallowed_namespaces
406    ]
407    disallowed_namespaces_pattern = re.compile("|".join(disallowed_regex))
408
409    updated_namespaces = {}
410    for namespace, _ in namespaces.items():
411        if (
412            not disallowed_namespaces_pattern.fullmatch(namespace)
413            and namespace not in ignore
414        ):
415            if "spoke" not in namespaces[namespace]:
416                namespaces[namespace]["spoke"] = DEFAULT_SPOKE
417            if "glean_app" not in namespaces[namespace]:
418                namespaces[namespace]["glean_app"] = False
419            updated_namespaces[namespace] = namespaces[namespace]
420
421    Path("namespaces.yaml").write_text(yaml.safe_dump(updated_namespaces))
DEFAULT_GENERATED_SQL_URI = 'https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz'
PROBE_INFO_BASE_URI = 'https://probeinfo.telemetry.mozilla.org'
DEFAULT_SPOKE = 'looker-spoke-default'
OPMON_DATASET = 'operational_monitoring'
PROD_PROJECT = 'moz-fx-data-shared-prod'
namespaces = <Command namespaces>

Generate namespaces.yaml.