generator.namespaces
Generate namespaces.yaml.
1"""Generate namespaces.yaml.""" 2 3import fnmatch 4import json 5import re 6import urllib.request 7import warnings 8from collections.abc import Mapping 9from copy import deepcopy 10from datetime import datetime 11from itertools import groupby 12from operator import itemgetter 13from pathlib import Path 14from typing import Any, Dict, List, Union 15 16import click 17import yaml 18from google.cloud import bigquery 19 20from generator import operational_monitoring_utils 21 22from .explores import EXPLORE_TYPES 23from .metrics_utils import LOOKER_METRIC_HUB_REPO, METRIC_HUB_REPO, MetricsConfigLoader 24from .views import VIEW_TYPES, View, lookml_utils 25 26DEFAULT_GENERATED_SQL_URI = ( 27 "https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz" 28) 29 30PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org" 31DEFAULT_SPOKE = "looker-spoke-default" 32OPMON_DATASET = "operational_monitoring" 33PROD_PROJECT = "moz-fx-data-shared-prod" 34SKIP_DEPRECATED = ["mozilla-vpn"] 35 36 37def _normalize_slug(name): 38 return re.sub(r"[^a-zA-Z0-9_]", "_", name) 39 40 41def _merge_namespaces(dct, merge_dct): 42 """Recursively merge namespaces.""" 43 for k, _ in merge_dct.items(): 44 if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping): 45 if "glean_app" in merge_dct[k] and merge_dct[k]["glean_app"] is False: 46 # if glean_app gets set to False, Glean views and explores should not be generated 47 dct[k] = merge_dct[k] 48 else: 49 _merge_namespaces(dct[k], merge_dct[k]) 50 else: 51 if k == "owners" and "owners" in dct: 52 # combine owners 53 dct[k] += merge_dct[k] 54 else: 55 dct[k] = merge_dct[k] 56 57 58def _get_opmon(bq_client: bigquery.Client, namespaces: Dict[str, Any]): 59 om_content: Dict[str, Any] = {"views": {}, "explores": {}, "dashboards": {}} 60 # get operational monitoring namespace information 61 62 opmon_namespace = namespaces["operational_monitoring"] 63 views = opmon_namespace.get("views") 64 65 if views is None: 66 print("No views defined for operational monitoring") 67 return {} 68 69 projects_view = views.get("projects") 70 71 if projects_view is None: 72 print("No projects view defined for operational monitoring") 73 return {} 74 75 projects_table = projects_view["tables"][0]["table"] 76 projects = operational_monitoring_utils.get_active_projects( 77 bq_client, project_table=projects_table 78 ) 79 80 # Iterating over all defined operational monitoring projects 81 for project in projects: 82 table_prefix = _normalize_slug(project["slug"]) 83 project_name = lookml_utils.slug_to_title( 84 re.sub("[^0-9a-zA-Z_]+", "_", "_".join(project["name"].lower().split(" "))) 85 ) 86 branches = project.get("branches", ["enabled", "disabled"]) 87 88 # append view and explore for data type 89 table = f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics" 90 dimensions = operational_monitoring_utils.get_dimension_defaults( 91 bq_client, table, project["dimensions"] 92 ) 93 om_content["views"][table_prefix] = { 94 "type": "operational_monitoring_view", 95 "tables": [ 96 { 97 "table": table, 98 "xaxis": project["xaxis"], 99 "dimensions": dimensions, 100 } 101 ], 102 } 103 om_content["explores"][table_prefix] = { 104 "type": "operational_monitoring_explore", 105 "views": {"base_view": f"{table_prefix}"}, 106 "branches": branches, 107 "xaxis": project["xaxis"], 108 "dimensions": dimensions, 109 "summaries": project["summaries"], 110 } 111 112 if "alerting" in project and project["alerting"]: 113 # create an alerting view if available 114 om_content["views"][f"{table_prefix}_alerts"] = { 115 "type": "operational_monitoring_alerting_view", 116 "tables": [ 117 { 118 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts", 119 } 120 ], 121 } 122 om_content["explores"][f"{table_prefix}_alerts"] = { 123 "type": "operational_monitoring_alerting_explore", 124 "views": {"base_view": f"{table_prefix}_alerts"}, 125 } 126 127 om_content["dashboards"][table_prefix] = { 128 "type": "operational_monitoring_dashboard", 129 "title": project_name, 130 "tables": [ 131 { 132 "explore": f"{table_prefix}", 133 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics", 134 "branches": branches, 135 "xaxis": project["xaxis"], 136 "compact_visualization": project.get( 137 "compact_visualization", False 138 ), 139 "dimensions": dimensions, 140 "group_by_dimension": project.get("group_by_dimension", None), 141 "summaries": project["summaries"], 142 } 143 ], 144 } 145 146 if "alerting" in project and project["alerting"]: 147 om_content["dashboards"][table_prefix]["tables"].append( 148 { 149 "explore": f"{table_prefix}_alerts", 150 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts", 151 } 152 ) 153 154 return om_content 155 156 157def _get_metric_hub_namespaces(existing_namespaces): 158 metric_hub_data_sources = _get_metric_hub_data_sources() 159 160 metric_hub_namespaces = {} 161 for namespace, metric_hub_data_sources in metric_hub_data_sources.items(): 162 # each data source definition is represented by a view and an explore 163 explores = {} 164 views = {} 165 for data_source in sorted(metric_hub_data_sources): 166 views[f"metric_definitions_{data_source}"] = { 167 "type": "metric_definitions_view" 168 } 169 170 explores[f"metric_definitions_{data_source}"] = { 171 "type": "metric_definitions_explore", 172 "views": {"base_view": f"metric_definitions_{data_source}"}, 173 } 174 175 metric_hub_namespaces[namespace] = { 176 "pretty_name": lookml_utils.slug_to_title(namespace), 177 "views": views, 178 "explores": explores, 179 } 180 181 return metric_hub_namespaces 182 183 184def _get_glean_apps( 185 app_listings_uri: str, 186) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]: 187 # define key function and reuse it for sorted and groupby 188 if app_listings_uri.startswith(PROBE_INFO_BASE_URI): 189 # For probe-info-service requests, add query param to bypass cloudfront cache 190 app_listings_uri += f"?t={datetime.utcnow().isoformat()}" 191 192 get_app_name = itemgetter("app_name") 193 with urllib.request.urlopen(app_listings_uri) as f: 194 # groupby requires input be sorted by key to produce one result per key 195 app_listings = sorted(json.loads(f.read()), key=get_app_name) 196 197 apps = [] 198 for app_name, group in groupby(app_listings, get_app_name): 199 variants = list(group) 200 201 # use canonical_app_name where channel=="release" or the first one 202 release_variant = next( 203 ( 204 channel 205 for channel in variants 206 if channel.get("app_channel") == "release" 207 ), 208 variants[0], 209 ) 210 211 canonical_app_name = release_variant["canonical_app_name"] 212 v1_name = release_variant["v1_name"] 213 emails = release_variant["notification_emails"] 214 215 # we use the `source_dataset` concept to figure out what reference 216 # we should be looking for inside bigquery-etl 217 # For release we are currently using an app-level dataset which 218 # references the app id specific one (so we look for that view as 219 # a reference). 220 # For other channels, we refer to the stable tables 221 channels = [ 222 { 223 "channel": channel.get("app_channel"), 224 "dataset": ( 225 channel.get("app_name").replace("-", "_") 226 if channel.get("app_channel") == "release" 227 else channel.get("bq_dataset_family") 228 ), 229 "source_dataset": ( 230 channel.get("bq_dataset_family") 231 if channel.get("app_channel") == "release" 232 else channel.get("bq_dataset_family") + "_stable" 233 ), 234 } 235 for channel in variants 236 if not channel.get("deprecated") 237 or channel.get("app_name") 238 not in SKIP_DEPRECATED # TODO handling for deprecated apps 239 ] 240 241 # If all channels are deprecated, don't include this app 242 if channels: 243 apps.append( 244 { 245 "name": app_name, 246 "pretty_name": canonical_app_name, 247 "channels": channels, 248 "owners": emails, 249 "glean_app": True, 250 "v1_name": v1_name, 251 } 252 ) 253 254 return apps 255 256 257def _get_looker_views( 258 app: Dict[str, Union[str, List[Dict[str, str]]]], 259 db_views: Dict[str, Dict[str, List[List[str]]]], 260) -> List[View]: 261 views, view_names = [], [] 262 263 for klass in VIEW_TYPES.values(): 264 for view in klass.from_db_views( # type: ignore 265 app["name"], app["glean_app"], app["channels"], db_views 266 ): 267 if view.name in view_names: 268 raise KeyError( 269 ( 270 f"Duplicate Looker View name {view.name} " 271 f"when generating views for namespace {app['name']}" 272 ) 273 ) 274 views.append(view) 275 view_names.append(view.name) 276 277 return views 278 279 280def _get_explores(views: List[View]) -> dict: 281 explores = {} 282 for _, klass in EXPLORE_TYPES.items(): 283 for explore in klass.from_views(views): # type: ignore 284 explores.update(explore.to_dict()) 285 286 return explores 287 288 289def _get_metric_hub_data_sources() -> Dict[str, List[str]]: 290 """Get data source definitions from metric-hub repository for each namespace.""" 291 data_sources_per_namespace: Dict[str, List[str]] = {} 292 for definition in MetricsConfigLoader.configs.definitions: 293 for data_source_slug in definition.spec.data_sources.definitions.keys(): 294 if ( 295 len( 296 MetricsConfigLoader.metrics_of_data_source( 297 data_source_slug, definition.platform 298 ) 299 ) 300 > 0 # ignore data sources that are not used for any metric definition 301 ): 302 if definition.platform in data_sources_per_namespace: 303 data_sources_per_namespace[definition.platform].append( 304 data_source_slug 305 ) 306 else: 307 data_sources_per_namespace[definition.platform] = [data_source_slug] 308 309 return data_sources_per_namespace 310 311 312@click.command(help=__doc__) 313@click.option( 314 "--custom-namespaces", 315 default="custom-namespaces.yaml", 316 type=click.File(), 317 help="Path to a custom namespaces file", 318) 319@click.option( 320 "--generated-sql-uri", 321 default=DEFAULT_GENERATED_SQL_URI, 322 help="URI of a tar archive of the bigquery-etl generated-sql branch, which is " 323 "used to list views and determine whether they reference stable tables", 324) 325@click.option( 326 "--app-listings-uri", 327 default="https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings", 328 help="URI for probeinfo service v2 glean app listings", 329) 330@click.option( 331 "--disallowlist", 332 type=click.File(), 333 default="namespaces-disallowlist.yaml", 334 help="Path to namespace disallow list", 335) 336@click.option( 337 "--metric-hub-repos", 338 "--metric_hub_repos", 339 multiple=True, 340 default=[METRIC_HUB_REPO, LOOKER_METRIC_HUB_REPO], 341 help="Repos to load metric configs from.", 342) 343@click.option( 344 "--ignore", 345 multiple=True, 346 default=[], 347 help="Namespaces to ignore during generation.", 348) 349@click.option( 350 "--use_cloud_function", 351 "--use-cloud-function", 352 help="Use the Cloud Function to run dry runs during LookML generation.", 353 type=bool, 354) 355def namespaces( 356 custom_namespaces, 357 generated_sql_uri, 358 app_listings_uri, 359 disallowlist, 360 metric_hub_repos, 361 ignore, 362 use_cloud_function, 363): 364 """Generate namespaces.yaml.""" 365 warnings.filterwarnings("ignore", module="google.auth._default") 366 glean_apps = _get_glean_apps(app_listings_uri) 367 db_views = lookml_utils.get_bigquery_view_reference_map(generated_sql_uri) 368 369 namespaces = {} 370 for app in glean_apps: 371 if app["name"] not in ignore: 372 looker_views = _get_looker_views(app, db_views) 373 explores = _get_explores(looker_views) 374 views_as_dict = {view.name: view.as_dict() for view in looker_views} 375 376 namespaces[app["name"]] = { 377 "owners": app["owners"], 378 "pretty_name": app["pretty_name"], 379 "views": views_as_dict, 380 "explores": explores, 381 "glean_app": True, 382 } 383 384 if custom_namespaces is not None: 385 custom_namespaces = yaml.safe_load(custom_namespaces.read()) or {} 386 # remove namespaces that should be ignored 387 for ignored_namespace in ignore: 388 if ignored_namespace in custom_namespaces: 389 del custom_namespaces[ignored_namespace] 390 391 # generating operational monitoring namespace, if available 392 if "operational_monitoring" in custom_namespaces: 393 if use_cloud_function: 394 raise Exception("Cannot generate OpMon using dry run Cloud Function") 395 396 client = bigquery.Client() 397 opmon = _get_opmon(bq_client=client, namespaces=custom_namespaces) 398 custom_namespaces["operational_monitoring"].update(opmon) 399 400 _merge_namespaces(namespaces, custom_namespaces) 401 402 if metric_hub_repos: 403 MetricsConfigLoader.update_repos(metric_hub_repos) 404 405 _merge_namespaces(namespaces, _get_metric_hub_namespaces(namespaces)) 406 407 updated_namespaces = _filter_disallowed(namespaces, disallowlist) 408 for namespace in updated_namespaces: 409 if namespace not in ignore: 410 if "spoke" not in updated_namespaces[namespace]: 411 updated_namespaces[namespace]["spoke"] = DEFAULT_SPOKE 412 if "glean_app" not in updated_namespaces[namespace]: 413 updated_namespaces[namespace]["glean_app"] = False 414 415 Path("namespaces.yaml").write_text(yaml.safe_dump(updated_namespaces)) 416 417 418def _filter_disallowed(namespaces, disallowlist): 419 """Filter models, explores and views from the generated namespaces config, based on the disallowlist.""" 420 421 def match_any(name, patterns): 422 return any(fnmatch.fnmatch(name, p) for p in patterns) 423 424 # transform namespace disallowlist to a dict 425 disallowed_namespaces = yaml.safe_load(disallowlist.read()) or [] 426 disallowed_namespaces_dict = {} 427 for ns in [ 428 {namespace: {}} if isinstance(namespace, str) else namespace 429 for namespace in disallowed_namespaces 430 ]: 431 disallowed_namespaces_dict.update(ns) 432 433 filtered_namespaces = deepcopy(namespaces) 434 435 for pattern, sub_filters in disallowed_namespaces_dict.items(): 436 for key in list(filtered_namespaces): 437 if fnmatch.fnmatch(key, pattern): 438 # if no sub_filters, remove entire section 439 if not sub_filters: 440 del filtered_namespaces[key] 441 continue 442 443 entry = filtered_namespaces.get(key, {}) 444 445 # remove matching artifact types (views, explores) 446 for artifact_type, disallowed_artifact_names in sub_filters.items(): 447 if artifact_type in entry: 448 for key in list(entry[artifact_type]): 449 if match_any(key, disallowed_artifact_names): 450 del entry[artifact_type][key] 451 452 return filtered_namespaces
DEFAULT_GENERATED_SQL_URI =
'https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz'
PROBE_INFO_BASE_URI =
'https://probeinfo.telemetry.mozilla.org'
DEFAULT_SPOKE =
'looker-spoke-default'
OPMON_DATASET =
'operational_monitoring'
PROD_PROJECT =
'moz-fx-data-shared-prod'
SKIP_DEPRECATED =
['mozilla-vpn']
namespaces =
<Command namespaces>
Generate namespaces.yaml.