generator.namespaces
Generate namespaces.yaml.
1"""Generate namespaces.yaml.""" 2 3import fnmatch 4import json 5import re 6import urllib.request 7import warnings 8from collections.abc import Mapping 9from copy import deepcopy 10from datetime import datetime 11from itertools import groupby 12from operator import itemgetter 13from pathlib import Path 14from typing import Any, Dict, List, Union 15 16import click 17import yaml 18from google.cloud import bigquery 19 20from generator import operational_monitoring_utils 21 22from .explores import EXPLORE_TYPES 23from .metrics_utils import LOOKER_METRIC_HUB_REPO, METRIC_HUB_REPO, MetricsConfigLoader 24from .views import VIEW_TYPES, View, lookml_utils 25 26DEFAULT_GENERATED_SQL_URI = ( 27 "https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz" 28) 29 30PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org" 31DEFAULT_SPOKE = "looker-spoke-default" 32OPMON_DATASET = "operational_monitoring" 33PROD_PROJECT = "moz-fx-data-shared-prod" 34 35 36def _normalize_slug(name): 37 return re.sub(r"[^a-zA-Z0-9_]", "_", name) 38 39 40def _merge_namespaces(dct, merge_dct): 41 """Recursively merge namespaces.""" 42 for k, _ in merge_dct.items(): 43 if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping): 44 if "glean_app" in merge_dct[k] and merge_dct[k]["glean_app"] is False: 45 # if glean_app gets set to False, Glean views and explores should not be generated 46 dct[k] = merge_dct[k] 47 else: 48 _merge_namespaces(dct[k], merge_dct[k]) 49 else: 50 if k == "owners" and "owners" in dct: 51 # combine owners 52 dct[k] += merge_dct[k] 53 else: 54 dct[k] = merge_dct[k] 55 56 57def _get_opmon(bq_client: bigquery.Client, namespaces: Dict[str, Any]): 58 om_content: Dict[str, Any] = {"views": {}, "explores": {}, "dashboards": {}} 59 # get operational monitoring namespace information 60 61 opmon_namespace = namespaces["operational_monitoring"] 62 views = opmon_namespace.get("views") 63 64 if views is None: 65 print("No views defined for operational monitoring") 66 return {} 67 68 projects_view = views.get("projects") 69 70 if projects_view is None: 71 print("No projects view defined for operational monitoring") 72 return {} 73 74 projects_table = projects_view["tables"][0]["table"] 75 projects = operational_monitoring_utils.get_active_projects( 76 bq_client, project_table=projects_table 77 ) 78 79 # Iterating over all defined operational monitoring projects 80 for project in projects: 81 table_prefix = _normalize_slug(project["slug"]) 82 project_name = lookml_utils.slug_to_title( 83 re.sub("[^0-9a-zA-Z_]+", "_", "_".join(project["name"].lower().split(" "))) 84 ) 85 branches = project.get("branches", ["enabled", "disabled"]) 86 87 # append view and explore for data type 88 table = f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics" 89 dimensions = operational_monitoring_utils.get_dimension_defaults( 90 bq_client, table, project["dimensions"] 91 ) 92 om_content["views"][table_prefix] = { 93 "type": "operational_monitoring_view", 94 "tables": [ 95 { 96 "table": table, 97 "xaxis": project["xaxis"], 98 "dimensions": dimensions, 99 } 100 ], 101 } 102 om_content["explores"][table_prefix] = { 103 "type": "operational_monitoring_explore", 104 "views": {"base_view": f"{table_prefix}"}, 105 "branches": branches, 106 "xaxis": project["xaxis"], 107 "dimensions": dimensions, 108 "summaries": project["summaries"], 109 } 110 111 if "alerting" in project and project["alerting"]: 112 # create an alerting view if available 113 om_content["views"][f"{table_prefix}_alerts"] = { 114 "type": "operational_monitoring_alerting_view", 115 "tables": [ 116 { 117 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts", 118 } 119 ], 120 } 121 om_content["explores"][f"{table_prefix}_alerts"] = { 122 "type": "operational_monitoring_alerting_explore", 123 "views": {"base_view": f"{table_prefix}_alerts"}, 124 } 125 126 om_content["dashboards"][table_prefix] = { 127 "type": "operational_monitoring_dashboard", 128 "title": project_name, 129 "tables": [ 130 { 131 "explore": f"{table_prefix}", 132 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics", 133 "branches": branches, 134 "xaxis": project["xaxis"], 135 "compact_visualization": project.get( 136 "compact_visualization", False 137 ), 138 "dimensions": dimensions, 139 "group_by_dimension": project.get("group_by_dimension", None), 140 "summaries": project["summaries"], 141 } 142 ], 143 } 144 145 if "alerting" in project and project["alerting"]: 146 om_content["dashboards"][table_prefix]["tables"].append( 147 { 148 "explore": f"{table_prefix}_alerts", 149 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts", 150 } 151 ) 152 153 return om_content 154 155 156def _get_metric_hub_namespaces(existing_namespaces): 157 metric_hub_data_sources = _get_metric_hub_data_sources() 158 159 metric_hub_namespaces = {} 160 for namespace, metric_hub_data_sources in metric_hub_data_sources.items(): 161 # each data source definition is represented by a view and an explore 162 explores = {} 163 views = {} 164 for data_source in sorted(metric_hub_data_sources): 165 views[f"metric_definitions_{data_source}"] = { 166 "type": "metric_definitions_view" 167 } 168 169 explores[f"metric_definitions_{data_source}"] = { 170 "type": "metric_definitions_explore", 171 "views": {"base_view": f"metric_definitions_{data_source}"}, 172 } 173 174 metric_hub_namespaces[namespace] = { 175 "pretty_name": lookml_utils.slug_to_title(namespace), 176 "views": views, 177 "explores": explores, 178 } 179 180 return metric_hub_namespaces 181 182 183def _get_glean_apps( 184 app_listings_uri: str, 185) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]: 186 # define key function and reuse it for sorted and groupby 187 if app_listings_uri.startswith(PROBE_INFO_BASE_URI): 188 # For probe-info-service requests, add query param to bypass cloudfront cache 189 app_listings_uri += f"?t={datetime.utcnow().isoformat()}" 190 191 get_app_name = itemgetter("app_name") 192 with urllib.request.urlopen(app_listings_uri) as f: 193 # groupby requires input be sorted by key to produce one result per key 194 app_listings = sorted(json.loads(f.read()), key=get_app_name) 195 196 apps = [] 197 for app_name, group in groupby(app_listings, get_app_name): 198 variants = list(group) 199 200 # use canonical_app_name where channel=="release" or the first one 201 release_variant = next( 202 ( 203 channel 204 for channel in variants 205 if channel.get("app_channel") == "release" 206 ), 207 variants[0], 208 ) 209 210 canonical_app_name = release_variant["canonical_app_name"] 211 v1_name = release_variant["v1_name"] 212 emails = release_variant["notification_emails"] 213 214 # we use the `source_dataset` concept to figure out what reference 215 # we should be looking for inside bigquery-etl 216 # For release we are currently using an app-level dataset which 217 # references the app id specific one (so we look for that view as 218 # a reference). 219 # For other channels, we refer to the stable tables 220 channels = [ 221 { 222 "channel": channel.get("app_channel"), 223 "dataset": ( 224 channel.get("app_name").replace("-", "_") 225 if channel.get("app_channel") == "release" 226 else channel.get("bq_dataset_family") 227 ), 228 "source_dataset": ( 229 channel.get("bq_dataset_family") 230 if channel.get("app_channel") == "release" 231 else channel.get("bq_dataset_family") + "_stable" 232 ), 233 } 234 for channel in variants 235 ] 236 237 apps.append( 238 { 239 "name": app_name, 240 "pretty_name": canonical_app_name, 241 "channels": channels, 242 "owners": emails, 243 "glean_app": True, 244 "v1_name": v1_name, 245 } 246 ) 247 248 return apps 249 250 251def _get_looker_views( 252 app: Dict[str, Union[str, List[Dict[str, str]]]], 253 db_views: Dict[str, Dict[str, List[List[str]]]], 254) -> List[View]: 255 views, view_names = [], [] 256 257 for klass in VIEW_TYPES.values(): 258 for view in klass.from_db_views( # type: ignore 259 app["name"], app["glean_app"], app["channels"], db_views 260 ): 261 if view.name in view_names: 262 raise KeyError( 263 ( 264 f"Duplicate Looker View name {view.name} " 265 f"when generating views for namespace {app['name']}" 266 ) 267 ) 268 views.append(view) 269 view_names.append(view.name) 270 271 return views 272 273 274def _get_explores(views: List[View]) -> dict: 275 explores = {} 276 for _, klass in EXPLORE_TYPES.items(): 277 for explore in klass.from_views(views): # type: ignore 278 explores.update(explore.to_dict()) 279 280 return explores 281 282 283def _get_metric_hub_data_sources() -> Dict[str, List[str]]: 284 """Get data source definitions from metric-hub repository for each namespace.""" 285 data_sources_per_namespace: Dict[str, List[str]] = {} 286 for definition in MetricsConfigLoader.configs.definitions: 287 for data_source_slug in definition.spec.data_sources.definitions.keys(): 288 if ( 289 len( 290 MetricsConfigLoader.metrics_of_data_source( 291 data_source_slug, definition.platform 292 ) 293 ) 294 > 0 # ignore data sources that are not used for any metric definition 295 ): 296 if definition.platform in data_sources_per_namespace: 297 data_sources_per_namespace[definition.platform].append( 298 data_source_slug 299 ) 300 else: 301 data_sources_per_namespace[definition.platform] = [data_source_slug] 302 303 return data_sources_per_namespace 304 305 306@click.command(help=__doc__) 307@click.option( 308 "--custom-namespaces", 309 default="custom-namespaces.yaml", 310 type=click.File(), 311 help="Path to a custom namespaces file", 312) 313@click.option( 314 "--generated-sql-uri", 315 default=DEFAULT_GENERATED_SQL_URI, 316 help="URI of a tar archive of the bigquery-etl generated-sql branch, which is " 317 "used to list views and determine whether they reference stable tables", 318) 319@click.option( 320 "--app-listings-uri", 321 default="https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings", 322 help="URI for probeinfo service v2 glean app listings", 323) 324@click.option( 325 "--disallowlist", 326 type=click.File(), 327 default="namespaces-disallowlist.yaml", 328 help="Path to namespace disallow list", 329) 330@click.option( 331 "--metric-hub-repos", 332 "--metric_hub_repos", 333 multiple=True, 334 default=[METRIC_HUB_REPO, LOOKER_METRIC_HUB_REPO], 335 help="Repos to load metric configs from.", 336) 337@click.option( 338 "--ignore", 339 multiple=True, 340 default=[], 341 help="Namespaces to ignore during generation.", 342) 343@click.option( 344 "--use_cloud_function", 345 "--use-cloud-function", 346 help="Use the Cloud Function to run dry runs during LookML generation.", 347 type=bool, 348) 349def namespaces( 350 custom_namespaces, 351 generated_sql_uri, 352 app_listings_uri, 353 disallowlist, 354 metric_hub_repos, 355 ignore, 356 use_cloud_function, 357): 358 """Generate namespaces.yaml.""" 359 warnings.filterwarnings("ignore", module="google.auth._default") 360 glean_apps = _get_glean_apps(app_listings_uri) 361 db_views = lookml_utils.get_bigquery_view_reference_map(generated_sql_uri) 362 363 namespaces = {} 364 for app in glean_apps: 365 if app["name"] not in ignore: 366 looker_views = _get_looker_views(app, db_views) 367 explores = _get_explores(looker_views) 368 views_as_dict = {view.name: view.as_dict() for view in looker_views} 369 370 namespaces[app["name"]] = { 371 "owners": app["owners"], 372 "pretty_name": app["pretty_name"], 373 "views": views_as_dict, 374 "explores": explores, 375 "glean_app": True, 376 } 377 378 if custom_namespaces is not None: 379 custom_namespaces = yaml.safe_load(custom_namespaces.read()) or {} 380 # remove namespaces that should be ignored 381 for ignored_namespace in ignore: 382 if ignored_namespace in custom_namespaces: 383 del custom_namespaces[ignored_namespace] 384 385 # generating operational monitoring namespace, if available 386 if "operational_monitoring" in custom_namespaces: 387 if use_cloud_function: 388 raise Exception("Cannot generate OpMon using dry run Cloud Function") 389 390 client = bigquery.Client() 391 opmon = _get_opmon(bq_client=client, namespaces=custom_namespaces) 392 custom_namespaces["operational_monitoring"].update(opmon) 393 394 _merge_namespaces(namespaces, custom_namespaces) 395 396 if metric_hub_repos: 397 MetricsConfigLoader.update_repos(metric_hub_repos) 398 399 _merge_namespaces(namespaces, _get_metric_hub_namespaces(namespaces)) 400 401 updated_namespaces = _filter_disallowed(namespaces, disallowlist) 402 for namespace in updated_namespaces: 403 if namespace not in ignore: 404 if "spoke" not in updated_namespaces[namespace]: 405 updated_namespaces[namespace]["spoke"] = DEFAULT_SPOKE 406 if "glean_app" not in updated_namespaces[namespace]: 407 updated_namespaces[namespace]["glean_app"] = False 408 409 Path("namespaces.yaml").write_text(yaml.safe_dump(updated_namespaces)) 410 411 412def _filter_disallowed(namespaces, disallowlist): 413 """Filter models, explores and views from the generated namespaces config, based on the disallowlist.""" 414 415 def match_any(name, patterns): 416 return any(fnmatch.fnmatch(name, p) for p in patterns) 417 418 # transform namespace disallowlist to a dict 419 disallowed_namespaces = yaml.safe_load(disallowlist.read()) or [] 420 disallowed_namespaces_dict = {} 421 for ns in [ 422 {namespace: {}} if isinstance(namespace, str) else namespace 423 for namespace in disallowed_namespaces 424 ]: 425 disallowed_namespaces_dict.update(ns) 426 427 filtered_namespaces = deepcopy(namespaces) 428 429 for pattern, sub_filters in disallowed_namespaces_dict.items(): 430 for key in list(filtered_namespaces): 431 if fnmatch.fnmatch(key, pattern): 432 # if no sub_filters, remove entire section 433 if not sub_filters: 434 del filtered_namespaces[key] 435 continue 436 437 entry = filtered_namespaces.get(key, {}) 438 439 # remove matching artifact types (views, explores) 440 for artifact_type, disallowed_artifact_names in sub_filters.items(): 441 if artifact_type in entry: 442 for key in list(entry[artifact_type]): 443 if match_any(key, disallowed_artifact_names): 444 del entry[artifact_type][key] 445 446 return filtered_namespaces
DEFAULT_GENERATED_SQL_URI =
'https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz'
PROBE_INFO_BASE_URI =
'https://probeinfo.telemetry.mozilla.org'
DEFAULT_SPOKE =
'looker-spoke-default'
OPMON_DATASET =
'operational_monitoring'
PROD_PROJECT =
'moz-fx-data-shared-prod'
namespaces =
<Command namespaces>
Generate namespaces.yaml.