generator.namespaces
Generate namespaces.yaml.
1"""Generate namespaces.yaml.""" 2 3import fnmatch 4import json 5import re 6import urllib.request 7import warnings 8from collections.abc import Mapping 9from copy import deepcopy 10from datetime import datetime 11from itertools import groupby 12from operator import itemgetter 13from pathlib import Path 14from typing import Any, Dict, List, Union 15 16import click 17import yaml 18from google.cloud import bigquery 19 20from generator import operational_monitoring_utils 21 22from .explores import EXPLORE_TYPES 23from .metrics_utils import LOOKER_METRIC_HUB_REPO, METRIC_HUB_REPO, MetricsConfigLoader 24from .views import VIEW_TYPES, View, lookml_utils 25 26DEFAULT_GENERATED_SQL_URI = ( 27 "https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz" 28) 29 30PROBE_INFO_BASE_URI = "https://probeinfo.telemetry.mozilla.org" 31DEFAULT_SPOKE = "looker-spoke-default" 32OPMON_DATASET = "operational_monitoring" 33PROD_PROJECT = "moz-fx-data-shared-prod" 34 35 36def _normalize_slug(name): 37 return re.sub(r"[^a-zA-Z0-9_]", "_", name) 38 39 40def _merge_namespaces(dct, merge_dct): 41 """Recursively merge namespaces.""" 42 for k, _ in merge_dct.items(): 43 if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], Mapping): 44 if "glean_app" in merge_dct[k] and merge_dct[k]["glean_app"] is False: 45 # if glean_app gets set to False, Glean views and explores should not be generated 46 dct[k] = merge_dct[k] 47 else: 48 _merge_namespaces(dct[k], merge_dct[k]) 49 else: 50 if k == "owners" and "owners" in dct: 51 # combine owners 52 dct[k] += merge_dct[k] 53 else: 54 dct[k] = merge_dct[k] 55 56 57def _get_opmon(bq_client: bigquery.Client, namespaces: Dict[str, Any]): 58 om_content: Dict[str, Any] = {"views": {}, "explores": {}, "dashboards": {}} 59 # get operational monitoring namespace information 60 61 opmon_namespace = namespaces["operational_monitoring"] 62 views = opmon_namespace.get("views") 63 64 if views is None: 65 print("No views defined for operational monitoring") 66 return {} 67 68 projects_view = views.get("projects") 69 70 if projects_view is None: 71 print("No projects view defined for operational monitoring") 72 return {} 73 74 projects_table = projects_view["tables"][0]["table"] 75 projects = operational_monitoring_utils.get_active_projects( 76 bq_client, project_table=projects_table 77 ) 78 79 # Iterating over all defined operational monitoring projects 80 for project in projects: 81 table_prefix = _normalize_slug(project["slug"]) 82 project_name = lookml_utils.slug_to_title( 83 re.sub("[^0-9a-zA-Z_]+", "_", "_".join(project["name"].lower().split(" "))) 84 ) 85 branches = project.get("branches", ["enabled", "disabled"]) 86 87 # append view and explore for data type 88 table = f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics" 89 dimensions = operational_monitoring_utils.get_dimension_defaults( 90 bq_client, table, project["dimensions"] 91 ) 92 om_content["views"][table_prefix] = { 93 "type": "operational_monitoring_view", 94 "tables": [ 95 { 96 "table": table, 97 "xaxis": project["xaxis"], 98 "dimensions": dimensions, 99 } 100 ], 101 } 102 om_content["explores"][table_prefix] = { 103 "type": "operational_monitoring_explore", 104 "views": {"base_view": f"{table_prefix}"}, 105 "branches": branches, 106 "xaxis": project["xaxis"], 107 "dimensions": dimensions, 108 "summaries": project["summaries"], 109 } 110 111 if "alerting" in project and project["alerting"]: 112 # create an alerting view if available 113 om_content["views"][f"{table_prefix}_alerts"] = { 114 "type": "operational_monitoring_alerting_view", 115 "tables": [ 116 { 117 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts", 118 } 119 ], 120 } 121 om_content["explores"][f"{table_prefix}_alerts"] = { 122 "type": "operational_monitoring_alerting_explore", 123 "views": {"base_view": f"{table_prefix}_alerts"}, 124 } 125 126 om_content["dashboards"][table_prefix] = { 127 "type": "operational_monitoring_dashboard", 128 "title": project_name, 129 "tables": [ 130 { 131 "explore": f"{table_prefix}", 132 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_statistics", 133 "branches": branches, 134 "xaxis": project["xaxis"], 135 "compact_visualization": project.get( 136 "compact_visualization", False 137 ), 138 "dimensions": dimensions, 139 "group_by_dimension": project.get("group_by_dimension", None), 140 "summaries": project["summaries"], 141 } 142 ], 143 } 144 145 if "alerting" in project and project["alerting"]: 146 om_content["dashboards"][table_prefix]["tables"].append( 147 { 148 "explore": f"{table_prefix}_alerts", 149 "table": f"{PROD_PROJECT}.{OPMON_DATASET}.{table_prefix}_alerts", 150 } 151 ) 152 153 return om_content 154 155 156def _get_metric_hub_namespaces(existing_namespaces): 157 metric_hub_data_sources = _get_metric_hub_data_sources() 158 159 metric_hub_namespaces = {} 160 for namespace, metric_hub_data_sources in metric_hub_data_sources.items(): 161 # each data source definition is represented by a view and an explore 162 explores = {} 163 views = {} 164 for data_source in sorted(metric_hub_data_sources): 165 views[f"metric_definitions_{data_source}"] = { 166 "type": "metric_definitions_view" 167 } 168 169 explores[f"metric_definitions_{data_source}"] = { 170 "type": "metric_definitions_explore", 171 "views": {"base_view": f"metric_definitions_{data_source}"}, 172 } 173 174 metric_hub_namespaces[namespace] = { 175 "pretty_name": lookml_utils.slug_to_title(namespace), 176 "views": views, 177 "explores": explores, 178 } 179 180 return metric_hub_namespaces 181 182 183def _get_glean_apps( 184 app_listings_uri: str, 185) -> List[Dict[str, Union[str, List[Dict[str, str]]]]]: 186 # define key function and reuse it for sorted and groupby 187 if app_listings_uri.startswith(PROBE_INFO_BASE_URI): 188 # For probe-info-service requests, add query param to bypass cloudfront cache 189 app_listings_uri += f"?t={datetime.utcnow().isoformat()}" 190 191 get_app_name = itemgetter("app_name") 192 with urllib.request.urlopen(app_listings_uri) as f: 193 # groupby requires input be sorted by key to produce one result per key 194 app_listings = sorted(json.loads(f.read()), key=get_app_name) 195 196 apps = [] 197 for app_name, group in groupby(app_listings, get_app_name): 198 variants = list(group) 199 200 # use canonical_app_name where channel=="release" or the first one 201 release_variant = next( 202 ( 203 channel 204 for channel in variants 205 if channel.get("app_channel") == "release" 206 ), 207 variants[0], 208 ) 209 210 canonical_app_name = release_variant["canonical_app_name"] 211 v1_name = release_variant["v1_name"] 212 emails = release_variant["notification_emails"] 213 214 # we use the `source_dataset` concept to figure out what reference 215 # we should be looking for inside bigquery-etl 216 # For release we are currently using an app-level dataset which 217 # references the app id specific one (so we look for that view as 218 # a reference). 219 # For other channels, we refer to the stable tables 220 channels = [ 221 { 222 "channel": channel.get("app_channel"), 223 "dataset": ( 224 channel.get("app_name").replace("-", "_") 225 if channel.get("app_channel") == "release" 226 else channel.get("bq_dataset_family") 227 ), 228 "source_dataset": ( 229 channel.get("bq_dataset_family") 230 if channel.get("app_channel") == "release" 231 else channel.get("bq_dataset_family") + "_stable" 232 ), 233 } 234 for channel in variants 235 if not channel.get("deprecated") 236 ] 237 238 # If all channels are deprecated, don't include this app 239 if channels: 240 apps.append( 241 { 242 "name": app_name, 243 "pretty_name": canonical_app_name, 244 "channels": channels, 245 "owners": emails, 246 "glean_app": True, 247 "v1_name": v1_name, 248 } 249 ) 250 251 return apps 252 253 254def _get_looker_views( 255 app: Dict[str, Union[str, List[Dict[str, str]]]], 256 db_views: Dict[str, Dict[str, List[List[str]]]], 257) -> List[View]: 258 views, view_names = [], [] 259 260 for klass in VIEW_TYPES.values(): 261 for view in klass.from_db_views( # type: ignore 262 app["name"], app["glean_app"], app["channels"], db_views 263 ): 264 if view.name in view_names: 265 raise KeyError( 266 ( 267 f"Duplicate Looker View name {view.name} " 268 f"when generating views for namespace {app['name']}" 269 ) 270 ) 271 views.append(view) 272 view_names.append(view.name) 273 274 return views 275 276 277def _get_explores(views: List[View]) -> dict: 278 explores = {} 279 for _, klass in EXPLORE_TYPES.items(): 280 for explore in klass.from_views(views): # type: ignore 281 explores.update(explore.to_dict()) 282 283 return explores 284 285 286def _get_metric_hub_data_sources() -> Dict[str, List[str]]: 287 """Get data source definitions from metric-hub repository for each namespace.""" 288 data_sources_per_namespace: Dict[str, List[str]] = {} 289 for definition in MetricsConfigLoader.configs.definitions: 290 for data_source_slug in definition.spec.data_sources.definitions.keys(): 291 if ( 292 len( 293 MetricsConfigLoader.metrics_of_data_source( 294 data_source_slug, definition.platform 295 ) 296 ) 297 > 0 # ignore data sources that are not used for any metric definition 298 ): 299 if definition.platform in data_sources_per_namespace: 300 data_sources_per_namespace[definition.platform].append( 301 data_source_slug 302 ) 303 else: 304 data_sources_per_namespace[definition.platform] = [data_source_slug] 305 306 return data_sources_per_namespace 307 308 309@click.command(help=__doc__) 310@click.option( 311 "--custom-namespaces", 312 default="custom-namespaces.yaml", 313 type=click.File(), 314 help="Path to a custom namespaces file", 315) 316@click.option( 317 "--generated-sql-uri", 318 default=DEFAULT_GENERATED_SQL_URI, 319 help="URI of a tar archive of the bigquery-etl generated-sql branch, which is " 320 "used to list views and determine whether they reference stable tables", 321) 322@click.option( 323 "--app-listings-uri", 324 default="https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings", 325 help="URI for probeinfo service v2 glean app listings", 326) 327@click.option( 328 "--disallowlist", 329 type=click.File(), 330 default="namespaces-disallowlist.yaml", 331 help="Path to namespace disallow list", 332) 333@click.option( 334 "--metric-hub-repos", 335 "--metric_hub_repos", 336 multiple=True, 337 default=[METRIC_HUB_REPO, LOOKER_METRIC_HUB_REPO], 338 help="Repos to load metric configs from.", 339) 340@click.option( 341 "--ignore", 342 multiple=True, 343 default=[], 344 help="Namespaces to ignore during generation.", 345) 346@click.option( 347 "--use_cloud_function", 348 "--use-cloud-function", 349 help="Use the Cloud Function to run dry runs during LookML generation.", 350 type=bool, 351) 352def namespaces( 353 custom_namespaces, 354 generated_sql_uri, 355 app_listings_uri, 356 disallowlist, 357 metric_hub_repos, 358 ignore, 359 use_cloud_function, 360): 361 """Generate namespaces.yaml.""" 362 warnings.filterwarnings("ignore", module="google.auth._default") 363 glean_apps = _get_glean_apps(app_listings_uri) 364 db_views = lookml_utils.get_bigquery_view_reference_map(generated_sql_uri) 365 366 namespaces = {} 367 for app in glean_apps: 368 if app["name"] not in ignore: 369 looker_views = _get_looker_views(app, db_views) 370 explores = _get_explores(looker_views) 371 views_as_dict = {view.name: view.as_dict() for view in looker_views} 372 373 namespaces[app["name"]] = { 374 "owners": app["owners"], 375 "pretty_name": app["pretty_name"], 376 "views": views_as_dict, 377 "explores": explores, 378 "glean_app": True, 379 } 380 381 if custom_namespaces is not None: 382 custom_namespaces = yaml.safe_load(custom_namespaces.read()) or {} 383 # remove namespaces that should be ignored 384 for ignored_namespace in ignore: 385 if ignored_namespace in custom_namespaces: 386 del custom_namespaces[ignored_namespace] 387 388 # generating operational monitoring namespace, if available 389 if "operational_monitoring" in custom_namespaces: 390 if use_cloud_function: 391 raise Exception("Cannot generate OpMon using dry run Cloud Function") 392 393 client = bigquery.Client() 394 opmon = _get_opmon(bq_client=client, namespaces=custom_namespaces) 395 custom_namespaces["operational_monitoring"].update(opmon) 396 397 _merge_namespaces(namespaces, custom_namespaces) 398 399 if metric_hub_repos: 400 MetricsConfigLoader.update_repos(metric_hub_repos) 401 402 _merge_namespaces(namespaces, _get_metric_hub_namespaces(namespaces)) 403 404 updated_namespaces = _filter_disallowed(namespaces, disallowlist) 405 for namespace in updated_namespaces: 406 if namespace not in ignore: 407 if "spoke" not in updated_namespaces[namespace]: 408 updated_namespaces[namespace]["spoke"] = DEFAULT_SPOKE 409 if "glean_app" not in updated_namespaces[namespace]: 410 updated_namespaces[namespace]["glean_app"] = False 411 412 Path("namespaces.yaml").write_text(yaml.safe_dump(updated_namespaces)) 413 414 415def _filter_disallowed(namespaces, disallowlist): 416 """Filter models, explores and views from the generated namespaces config, based on the disallowlist.""" 417 418 def match_any(name, patterns): 419 return any(fnmatch.fnmatch(name, p) for p in patterns) 420 421 # transform namespace disallowlist to a dict 422 disallowed_namespaces = yaml.safe_load(disallowlist.read()) or [] 423 disallowed_namespaces_dict = {} 424 for ns in [ 425 {namespace: {}} if isinstance(namespace, str) else namespace 426 for namespace in disallowed_namespaces 427 ]: 428 disallowed_namespaces_dict.update(ns) 429 430 filtered_namespaces = deepcopy(namespaces) 431 432 for pattern, sub_filters in disallowed_namespaces_dict.items(): 433 for key in list(filtered_namespaces): 434 if fnmatch.fnmatch(key, pattern): 435 # if no sub_filters, remove entire section 436 if not sub_filters: 437 del filtered_namespaces[key] 438 continue 439 440 entry = filtered_namespaces.get(key, {}) 441 442 # remove matching artifact types (views, explores) 443 for artifact_type, disallowed_artifact_names in sub_filters.items(): 444 if artifact_type in entry: 445 for key in list(entry[artifact_type]): 446 if match_any(key, disallowed_artifact_names): 447 del entry[artifact_type][key] 448 449 return filtered_namespaces
DEFAULT_GENERATED_SQL_URI =
'https://github.com/mozilla/bigquery-etl/archive/generated-sql.tar.gz'
PROBE_INFO_BASE_URI =
'https://probeinfo.telemetry.mozilla.org'
DEFAULT_SPOKE =
'looker-spoke-default'
OPMON_DATASET =
'operational_monitoring'
PROD_PROJECT =
'moz-fx-data-shared-prod'
namespaces =
<Command namespaces>
Generate namespaces.yaml.