generator.views.lookml_utils
Utils for generating lookml.
1"""Utils for generating lookml.""" 2 3import re 4import tarfile 5import urllib.request 6from collections import defaultdict 7from io import BytesIO 8from pathlib import Path 9from typing import Any, Dict, Iterable, List, Optional, Tuple 10 11import click 12import yaml 13from jinja2 import Environment, FileSystemLoader 14 15GENERATOR_PATH = Path(__file__).parent.parent 16 17BIGQUERY_TYPE_TO_DIMENSION_TYPE = { 18 "BIGNUMERIC": "string", 19 "BOOLEAN": "yesno", 20 "BYTES": "string", 21 "DATE": "time", 22 "DATETIME": "time", 23 "FLOAT": "number", 24 "INTEGER": "number", 25 "NUMERIC": "number", 26 "STRING": "string", 27 "TIME": "time", 28 "TIMESTAMP": "time", 29} 30 31HIDDEN_DIMENSIONS = { 32 ("document_id",), 33 ("client_id",), 34 ("client_info", "client_id"), 35 ("context_id",), 36 ("additional_properties",), 37} 38 39MAP_LAYER_NAMES = { 40 ("country",): "countries", 41 ("metadata", "geo", "country"): "countries", 42} 43 44DEFAULT_MAX_SUGGEST_PERSIST_FOR = "24 hours" 45 46UPPERCASE_INITIALISMS_PATTERN = re.compile( 47 ( 48 r"\b(" 49 + "|".join( 50 [ 51 "CPU", 52 "DB", 53 "DNS", 54 "DNT", 55 "DOM", 56 "GC", 57 "GPU", 58 "HTTP", 59 "ID", 60 "IO", 61 "IP", 62 "ISP", 63 "JWE", 64 "LB", 65 "OS", 66 "SDK", 67 "SERP", 68 "SSL", 69 "TLS", 70 "UI", 71 "URI", 72 "URL", 73 "UTM", 74 "UUID", 75 ] 76 ) 77 + r")\b" 78 ), 79 re.IGNORECASE, 80) 81 82 83def _get_dimension( 84 path: Tuple[str, ...], field_type: str, mode: str, description: Optional[str] 85) -> Dict[str, Any]: 86 result: Dict[str, Any] = {} 87 result["sql"] = "${TABLE}." + ".".join(path) 88 name = path 89 if ( 90 mode == "REPEATED" 91 or path in HIDDEN_DIMENSIONS 92 or field_type not in BIGQUERY_TYPE_TO_DIMENSION_TYPE 93 ): 94 result["hidden"] = "yes" 95 else: 96 result["type"] = BIGQUERY_TYPE_TO_DIMENSION_TYPE[field_type] 97 result["suggest_persist_for"] = DEFAULT_MAX_SUGGEST_PERSIST_FOR 98 99 group_label, group_item_label = None, None 100 if len(path) > 1: 101 group_label = ": ".join(slug_to_title(slug) for slug in path[:-1]) 102 group_item_label = slug_to_title(path[-1]) 103 if result["type"] == "time": 104 # Remove _{type} suffix from the last path element for dimension group 105 # names For example submission_date and submission_timestamp become 106 # submission, and metadata.header.parsed_date becomes 107 # metadata__header__parsed. This is because the timeframe will add a _{type} 108 # suffix to the individual dimension names. 109 name = *path[:-1], re.sub("_(date|time(stamp)?)$", "", path[-1]) 110 result["timeframes"] = [ 111 "raw", 112 "time", 113 "date", 114 "week", 115 "month", 116 "quarter", 117 "year", 118 ] 119 if field_type == "DATE": 120 result["timeframes"].remove("time") 121 result["convert_tz"] = "no" 122 result["datatype"] = "date" 123 if group_label and group_item_label: 124 # Dimension groups should not be nested, see issue #82 125 result["label"] = f"{group_label}: {group_item_label}" 126 # `suggest_persist_for` is not supported for dimension groups. 127 del result["suggest_persist_for"] 128 elif len(path) > 1: 129 result["group_label"] = group_label 130 result["group_item_label"] = group_item_label 131 if path in MAP_LAYER_NAMES: 132 result["map_layer_name"] = MAP_LAYER_NAMES[path] 133 result["name"] = "__".join(name) 134 135 if description: 136 result["description"] = description 137 138 return result 139 140 141def _generate_dimensions_helper(schema: List[Any], *prefix: str) -> Iterable[dict]: 142 for field in sorted(schema, key=lambda f: f["name"]): 143 if field["type"] == "RECORD" and not field.get("mode", "") == "REPEATED": 144 yield from _generate_dimensions_helper( 145 field["fields"], *prefix, field["name"] 146 ) 147 else: 148 yield _get_dimension( 149 (*prefix, field["name"]), 150 field["type"], 151 field.get("mode", ""), 152 field.get("description", ""), 153 ) 154 155 156def _remove_conflicting_dimension_group_timeframes( 157 dimensions: list[dict[str, Any]] 158) -> None: 159 """Remove timeframes from dimension groups if the resulting dimension would conflict with an existing dimension.""" 160 dimension_names = set(dimension["name"] for dimension in dimensions) 161 for dimension in dimensions: 162 if "timeframes" in dimension: 163 dimension["timeframes"] = [ 164 timeframe 165 for timeframe in dimension["timeframes"] 166 if f"{dimension['name']}_{timeframe}" not in dimension_names 167 ] 168 169 170def _generate_dimensions(table: str, dryrun) -> List[Dict[str, Any]]: 171 """Generate dimensions and dimension groups from a bigquery table. 172 173 When schema contains both submission_timestamp and submission_date, only produce 174 a dimension group for submission_timestamp. 175 176 Raise ClickException if schema results in duplicate dimensions. 177 """ 178 dimensions = {} 179 [project, dataset, table] = table.split(".") 180 table_schema = dryrun.create( 181 project=project, 182 dataset=dataset, 183 table=table, 184 ).get_table_schema() 185 186 for dimension in _generate_dimensions_helper(table_schema): 187 name_key = dimension["name"] 188 189 # This prevents `time` dimension groups from overwriting other dimensions below 190 if dimension.get("type") == "time": 191 name_key += "_time" 192 # overwrite duplicate "submission", "end", "start" dimension group, thus picking the 193 # last value sorted by field name, which is submission_timestamp 194 # See also https://github.com/mozilla/lookml-generator/issues/471 195 if name_key in dimensions and not ( 196 dimension.get("type") == "time" 197 and ( 198 dimension["name"] == "submission" 199 or dimension["name"].endswith("end") 200 or dimension["name"].endswith("start") 201 ) 202 ): 203 raise click.ClickException( 204 f"duplicate dimension {name_key!r} for table {table!r}" 205 ) 206 dimensions[name_key] = dimension 207 208 dimensions_list = list(dimensions.values()) 209 _remove_conflicting_dimension_group_timeframes(dimensions_list) 210 return dimensions_list 211 212 213def _generate_dimensions_from_query(query: str, dryrun) -> List[Dict[str, Any]]: 214 """Generate dimensions and dimension groups from a SQL query.""" 215 schema = dryrun.create(sql=query).get_schema() 216 dimensions = {} 217 for dimension in _generate_dimensions_helper(schema or []): 218 name_key = dimension["name"] 219 220 # This prevents `time` dimension groups from overwriting other dimensions below 221 if dimension.get("type") == "time": 222 name_key += "_time" 223 224 # overwrite duplicate "submission", "end", "start" dimension group, thus picking the 225 # last value sorted by field name, which is submission_timestamp 226 # See also https://github.com/mozilla/lookml-generator/issues/471 227 if name_key in dimensions and not ( 228 dimension.get("type") == "time" 229 and ( 230 dimension["name"] == "submission" 231 or dimension["name"].endswith("end") 232 or dimension["name"].endswith("start") 233 ) 234 ): 235 raise click.ClickException(f"duplicate dimension {name_key!r} in query") 236 dimensions[name_key] = dimension 237 238 dimensions_list = list(dimensions.values()) 239 _remove_conflicting_dimension_group_timeframes(dimensions_list) 240 return dimensions_list 241 242 243def _generate_nested_dimension_views( 244 schema: List[dict], view_name: str 245) -> List[Dict[str, Any]]: 246 """ 247 Recursively generate views for nested fields. 248 249 Nested fields are created as views, with dimensions and optionally measures. 250 """ 251 views: List[Dict[str, Any]] = [] 252 for field in sorted(schema, key=lambda f: f["name"]): 253 if field["type"] == "RECORD" and field["name"] != "labeled_counter": 254 # labeled_counter is handled explicitly in glean ping views; hidden for other views 255 if field.get("mode") == "REPEATED": 256 nested_field_view: Dict[str, Any] = { 257 "name": f"{view_name}__{field['name']}" 258 } 259 dimensions = list(_generate_dimensions_helper(schema=field["fields"])) 260 _remove_conflicting_dimension_group_timeframes(dimensions) 261 nested_field_view["dimensions"] = [ 262 d for d in dimensions if not _is_dimension_group(d) 263 ] 264 nested_field_view["dimension_groups"] = [ 265 d for d in dimensions if _is_dimension_group(d) 266 ] 267 views = ( 268 views 269 + [nested_field_view] 270 + _generate_nested_dimension_views( 271 field["fields"], f"{view_name}__{field['name']}" 272 ) 273 ) 274 else: 275 views = views + _generate_nested_dimension_views( 276 field["fields"], f"{view_name}__{field['name']}" 277 ) 278 279 return views 280 281 282def _is_dimension_group(dimension: dict): 283 """Determine if a dimension is actually a dimension group.""" 284 return "timeframes" in dimension or "intervals" in dimension 285 286 287def escape_filter_expr(expr: str) -> str: 288 """Escape filter expression for special Looker chars.""" 289 return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0) 290 291 292def _is_nested_dimension(dimension: dict): 293 return ( 294 "hidden" in dimension 295 and dimension["hidden"] 296 and "nested" in dimension 297 and dimension["nested"] 298 ) 299 300 301def render_template(filename, template_folder, **kwargs) -> str: 302 """Render a given template using Jinja.""" 303 env = Environment( 304 loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates") 305 ) 306 template = env.get_template(filename) 307 rendered = template.render(**kwargs) 308 return rendered 309 310 311def slug_to_title(slug): 312 """Convert a slug to title case, with some common initialisms uppercased.""" 313 return UPPERCASE_INITIALISMS_PATTERN.sub( 314 lambda match: match.group(0).upper(), slug.replace("_", " ").title() 315 ) 316 317 318# Map from view to qualified references {dataset: {view: [[project, dataset, table],]}} 319BQViewReferenceMap = Dict[str, Dict[str, List[List[str]]]] 320 321 322def get_bigquery_view_reference_map( 323 generated_sql_uri: str, 324) -> BQViewReferenceMap: 325 """Get a mapping from BigQuery datasets to views with references.""" 326 with urllib.request.urlopen(generated_sql_uri) as f: 327 tarbytes = BytesIO(f.read()) 328 views: BQViewReferenceMap = defaultdict(dict) 329 with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: 330 for tarinfo in tar: 331 if tarinfo.name.endswith("/metadata.yaml"): 332 metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) # type: ignore 333 references = metadata.get("references", {}) 334 if "view.sql" not in references: 335 continue 336 *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") 337 if project == "moz-fx-data-shared-prod": 338 views[dataset_id][view_id] = [ 339 ref.split(".") for ref in references["view.sql"] 340 ] 341 return views
GENERATOR_PATH =
PosixPath('/home/runner/work/lookml-generator/lookml-generator/generator')
BIGQUERY_TYPE_TO_DIMENSION_TYPE =
{'BIGNUMERIC': 'string', 'BOOLEAN': 'yesno', 'BYTES': 'string', 'DATE': 'time', 'DATETIME': 'time', 'FLOAT': 'number', 'INTEGER': 'number', 'NUMERIC': 'number', 'STRING': 'string', 'TIME': 'time', 'TIMESTAMP': 'time'}
HIDDEN_DIMENSIONS =
{('client_id',), ('context_id',), ('client_info', 'client_id'), ('additional_properties',), ('document_id',)}
MAP_LAYER_NAMES =
{('country',): 'countries', ('metadata', 'geo', 'country'): 'countries'}
DEFAULT_MAX_SUGGEST_PERSIST_FOR =
'24 hours'
UPPERCASE_INITIALISMS_PATTERN =
re.compile('\\b(CPU|DB|DNS|DNT|DOM|GC|GPU|HTTP|ID|IO|IP|ISP|JWE|LB|OS|SDK|SERP|SSL|TLS|UI|URI|URL|UTM|UUID)\\b', re.IGNORECASE)
def
escape_filter_expr(expr: str) -> str:
288def escape_filter_expr(expr: str) -> str: 289 """Escape filter expression for special Looker chars.""" 290 return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)
Escape filter expression for special Looker chars.
def
render_template(filename, template_folder, **kwargs) -> str:
302def render_template(filename, template_folder, **kwargs) -> str: 303 """Render a given template using Jinja.""" 304 env = Environment( 305 loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates") 306 ) 307 template = env.get_template(filename) 308 rendered = template.render(**kwargs) 309 return rendered
Render a given template using Jinja.
def
slug_to_title(slug):
312def slug_to_title(slug): 313 """Convert a slug to title case, with some common initialisms uppercased.""" 314 return UPPERCASE_INITIALISMS_PATTERN.sub( 315 lambda match: match.group(0).upper(), slug.replace("_", " ").title() 316 )
Convert a slug to title case, with some common initialisms uppercased.
BQViewReferenceMap =
typing.Dict[str, typing.Dict[str, typing.List[typing.List[str]]]]
def
get_bigquery_view_reference_map(generated_sql_uri: str) -> Dict[str, Dict[str, List[List[str]]]]:
323def get_bigquery_view_reference_map( 324 generated_sql_uri: str, 325) -> BQViewReferenceMap: 326 """Get a mapping from BigQuery datasets to views with references.""" 327 with urllib.request.urlopen(generated_sql_uri) as f: 328 tarbytes = BytesIO(f.read()) 329 views: BQViewReferenceMap = defaultdict(dict) 330 with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: 331 for tarinfo in tar: 332 if tarinfo.name.endswith("/metadata.yaml"): 333 metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) # type: ignore 334 references = metadata.get("references", {}) 335 if "view.sql" not in references: 336 continue 337 *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") 338 if project == "moz-fx-data-shared-prod": 339 views[dataset_id][view_id] = [ 340 ref.split(".") for ref in references["view.sql"] 341 ] 342 return views
Get a mapping from BigQuery datasets to views with references.