generator.views.lookml_utils
Utils for generating lookml.
1"""Utils for generating lookml.""" 2 3import re 4import tarfile 5import urllib.request 6from collections import defaultdict 7from io import BytesIO 8from pathlib import Path 9from typing import Any, Dict, Iterable, List, Optional, Tuple 10 11import click 12import yaml 13from jinja2 import Environment, FileSystemLoader 14 15GENERATOR_PATH = Path(__file__).parent.parent 16 17BIGQUERY_TYPE_TO_DIMENSION_TYPE = { 18 "BIGNUMERIC": "string", 19 "BOOLEAN": "yesno", 20 "BYTES": "string", 21 "DATE": "time", 22 "DATETIME": "time", 23 "FLOAT": "number", 24 "INTEGER": "number", 25 "NUMERIC": "number", 26 "STRING": "string", 27 "TIME": "time", 28 "TIMESTAMP": "time", 29} 30 31HIDDEN_DIMENSIONS = { 32 ("document_id",), 33 ("client_id",), 34 ("client_info", "client_id"), 35 ("context_id",), 36 ("additional_properties",), 37} 38 39MAP_LAYER_NAMES = { 40 ("country",): "countries", 41 ("metadata", "geo", "country"): "countries", 42} 43 44DEFAULT_MAX_SUGGEST_PERSIST_FOR = "24 hours" 45 46UPPERCASE_INITIALISMS_PATTERN = re.compile( 47 ( 48 r"\b(" 49 + "|".join( 50 [ 51 "CPU", 52 "DB", 53 "DNS", 54 "DNT", 55 "DOM", 56 "GC", 57 "GPU", 58 "HTTP", 59 "ID", 60 "IO", 61 "IP", 62 "ISP", 63 "JWE", 64 "LB", 65 "OS", 66 "SDK", 67 "SERP", 68 "SSL", 69 "TLS", 70 "UI", 71 "URI", 72 "URL", 73 "UTM", 74 "UUID", 75 ] 76 ) 77 + r")\b" 78 ), 79 re.IGNORECASE, 80) 81 82 83def _get_dimension( 84 path: Tuple[str, ...], field_type: str, mode: str, description: Optional[str] 85) -> Dict[str, Any]: 86 result: Dict[str, Any] = {} 87 result["sql"] = "${TABLE}." + ".".join(path) 88 name = path 89 if ( 90 mode == "REPEATED" 91 or path in HIDDEN_DIMENSIONS 92 or field_type not in BIGQUERY_TYPE_TO_DIMENSION_TYPE 93 ): 94 result["hidden"] = "yes" 95 else: 96 result["type"] = BIGQUERY_TYPE_TO_DIMENSION_TYPE[field_type] 97 result["suggest_persist_for"] = DEFAULT_MAX_SUGGEST_PERSIST_FOR 98 99 group_label, group_item_label = None, None 100 if len(path) > 1: 101 group_label = slug_to_title(" ".join(path[:-1])) 102 group_item_label = slug_to_title(path[-1]) 103 if result["type"] == "time": 104 # Remove _{type} suffix from the last path element for dimension group 105 # names For example submission_date and submission_timestamp become 106 # submission, and metadata.header.parsed_date becomes 107 # metadata__header__parsed. This is because the timeframe will add a _{type} 108 # suffix to the individual dimension names. 109 name = *path[:-1], re.sub("_(date|time(stamp)?)$", "", path[-1]) 110 result["timeframes"] = [ 111 "raw", 112 "time", 113 "date", 114 "week", 115 "month", 116 "quarter", 117 "year", 118 ] 119 if field_type == "DATE": 120 result["timeframes"].remove("time") 121 result["convert_tz"] = "no" 122 result["datatype"] = "date" 123 if group_label and group_item_label: 124 # Dimension groups should not be nested, see issue #82 125 result["label"] = f"{group_label}: {group_item_label}" 126 elif len(path) > 1: 127 result["group_label"] = group_label 128 result["group_item_label"] = group_item_label 129 if path in MAP_LAYER_NAMES: 130 result["map_layer_name"] = MAP_LAYER_NAMES[path] 131 result["name"] = "__".join(name) 132 133 if description: 134 result["description"] = description 135 136 return result 137 138 139def _generate_dimensions_helper(schema: List[Any], *prefix: str) -> Iterable[dict]: 140 for field in sorted(schema, key=lambda f: f["name"]): 141 if field["type"] == "RECORD" and not field.get("mode", "") == "REPEATED": 142 yield from _generate_dimensions_helper( 143 field["fields"], *prefix, field["name"] 144 ) 145 else: 146 yield _get_dimension( 147 (*prefix, field["name"]), 148 field["type"], 149 field.get("mode", ""), 150 field.get("description", ""), 151 ) 152 153 154def _generate_dimensions(table: str, dryrun) -> List[Dict[str, Any]]: 155 """Generate dimensions and dimension groups from a bigquery table. 156 157 When schema contains both submission_timestamp and submission_date, only produce 158 a dimension group for submission_timestamp. 159 160 Raise ClickException if schema results in duplicate dimensions. 161 """ 162 dimensions = {} 163 [project, dataset, table] = table.split(".") 164 table_schema = dryrun.create( 165 project=project, 166 dataset=dataset, 167 table=table, 168 ).get_table_schema() 169 170 for dimension in _generate_dimensions_helper(table_schema): 171 name_key = dimension["name"] 172 173 # This prevents `time` dimension groups from overwriting other dimensions below 174 if dimension.get("type") == "time": 175 name_key += "_time" 176 # overwrite duplicate "submission", "end", "start" dimension group, thus picking the 177 # last value sorted by field name, which is submission_timestamp 178 # See also https://github.com/mozilla/lookml-generator/issues/471 179 if name_key in dimensions and not ( 180 dimension.get("type") == "time" 181 and ( 182 dimension["name"] == "submission" 183 or dimension["name"].endswith("end") 184 or dimension["name"].endswith("start") 185 ) 186 ): 187 raise click.ClickException( 188 f"duplicate dimension {name_key!r} for table {table!r}" 189 ) 190 dimensions[name_key] = dimension 191 return list(dimensions.values()) 192 193 194def _generate_dimensions_from_query(query: str, dryrun) -> List[Dict[str, Any]]: 195 """Generate dimensions and dimension groups from a SQL query.""" 196 schema = dryrun.create(sql=query).get_schema() 197 dimensions = {} 198 for dimension in _generate_dimensions_helper(schema or []): 199 name_key = dimension["name"] 200 201 # This prevents `time` dimension groups from overwriting other dimensions below 202 if dimension.get("type") == "time": 203 name_key += "_time" 204 205 # overwrite duplicate "submission", "end", "start" dimension group, thus picking the 206 # last value sorted by field name, which is submission_timestamp 207 # See also https://github.com/mozilla/lookml-generator/issues/471 208 if name_key in dimensions and not ( 209 dimension.get("type") == "time" 210 and ( 211 dimension["name"] == "submission" 212 or dimension["name"].endswith("end") 213 or dimension["name"].endswith("start") 214 ) 215 ): 216 raise click.ClickException(f"duplicate dimension {name_key!r} in query") 217 dimensions[name_key] = dimension 218 return list(dimensions.values()) 219 220 221def _generate_nested_dimension_views( 222 schema: List[dict], view_name: str 223) -> List[Dict[str, Any]]: 224 """ 225 Recursively generate views for nested fields. 226 227 Nested fields are created as views, with dimensions and optionally measures. 228 """ 229 views: List[Dict[str, Any]] = [] 230 for field in sorted(schema, key=lambda f: f["name"]): 231 if field["type"] == "RECORD" and field["name"] != "labeled_counter": 232 # labeled_counter is handled explicitly in glean ping views; hidden for other views 233 if field.get("mode") == "REPEATED": 234 nested_field_view: Dict[str, Any] = { 235 "name": f"{view_name}__{field['name']}" 236 } 237 dimensions = _generate_dimensions_helper(schema=field["fields"]) 238 nested_field_view["dimensions"] = [ 239 d for d in dimensions if not _is_dimension_group(d) 240 ] 241 nested_field_view["dimension_groups"] = [ 242 d for d in dimensions if _is_dimension_group(d) 243 ] 244 views = ( 245 views 246 + [nested_field_view] 247 + _generate_nested_dimension_views( 248 field["fields"], f"{view_name}__{field['name']}" 249 ) 250 ) 251 else: 252 views = views + _generate_nested_dimension_views( 253 field["fields"], f"{view_name}__{field['name']}" 254 ) 255 256 return views 257 258 259def _is_dimension_group(dimension: dict): 260 """Determine if a dimension is actually a dimension group.""" 261 return "timeframes" in dimension or "intervals" in dimension 262 263 264def escape_filter_expr(expr: str) -> str: 265 """Escape filter expression for special Looker chars.""" 266 return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0) 267 268 269def _is_nested_dimension(dimension: dict): 270 return ( 271 "hidden" in dimension 272 and dimension["hidden"] 273 and "nested" in dimension 274 and dimension["nested"] 275 ) 276 277 278def render_template(filename, template_folder, **kwargs) -> str: 279 """Render a given template using Jinja.""" 280 env = Environment( 281 loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates") 282 ) 283 template = env.get_template(filename) 284 rendered = template.render(**kwargs) 285 return rendered 286 287 288def slug_to_title(slug): 289 """Convert a slug to title case, with some common initialisms uppercased.""" 290 return UPPERCASE_INITIALISMS_PATTERN.sub( 291 lambda match: match.group(0).upper(), slug.replace("_", " ").title() 292 ) 293 294 295# Map from view to qualified references {dataset: {view: [[project, dataset, table],]}} 296BQViewReferenceMap = Dict[str, Dict[str, List[List[str]]]] 297 298 299def get_bigquery_view_reference_map( 300 generated_sql_uri: str, 301) -> BQViewReferenceMap: 302 """Get a mapping from BigQuery datasets to views with references.""" 303 with urllib.request.urlopen(generated_sql_uri) as f: 304 tarbytes = BytesIO(f.read()) 305 views: BQViewReferenceMap = defaultdict(dict) 306 with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: 307 for tarinfo in tar: 308 if tarinfo.name.endswith("/metadata.yaml"): 309 metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) # type: ignore 310 references = metadata.get("references", {}) 311 if "view.sql" not in references: 312 continue 313 *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") 314 if project == "moz-fx-data-shared-prod": 315 views[dataset_id][view_id] = [ 316 ref.split(".") for ref in references["view.sql"] 317 ] 318 return views
GENERATOR_PATH =
PosixPath('/root/project/generator')
BIGQUERY_TYPE_TO_DIMENSION_TYPE =
{'BIGNUMERIC': 'string', 'BOOLEAN': 'yesno', 'BYTES': 'string', 'DATE': 'time', 'DATETIME': 'time', 'FLOAT': 'number', 'INTEGER': 'number', 'NUMERIC': 'number', 'STRING': 'string', 'TIME': 'time', 'TIMESTAMP': 'time'}
HIDDEN_DIMENSIONS =
{('additional_properties',), ('document_id',), ('client_info', 'client_id'), ('client_id',), ('context_id',)}
MAP_LAYER_NAMES =
{('country',): 'countries', ('metadata', 'geo', 'country'): 'countries'}
DEFAULT_MAX_SUGGEST_PERSIST_FOR =
'24 hours'
UPPERCASE_INITIALISMS_PATTERN =
re.compile('\\b(CPU|DB|DNS|DNT|DOM|GC|GPU|HTTP|ID|IO|IP|ISP|JWE|LB|OS|SDK|SERP|SSL|TLS|UI|URI|URL|UTM|UUID)\\b', re.IGNORECASE)
def
escape_filter_expr(expr: str) -> str:
265def escape_filter_expr(expr: str) -> str: 266 """Escape filter expression for special Looker chars.""" 267 return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)
Escape filter expression for special Looker chars.
def
render_template(filename, template_folder, **kwargs) -> str:
279def render_template(filename, template_folder, **kwargs) -> str: 280 """Render a given template using Jinja.""" 281 env = Environment( 282 loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates") 283 ) 284 template = env.get_template(filename) 285 rendered = template.render(**kwargs) 286 return rendered
Render a given template using Jinja.
def
slug_to_title(slug):
289def slug_to_title(slug): 290 """Convert a slug to title case, with some common initialisms uppercased.""" 291 return UPPERCASE_INITIALISMS_PATTERN.sub( 292 lambda match: match.group(0).upper(), slug.replace("_", " ").title() 293 )
Convert a slug to title case, with some common initialisms uppercased.
BQViewReferenceMap =
typing.Dict[str, typing.Dict[str, typing.List[typing.List[str]]]]
def
get_bigquery_view_reference_map(generated_sql_uri: str) -> Dict[str, Dict[str, List[List[str]]]]:
300def get_bigquery_view_reference_map( 301 generated_sql_uri: str, 302) -> BQViewReferenceMap: 303 """Get a mapping from BigQuery datasets to views with references.""" 304 with urllib.request.urlopen(generated_sql_uri) as f: 305 tarbytes = BytesIO(f.read()) 306 views: BQViewReferenceMap = defaultdict(dict) 307 with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: 308 for tarinfo in tar: 309 if tarinfo.name.endswith("/metadata.yaml"): 310 metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) # type: ignore 311 references = metadata.get("references", {}) 312 if "view.sql" not in references: 313 continue 314 *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") 315 if project == "moz-fx-data-shared-prod": 316 views[dataset_id][view_id] = [ 317 ref.split(".") for ref in references["view.sql"] 318 ] 319 return views
Get a mapping from BigQuery datasets to views with references.