generator.views.lookml_utils
Utils for generating lookml.
1"""Utils for generating lookml.""" 2 3import re 4import tarfile 5import urllib.request 6from collections import defaultdict 7from io import BytesIO 8from pathlib import Path 9from typing import Any, Dict, Iterable, List, Optional, Tuple 10 11import click 12import yaml 13from jinja2 import Environment, FileSystemLoader 14 15GENERATOR_PATH = Path(__file__).parent.parent 16 17BIGQUERY_TYPE_TO_DIMENSION_TYPE = { 18 "BIGNUMERIC": "string", 19 "BOOLEAN": "yesno", 20 "BYTES": "string", 21 "DATE": "time", 22 "DATETIME": "time", 23 "FLOAT": "number", 24 "INTEGER": "number", 25 "NUMERIC": "number", 26 "STRING": "string", 27 "TIME": "time", 28 "TIMESTAMP": "time", 29} 30 31HIDDEN_DIMENSIONS = { 32 ("document_id",), 33 ("client_id",), 34 ("client_info", "client_id"), 35 ("context_id",), 36 ("additional_properties",), 37} 38 39MAP_LAYER_NAMES = { 40 ("country",): "countries", 41 ("metadata", "geo", "country"): "countries", 42} 43 44 45def _get_dimension( 46 path: Tuple[str, ...], field_type: str, mode: str, description: Optional[str] 47) -> Dict[str, Any]: 48 result: Dict[str, Any] = {} 49 result["sql"] = "${TABLE}." + ".".join(path) 50 name = path 51 if ( 52 mode == "REPEATED" 53 or path in HIDDEN_DIMENSIONS 54 or field_type not in BIGQUERY_TYPE_TO_DIMENSION_TYPE 55 ): 56 result["hidden"] = "yes" 57 else: 58 result["type"] = BIGQUERY_TYPE_TO_DIMENSION_TYPE[field_type] 59 60 group_label, group_item_label = None, None 61 if len(path) > 1: 62 group_label = slug_to_title(" ".join(path[:-1])) 63 group_item_label = slug_to_title(path[-1]) 64 if result["type"] == "time": 65 # Remove _{type} suffix from the last path element for dimension group 66 # names For example submission_date and submission_timestamp become 67 # submission, and metadata.header.parsed_date becomes 68 # metadata__header__parsed. This is because the timeframe will add a _{type} 69 # suffix to the individual dimension names. 70 name = *path[:-1], re.sub("_(date|time(stamp)?)$", "", path[-1]) 71 result["timeframes"] = [ 72 "raw", 73 "time", 74 "date", 75 "week", 76 "month", 77 "quarter", 78 "year", 79 ] 80 if field_type == "DATE": 81 result["timeframes"].remove("time") 82 result["convert_tz"] = "no" 83 result["datatype"] = "date" 84 if group_label and group_item_label: 85 # Dimension groups should not be nested, see issue #82 86 result["label"] = f"{group_label}: {group_item_label}" 87 elif len(path) > 1: 88 result["group_label"] = group_label 89 result["group_item_label"] = group_item_label 90 if path in MAP_LAYER_NAMES: 91 result["map_layer_name"] = MAP_LAYER_NAMES[path] 92 result["name"] = "__".join(name) 93 94 if description: 95 result["description"] = description 96 97 return result 98 99 100def _generate_dimensions_helper(schema: List[Any], *prefix: str) -> Iterable[dict]: 101 for field in sorted(schema, key=lambda f: f["name"]): 102 if field["type"] == "RECORD" and not field.get("mode", "") == "REPEATED": 103 yield from _generate_dimensions_helper( 104 field["fields"], *prefix, field["name"] 105 ) 106 else: 107 yield _get_dimension( 108 (*prefix, field["name"]), 109 field["type"], 110 field.get("mode", ""), 111 field.get("description", ""), 112 ) 113 114 115def _generate_dimensions(table: str, dryrun) -> List[Dict[str, Any]]: 116 """Generate dimensions and dimension groups from a bigquery table. 117 118 When schema contains both submission_timestamp and submission_date, only produce 119 a dimension group for submission_timestamp. 120 121 Raise ClickException if schema results in duplicate dimensions. 122 """ 123 dimensions = {} 124 [project, dataset, table] = table.split(".") 125 table_schema = dryrun.create( 126 project=project, 127 dataset=dataset, 128 table=table, 129 ).get_table_schema() 130 131 for dimension in _generate_dimensions_helper(table_schema): 132 name_key = dimension["name"] 133 134 # This prevents `time` dimension groups from overwriting other dimensions below 135 if dimension.get("type") == "time": 136 name_key += "_time" 137 # overwrite duplicate "submission", "end", "start" dimension group, thus picking the 138 # last value sorted by field name, which is submission_timestamp 139 # See also https://github.com/mozilla/lookml-generator/issues/471 140 if name_key in dimensions and not ( 141 dimension.get("type") == "time" 142 and ( 143 dimension["name"] == "submission" 144 or dimension["name"].endswith("end") 145 or dimension["name"].endswith("start") 146 ) 147 ): 148 raise click.ClickException( 149 f"duplicate dimension {name_key!r} for table {table!r}" 150 ) 151 dimensions[name_key] = dimension 152 return list(dimensions.values()) 153 154 155def _generate_dimensions_from_query(query: str, dryrun) -> List[Dict[str, Any]]: 156 """Generate dimensions and dimension groups from a SQL query.""" 157 schema = dryrun.create(sql=query).get_schema() 158 dimensions = {} 159 for dimension in _generate_dimensions_helper(schema or []): 160 name_key = dimension["name"] 161 162 # This prevents `time` dimension groups from overwriting other dimensions below 163 if dimension.get("type") == "time": 164 name_key += "_time" 165 166 # overwrite duplicate "submission", "end", "start" dimension group, thus picking the 167 # last value sorted by field name, which is submission_timestamp 168 # See also https://github.com/mozilla/lookml-generator/issues/471 169 if name_key in dimensions and not ( 170 dimension.get("type") == "time" 171 and ( 172 dimension["name"] == "submission" 173 or dimension["name"].endswith("end") 174 or dimension["name"].endswith("start") 175 ) 176 ): 177 raise click.ClickException(f"duplicate dimension {name_key!r} in query") 178 dimensions[name_key] = dimension 179 return list(dimensions.values()) 180 181 182def _generate_nested_dimension_views( 183 schema: List[dict], view_name: str 184) -> List[Dict[str, Any]]: 185 """ 186 Recursively generate views for nested fields. 187 188 Nested fields are created as views, with dimensions and optionally measures. 189 """ 190 views: List[Dict[str, Any]] = [] 191 for field in sorted(schema, key=lambda f: f["name"]): 192 if field["type"] == "RECORD" and field["name"] != "labeled_counter": 193 # labeled_counter is handled explicitly in glean ping views; hidden for other views 194 if field.get("mode") == "REPEATED": 195 nested_field_view: Dict[str, Any] = { 196 "name": f"{view_name}__{field['name']}" 197 } 198 dimensions = _generate_dimensions_helper(schema=field["fields"]) 199 nested_field_view["dimensions"] = [ 200 d for d in dimensions if not _is_dimension_group(d) 201 ] 202 nested_field_view["dimension_groups"] = [ 203 d for d in dimensions if _is_dimension_group(d) 204 ] 205 views = ( 206 views 207 + [nested_field_view] 208 + _generate_nested_dimension_views( 209 field["fields"], f"{view_name}__{field['name']}" 210 ) 211 ) 212 else: 213 views = views + _generate_nested_dimension_views( 214 field["fields"], f"{view_name}__{field['name']}" 215 ) 216 217 return views 218 219 220def _is_dimension_group(dimension: dict): 221 """Determine if a dimension is actually a dimension group.""" 222 return "timeframes" in dimension or "intervals" in dimension 223 224 225def escape_filter_expr(expr: str) -> str: 226 """Escape filter expression for special Looker chars.""" 227 return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0) 228 229 230def _is_nested_dimension(dimension: dict): 231 return ( 232 "hidden" in dimension 233 and dimension["hidden"] 234 and "nested" in dimension 235 and dimension["nested"] 236 ) 237 238 239def render_template(filename, template_folder, **kwargs) -> str: 240 """Render a given template using Jinja.""" 241 env = Environment( 242 loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates") 243 ) 244 template = env.get_template(filename) 245 rendered = template.render(**kwargs) 246 return rendered 247 248 249def slug_to_title(slug): 250 """Convert a slug to title case.""" 251 return slug.replace("_", " ").title() 252 253 254# Map from view to qualified references {dataset: {view: [[project, dataset, table],]}} 255BQViewReferenceMap = Dict[str, Dict[str, List[List[str]]]] 256 257 258def get_bigquery_view_reference_map( 259 generated_sql_uri: str, 260) -> BQViewReferenceMap: 261 """Get a mapping from BigQuery datasets to views with references.""" 262 with urllib.request.urlopen(generated_sql_uri) as f: 263 tarbytes = BytesIO(f.read()) 264 views: BQViewReferenceMap = defaultdict(dict) 265 with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: 266 for tarinfo in tar: 267 if tarinfo.name.endswith("/metadata.yaml"): 268 metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) # type: ignore 269 references = metadata.get("references", {}) 270 if "view.sql" not in references: 271 continue 272 *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") 273 if project == "moz-fx-data-shared-prod": 274 views[dataset_id][view_id] = [ 275 ref.split(".") for ref in references["view.sql"] 276 ] 277 return views
GENERATOR_PATH =
PosixPath('/root/project/generator')
BIGQUERY_TYPE_TO_DIMENSION_TYPE =
{'BIGNUMERIC': 'string', 'BOOLEAN': 'yesno', 'BYTES': 'string', 'DATE': 'time', 'DATETIME': 'time', 'FLOAT': 'number', 'INTEGER': 'number', 'NUMERIC': 'number', 'STRING': 'string', 'TIME': 'time', 'TIMESTAMP': 'time'}
HIDDEN_DIMENSIONS =
{('additional_properties',), ('client_info', 'client_id'), ('client_id',), ('document_id',), ('context_id',)}
MAP_LAYER_NAMES =
{('country',): 'countries', ('metadata', 'geo', 'country'): 'countries'}
def
escape_filter_expr(expr: str) -> str:
226def escape_filter_expr(expr: str) -> str: 227 """Escape filter expression for special Looker chars.""" 228 return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)
Escape filter expression for special Looker chars.
def
render_template(filename, template_folder, **kwargs) -> str:
240def render_template(filename, template_folder, **kwargs) -> str: 241 """Render a given template using Jinja.""" 242 env = Environment( 243 loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates") 244 ) 245 template = env.get_template(filename) 246 rendered = template.render(**kwargs) 247 return rendered
Render a given template using Jinja.
def
slug_to_title(slug):
250def slug_to_title(slug): 251 """Convert a slug to title case.""" 252 return slug.replace("_", " ").title()
Convert a slug to title case.
BQViewReferenceMap =
typing.Dict[str, typing.Dict[str, typing.List[typing.List[str]]]]
def
get_bigquery_view_reference_map(generated_sql_uri: str) -> Dict[str, Dict[str, List[List[str]]]]:
259def get_bigquery_view_reference_map( 260 generated_sql_uri: str, 261) -> BQViewReferenceMap: 262 """Get a mapping from BigQuery datasets to views with references.""" 263 with urllib.request.urlopen(generated_sql_uri) as f: 264 tarbytes = BytesIO(f.read()) 265 views: BQViewReferenceMap = defaultdict(dict) 266 with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: 267 for tarinfo in tar: 268 if tarinfo.name.endswith("/metadata.yaml"): 269 metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) # type: ignore 270 references = metadata.get("references", {}) 271 if "view.sql" not in references: 272 continue 273 *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") 274 if project == "moz-fx-data-shared-prod": 275 views[dataset_id][view_id] = [ 276 ref.split(".") for ref in references["view.sql"] 277 ] 278 return views
Get a mapping from BigQuery datasets to views with references.