generator.views.lookml_utils
Utils for generating lookml.
1"""Utils for generating lookml.""" 2 3import re 4import tarfile 5import urllib.request 6from collections import defaultdict 7from io import BytesIO 8from pathlib import Path 9from typing import Any, Dict, Iterable, List, Optional, Tuple 10 11import click 12import yaml 13from jinja2 import Environment, FileSystemLoader 14 15GENERATOR_PATH = Path(__file__).parent.parent 16 17BIGQUERY_TYPE_TO_DIMENSION_TYPE = { 18 "BIGNUMERIC": "string", 19 "BOOLEAN": "yesno", 20 "BYTES": "string", 21 "DATE": "time", 22 "DATETIME": "time", 23 "FLOAT": "number", 24 "INTEGER": "number", 25 "NUMERIC": "number", 26 "STRING": "string", 27 "TIME": "time", 28 "TIMESTAMP": "time", 29} 30 31HIDDEN_DIMENSIONS = { 32 ("document_id",), 33 ("client_id",), 34 ("client_info", "client_id"), 35 ("context_id",), 36 ("additional_properties",), 37} 38 39MAP_LAYER_NAMES = { 40 ("country",): "countries", 41 ("metadata", "geo", "country"): "countries", 42} 43 44DEFAULT_MAX_SUGGEST_PERSIST_FOR = "24 hours" 45 46 47def _get_dimension( 48 path: Tuple[str, ...], field_type: str, mode: str, description: Optional[str] 49) -> Dict[str, Any]: 50 result: Dict[str, Any] = {} 51 result["sql"] = "${TABLE}." + ".".join(path) 52 name = path 53 if ( 54 mode == "REPEATED" 55 or path in HIDDEN_DIMENSIONS 56 or field_type not in BIGQUERY_TYPE_TO_DIMENSION_TYPE 57 ): 58 result["hidden"] = "yes" 59 else: 60 result["type"] = BIGQUERY_TYPE_TO_DIMENSION_TYPE[field_type] 61 result["suggest_persist_for"] = DEFAULT_MAX_SUGGEST_PERSIST_FOR 62 63 group_label, group_item_label = None, None 64 if len(path) > 1: 65 group_label = slug_to_title(" ".join(path[:-1])) 66 group_item_label = slug_to_title(path[-1]) 67 if result["type"] == "time": 68 # Remove _{type} suffix from the last path element for dimension group 69 # names For example submission_date and submission_timestamp become 70 # submission, and metadata.header.parsed_date becomes 71 # metadata__header__parsed. This is because the timeframe will add a _{type} 72 # suffix to the individual dimension names. 73 name = *path[:-1], re.sub("_(date|time(stamp)?)$", "", path[-1]) 74 result["timeframes"] = [ 75 "raw", 76 "time", 77 "date", 78 "week", 79 "month", 80 "quarter", 81 "year", 82 ] 83 if field_type == "DATE": 84 result["timeframes"].remove("time") 85 result["convert_tz"] = "no" 86 result["datatype"] = "date" 87 if group_label and group_item_label: 88 # Dimension groups should not be nested, see issue #82 89 result["label"] = f"{group_label}: {group_item_label}" 90 elif len(path) > 1: 91 result["group_label"] = group_label 92 result["group_item_label"] = group_item_label 93 if path in MAP_LAYER_NAMES: 94 result["map_layer_name"] = MAP_LAYER_NAMES[path] 95 result["name"] = "__".join(name) 96 97 if description: 98 result["description"] = description 99 100 return result 101 102 103def _generate_dimensions_helper(schema: List[Any], *prefix: str) -> Iterable[dict]: 104 for field in sorted(schema, key=lambda f: f["name"]): 105 if field["type"] == "RECORD" and not field.get("mode", "") == "REPEATED": 106 yield from _generate_dimensions_helper( 107 field["fields"], *prefix, field["name"] 108 ) 109 else: 110 yield _get_dimension( 111 (*prefix, field["name"]), 112 field["type"], 113 field.get("mode", ""), 114 field.get("description", ""), 115 ) 116 117 118def _generate_dimensions(table: str, dryrun) -> List[Dict[str, Any]]: 119 """Generate dimensions and dimension groups from a bigquery table. 120 121 When schema contains both submission_timestamp and submission_date, only produce 122 a dimension group for submission_timestamp. 123 124 Raise ClickException if schema results in duplicate dimensions. 125 """ 126 dimensions = {} 127 [project, dataset, table] = table.split(".") 128 table_schema = dryrun.create( 129 project=project, 130 dataset=dataset, 131 table=table, 132 ).get_table_schema() 133 134 for dimension in _generate_dimensions_helper(table_schema): 135 name_key = dimension["name"] 136 137 # This prevents `time` dimension groups from overwriting other dimensions below 138 if dimension.get("type") == "time": 139 name_key += "_time" 140 # overwrite duplicate "submission", "end", "start" dimension group, thus picking the 141 # last value sorted by field name, which is submission_timestamp 142 # See also https://github.com/mozilla/lookml-generator/issues/471 143 if name_key in dimensions and not ( 144 dimension.get("type") == "time" 145 and ( 146 dimension["name"] == "submission" 147 or dimension["name"].endswith("end") 148 or dimension["name"].endswith("start") 149 ) 150 ): 151 raise click.ClickException( 152 f"duplicate dimension {name_key!r} for table {table!r}" 153 ) 154 dimensions[name_key] = dimension 155 return list(dimensions.values()) 156 157 158def _generate_dimensions_from_query(query: str, dryrun) -> List[Dict[str, Any]]: 159 """Generate dimensions and dimension groups from a SQL query.""" 160 schema = dryrun.create(sql=query).get_schema() 161 dimensions = {} 162 for dimension in _generate_dimensions_helper(schema or []): 163 name_key = dimension["name"] 164 165 # This prevents `time` dimension groups from overwriting other dimensions below 166 if dimension.get("type") == "time": 167 name_key += "_time" 168 169 # overwrite duplicate "submission", "end", "start" dimension group, thus picking the 170 # last value sorted by field name, which is submission_timestamp 171 # See also https://github.com/mozilla/lookml-generator/issues/471 172 if name_key in dimensions and not ( 173 dimension.get("type") == "time" 174 and ( 175 dimension["name"] == "submission" 176 or dimension["name"].endswith("end") 177 or dimension["name"].endswith("start") 178 ) 179 ): 180 raise click.ClickException(f"duplicate dimension {name_key!r} in query") 181 dimensions[name_key] = dimension 182 return list(dimensions.values()) 183 184 185def _generate_nested_dimension_views( 186 schema: List[dict], view_name: str 187) -> List[Dict[str, Any]]: 188 """ 189 Recursively generate views for nested fields. 190 191 Nested fields are created as views, with dimensions and optionally measures. 192 """ 193 views: List[Dict[str, Any]] = [] 194 for field in sorted(schema, key=lambda f: f["name"]): 195 if field["type"] == "RECORD" and field["name"] != "labeled_counter": 196 # labeled_counter is handled explicitly in glean ping views; hidden for other views 197 if field.get("mode") == "REPEATED": 198 nested_field_view: Dict[str, Any] = { 199 "name": f"{view_name}__{field['name']}" 200 } 201 dimensions = _generate_dimensions_helper(schema=field["fields"]) 202 nested_field_view["dimensions"] = [ 203 d for d in dimensions if not _is_dimension_group(d) 204 ] 205 nested_field_view["dimension_groups"] = [ 206 d for d in dimensions if _is_dimension_group(d) 207 ] 208 views = ( 209 views 210 + [nested_field_view] 211 + _generate_nested_dimension_views( 212 field["fields"], f"{view_name}__{field['name']}" 213 ) 214 ) 215 else: 216 views = views + _generate_nested_dimension_views( 217 field["fields"], f"{view_name}__{field['name']}" 218 ) 219 220 return views 221 222 223def _is_dimension_group(dimension: dict): 224 """Determine if a dimension is actually a dimension group.""" 225 return "timeframes" in dimension or "intervals" in dimension 226 227 228def escape_filter_expr(expr: str) -> str: 229 """Escape filter expression for special Looker chars.""" 230 return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0) 231 232 233def _is_nested_dimension(dimension: dict): 234 return ( 235 "hidden" in dimension 236 and dimension["hidden"] 237 and "nested" in dimension 238 and dimension["nested"] 239 ) 240 241 242def render_template(filename, template_folder, **kwargs) -> str: 243 """Render a given template using Jinja.""" 244 env = Environment( 245 loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates") 246 ) 247 template = env.get_template(filename) 248 rendered = template.render(**kwargs) 249 return rendered 250 251 252def slug_to_title(slug): 253 """Convert a slug to title case.""" 254 return slug.replace("_", " ").title() 255 256 257# Map from view to qualified references {dataset: {view: [[project, dataset, table],]}} 258BQViewReferenceMap = Dict[str, Dict[str, List[List[str]]]] 259 260 261def get_bigquery_view_reference_map( 262 generated_sql_uri: str, 263) -> BQViewReferenceMap: 264 """Get a mapping from BigQuery datasets to views with references.""" 265 with urllib.request.urlopen(generated_sql_uri) as f: 266 tarbytes = BytesIO(f.read()) 267 views: BQViewReferenceMap = defaultdict(dict) 268 with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: 269 for tarinfo in tar: 270 if tarinfo.name.endswith("/metadata.yaml"): 271 metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) # type: ignore 272 references = metadata.get("references", {}) 273 if "view.sql" not in references: 274 continue 275 *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") 276 if project == "moz-fx-data-shared-prod": 277 views[dataset_id][view_id] = [ 278 ref.split(".") for ref in references["view.sql"] 279 ] 280 return views
GENERATOR_PATH =
PosixPath('/root/project/generator')
BIGQUERY_TYPE_TO_DIMENSION_TYPE =
{'BIGNUMERIC': 'string', 'BOOLEAN': 'yesno', 'BYTES': 'string', 'DATE': 'time', 'DATETIME': 'time', 'FLOAT': 'number', 'INTEGER': 'number', 'NUMERIC': 'number', 'STRING': 'string', 'TIME': 'time', 'TIMESTAMP': 'time'}
HIDDEN_DIMENSIONS =
{('additional_properties',), ('client_id',), ('client_info', 'client_id'), ('document_id',), ('context_id',)}
MAP_LAYER_NAMES =
{('country',): 'countries', ('metadata', 'geo', 'country'): 'countries'}
DEFAULT_MAX_SUGGEST_PERSIST_FOR =
'24 hours'
def
escape_filter_expr(expr: str) -> str:
229def escape_filter_expr(expr: str) -> str: 230 """Escape filter expression for special Looker chars.""" 231 return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)
Escape filter expression for special Looker chars.
def
render_template(filename, template_folder, **kwargs) -> str:
243def render_template(filename, template_folder, **kwargs) -> str: 244 """Render a given template using Jinja.""" 245 env = Environment( 246 loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates") 247 ) 248 template = env.get_template(filename) 249 rendered = template.render(**kwargs) 250 return rendered
Render a given template using Jinja.
def
slug_to_title(slug):
253def slug_to_title(slug): 254 """Convert a slug to title case.""" 255 return slug.replace("_", " ").title()
Convert a slug to title case.
BQViewReferenceMap =
typing.Dict[str, typing.Dict[str, typing.List[typing.List[str]]]]
def
get_bigquery_view_reference_map(generated_sql_uri: str) -> Dict[str, Dict[str, List[List[str]]]]:
262def get_bigquery_view_reference_map( 263 generated_sql_uri: str, 264) -> BQViewReferenceMap: 265 """Get a mapping from BigQuery datasets to views with references.""" 266 with urllib.request.urlopen(generated_sql_uri) as f: 267 tarbytes = BytesIO(f.read()) 268 views: BQViewReferenceMap = defaultdict(dict) 269 with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar: 270 for tarinfo in tar: 271 if tarinfo.name.endswith("/metadata.yaml"): 272 metadata = yaml.safe_load(tar.extractfile(tarinfo.name)) # type: ignore 273 references = metadata.get("references", {}) 274 if "view.sql" not in references: 275 continue 276 *_, project, dataset_id, view_id, _ = tarinfo.name.split("/") 277 if project == "moz-fx-data-shared-prod": 278 views[dataset_id][view_id] = [ 279 ref.split(".") for ref in references["view.sql"] 280 ] 281 return views
Get a mapping from BigQuery datasets to views with references.