generator.views.lookml_utils

Utils for generating lookml.

  1"""Utils for generating lookml."""
  2
  3import re
  4import tarfile
  5import urllib.request
  6from collections import defaultdict
  7from io import BytesIO
  8from pathlib import Path
  9from typing import Any, Dict, Iterable, List, Optional, Tuple
 10
 11import click
 12import yaml
 13from jinja2 import Environment, FileSystemLoader
 14
 15GENERATOR_PATH = Path(__file__).parent.parent
 16
 17BIGQUERY_TYPE_TO_DIMENSION_TYPE = {
 18    "BIGNUMERIC": "string",
 19    "BOOLEAN": "yesno",
 20    "BYTES": "string",
 21    "DATE": "time",
 22    "DATETIME": "time",
 23    "FLOAT": "number",
 24    "INTEGER": "number",
 25    "NUMERIC": "number",
 26    "STRING": "string",
 27    "TIME": "time",
 28    "TIMESTAMP": "time",
 29}
 30
 31HIDDEN_DIMENSIONS = {
 32    ("document_id",),
 33    ("client_id",),
 34    ("client_info", "client_id"),
 35    ("context_id",),
 36    ("additional_properties",),
 37}
 38
 39MAP_LAYER_NAMES = {
 40    ("country",): "countries",
 41    ("metadata", "geo", "country"): "countries",
 42}
 43
 44DEFAULT_MAX_SUGGEST_PERSIST_FOR = "24 hours"
 45
 46UPPERCASE_INITIALISMS_PATTERN = re.compile(
 47    (
 48        r"\b("
 49        + "|".join(
 50            [
 51                "CPU",
 52                "DB",
 53                "DNS",
 54                "DNT",
 55                "DOM",
 56                "GC",
 57                "GPU",
 58                "HTTP",
 59                "ID",
 60                "IO",
 61                "IP",
 62                "ISP",
 63                "JWE",
 64                "LB",
 65                "OS",
 66                "SDK",
 67                "SERP",
 68                "SSL",
 69                "TLS",
 70                "UI",
 71                "URI",
 72                "URL",
 73                "UTM",
 74                "UUID",
 75            ]
 76        )
 77        + r")\b"
 78    ),
 79    re.IGNORECASE,
 80)
 81
 82
 83def _get_dimension(
 84    path: Tuple[str, ...], field_type: str, mode: str, description: Optional[str]
 85) -> Dict[str, Any]:
 86    result: Dict[str, Any] = {}
 87    result["sql"] = "${TABLE}." + ".".join(path)
 88    name = path
 89    if (
 90        mode == "REPEATED"
 91        or path in HIDDEN_DIMENSIONS
 92        or field_type not in BIGQUERY_TYPE_TO_DIMENSION_TYPE
 93    ):
 94        result["hidden"] = "yes"
 95    else:
 96        result["type"] = BIGQUERY_TYPE_TO_DIMENSION_TYPE[field_type]
 97        result["suggest_persist_for"] = DEFAULT_MAX_SUGGEST_PERSIST_FOR
 98
 99        group_label, group_item_label = None, None
100        if len(path) > 1:
101            group_label = ": ".join(slug_to_title(slug) for slug in path[:-1])
102            group_item_label = slug_to_title(path[-1])
103        if result["type"] == "time":
104            # Remove _{type} suffix from the last path element for dimension group
105            # names For example submission_date and submission_timestamp become
106            # submission, and metadata.header.parsed_date becomes
107            # metadata__header__parsed. This is because the timeframe will add a _{type}
108            # suffix to the individual dimension names.
109            name = *path[:-1], re.sub("_(date|time(stamp)?)$", "", path[-1])
110            result["timeframes"] = [
111                "raw",
112                "time",
113                "date",
114                "week",
115                "month",
116                "quarter",
117                "year",
118            ]
119            if field_type == "DATE":
120                result["timeframes"].remove("time")
121                result["convert_tz"] = "no"
122                result["datatype"] = "date"
123            if group_label and group_item_label:
124                # Dimension groups should not be nested, see issue #82
125                result["label"] = f"{group_label}: {group_item_label}"
126            # `suggest_persist_for` is not supported for dimension groups.
127            del result["suggest_persist_for"]
128        elif len(path) > 1:
129            result["group_label"] = group_label
130            result["group_item_label"] = group_item_label
131        if path in MAP_LAYER_NAMES:
132            result["map_layer_name"] = MAP_LAYER_NAMES[path]
133    result["name"] = "__".join(name)
134
135    if description:
136        result["description"] = description
137
138    return result
139
140
141def _generate_dimensions_helper(schema: List[Any], *prefix: str) -> Iterable[dict]:
142    for field in sorted(schema, key=lambda f: f["name"]):
143        if field["type"] == "RECORD" and not field.get("mode", "") == "REPEATED":
144            yield from _generate_dimensions_helper(
145                field["fields"], *prefix, field["name"]
146            )
147        else:
148            yield _get_dimension(
149                (*prefix, field["name"]),
150                field["type"],
151                field.get("mode", ""),
152                field.get("description", ""),
153            )
154
155
156def _remove_conflicting_dimension_group_timeframes(
157    dimensions: list[dict[str, Any]]
158) -> None:
159    """Remove timeframes from dimension groups if the resulting dimension would conflict with an existing dimension."""
160    dimension_names = set(dimension["name"] for dimension in dimensions)
161    for dimension in dimensions:
162        if "timeframes" in dimension:
163            dimension["timeframes"] = [
164                timeframe
165                for timeframe in dimension["timeframes"]
166                if f"{dimension['name']}_{timeframe}" not in dimension_names
167            ]
168
169
170def _generate_dimensions(table: str, dryrun) -> List[Dict[str, Any]]:
171    """Generate dimensions and dimension groups from a bigquery table.
172
173    When schema contains both submission_timestamp and submission_date, only produce
174    a dimension group for submission_timestamp.
175
176    Raise ClickException if schema results in duplicate dimensions.
177    """
178    dimensions = {}
179    [project, dataset, table] = table.split(".")
180    table_schema = dryrun.create(
181        project=project,
182        dataset=dataset,
183        table=table,
184    ).get_table_schema()
185
186    for dimension in _generate_dimensions_helper(table_schema):
187        name_key = dimension["name"]
188
189        # This prevents `time` dimension groups from overwriting other dimensions below
190        if dimension.get("type") == "time":
191            name_key += "_time"
192        # overwrite duplicate "submission", "end", "start" dimension group, thus picking the
193        # last value sorted by field name, which is submission_timestamp
194        # See also https://github.com/mozilla/lookml-generator/issues/471
195        if name_key in dimensions and not (
196            dimension.get("type") == "time"
197            and (
198                dimension["name"] == "submission"
199                or dimension["name"].endswith("end")
200                or dimension["name"].endswith("start")
201            )
202        ):
203            raise click.ClickException(
204                f"duplicate dimension {name_key!r} for table {table!r}"
205            )
206        dimensions[name_key] = dimension
207
208    dimensions_list = list(dimensions.values())
209    _remove_conflicting_dimension_group_timeframes(dimensions_list)
210    return dimensions_list
211
212
213def _generate_dimensions_from_query(query: str, dryrun) -> List[Dict[str, Any]]:
214    """Generate dimensions and dimension groups from a SQL query."""
215    schema = dryrun.create(sql=query).get_schema()
216    dimensions = {}
217    for dimension in _generate_dimensions_helper(schema or []):
218        name_key = dimension["name"]
219
220        # This prevents `time` dimension groups from overwriting other dimensions below
221        if dimension.get("type") == "time":
222            name_key += "_time"
223
224        # overwrite duplicate "submission", "end", "start" dimension group, thus picking the
225        # last value sorted by field name, which is submission_timestamp
226        # See also https://github.com/mozilla/lookml-generator/issues/471
227        if name_key in dimensions and not (
228            dimension.get("type") == "time"
229            and (
230                dimension["name"] == "submission"
231                or dimension["name"].endswith("end")
232                or dimension["name"].endswith("start")
233            )
234        ):
235            raise click.ClickException(f"duplicate dimension {name_key!r} in query")
236        dimensions[name_key] = dimension
237
238    dimensions_list = list(dimensions.values())
239    _remove_conflicting_dimension_group_timeframes(dimensions_list)
240    return dimensions_list
241
242
243def _generate_nested_dimension_views(
244    schema: List[dict], view_name: str
245) -> List[Dict[str, Any]]:
246    """
247    Recursively generate views for nested fields.
248
249    Nested fields are created as views, with dimensions and optionally measures.
250    """
251    views: List[Dict[str, Any]] = []
252    for field in sorted(schema, key=lambda f: f["name"]):
253        if field["type"] == "RECORD" and field["name"] != "labeled_counter":
254            # labeled_counter is handled explicitly in glean ping views; hidden for other views
255            if field.get("mode") == "REPEATED":
256                nested_field_view: Dict[str, Any] = {
257                    "name": f"{view_name}__{field['name']}"
258                }
259                dimensions = list(_generate_dimensions_helper(schema=field["fields"]))
260                _remove_conflicting_dimension_group_timeframes(dimensions)
261                nested_field_view["dimensions"] = [
262                    d for d in dimensions if not _is_dimension_group(d)
263                ]
264                nested_field_view["dimension_groups"] = [
265                    d for d in dimensions if _is_dimension_group(d)
266                ]
267                views = (
268                    views
269                    + [nested_field_view]
270                    + _generate_nested_dimension_views(
271                        field["fields"], f"{view_name}__{field['name']}"
272                    )
273                )
274            else:
275                views = views + _generate_nested_dimension_views(
276                    field["fields"], f"{view_name}__{field['name']}"
277                )
278
279    return views
280
281
282def _is_dimension_group(dimension: dict):
283    """Determine if a dimension is actually a dimension group."""
284    return "timeframes" in dimension or "intervals" in dimension
285
286
287def escape_filter_expr(expr: str) -> str:
288    """Escape filter expression for special Looker chars."""
289    return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)
290
291
292def _is_nested_dimension(dimension: dict):
293    return (
294        "hidden" in dimension
295        and dimension["hidden"]
296        and "nested" in dimension
297        and dimension["nested"]
298    )
299
300
301def render_template(filename, template_folder, **kwargs) -> str:
302    """Render a given template using Jinja."""
303    env = Environment(
304        loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates")
305    )
306    template = env.get_template(filename)
307    rendered = template.render(**kwargs)
308    return rendered
309
310
311def slug_to_title(slug):
312    """Convert a slug to title case, with some common initialisms uppercased."""
313    return UPPERCASE_INITIALISMS_PATTERN.sub(
314        lambda match: match.group(0).upper(), slug.replace("_", " ").title()
315    )
316
317
318# Map from view to qualified references {dataset: {view: [[project, dataset, table],]}}
319BQViewReferenceMap = Dict[str, Dict[str, List[List[str]]]]
320
321
322def get_bigquery_view_reference_map(
323    generated_sql_uri: str,
324) -> BQViewReferenceMap:
325    """Get a mapping from BigQuery datasets to views with references."""
326    with urllib.request.urlopen(generated_sql_uri) as f:
327        tarbytes = BytesIO(f.read())
328    views: BQViewReferenceMap = defaultdict(dict)
329    with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar:
330        for tarinfo in tar:
331            if tarinfo.name.endswith("/metadata.yaml"):
332                metadata = yaml.safe_load(tar.extractfile(tarinfo.name))  # type: ignore
333                references = metadata.get("references", {})
334                if "view.sql" not in references:
335                    continue
336                *_, project, dataset_id, view_id, _ = tarinfo.name.split("/")
337                if project == "moz-fx-data-shared-prod":
338                    views[dataset_id][view_id] = [
339                        ref.split(".") for ref in references["view.sql"]
340                    ]
341    return views
GENERATOR_PATH = PosixPath('/home/runner/work/lookml-generator/lookml-generator/generator')
BIGQUERY_TYPE_TO_DIMENSION_TYPE = {'BIGNUMERIC': 'string', 'BOOLEAN': 'yesno', 'BYTES': 'string', 'DATE': 'time', 'DATETIME': 'time', 'FLOAT': 'number', 'INTEGER': 'number', 'NUMERIC': 'number', 'STRING': 'string', 'TIME': 'time', 'TIMESTAMP': 'time'}
HIDDEN_DIMENSIONS = {('client_id',), ('context_id',), ('client_info', 'client_id'), ('additional_properties',), ('document_id',)}
MAP_LAYER_NAMES = {('country',): 'countries', ('metadata', 'geo', 'country'): 'countries'}
DEFAULT_MAX_SUGGEST_PERSIST_FOR = '24 hours'
UPPERCASE_INITIALISMS_PATTERN = re.compile('\\b(CPU|DB|DNS|DNT|DOM|GC|GPU|HTTP|ID|IO|IP|ISP|JWE|LB|OS|SDK|SERP|SSL|TLS|UI|URI|URL|UTM|UUID)\\b', re.IGNORECASE)
def escape_filter_expr(expr: str) -> str:
288def escape_filter_expr(expr: str) -> str:
289    """Escape filter expression for special Looker chars."""
290    return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)

Escape filter expression for special Looker chars.

def render_template(filename, template_folder, **kwargs) -> str:
302def render_template(filename, template_folder, **kwargs) -> str:
303    """Render a given template using Jinja."""
304    env = Environment(
305        loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates")
306    )
307    template = env.get_template(filename)
308    rendered = template.render(**kwargs)
309    return rendered

Render a given template using Jinja.

def slug_to_title(slug):
312def slug_to_title(slug):
313    """Convert a slug to title case, with some common initialisms uppercased."""
314    return UPPERCASE_INITIALISMS_PATTERN.sub(
315        lambda match: match.group(0).upper(), slug.replace("_", " ").title()
316    )

Convert a slug to title case, with some common initialisms uppercased.

BQViewReferenceMap = typing.Dict[str, typing.Dict[str, typing.List[typing.List[str]]]]
def get_bigquery_view_reference_map(generated_sql_uri: str) -> Dict[str, Dict[str, List[List[str]]]]:
323def get_bigquery_view_reference_map(
324    generated_sql_uri: str,
325) -> BQViewReferenceMap:
326    """Get a mapping from BigQuery datasets to views with references."""
327    with urllib.request.urlopen(generated_sql_uri) as f:
328        tarbytes = BytesIO(f.read())
329    views: BQViewReferenceMap = defaultdict(dict)
330    with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar:
331        for tarinfo in tar:
332            if tarinfo.name.endswith("/metadata.yaml"):
333                metadata = yaml.safe_load(tar.extractfile(tarinfo.name))  # type: ignore
334                references = metadata.get("references", {})
335                if "view.sql" not in references:
336                    continue
337                *_, project, dataset_id, view_id, _ = tarinfo.name.split("/")
338                if project == "moz-fx-data-shared-prod":
339                    views[dataset_id][view_id] = [
340                        ref.split(".") for ref in references["view.sql"]
341                    ]
342    return views

Get a mapping from BigQuery datasets to views with references.