generator.views.lookml_utils

Utils for generating lookml.

  1"""Utils for generating lookml."""
  2
  3import re
  4import tarfile
  5import urllib.request
  6from collections import defaultdict
  7from io import BytesIO
  8from pathlib import Path
  9from typing import Any, Dict, Iterable, List, Optional, Tuple
 10
 11import click
 12import yaml
 13from jinja2 import Environment, FileSystemLoader
 14
 15GENERATOR_PATH = Path(__file__).parent.parent
 16
 17BIGQUERY_TYPE_TO_DIMENSION_TYPE = {
 18    "BIGNUMERIC": "string",
 19    "BOOLEAN": "yesno",
 20    "BYTES": "string",
 21    "DATE": "time",
 22    "DATETIME": "time",
 23    "FLOAT": "number",
 24    "INTEGER": "number",
 25    "NUMERIC": "number",
 26    "STRING": "string",
 27    "TIME": "time",
 28    "TIMESTAMP": "time",
 29}
 30
 31HIDDEN_DIMENSIONS = {
 32    ("document_id",),
 33    ("client_id",),
 34    ("client_info", "client_id"),
 35    ("context_id",),
 36    ("additional_properties",),
 37}
 38
 39MAP_LAYER_NAMES = {
 40    ("country",): "countries",
 41    ("metadata", "geo", "country"): "countries",
 42}
 43
 44DEFAULT_MAX_SUGGEST_PERSIST_FOR = "24 hours"
 45
 46
 47def _get_dimension(
 48    path: Tuple[str, ...], field_type: str, mode: str, description: Optional[str]
 49) -> Dict[str, Any]:
 50    result: Dict[str, Any] = {}
 51    result["sql"] = "${TABLE}." + ".".join(path)
 52    name = path
 53    if (
 54        mode == "REPEATED"
 55        or path in HIDDEN_DIMENSIONS
 56        or field_type not in BIGQUERY_TYPE_TO_DIMENSION_TYPE
 57    ):
 58        result["hidden"] = "yes"
 59    else:
 60        result["type"] = BIGQUERY_TYPE_TO_DIMENSION_TYPE[field_type]
 61        result["suggest_persist_for"] = DEFAULT_MAX_SUGGEST_PERSIST_FOR
 62
 63        group_label, group_item_label = None, None
 64        if len(path) > 1:
 65            group_label = slug_to_title(" ".join(path[:-1]))
 66            group_item_label = slug_to_title(path[-1])
 67        if result["type"] == "time":
 68            # Remove _{type} suffix from the last path element for dimension group
 69            # names For example submission_date and submission_timestamp become
 70            # submission, and metadata.header.parsed_date becomes
 71            # metadata__header__parsed. This is because the timeframe will add a _{type}
 72            # suffix to the individual dimension names.
 73            name = *path[:-1], re.sub("_(date|time(stamp)?)$", "", path[-1])
 74            result["timeframes"] = [
 75                "raw",
 76                "time",
 77                "date",
 78                "week",
 79                "month",
 80                "quarter",
 81                "year",
 82            ]
 83            if field_type == "DATE":
 84                result["timeframes"].remove("time")
 85                result["convert_tz"] = "no"
 86                result["datatype"] = "date"
 87            if group_label and group_item_label:
 88                # Dimension groups should not be nested, see issue #82
 89                result["label"] = f"{group_label}: {group_item_label}"
 90        elif len(path) > 1:
 91            result["group_label"] = group_label
 92            result["group_item_label"] = group_item_label
 93        if path in MAP_LAYER_NAMES:
 94            result["map_layer_name"] = MAP_LAYER_NAMES[path]
 95    result["name"] = "__".join(name)
 96
 97    if description:
 98        result["description"] = description
 99
100    return result
101
102
103def _generate_dimensions_helper(schema: List[Any], *prefix: str) -> Iterable[dict]:
104    for field in sorted(schema, key=lambda f: f["name"]):
105        if field["type"] == "RECORD" and not field.get("mode", "") == "REPEATED":
106            yield from _generate_dimensions_helper(
107                field["fields"], *prefix, field["name"]
108            )
109        else:
110            yield _get_dimension(
111                (*prefix, field["name"]),
112                field["type"],
113                field.get("mode", ""),
114                field.get("description", ""),
115            )
116
117
118def _generate_dimensions(table: str, dryrun) -> List[Dict[str, Any]]:
119    """Generate dimensions and dimension groups from a bigquery table.
120
121    When schema contains both submission_timestamp and submission_date, only produce
122    a dimension group for submission_timestamp.
123
124    Raise ClickException if schema results in duplicate dimensions.
125    """
126    dimensions = {}
127    [project, dataset, table] = table.split(".")
128    table_schema = dryrun.create(
129        project=project,
130        dataset=dataset,
131        table=table,
132    ).get_table_schema()
133
134    for dimension in _generate_dimensions_helper(table_schema):
135        name_key = dimension["name"]
136
137        # This prevents `time` dimension groups from overwriting other dimensions below
138        if dimension.get("type") == "time":
139            name_key += "_time"
140        # overwrite duplicate "submission", "end", "start" dimension group, thus picking the
141        # last value sorted by field name, which is submission_timestamp
142        # See also https://github.com/mozilla/lookml-generator/issues/471
143        if name_key in dimensions and not (
144            dimension.get("type") == "time"
145            and (
146                dimension["name"] == "submission"
147                or dimension["name"].endswith("end")
148                or dimension["name"].endswith("start")
149            )
150        ):
151            raise click.ClickException(
152                f"duplicate dimension {name_key!r} for table {table!r}"
153            )
154        dimensions[name_key] = dimension
155    return list(dimensions.values())
156
157
158def _generate_dimensions_from_query(query: str, dryrun) -> List[Dict[str, Any]]:
159    """Generate dimensions and dimension groups from a SQL query."""
160    schema = dryrun.create(sql=query).get_schema()
161    dimensions = {}
162    for dimension in _generate_dimensions_helper(schema or []):
163        name_key = dimension["name"]
164
165        # This prevents `time` dimension groups from overwriting other dimensions below
166        if dimension.get("type") == "time":
167            name_key += "_time"
168
169        # overwrite duplicate "submission", "end", "start" dimension group, thus picking the
170        # last value sorted by field name, which is submission_timestamp
171        # See also https://github.com/mozilla/lookml-generator/issues/471
172        if name_key in dimensions and not (
173            dimension.get("type") == "time"
174            and (
175                dimension["name"] == "submission"
176                or dimension["name"].endswith("end")
177                or dimension["name"].endswith("start")
178            )
179        ):
180            raise click.ClickException(f"duplicate dimension {name_key!r} in query")
181        dimensions[name_key] = dimension
182    return list(dimensions.values())
183
184
185def _generate_nested_dimension_views(
186    schema: List[dict], view_name: str
187) -> List[Dict[str, Any]]:
188    """
189    Recursively generate views for nested fields.
190
191    Nested fields are created as views, with dimensions and optionally measures.
192    """
193    views: List[Dict[str, Any]] = []
194    for field in sorted(schema, key=lambda f: f["name"]):
195        if field["type"] == "RECORD" and field["name"] != "labeled_counter":
196            # labeled_counter is handled explicitly in glean ping views; hidden for other views
197            if field.get("mode") == "REPEATED":
198                nested_field_view: Dict[str, Any] = {
199                    "name": f"{view_name}__{field['name']}"
200                }
201                dimensions = _generate_dimensions_helper(schema=field["fields"])
202                nested_field_view["dimensions"] = [
203                    d for d in dimensions if not _is_dimension_group(d)
204                ]
205                nested_field_view["dimension_groups"] = [
206                    d for d in dimensions if _is_dimension_group(d)
207                ]
208                views = (
209                    views
210                    + [nested_field_view]
211                    + _generate_nested_dimension_views(
212                        field["fields"], f"{view_name}__{field['name']}"
213                    )
214                )
215            else:
216                views = views + _generate_nested_dimension_views(
217                    field["fields"], f"{view_name}__{field['name']}"
218                )
219
220    return views
221
222
223def _is_dimension_group(dimension: dict):
224    """Determine if a dimension is actually a dimension group."""
225    return "timeframes" in dimension or "intervals" in dimension
226
227
228def escape_filter_expr(expr: str) -> str:
229    """Escape filter expression for special Looker chars."""
230    return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)
231
232
233def _is_nested_dimension(dimension: dict):
234    return (
235        "hidden" in dimension
236        and dimension["hidden"]
237        and "nested" in dimension
238        and dimension["nested"]
239    )
240
241
242def render_template(filename, template_folder, **kwargs) -> str:
243    """Render a given template using Jinja."""
244    env = Environment(
245        loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates")
246    )
247    template = env.get_template(filename)
248    rendered = template.render(**kwargs)
249    return rendered
250
251
252def slug_to_title(slug):
253    """Convert a slug to title case."""
254    return slug.replace("_", " ").title()
255
256
257# Map from view to qualified references {dataset: {view: [[project, dataset, table],]}}
258BQViewReferenceMap = Dict[str, Dict[str, List[List[str]]]]
259
260
261def get_bigquery_view_reference_map(
262    generated_sql_uri: str,
263) -> BQViewReferenceMap:
264    """Get a mapping from BigQuery datasets to views with references."""
265    with urllib.request.urlopen(generated_sql_uri) as f:
266        tarbytes = BytesIO(f.read())
267    views: BQViewReferenceMap = defaultdict(dict)
268    with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar:
269        for tarinfo in tar:
270            if tarinfo.name.endswith("/metadata.yaml"):
271                metadata = yaml.safe_load(tar.extractfile(tarinfo.name))  # type: ignore
272                references = metadata.get("references", {})
273                if "view.sql" not in references:
274                    continue
275                *_, project, dataset_id, view_id, _ = tarinfo.name.split("/")
276                if project == "moz-fx-data-shared-prod":
277                    views[dataset_id][view_id] = [
278                        ref.split(".") for ref in references["view.sql"]
279                    ]
280    return views
GENERATOR_PATH = PosixPath('/root/project/generator')
BIGQUERY_TYPE_TO_DIMENSION_TYPE = {'BIGNUMERIC': 'string', 'BOOLEAN': 'yesno', 'BYTES': 'string', 'DATE': 'time', 'DATETIME': 'time', 'FLOAT': 'number', 'INTEGER': 'number', 'NUMERIC': 'number', 'STRING': 'string', 'TIME': 'time', 'TIMESTAMP': 'time'}
HIDDEN_DIMENSIONS = {('additional_properties',), ('client_id',), ('client_info', 'client_id'), ('document_id',), ('context_id',)}
MAP_LAYER_NAMES = {('country',): 'countries', ('metadata', 'geo', 'country'): 'countries'}
DEFAULT_MAX_SUGGEST_PERSIST_FOR = '24 hours'
def escape_filter_expr(expr: str) -> str:
229def escape_filter_expr(expr: str) -> str:
230    """Escape filter expression for special Looker chars."""
231    return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)

Escape filter expression for special Looker chars.

def render_template(filename, template_folder, **kwargs) -> str:
243def render_template(filename, template_folder, **kwargs) -> str:
244    """Render a given template using Jinja."""
245    env = Environment(
246        loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates")
247    )
248    template = env.get_template(filename)
249    rendered = template.render(**kwargs)
250    return rendered

Render a given template using Jinja.

def slug_to_title(slug):
253def slug_to_title(slug):
254    """Convert a slug to title case."""
255    return slug.replace("_", " ").title()

Convert a slug to title case.

BQViewReferenceMap = typing.Dict[str, typing.Dict[str, typing.List[typing.List[str]]]]
def get_bigquery_view_reference_map(generated_sql_uri: str) -> Dict[str, Dict[str, List[List[str]]]]:
262def get_bigquery_view_reference_map(
263    generated_sql_uri: str,
264) -> BQViewReferenceMap:
265    """Get a mapping from BigQuery datasets to views with references."""
266    with urllib.request.urlopen(generated_sql_uri) as f:
267        tarbytes = BytesIO(f.read())
268    views: BQViewReferenceMap = defaultdict(dict)
269    with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar:
270        for tarinfo in tar:
271            if tarinfo.name.endswith("/metadata.yaml"):
272                metadata = yaml.safe_load(tar.extractfile(tarinfo.name))  # type: ignore
273                references = metadata.get("references", {})
274                if "view.sql" not in references:
275                    continue
276                *_, project, dataset_id, view_id, _ = tarinfo.name.split("/")
277                if project == "moz-fx-data-shared-prod":
278                    views[dataset_id][view_id] = [
279                        ref.split(".") for ref in references["view.sql"]
280                    ]
281    return views

Get a mapping from BigQuery datasets to views with references.