generator.views.lookml_utils

Utils for generating lookml.

  1"""Utils for generating lookml."""
  2
  3import re
  4import tarfile
  5import urllib.request
  6from collections import defaultdict
  7from io import BytesIO
  8from pathlib import Path
  9from typing import Any, Dict, Iterable, List, Optional, Tuple
 10
 11import click
 12import yaml
 13from jinja2 import Environment, FileSystemLoader
 14
 15GENERATOR_PATH = Path(__file__).parent.parent
 16
 17BIGQUERY_TYPE_TO_DIMENSION_TYPE = {
 18    "BIGNUMERIC": "string",
 19    "BOOLEAN": "yesno",
 20    "BYTES": "string",
 21    "DATE": "time",
 22    "DATETIME": "time",
 23    "FLOAT": "number",
 24    "INTEGER": "number",
 25    "NUMERIC": "number",
 26    "STRING": "string",
 27    "TIME": "time",
 28    "TIMESTAMP": "time",
 29}
 30
 31HIDDEN_DIMENSIONS = {
 32    ("document_id",),
 33    ("client_id",),
 34    ("client_info", "client_id"),
 35    ("context_id",),
 36    ("additional_properties",),
 37}
 38
 39MAP_LAYER_NAMES = {
 40    ("country",): "countries",
 41    ("metadata", "geo", "country"): "countries",
 42}
 43
 44
 45def _get_dimension(
 46    path: Tuple[str, ...], field_type: str, mode: str, description: Optional[str]
 47) -> Dict[str, Any]:
 48    result: Dict[str, Any] = {}
 49    result["sql"] = "${TABLE}." + ".".join(path)
 50    name = path
 51    if (
 52        mode == "REPEATED"
 53        or path in HIDDEN_DIMENSIONS
 54        or field_type not in BIGQUERY_TYPE_TO_DIMENSION_TYPE
 55    ):
 56        result["hidden"] = "yes"
 57    else:
 58        result["type"] = BIGQUERY_TYPE_TO_DIMENSION_TYPE[field_type]
 59
 60        group_label, group_item_label = None, None
 61        if len(path) > 1:
 62            group_label = slug_to_title(" ".join(path[:-1]))
 63            group_item_label = slug_to_title(path[-1])
 64        if result["type"] == "time":
 65            # Remove _{type} suffix from the last path element for dimension group
 66            # names For example submission_date and submission_timestamp become
 67            # submission, and metadata.header.parsed_date becomes
 68            # metadata__header__parsed. This is because the timeframe will add a _{type}
 69            # suffix to the individual dimension names.
 70            name = *path[:-1], re.sub("_(date|time(stamp)?)$", "", path[-1])
 71            result["timeframes"] = [
 72                "raw",
 73                "time",
 74                "date",
 75                "week",
 76                "month",
 77                "quarter",
 78                "year",
 79            ]
 80            if field_type == "DATE":
 81                result["timeframes"].remove("time")
 82                result["convert_tz"] = "no"
 83                result["datatype"] = "date"
 84            if group_label and group_item_label:
 85                # Dimension groups should not be nested, see issue #82
 86                result["label"] = f"{group_label}: {group_item_label}"
 87        elif len(path) > 1:
 88            result["group_label"] = group_label
 89            result["group_item_label"] = group_item_label
 90        if path in MAP_LAYER_NAMES:
 91            result["map_layer_name"] = MAP_LAYER_NAMES[path]
 92    result["name"] = "__".join(name)
 93
 94    if description:
 95        result["description"] = description
 96
 97    return result
 98
 99
100def _generate_dimensions_helper(schema: List[Any], *prefix: str) -> Iterable[dict]:
101    for field in sorted(schema, key=lambda f: f["name"]):
102        if field["type"] == "RECORD" and not field.get("mode", "") == "REPEATED":
103            yield from _generate_dimensions_helper(
104                field["fields"], *prefix, field["name"]
105            )
106        else:
107            yield _get_dimension(
108                (*prefix, field["name"]),
109                field["type"],
110                field.get("mode", ""),
111                field.get("description", ""),
112            )
113
114
115def _generate_dimensions(table: str, dryrun) -> List[Dict[str, Any]]:
116    """Generate dimensions and dimension groups from a bigquery table.
117
118    When schema contains both submission_timestamp and submission_date, only produce
119    a dimension group for submission_timestamp.
120
121    Raise ClickException if schema results in duplicate dimensions.
122    """
123    dimensions = {}
124    [project, dataset, table] = table.split(".")
125    table_schema = dryrun.create(
126        project=project,
127        dataset=dataset,
128        table=table,
129    ).get_table_schema()
130
131    for dimension in _generate_dimensions_helper(table_schema):
132        name_key = dimension["name"]
133
134        # This prevents `time` dimension groups from overwriting other dimensions below
135        if dimension.get("type") == "time":
136            name_key += "_time"
137        # overwrite duplicate "submission", "end", "start" dimension group, thus picking the
138        # last value sorted by field name, which is submission_timestamp
139        # See also https://github.com/mozilla/lookml-generator/issues/471
140        if name_key in dimensions and not (
141            dimension.get("type") == "time"
142            and (
143                dimension["name"] == "submission"
144                or dimension["name"].endswith("end")
145                or dimension["name"].endswith("start")
146            )
147        ):
148            raise click.ClickException(
149                f"duplicate dimension {name_key!r} for table {table!r}"
150            )
151        dimensions[name_key] = dimension
152    return list(dimensions.values())
153
154
155def _generate_dimensions_from_query(query: str, dryrun) -> List[Dict[str, Any]]:
156    """Generate dimensions and dimension groups from a SQL query."""
157    schema = dryrun.create(sql=query).get_schema()
158    dimensions = {}
159    for dimension in _generate_dimensions_helper(schema or []):
160        name_key = dimension["name"]
161
162        # This prevents `time` dimension groups from overwriting other dimensions below
163        if dimension.get("type") == "time":
164            name_key += "_time"
165
166        # overwrite duplicate "submission", "end", "start" dimension group, thus picking the
167        # last value sorted by field name, which is submission_timestamp
168        # See also https://github.com/mozilla/lookml-generator/issues/471
169        if name_key in dimensions and not (
170            dimension.get("type") == "time"
171            and (
172                dimension["name"] == "submission"
173                or dimension["name"].endswith("end")
174                or dimension["name"].endswith("start")
175            )
176        ):
177            raise click.ClickException(f"duplicate dimension {name_key!r} in query")
178        dimensions[name_key] = dimension
179    return list(dimensions.values())
180
181
182def _generate_nested_dimension_views(
183    schema: List[dict], view_name: str
184) -> List[Dict[str, Any]]:
185    """
186    Recursively generate views for nested fields.
187
188    Nested fields are created as views, with dimensions and optionally measures.
189    """
190    views: List[Dict[str, Any]] = []
191    for field in sorted(schema, key=lambda f: f["name"]):
192        if field["type"] == "RECORD" and field["name"] != "labeled_counter":
193            # labeled_counter is handled explicitly in glean ping views; hidden for other views
194            if field.get("mode") == "REPEATED":
195                nested_field_view: Dict[str, Any] = {
196                    "name": f"{view_name}__{field['name']}"
197                }
198                dimensions = _generate_dimensions_helper(schema=field["fields"])
199                nested_field_view["dimensions"] = [
200                    d for d in dimensions if not _is_dimension_group(d)
201                ]
202                nested_field_view["dimension_groups"] = [
203                    d for d in dimensions if _is_dimension_group(d)
204                ]
205                views = (
206                    views
207                    + [nested_field_view]
208                    + _generate_nested_dimension_views(
209                        field["fields"], f"{view_name}__{field['name']}"
210                    )
211                )
212            else:
213                views = views + _generate_nested_dimension_views(
214                    field["fields"], f"{view_name}__{field['name']}"
215                )
216
217    return views
218
219
220def _is_dimension_group(dimension: dict):
221    """Determine if a dimension is actually a dimension group."""
222    return "timeframes" in dimension or "intervals" in dimension
223
224
225def escape_filter_expr(expr: str) -> str:
226    """Escape filter expression for special Looker chars."""
227    return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)
228
229
230def _is_nested_dimension(dimension: dict):
231    return (
232        "hidden" in dimension
233        and dimension["hidden"]
234        and "nested" in dimension
235        and dimension["nested"]
236    )
237
238
239def render_template(filename, template_folder, **kwargs) -> str:
240    """Render a given template using Jinja."""
241    env = Environment(
242        loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates")
243    )
244    template = env.get_template(filename)
245    rendered = template.render(**kwargs)
246    return rendered
247
248
249def slug_to_title(slug):
250    """Convert a slug to title case."""
251    return slug.replace("_", " ").title()
252
253
254# Map from view to qualified references {dataset: {view: [[project, dataset, table],]}}
255BQViewReferenceMap = Dict[str, Dict[str, List[List[str]]]]
256
257
258def get_bigquery_view_reference_map(
259    generated_sql_uri: str,
260) -> BQViewReferenceMap:
261    """Get a mapping from BigQuery datasets to views with references."""
262    with urllib.request.urlopen(generated_sql_uri) as f:
263        tarbytes = BytesIO(f.read())
264    views: BQViewReferenceMap = defaultdict(dict)
265    with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar:
266        for tarinfo in tar:
267            if tarinfo.name.endswith("/metadata.yaml"):
268                metadata = yaml.safe_load(tar.extractfile(tarinfo.name))  # type: ignore
269                references = metadata.get("references", {})
270                if "view.sql" not in references:
271                    continue
272                *_, project, dataset_id, view_id, _ = tarinfo.name.split("/")
273                if project == "moz-fx-data-shared-prod":
274                    views[dataset_id][view_id] = [
275                        ref.split(".") for ref in references["view.sql"]
276                    ]
277    return views
GENERATOR_PATH = PosixPath('/root/project/generator')
BIGQUERY_TYPE_TO_DIMENSION_TYPE = {'BIGNUMERIC': 'string', 'BOOLEAN': 'yesno', 'BYTES': 'string', 'DATE': 'time', 'DATETIME': 'time', 'FLOAT': 'number', 'INTEGER': 'number', 'NUMERIC': 'number', 'STRING': 'string', 'TIME': 'time', 'TIMESTAMP': 'time'}
HIDDEN_DIMENSIONS = {('client_id',), ('client_info', 'client_id'), ('document_id',), ('additional_properties',), ('context_id',)}
MAP_LAYER_NAMES = {('country',): 'countries', ('metadata', 'geo', 'country'): 'countries'}
def escape_filter_expr(expr: str) -> str:
226def escape_filter_expr(expr: str) -> str:
227    """Escape filter expression for special Looker chars."""
228    return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)

Escape filter expression for special Looker chars.

def render_template(filename, template_folder, **kwargs) -> str:
240def render_template(filename, template_folder, **kwargs) -> str:
241    """Render a given template using Jinja."""
242    env = Environment(
243        loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates")
244    )
245    template = env.get_template(filename)
246    rendered = template.render(**kwargs)
247    return rendered

Render a given template using Jinja.

def slug_to_title(slug):
250def slug_to_title(slug):
251    """Convert a slug to title case."""
252    return slug.replace("_", " ").title()

Convert a slug to title case.

BQViewReferenceMap = typing.Dict[str, typing.Dict[str, typing.List[typing.List[str]]]]
def get_bigquery_view_reference_map(generated_sql_uri: str) -> Dict[str, Dict[str, List[List[str]]]]:
259def get_bigquery_view_reference_map(
260    generated_sql_uri: str,
261) -> BQViewReferenceMap:
262    """Get a mapping from BigQuery datasets to views with references."""
263    with urllib.request.urlopen(generated_sql_uri) as f:
264        tarbytes = BytesIO(f.read())
265    views: BQViewReferenceMap = defaultdict(dict)
266    with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar:
267        for tarinfo in tar:
268            if tarinfo.name.endswith("/metadata.yaml"):
269                metadata = yaml.safe_load(tar.extractfile(tarinfo.name))  # type: ignore
270                references = metadata.get("references", {})
271                if "view.sql" not in references:
272                    continue
273                *_, project, dataset_id, view_id, _ = tarinfo.name.split("/")
274                if project == "moz-fx-data-shared-prod":
275                    views[dataset_id][view_id] = [
276                        ref.split(".") for ref in references["view.sql"]
277                    ]
278    return views

Get a mapping from BigQuery datasets to views with references.