generator.views.lookml_utils

Utils for generating lookml.

  1"""Utils for generating lookml."""
  2
  3import re
  4import tarfile
  5import urllib.request
  6from collections import defaultdict
  7from io import BytesIO
  8from pathlib import Path
  9from typing import Any, Dict, Iterable, List, Optional, Tuple
 10
 11import click
 12import yaml
 13from jinja2 import Environment, FileSystemLoader
 14
 15GENERATOR_PATH = Path(__file__).parent.parent
 16
 17BIGQUERY_TYPE_TO_DIMENSION_TYPE = {
 18    "BIGNUMERIC": "string",
 19    "BOOLEAN": "yesno",
 20    "BYTES": "string",
 21    "DATE": "time",
 22    "DATETIME": "time",
 23    "FLOAT": "number",
 24    "INTEGER": "number",
 25    "NUMERIC": "number",
 26    "STRING": "string",
 27    "TIME": "time",
 28    "TIMESTAMP": "time",
 29}
 30
 31HIDDEN_DIMENSIONS = {
 32    ("document_id",),
 33    ("client_id",),
 34    ("client_info", "client_id"),
 35    ("context_id",),
 36    ("additional_properties",),
 37}
 38
 39MAP_LAYER_NAMES = {
 40    ("country",): "countries",
 41    ("metadata", "geo", "country"): "countries",
 42}
 43
 44DEFAULT_MAX_SUGGEST_PERSIST_FOR = "24 hours"
 45
 46UPPERCASE_INITIALISMS_PATTERN = re.compile(
 47    (
 48        r"\b("
 49        + "|".join(
 50            [
 51                "CPU",
 52                "DB",
 53                "DNS",
 54                "DNT",
 55                "DOM",
 56                "GC",
 57                "GPU",
 58                "HTTP",
 59                "ID",
 60                "IO",
 61                "IP",
 62                "ISP",
 63                "JWE",
 64                "LB",
 65                "OS",
 66                "SDK",
 67                "SERP",
 68                "SSL",
 69                "TLS",
 70                "UI",
 71                "URI",
 72                "URL",
 73                "UTM",
 74                "UUID",
 75            ]
 76        )
 77        + r")\b"
 78    ),
 79    re.IGNORECASE,
 80)
 81
 82
 83def _get_dimension(
 84    path: Tuple[str, ...], field_type: str, mode: str, description: Optional[str]
 85) -> Dict[str, Any]:
 86    result: Dict[str, Any] = {}
 87    result["sql"] = "${TABLE}." + ".".join(path)
 88    name = path
 89    if (
 90        mode == "REPEATED"
 91        or path in HIDDEN_DIMENSIONS
 92        or field_type not in BIGQUERY_TYPE_TO_DIMENSION_TYPE
 93    ):
 94        result["hidden"] = "yes"
 95    else:
 96        result["type"] = BIGQUERY_TYPE_TO_DIMENSION_TYPE[field_type]
 97        result["suggest_persist_for"] = DEFAULT_MAX_SUGGEST_PERSIST_FOR
 98
 99        group_label, group_item_label = None, None
100        if len(path) > 1:
101            group_label = slug_to_title(" ".join(path[:-1]))
102            group_item_label = slug_to_title(path[-1])
103        if result["type"] == "time":
104            # Remove _{type} suffix from the last path element for dimension group
105            # names For example submission_date and submission_timestamp become
106            # submission, and metadata.header.parsed_date becomes
107            # metadata__header__parsed. This is because the timeframe will add a _{type}
108            # suffix to the individual dimension names.
109            name = *path[:-1], re.sub("_(date|time(stamp)?)$", "", path[-1])
110            result["timeframes"] = [
111                "raw",
112                "time",
113                "date",
114                "week",
115                "month",
116                "quarter",
117                "year",
118            ]
119            if field_type == "DATE":
120                result["timeframes"].remove("time")
121                result["convert_tz"] = "no"
122                result["datatype"] = "date"
123            if group_label and group_item_label:
124                # Dimension groups should not be nested, see issue #82
125                result["label"] = f"{group_label}: {group_item_label}"
126        elif len(path) > 1:
127            result["group_label"] = group_label
128            result["group_item_label"] = group_item_label
129        if path in MAP_LAYER_NAMES:
130            result["map_layer_name"] = MAP_LAYER_NAMES[path]
131    result["name"] = "__".join(name)
132
133    if description:
134        result["description"] = description
135
136    return result
137
138
139def _generate_dimensions_helper(schema: List[Any], *prefix: str) -> Iterable[dict]:
140    for field in sorted(schema, key=lambda f: f["name"]):
141        if field["type"] == "RECORD" and not field.get("mode", "") == "REPEATED":
142            yield from _generate_dimensions_helper(
143                field["fields"], *prefix, field["name"]
144            )
145        else:
146            yield _get_dimension(
147                (*prefix, field["name"]),
148                field["type"],
149                field.get("mode", ""),
150                field.get("description", ""),
151            )
152
153
154def _generate_dimensions(table: str, dryrun) -> List[Dict[str, Any]]:
155    """Generate dimensions and dimension groups from a bigquery table.
156
157    When schema contains both submission_timestamp and submission_date, only produce
158    a dimension group for submission_timestamp.
159
160    Raise ClickException if schema results in duplicate dimensions.
161    """
162    dimensions = {}
163    [project, dataset, table] = table.split(".")
164    table_schema = dryrun.create(
165        project=project,
166        dataset=dataset,
167        table=table,
168    ).get_table_schema()
169
170    for dimension in _generate_dimensions_helper(table_schema):
171        name_key = dimension["name"]
172
173        # This prevents `time` dimension groups from overwriting other dimensions below
174        if dimension.get("type") == "time":
175            name_key += "_time"
176        # overwrite duplicate "submission", "end", "start" dimension group, thus picking the
177        # last value sorted by field name, which is submission_timestamp
178        # See also https://github.com/mozilla/lookml-generator/issues/471
179        if name_key in dimensions and not (
180            dimension.get("type") == "time"
181            and (
182                dimension["name"] == "submission"
183                or dimension["name"].endswith("end")
184                or dimension["name"].endswith("start")
185            )
186        ):
187            raise click.ClickException(
188                f"duplicate dimension {name_key!r} for table {table!r}"
189            )
190        dimensions[name_key] = dimension
191    return list(dimensions.values())
192
193
194def _generate_dimensions_from_query(query: str, dryrun) -> List[Dict[str, Any]]:
195    """Generate dimensions and dimension groups from a SQL query."""
196    schema = dryrun.create(sql=query).get_schema()
197    dimensions = {}
198    for dimension in _generate_dimensions_helper(schema or []):
199        name_key = dimension["name"]
200
201        # This prevents `time` dimension groups from overwriting other dimensions below
202        if dimension.get("type") == "time":
203            name_key += "_time"
204
205        # overwrite duplicate "submission", "end", "start" dimension group, thus picking the
206        # last value sorted by field name, which is submission_timestamp
207        # See also https://github.com/mozilla/lookml-generator/issues/471
208        if name_key in dimensions and not (
209            dimension.get("type") == "time"
210            and (
211                dimension["name"] == "submission"
212                or dimension["name"].endswith("end")
213                or dimension["name"].endswith("start")
214            )
215        ):
216            raise click.ClickException(f"duplicate dimension {name_key!r} in query")
217        dimensions[name_key] = dimension
218    return list(dimensions.values())
219
220
221def _generate_nested_dimension_views(
222    schema: List[dict], view_name: str
223) -> List[Dict[str, Any]]:
224    """
225    Recursively generate views for nested fields.
226
227    Nested fields are created as views, with dimensions and optionally measures.
228    """
229    views: List[Dict[str, Any]] = []
230    for field in sorted(schema, key=lambda f: f["name"]):
231        if field["type"] == "RECORD" and field["name"] != "labeled_counter":
232            # labeled_counter is handled explicitly in glean ping views; hidden for other views
233            if field.get("mode") == "REPEATED":
234                nested_field_view: Dict[str, Any] = {
235                    "name": f"{view_name}__{field['name']}"
236                }
237                dimensions = _generate_dimensions_helper(schema=field["fields"])
238                nested_field_view["dimensions"] = [
239                    d for d in dimensions if not _is_dimension_group(d)
240                ]
241                nested_field_view["dimension_groups"] = [
242                    d for d in dimensions if _is_dimension_group(d)
243                ]
244                views = (
245                    views
246                    + [nested_field_view]
247                    + _generate_nested_dimension_views(
248                        field["fields"], f"{view_name}__{field['name']}"
249                    )
250                )
251            else:
252                views = views + _generate_nested_dimension_views(
253                    field["fields"], f"{view_name}__{field['name']}"
254                )
255
256    return views
257
258
259def _is_dimension_group(dimension: dict):
260    """Determine if a dimension is actually a dimension group."""
261    return "timeframes" in dimension or "intervals" in dimension
262
263
264def escape_filter_expr(expr: str) -> str:
265    """Escape filter expression for special Looker chars."""
266    return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)
267
268
269def _is_nested_dimension(dimension: dict):
270    return (
271        "hidden" in dimension
272        and dimension["hidden"]
273        and "nested" in dimension
274        and dimension["nested"]
275    )
276
277
278def render_template(filename, template_folder, **kwargs) -> str:
279    """Render a given template using Jinja."""
280    env = Environment(
281        loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates")
282    )
283    template = env.get_template(filename)
284    rendered = template.render(**kwargs)
285    return rendered
286
287
288def slug_to_title(slug):
289    """Convert a slug to title case, with some common initialisms uppercased."""
290    return UPPERCASE_INITIALISMS_PATTERN.sub(
291        lambda match: match.group(0).upper(), slug.replace("_", " ").title()
292    )
293
294
295# Map from view to qualified references {dataset: {view: [[project, dataset, table],]}}
296BQViewReferenceMap = Dict[str, Dict[str, List[List[str]]]]
297
298
299def get_bigquery_view_reference_map(
300    generated_sql_uri: str,
301) -> BQViewReferenceMap:
302    """Get a mapping from BigQuery datasets to views with references."""
303    with urllib.request.urlopen(generated_sql_uri) as f:
304        tarbytes = BytesIO(f.read())
305    views: BQViewReferenceMap = defaultdict(dict)
306    with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar:
307        for tarinfo in tar:
308            if tarinfo.name.endswith("/metadata.yaml"):
309                metadata = yaml.safe_load(tar.extractfile(tarinfo.name))  # type: ignore
310                references = metadata.get("references", {})
311                if "view.sql" not in references:
312                    continue
313                *_, project, dataset_id, view_id, _ = tarinfo.name.split("/")
314                if project == "moz-fx-data-shared-prod":
315                    views[dataset_id][view_id] = [
316                        ref.split(".") for ref in references["view.sql"]
317                    ]
318    return views
GENERATOR_PATH = PosixPath('/root/project/generator')
BIGQUERY_TYPE_TO_DIMENSION_TYPE = {'BIGNUMERIC': 'string', 'BOOLEAN': 'yesno', 'BYTES': 'string', 'DATE': 'time', 'DATETIME': 'time', 'FLOAT': 'number', 'INTEGER': 'number', 'NUMERIC': 'number', 'STRING': 'string', 'TIME': 'time', 'TIMESTAMP': 'time'}
HIDDEN_DIMENSIONS = {('additional_properties',), ('document_id',), ('client_info', 'client_id'), ('client_id',), ('context_id',)}
MAP_LAYER_NAMES = {('country',): 'countries', ('metadata', 'geo', 'country'): 'countries'}
DEFAULT_MAX_SUGGEST_PERSIST_FOR = '24 hours'
UPPERCASE_INITIALISMS_PATTERN = re.compile('\\b(CPU|DB|DNS|DNT|DOM|GC|GPU|HTTP|ID|IO|IP|ISP|JWE|LB|OS|SDK|SERP|SSL|TLS|UI|URI|URL|UTM|UUID)\\b', re.IGNORECASE)
def escape_filter_expr(expr: str) -> str:
265def escape_filter_expr(expr: str) -> str:
266    """Escape filter expression for special Looker chars."""
267    return re.sub(r'((?:^-)|["_%,^])', r"^\1", expr, count=0)

Escape filter expression for special Looker chars.

def render_template(filename, template_folder, **kwargs) -> str:
279def render_template(filename, template_folder, **kwargs) -> str:
280    """Render a given template using Jinja."""
281    env = Environment(
282        loader=FileSystemLoader(GENERATOR_PATH / f"{template_folder}/templates")
283    )
284    template = env.get_template(filename)
285    rendered = template.render(**kwargs)
286    return rendered

Render a given template using Jinja.

def slug_to_title(slug):
289def slug_to_title(slug):
290    """Convert a slug to title case, with some common initialisms uppercased."""
291    return UPPERCASE_INITIALISMS_PATTERN.sub(
292        lambda match: match.group(0).upper(), slug.replace("_", " ").title()
293    )

Convert a slug to title case, with some common initialisms uppercased.

BQViewReferenceMap = typing.Dict[str, typing.Dict[str, typing.List[typing.List[str]]]]
def get_bigquery_view_reference_map(generated_sql_uri: str) -> Dict[str, Dict[str, List[List[str]]]]:
300def get_bigquery_view_reference_map(
301    generated_sql_uri: str,
302) -> BQViewReferenceMap:
303    """Get a mapping from BigQuery datasets to views with references."""
304    with urllib.request.urlopen(generated_sql_uri) as f:
305        tarbytes = BytesIO(f.read())
306    views: BQViewReferenceMap = defaultdict(dict)
307    with tarfile.open(fileobj=tarbytes, mode="r:gz") as tar:
308        for tarinfo in tar:
309            if tarinfo.name.endswith("/metadata.yaml"):
310                metadata = yaml.safe_load(tar.extractfile(tarinfo.name))  # type: ignore
311                references = metadata.get("references", {})
312                if "view.sql" not in references:
313                    continue
314                *_, project, dataset_id, view_id, _ = tarinfo.name.split("/")
315                if project == "moz-fx-data-shared-prod":
316                    views[dataset_id][view_id] = [
317                        ref.split(".") for ref in references["view.sql"]
318                    ]
319    return views

Get a mapping from BigQuery datasets to views with references.