Source code for presc.report.runner

from pathlib import Path
import shelve
from tempfile import TemporaryDirectory
import shutil
import subprocess
import warnings
import webbrowser

import yaml

from presc.utils import PrescError, include_exclude_list
from presc.configuration import PrescConfig
from presc import global_config

# Path to the report source dir
REPORT_SOURCE_PATH = Path(__file__).parent / "resources"
JB_CONFIG_FILENAME = "_config.yml"
JB_TOC_FILENAME = "_toc.yml"
REPORT_OUTPUT_DIR = "presc_report"
REPORT_EXECUTION_DIR = "_exec"
REPORT_MAIN_PAGE = "report.html"
SPHINX_INDEX_PAGE = Path("_build") / "html" / "index.html"
JB_CLEAN_LOG = "jupyterbook_clean.log"
JB_BUILD_LOG = "jupyterbook_build.log"
# Path to the store for the inputs to the report, relative to the execution dir
# for the report.
CONTEXT_STORE_BASENAME = "_context_store"


def _updated_jb_config(report_config):
    """Override default jupyter-book options.

    report_config: PRESC config options for the report

    Returns the updated JB config file as a YAML-formatted string that can be
    written to a _config.yml.
    """
    with open(REPORT_SOURCE_PATH / JB_CONFIG_FILENAME) as f:
        jb_config = yaml.load(f, Loader=yaml.FullLoader)

    jb_config["title"] = report_config["title"].get()
    jb_config["author"] = report_config["author"].get()

    # Add any page exclusions
    # First compile the overall list of report pages from the TOC.
    with open(REPORT_SOURCE_PATH / JB_TOC_FILENAME) as f:
        toc_str = f.read()

    stripped_lines = [x.strip() for x in toc_str.split("\n")]
    all_pages = [x[8:] for x in stripped_lines if x.startswith("- file: ")]
    incl_pages = include_exclude_list(
        all_pages,
        report_config["evaluations_include"].get(),
        report_config["evaluations_exclude"].get(),
    )
    if "landing" not in incl_pages:
        incl_pages.append("landing")

    to_exclude = [f"{p}.ipynb" for p in all_pages if p not in incl_pages]
    if to_exclude:
        jb_config["exclude_patterns"] = to_exclude

    return yaml.dump(jb_config)


[docs]class ReportRunner:
    """Main entrypoint to run the PRESC report for the given modeling inputs.

    The report is written to `<output_path>/presc_report`. If this dir already
    exists, it will be overwritten.

    To generate the report:
    ```
    pr = ReportRunner()
    pr.run(...)
    ```

    The path to the report is accessible as `pr.report_html` and will open in
    the default browser by calling `pr.open()`

    Attributes
    ----------
    output_path : str
        Path to the dir where the report will be written. If not specified,
        defaults to the current working dir.
    execution_path : str
        Path from which the report is built. If not specified, a temporary dir is used.
    config_filepath : str
        Path to a YAML file listing overrides to the default config options.
    """

    def __init__(self, output_path=".", execution_path=None, config_filepath=None):
        report_config = PrescConfig(global_config)
        if config_filepath:
            report_config.update_from_file(config_filepath)
        self.config = report_config
        # Path where the report output is written.
        # Outputs are nested in a subdir.
        self.output_path = Path(output_path) / REPORT_OUTPUT_DIR
        self.output_path.mkdir(parents=True, exist_ok=True)
        # Path where the report is built from.
        # The report source files are copied here, and the model inputs are
        # written to a data store.
        # If missing, a temp dir will be used on execution.
        self.execution_path = None
        if execution_path is not None:
            self.execution_path = Path(execution_path) / REPORT_EXECUTION_DIR

        # Build artifacts:
        # The main entry page for the report.
        self.report_main_page = self.output_path / SPHINX_INDEX_PAGE
        # Log files for jupyter-book execution.
        self.jb_clean_log = self.output_path / JB_CLEAN_LOG
        self.jb_build_log = self.output_path / JB_BUILD_LOG
        # The main page will be linked to the top-level output dir, if possible.
        self._linked_main_page = self.output_path / REPORT_MAIN_PAGE
        # Cache the process results from running jupyter-book commands for
        # debugging.
        self._jb_clean_result = None
        self._jb_build_result = None

    def _presc_artifacts(self):
        """List of paths to remove from the top-level output dir on clean."""
        return [self._linked_main_page, self.jb_clean_log, self.jb_build_log]

[docs]    def run(self, model, test_dataset, train_dataset=None, settings=None, clean=True):
        """Runs the PRESC report for the given modeling inputs.

        The report is written to `<output_path>/presc_report`. If this dir already
        exists, it will be overwritten.

        Parameters
        ----------
        model: presc.model.ClassificationModel
            A pre-trained ClassificationModel instance to evaluate
        test_dataset : presc.dataset.Dataset
            A test Dataset instance used to evaluate model performance
        train_dataset: presc.dataset.Dataset
            The Dataset instance used to train the model. This is not required for every evaluation.
        settings : dict
            A dict specifying option values to override report settings,
            eg. `{"report.title": "My Report"}`.
        clean : book
            Should previous outputs be cleaned? Default: True
        """
        if settings:
            run_config = PrescConfig(self.config)
            run_config.set(settings)
        else:
            run_config = self.config

        if clean:
            self.clean()

        tmpdir = None
        exec_path = None
        if self.execution_path:
            # If using a user-defined execution path, need to make sure
            # it doesn't exist for `shutil.copytree` to work.
            # Note that this will only remove the nested subdir, not the actual
            # user-specified dir.
            if self.execution_path.exists():
                shutil.rmtree(self.execution_path)
            exec_path = self.execution_path
        else:
            # Create a temp dir to run the build from.
            # We set up the temp dir here rather than using jupyter-book's
            # `run_in_temp` option so that we have access to the temp path.
            tmpdir = TemporaryDirectory()
            exec_path = Path(tmpdir.name) / REPORT_EXECUTION_DIR

        # Copy the report source files to the execution dir and
        # execute from there. The data store for the inputs is saved to
        # the same dir. That way, since the notebooks' working dir on execution
        # is set to where they are located by jupyter-book, they can find
        # the data store without needed to know the calling path.
        try:
            shutil.copytree(REPORT_SOURCE_PATH, exec_path)
        except shutil.Error as e:
            msg = f"Failed to copy report source to execution dir {exec_path}"
            raise PrescError(msg) from e

        # Update the default JB config files based on the PRESC config options.
        with open(exec_path / JB_CONFIG_FILENAME, "w") as f:
            f.write(_updated_jb_config(run_config["report"]))

        # Write the inputs to the data store.
        ctx = Context(store_dir=exec_path)
        ctx.store_inputs(
            model=model,
            test_dataset=test_dataset,
            train_dataset=train_dataset,
            config=run_config,
        )

        # Build the report.
        self._run_jb_build(exec_path)
        if tmpdir:
            tmpdir.cleanup()

        # The build should have created index.html at the `report_main_page`
        # path.
        if self.report_main_page.exists():
            # Symlink the main page to the top level for convenience.
            try:
                main_page_target = self.report_main_page.relative_to(
                    self._linked_main_page.parent
                )
                self._linked_main_page.symlink_to(main_page_target)
            except OSError:
                pass
        else:
            msg = f"The expected report main page {self.report_main_page} does not appear to exist."
            msg += " There may have been an error generating the report."
            msg += f" Output is written to {self.jb_build_log}"
            warnings.warn(msg)

    @property
    def report_html(self):
        """The main page of the HTML report."""
        # Return symlink, if available, for a more user-friendly experience.
        if self._linked_main_page.exists():
            # Resolve to an absolute path up to the symlink.
            report_path = (
                self._linked_main_page.parent.resolve() / self._linked_main_page.name
            )
        else:
            report_path = self.report_main_page.resolve()
        if not report_path.exists():
            msg = "Report file does not appear to exist."
            msg += " Make sure the report has already been built."
            raise AttributeError(msg)

        return str(report_path)

[docs]    def open(self):
        """Open the report in the default web browser."""
        webbrowser.open_new_tab(f"file://{self.report_html}")

[docs]    def clean(self):
        """Remove artifacts from a previous run, if any."""
        for p in self._presc_artifacts():
            try:
                p.unlink()
            except FileNotFoundError:
                pass
        self._run_jb_clean()

    def _run_jb_clean(self):
        """Run `jupyter-book clean`."""
        with open(self.jb_clean_log, "w") as outfile:
            result = subprocess.run(
                ["jupyter-book", "clean", self.output_path],
                stdout=outfile,
                stderr=subprocess.STDOUT,
            )
        if result.returncode > 0:
            msg = f"`jupyter-book clean {self.output_path} did not succeed."
            msg += f" Output is written to {self.jb_clean_log}"
            warnings.warn(msg)

        self._jb_clean_result = result

    def _run_jb_build(self, input_path):
        """Run `jupyter-book build` on the given path."""
        with open(self.jb_build_log, "w") as outfile:
            result = subprocess.run(
                [
                    "jupyter-book",
                    "build",
                    "--path-output",
                    str(self.output_path.resolve()),
                    str(input_path.resolve()),
                ],
                stdout=outfile,
                stderr=subprocess.STDOUT,
            )
        if result.returncode > 0:
            msg = f"`jupyter-book build {input_path} did not succeed."
            msg += f" Output is written to {self.jb_build_log}"
            warnings.warn(msg)

        self._jb_build_result = result


[docs]class Context:
    """Persistent data store for sharing report inputs across notebooks.

    Note that the store implementation does not support concurrent access. It is
    up to the caller to ensure that multiple instances each have a unique store
    location.

    Attributes
    ----------
    store_dir : str
        The dir to contain the data store, implemented as one or more
        database files. If not specified, defaults to the current working dir.
    """

    def __init__(self, store_dir="."):
        store_path = Path(store_dir) / CONTEXT_STORE_BASENAME
        self._store_path = str(store_path.resolve())

[docs]    def store_inputs(
        self, model=None, test_dataset=None, train_dataset=None, config=None
    ):
        """Write the report inputs to the data store.

        Any existing values will be overwritten.

        Parameters
        ----------
        model: presc.model.ClassificationModel
            A ClassificationModel instance
        test_dataset:  presc.dataset.Dataset
            A Dataset instance
        train_dataset: presc.dataset.Dataset
            A Dataset instance
        config: dict
            A dict of config options
        """
        with shelve.open(self._store_path) as ctx:
            if model:
                ctx["model"] = model
            if test_dataset:
                ctx["test_dataset"] = test_dataset
            if train_dataset:
                ctx["train_dataset"] = train_dataset
            if config:
                ctx["config"] = config

    def _get(self, key):
        try:
            with shelve.open(self._store_path, flag="r") as ctx:
                val = ctx[key]
            return val
        except KeyError:
            raise PrescError(f"Could not find stored value for '{key}'")

    @property
    def model(self):
        return self._get("model")

    @property
    def test_dataset(self):
        return self._get("test_dataset")

    @property
    def train_dataset(self):
        return self._get("train_dataset")

    @property
    def config(self):
        return self._get("config")