Source code for weaver.processes.builtin.utils

import os
import tempfile
from typing import TYPE_CHECKING
from urllib.parse import urlparse

from weaver import WEAVER_ROOT_DIR
from weaver.formats import ContentType, get_extension

if TYPE_CHECKING:
    from typing import Any, Tuple



[docs]
def is_netcdf_url(url):
    # type: (Any) -> bool
    """
    Validates that the reference is a remote NetCDF file reference.
    """
    try:
        validate_reference(url, is_file=True)
    except (TypeError, ValueError):
        return False
    return os.path.splitext(url)[-1] == get_extension(ContentType.APP_NETCDF)




[docs]
def is_geojson_url(url):
    # type: (Any) -> bool
    """
    Validates that the reference is a remote GeoJSON file reference.
    """
    try:
        validate_reference(url, is_file=True)
    except (TypeError, ValueError):
        return False
    return os.path.splitext(url)[-1] in [get_extension(ContentType.APP_GEOJSON), get_extension(ContentType.APP_JSON)]




[docs]
def validate_reference(url, is_file):
    # type: (str, bool) -> None
    """
    Ensures that the provided reference points to a valid remote file or a temporary intermediate file.

    In order to avoid bypassing security validation of server file access between jobs, remote locations must be
    enforced. However, :term:`CWL` temporary files must be allowed through for intermediate locations passed around
    between :term:`Workflow` steps or employed as temporary writing locations for file extraction purposes.
    """
    if not isinstance(url, str):
        raise TypeError(f"Not a valid URL: [{url!s}]")
    if (is_file and url.endswith("/")) or (not is_file and not url.endswith("/")):
        dir_msg = "not supported" if is_file else "required"
        raise ValueError(f"Not a valid file URL reference [{url}]. Directory path {dir_msg}.")
    # When in a CWL step, tempdir will return the `/tmp/cwltool_tmp_...' path (since enforced by the tool).
    # When executed in other situations, it will map to the environment variable or platform-specific tmp path.
    # Although CWL will set TMPDIR for the current step, the source file could be coming from a previous step.
    # Therefore, the random part of the path after 'cwltool_tmp_'/'cwltool_out_' could differ from the current ones.
    tmp_dir = tempfile.gettempdir()
    tmp_paths = [
        f"file://{tmp_dir}/",
        f"{tmp_dir}/",
        "file:///tmp/cwltool_out_",
        "file:///tmp/cwltool_tmp_",
        "/tmp/cwltool_out_",  # nosec: B108
        "/tmp/cwltool_tmp_",  # nosec: B108
    ]
    if any(url.startswith(path) for path in tmp_paths):
        return
    if urlparse(url).scheme not in ["http", "https", "s3"]:
        raise ValueError(f"Not a valid file URL reference [{url}]. Scheme not supported.")




[docs]
def get_package_details(file):
    # type: (os.PathLike[str]) -> Tuple[str, str, str]
    """
    Obtains the ``builtin`` process details from its file reference.
    """
    name = os.path.split(os.path.splitext(file)[0])[-1]
    root = WEAVER_ROOT_DIR.rstrip("/")  # avoid double //
    path = str(file).rsplit(f"{root}/", 1)[-1].rsplit(name)[0]
    mod = f"{path}{name}".replace("/", ".")
    return name, path, mod