Upload Utilities

Utilities for uploading files and assets to a Deriva catalog's Hatrac object store.

This module provides functions that help structure local directories for uploading to a DerivaML catalog, and generating an upload specification for those directories.

Here is the directory layout we support:

deriva-ml/ execution execution-asset file1, file2, .... <- Need to update execution_asset association table. execution-metadata feature asset file1, file2, ... .jsonl <- needs to have asset_name column remapped before uploading table record_table.csv asset file1, file2, .... asset-type file1.jsonl, file2.jsonl

NULL_SENTINEL `module-attribute`

NULL_SENTINEL = '__NULL__'

Directory-segment marker for nullable asset-metadata columns with no value. Written into the staging tree by _invoke_deriva_py_uploader (in upload_engine.py) and translated back to Python None by :class:deriva_ml.asset.null_sentinel_processor.NullSentinelProcessor before deriva-py builds the catalog insert. The bag-based commit path (see :mod:deriva_ml.execution.bag_commit) skips this dance entirely — it builds row dicts with real None values and hands them to :class:deriva.bag.BagBuilder. See Bug C design doc.

asset_file_path

asset_file_path(
    prefix: Path | str,
    exec_rid: RID,
    asset_table: Table,
    file_name: str,
    metadata: dict[str, Any],
) -> Path

Return the file in which to place assets of a specified type are to be uploaded.

Parameters:

Name	Type	Description	Default
`prefix`	`Path \| str`	Path prefix to use.	required
`exec_rid`	`RID`	RID to use.	required
`asset_table`	`Table`	Table in which to place assets.	required
`file_name`	`str`	File name to use.	required
`metadata`	`dict[str, Any]`	Any additional metadata to add to the asset	required

Returns: Path to directory in which to place assets of type asset_type.

Source code in src/deriva_ml/dataset/upload.py

def asset_file_path(
    prefix: Path | str,
    exec_rid: RID,
    asset_table: Table,
    file_name: str,
    metadata: dict[str, Any],
) -> Path:
    """Return the file in which to place  assets of a specified type are to be uploaded.

    Args:
        prefix: Path prefix to use.
        exec_rid: RID to use.
        asset_table: Table in which to place assets.
        file_name: File name to use.
        metadata: Any additional metadata to add to the asset
    Returns:
        Path to directory in which to place assets of type asset_type.
    """
    schema = asset_table.schema.name
    asset_name = asset_table.name

    path = execution_root(prefix, exec_rid) / "asset" / schema / asset_name
    metadata = metadata or {}
    asset_columns = {
        "Filename",
        "URL",
        "Length",
        "MD5",
        "Description",
    }.union(set(DerivaSystemColumns))
    asset_metadata = {c.name for c in asset_table.columns} - asset_columns

    if not (asset_metadata >= set(metadata.keys())):
        raise DerivaMLException(f"Metadata {metadata} does not match asset metadata {asset_metadata}")

    for m in asset_metadata:
        path = path / str(metadata.get(m, NULL_SENTINEL))
    path.mkdir(parents=True, exist_ok=True)
    return path / file_name

asset_root

asset_root(
    prefix: Path | str, exec_rid: str
) -> Path

Return the directory for staging asset uploads for a specific execution.

The directory is created if it does not already exist.

Parameters:

Name	Type	Description	Default
`prefix`	`Path \| str`	Location of the upload root directory.	required
`exec_rid`	`str`	RID of the execution whose asset files are being staged.	required

Returns:

Type	Description
`Path`	Path to the asset upload directory for the given execution.

Source code in src/deriva_ml/dataset/upload.py

def asset_root(prefix: Path | str, exec_rid: str) -> Path:
    """Return the directory for staging asset uploads for a specific execution.

    The directory is created if it does not already exist.

    Args:
        prefix: Location of the upload root directory.
        exec_rid: RID of the execution whose asset files are being staged.

    Returns:
        Path to the asset upload directory for the given execution.
    """
    path = execution_root(prefix, exec_rid) / "asset"
    path.mkdir(parents=True, exist_ok=True)
    return path

asset_table_upload_spec

asset_table_upload_spec(
    model: DerivaModel,
    asset_table: str | Table,
    chunk_size: int | None = None,
)

Generate upload specification for an asset table.

Parameters:

Name	Type	Description	Default
`model`	`DerivaModel`	The DerivaModel instance.	required
`asset_table`	`str \| Table`	The asset table name or Table object.	required
`chunk_size`	`int \| None`	Optional chunk size in bytes for hatrac uploads. If provided, large files will be uploaded in chunks of this size.	`None`

Returns:

Type	Description
	A dictionary containing the upload specification for the asset table.

Source code in src/deriva_ml/dataset/upload.py

def asset_table_upload_spec(model: DerivaModel, asset_table: str | Table, chunk_size: int | None = None):
    """Generate upload specification for an asset table.

    Args:
        model: The DerivaModel instance.
        asset_table: The asset table name or Table object.
        chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
            large files will be uploaded in chunks of this size.

    Returns:
        A dictionary containing the upload specification for the asset table.
    """
    metadata_columns = sorted(model.asset_metadata(asset_table))
    asset_table = model.name_to_table(asset_table)
    schema = model.name_to_table(asset_table).schema.name

    # Be careful here as a metadata value might be a string with can contain special characters.
    # metadata_columns is sorted to ensure deterministic directory order matching the regex.
    metadata_path = "/".join([rf"(?P<{c}>[-:._ \w]+)" for c in metadata_columns])
    # Bug E.2: capture pre-allocated RID as an additional path segment
    # after metadata columns and before the filename.
    rid_path = r"(?P<RID>[-A-Z0-9]+)"
    parts = [metadata_path, rid_path] if metadata_path else [rid_path]
    asset_path = f"{exec_dir_regex}/asset/{schema}/{asset_table.name}/{'/'.join(parts)}/{asset_file_regex}"
    asset_table = model.name_to_table(asset_table)
    schema = model.name_to_table(asset_table).schema.name

    # Build hatrac_options with optional chunk_size
    hatrac_options = {"versioned_urls": True}
    if chunk_size is not None:
        hatrac_options["chunk_size"] = chunk_size

    # Create upload specification
    spec = {
        # Upload assets into an asset table of an asset table.
        "column_map": {
            "MD5": "{md5}",
            "URL": "{URI}",
            "Length": "{file_size}",
            "Filename": "{file_name}",
            "RID": "{RID}",  # Bug E.2: pre-allocated RID
        }
        | {c: f"{{{c}}}" for c in metadata_columns},
        "file_pattern": asset_path,  # Sets schema, asset_table, file
        "asset_type": "file",
        "target_table": [schema, asset_table.name],
        "checksum_types": ["sha256", "md5"],
        "hatrac_options": hatrac_options,
        "hatrac_templates": {
            "hatrac_uri": f"/hatrac/{asset_table.name}/{{md5}}.{{file_name}}",
            "content-disposition": "filename*=UTF-8''{file_name}",
        },
        "record_query_template": "/entity/{target_table}/MD5={md5}&Filename={file_name}",
        "use_pre_allocated_rid": True,  # Bug E.2: use caller-supplied RID
    }
    # Wire the NullSentinelProcessor only when the table has metadata
    # columns — otherwise no sentinel values can appear and the
    # processor would be a no-op. See Bug C design doc.
    if metadata_columns:
        spec["pre_processors"] = [
            {
                "processor": "NullSentinelProcessor",
                "processor_type": ("deriva_ml.asset.null_sentinel_processor.NullSentinelProcessor"),
            }
        ]
    return spec

asset_type_path

asset_type_path(
    prefix: Path | str,
    exec_rid: RID,
    asset_table: Table,
) -> Path

Return the path to a JSON line file in which to place asset_type information.

Parameters:

Name	Type	Description	Default
`prefix`	`Path \| str`	Location of upload root directory	required
`exec_rid`	`RID`	Execution RID	required
`asset_table`	`Table`	Table in which to place assets.	required

Returns:

Type	Description
`Path`	Path to the file in which to place asset_type values for the named asset.

Source code in src/deriva_ml/dataset/upload.py

def asset_type_path(prefix: Path | str, exec_rid: RID, asset_table: Table) -> Path:
    """Return the path to a JSON line file in which to place asset_type information.

    Args:
        prefix: Location of upload root directory
        exec_rid: Execution RID
        asset_table: Table in which to place assets.

    Returns:
        Path to the file in which to place asset_type values for the named asset.
    """
    path = execution_root(prefix, exec_rid=exec_rid) / "asset-type" / asset_table.schema.name
    path.mkdir(parents=True, exist_ok=True)
    return path / f"{asset_table.name}.jsonl"

bulk_upload_configuration

bulk_upload_configuration(
    model: DerivaModel,
    chunk_size: int | None = None,
) -> dict[str, Any]

Return an upload specification for deriva-ml

Parameters:

Name	Type	Description	Default
`model`	`DerivaModel`	Model from which to generate the upload configuration.	required
`chunk_size`	`int \| None`	Optional chunk size in bytes for hatrac uploads. If provided, large files will be uploaded in chunks of this size.	`None`

Source code in src/deriva_ml/dataset/upload.py

def bulk_upload_configuration(model: DerivaModel, chunk_size: int | None = None) -> dict[str, Any]:
    """Return an upload specification for deriva-ml

    Args:
        model: Model from which to generate the upload configuration.
        chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
            large files will be uploaded in chunks of this size.
    """
    # Bug E.2: Use asset_table_upload_spec for ALL asset tables, not
    # just those with metadata columns. The spec handles zero-metadata
    # tables correctly (RID is the only capturing segment). This
    # unifies the regex + use_pre_allocated_rid semantics across all
    # asset tables, fixing the mismatch where Execution_Metadata (and
    # other zero-metadata tables) went through a fallback mapping that
    # did not include the RID segment.
    all_asset_mappings = [
        asset_table_upload_spec(model=model, asset_table=t, chunk_size=chunk_size) for t in model.find_assets()
    ]

    return {
        "asset_mappings": all_asset_mappings
        + [
            {
                #  Upload the records into a  table
                "asset_type": "table",
                "default_columns": ["RID", "RCB", "RMB", "RCT", "RMT"],
                "file_pattern": table_regex,  # Sets schema, table,
                "ext_pattern": "^.*[.](?P<file_ext>json|csv)$",
                "target_table": ["{schema}", "{table}"],
            },
        ],
        "version_update_url": "https://github.com/informatics-isi-edu/deriva-client",
        "version_compatibility": [[">=1.4.0", "<2.0.0"]],
    }

execution_rids

execution_rids(
    prefix: Path | str,
) -> list[RID]

Return all execution RIDs that have files staged for upload.

Scans the execution/ subdirectory under the upload root and returns the name of each immediate child directory, which corresponds to an execution RID.

Parameters:

Name	Type	Description	Default
`prefix`	`Path \| str`	Location of the upload root directory.	required

Returns:

Type	Description
`list[RID]`	List of execution RID strings found under the upload root.

Source code in src/deriva_ml/dataset/upload.py

def execution_rids(prefix: Path | str) -> list[RID]:
    """Return all execution RIDs that have files staged for upload.

    Scans the ``execution/`` subdirectory under the upload root and returns the
    name of each immediate child directory, which corresponds to an execution RID.

    Args:
        prefix: Location of the upload root directory.

    Returns:
        List of execution RID strings found under the upload root.
    """
    path = upload_root(prefix) / "execution"
    return [d.name for d in path.iterdir()]

execution_root

execution_root(
    prefix: Path | str, exec_rid
) -> Path

Return the directory for staging upload files for a specific execution.

The directory is created if it does not already exist.

Parameters:

Name	Type	Description	Default
`prefix`	`Path \| str`	Location of the upload root directory.	required
`exec_rid`		RID of the execution whose upload files are being staged.	required

Returns:

Type	Description
`Path`	Path to the execution-specific upload directory.

Source code in src/deriva_ml/dataset/upload.py

def execution_root(prefix: Path | str, exec_rid) -> Path:
    """Return the directory for staging upload files for a specific execution.

    The directory is created if it does not already exist.

    Args:
        prefix: Location of the upload root directory.
        exec_rid: RID of the execution whose upload files are being staged.

    Returns:
        Path to the execution-specific upload directory.
    """
    path = upload_root(prefix) / "execution" / exec_rid
    path.mkdir(exist_ok=True, parents=True)
    return path

flat_asset_dir

flat_asset_dir(
    prefix: Path | str,
    exec_rid: str,
    asset_table_name: str,
) -> Path

Return the flat per-table asset directory for the manifest-first storage layout.

Files are stored in assets/{AssetTable}/ without metadata encoding in the path. Metadata lives in the manifest JSON file instead.

Parameters:

Name	Type	Description	Default
`prefix`	`Path \| str`	Location of upload root directory.	required
`exec_rid`	`str`	Execution RID.	required
`asset_table_name`	`str`	Name of the asset table (e.g., "Image", "Model").	required

Returns:

Type	Description
`Path`	Path to the flat asset directory (created if it doesn't exist).

Source code in src/deriva_ml/dataset/upload.py

def flat_asset_dir(prefix: Path | str, exec_rid: str, asset_table_name: str) -> Path:
    """Return the flat per-table asset directory for the manifest-first storage layout.

    Files are stored in ``assets/{AssetTable}/`` without metadata encoding in the path.
    Metadata lives in the manifest JSON file instead.

    Args:
        prefix: Location of upload root directory.
        exec_rid: Execution RID.
        asset_table_name: Name of the asset table (e.g., "Image", "Model").

    Returns:
        Path to the flat asset directory (created if it doesn't exist).
    """
    path = execution_root(prefix, exec_rid) / "assets" / asset_table_name
    path.mkdir(parents=True, exist_ok=True)
    return path

manifest_path

manifest_path(
    prefix: Path | str, exec_rid: str
) -> Path

Return the path to the asset-manifest.json file for an execution.

Source code in src/deriva_ml/dataset/upload.py

def manifest_path(prefix: Path | str, exec_rid: str) -> Path:
    """Return the path to the asset-manifest.json file for an execution."""
    return execution_root(prefix, exec_rid) / "asset-manifest.json"

normalize_asset_dir

normalize_asset_dir(
    path: str | Path,
) -> Optional[tuple[str, str]]

Parse a path to an asset file and return the asset table name and file name.

Parameters:

Name	Type	Description	Default
`path`	`str \| Path`	Path to the asset file	required

Returns:

Type	Description
`Optional[tuple[str, str]]`	Tuple of (schema/table, filename) or None if path doesn't match pattern

Source code in src/deriva_ml/dataset/upload.py

def normalize_asset_dir(path: str | Path) -> Optional[tuple[str, str]]:
    """Parse a path to an asset file and return the asset table name and file name.

    Args:
        path: Path to the asset file

    Returns:
        Tuple of (schema/table, filename) or None if path doesn't match pattern
    """
    path = Path(path)
    if not (m := re.match(asset_path_regex, str(path))):
        return None
    return f"{m['schema']}/{m['asset_table']}", path.name

table_path

table_path(
    prefix: Path | str,
    schema: str,
    table: str,
) -> Path

Return the path to a CSV file in which to place table values that are to be uploaded.

Parameters:

Name	Type	Description	Default
`prefix`	`Path \| str`	Location of upload root directory	required
`schema`	`str`	Domain schema	required
`table`	`str`	Name of the table to be uploaded.	required

Returns:

Type	Description
`Path`	Path to the file in which to place table values that are to be uploaded.

Source code in src/deriva_ml/dataset/upload.py

def table_path(prefix: Path | str, schema: str, table: str) -> Path:
    """Return the path to a CSV file in which to place table values that are to be uploaded.

    Args:
        prefix: Location of upload root directory
        schema: Domain schema
        table: Name of the table to be uploaded.

    Returns:
        Path to the file in which to place table values that are to be uploaded.
    """
    path = upload_root(prefix) / "table" / schema / table
    path.mkdir(parents=True, exist_ok=True)
    return path / f"{table}.csv"

upload_asset

upload_asset(
    model: DerivaModel,
    file: Path | str,
    table: Table,
    **kwargs: Any,
) -> dict

Upload the specified file into Hatrac and update the associated asset table.

Parameters:

Name	Type	Description	Default
`file`	`Path \| str`	path to the file to upload.	required
`table`	`Table`	Name of the asset table	required
`model`	`DerivaModel`	Model to upload assets to.	required
`kwargs`	`Any`	Keyword arguments for values of additional columns to be added to the asset table.	`{}`

Returns:

Source code in src/deriva_ml/dataset/upload.py

@validate_call(config=VALIDATION_CONFIG)
def upload_asset(model: DerivaModel, file: Path | str, table: Table, **kwargs: Any) -> dict:
    """Upload the specified file into Hatrac and update the associated asset table.

    Args:
        file: path to the file to upload.
        table: Name of the asset table
        model: Model to upload assets to.
        kwargs: Keyword arguments for values of additional columns to be added to the asset table.

    Returns:

    """
    if not model.is_asset(table):
        raise DerivaMLException(f"Table {table} is not an asset table.")

    file_path = Path(file)
    file_name = file_path.name
    file_size = file_path.stat().st_size

    hatrac_path = f"/hatrac/{table.name}/"
    hs = HatracStore(
        "https",
        server=model.catalog.deriva_server.server,
        credentials=model.catalog.deriva_server.credentials,
    )
    md5_hashes = hash_utils.compute_file_hashes(file, frozenset(["md5"]))["md5"]
    sanitized_filename = urlquote(re.sub("[^a-zA-Z0-9_.-]", "_", md5_hashes[0] + "." + file_name))
    hatrac_path = f"{hatrac_path}{sanitized_filename}"

    try:
        # Upload the file to hatrac.
        hatrac_uri = hs.put_obj(
            hatrac_path,
            file,
            md5=md5_hashes[1],
            content_type=mime_utils.guess_content_type(file),
            content_disposition="filename*=UTF-8''" + file_name,
        )
    except Exception as e:
        raise e
    try:
        # Now update the asset table.
        ipath = model.catalog.getPathBuilder().schemas[table.schema.name].tables[table.name]
        return list(
            ipath.insert(
                [
                    {
                        "URL": hatrac_uri,
                        "Filename": file_name,
                        "Length": file_size,
                        "MD5": md5_hashes[0],
                    }
                    | kwargs
                ]
            )
        )[0]
    except Exception as e:
        raise e

upload_directory

upload_directory(
    model: DerivaModel,
    directory: Path | str,
    progress_callback: Callable[
        [UploadProgress], None
    ]
    | None = None,
    max_retries: int = 3,
    retry_delay: float = 5.0,
    timeout: tuple[int, int]
    | None = None,
    chunk_size: int | None = None,
) -> dict[Any, FileUploadState] | None

Upload assets from a directory. This routine assumes that the current upload specification includes a configuration for the specified directory. Every asset in the specified directory is uploaded

Parameters:

Name	Type	Description	Default
`model`	`DerivaModel`	Model to upload assets to.	required
`directory`	`Path \| str`	Directory containing the assets and tables to upload.	required
`progress_callback`	`Callable[[UploadProgress], None] \| None`	Optional callback function to receive upload progress updates. Called with UploadProgress objects containing file information and progress.	`None`
`max_retries`	`int`	Maximum number of retry attempts for failed uploads (default: 3).	`3`
`retry_delay`	`float`	Initial delay in seconds between retries, doubles with each attempt (default: 5.0).	`5.0`
`timeout`	`tuple[int, int] \| None`	Tuple of (connect_timeout, read_timeout) in seconds. Default is (600, 600). Note: urllib3 uses connect_timeout as the socket timeout during request body writes, so it must be large enough for a full chunk upload. Both values should be set generously for large file uploads.	`None`
`chunk_size`	`int \| None`	Optional chunk size in bytes for hatrac uploads. If provided, large files will be uploaded in chunks of this size.	`None`

Returns:

Type	Description
`dict[Any, FileUploadState] \| None`	Results of the upload operation.

Raises:

Type	Description
`DerivaMLException`	If there is an issue with uploading the assets.

Source code in src/deriva_ml/dataset/upload.py

@validate_call(config=VALIDATION_CONFIG)
def upload_directory(
    model: DerivaModel,
    directory: Path | str,
    progress_callback: Callable[[UploadProgress], None] | None = None,
    max_retries: int = 3,
    retry_delay: float = 5.0,
    timeout: tuple[int, int] | None = None,
    chunk_size: int | None = None,
) -> dict[Any, FileUploadState] | None:
    """Upload assets from a directory. This routine assumes that the current upload specification includes a
    configuration for the specified directory.  Every asset in the specified directory is uploaded

    Args:
        model: Model to upload assets to.
        directory: Directory containing the assets and tables to upload.
        progress_callback: Optional callback function to receive upload progress updates.
            Called with UploadProgress objects containing file information and progress.
        max_retries: Maximum number of retry attempts for failed uploads (default: 3).
        retry_delay: Initial delay in seconds between retries, doubles with each attempt (default: 5.0).
        timeout: Tuple of (connect_timeout, read_timeout) in seconds. Default is (600, 600).
            Note: urllib3 uses connect_timeout as the socket timeout during request body
            writes, so it must be large enough for a full chunk upload. Both values should
            be set generously for large file uploads.
        chunk_size: Optional chunk size in bytes for hatrac uploads. If provided,
            large files will be uploaded in chunks of this size.

    Returns:
        Results of the upload operation.

    Raises:
        DerivaMLException: If there is an issue with uploading the assets.
    """
    import time

    from deriva.core import DEFAULT_SESSION_CONFIG

    logger = get_logger(__name__)
    directory = Path(directory)
    if not directory.is_dir():
        raise DerivaMLException("Directory does not exist")

    # Track upload progress across files
    # status_callback is called twice per file: once before upload starts, once after it completes
    upload_state = {"completed_files": 0, "total_files": 0, "status_calls": 0}

    # Count total files to upload
    for root, dirs, files in os.walk(directory):
        upload_state["total_files"] += len(files)

    # Create wrapper callbacks for GenericUploader if a progress callback was provided
    def file_callback(**kwargs) -> bool:
        """Callback for per-chunk progress updates from GenericUploader.

        The deriva GenericUploader passes kwargs with: completed, total, file_path, host, job_info.
        Note: This callback is only invoked for large files (> 25MB) that use chunked uploads.
        Small files are uploaded in a single request and this callback won't be called.
        """
        if progress_callback is not None:
            file_path = kwargs.get("file_path", "")
            completed_chunks = kwargs.get("completed", 0)
            total_chunks = kwargs.get("total", 0)

            progress = UploadProgress(
                file_path=file_path,
                file_name=Path(file_path).name if file_path else "",
                bytes_completed=completed_chunks,
                bytes_total=total_chunks,
                percent_complete=(completed_chunks / total_chunks * 100) if total_chunks > 0 else 0,
                phase="uploading_chunks",
                message=f"Uploading large file: chunk {completed_chunks} of {total_chunks}",
            )
            progress_callback(progress)
        return True  # Continue upload

    def status_callback() -> None:
        """Callback for per-file status updates from GenericUploader.

        GenericUploader calls this twice per file: once before upload starts (odd calls)
        and once after upload completes (even calls). We use even calls to track completed files.
        """
        if progress_callback is not None:
            upload_state["status_calls"] += 1

            # Even calls indicate file completion (after upload)
            if upload_state["status_calls"] % 2 == 0:
                upload_state["completed_files"] += 1

            # Report progress with current file count
            current_file = (upload_state["status_calls"] + 1) // 2  # 1-indexed current file
            progress = UploadProgress(
                phase="uploading",
                message=f"Uploading file {current_file} of {upload_state['total_files']}",
                percent_complete=(upload_state["completed_files"] / upload_state["total_files"] * 100)
                if upload_state["total_files"] > 0
                else 0,
            )
            progress_callback(progress)

    def do_upload(uploader) -> dict[str, dict]:
        """Perform the upload and return raw results."""
        uploader.getUpdatedConfig()
        uploader.scanDirectory(directory, purge_state=True)
        return uploader.uploadFiles(
            file_callback=file_callback if progress_callback else None,
            status_callback=status_callback if progress_callback else None,
        )

    # Use provided timeout or default
    upload_timeout = timeout if timeout is not None else DEFAULT_UPLOAD_TIMEOUT

    # Now upload the files by creating an upload spec and then calling the uploader.
    with TemporaryDirectory() as temp_dir:
        spec_file = Path(temp_dir) / "config.json"
        with spec_file.open("w+") as cfile:
            json.dump(bulk_upload_configuration(model, chunk_size=chunk_size), cfile)

        # Create session config with longer timeout for large file uploads
        session_config = DEFAULT_SESSION_CONFIG.copy()
        session_config["timeout"] = upload_timeout
        logger.debug(f"Upload session config timeout: {session_config['timeout']}")

        all_results = {}
        attempt = 0
        current_delay = retry_delay

        while attempt <= max_retries:
            uploader = GenericUploader(
                server={
                    "host": model.hostname,
                    "protocol": "https",
                    "catalog_id": model.catalog.catalog_id,
                    "session": session_config,
                },
                config_file=spec_file,
            )
            try:
                raw_results = do_upload(uploader)

                # Process results and check for failures
                failed_files = []
                for path, result in raw_results.items():
                    state = UploadState(result["State"])
                    if state == UploadState.failed or result["Result"] is None:
                        failed_files.append((path, result["Status"]))
                    else:
                        # Store successful results
                        all_results[path] = FileUploadState(
                            state=state,
                            status=result["Status"],
                            result=result["Result"],
                        )

                if not failed_files:
                    # All uploads successful
                    break

                attempt += 1
                if attempt > max_retries:
                    # Final attempt failed, raise error with details
                    error_details = "; ".join([f"{path}: {msg}" for path, msg in failed_files])
                    raise DerivaMLException(
                        f"Failed to upload {len(failed_files)} file(s) after {max_retries} retries: {error_details}"
                    )

                # Log retry attempt and wait before retrying
                logger.warning(
                    f"Upload failed for {len(failed_files)} file(s), retrying in {current_delay:.1f}s "
                    f"(attempt {attempt}/{max_retries}): {[p for p, _ in failed_files]}"
                )
                if progress_callback:
                    progress_callback(
                        UploadProgress(
                            phase="retrying",
                            message=f"Retrying {len(failed_files)} failed upload(s) in {current_delay:.1f}s (attempt {attempt}/{max_retries})",
                            percent_complete=0,
                        )
                    )

                time.sleep(current_delay)
                current_delay *= 2  # Exponential backoff

                # Reset upload state for retry
                upload_state["status_calls"] = 0

            finally:
                uploader.cleanup()

        return all_results

upload_root

upload_root(prefix: Path | str) -> Path

Return the top level directory of where to put files to be uploaded.

Source code in src/deriva_ml/dataset/upload.py

def upload_root(prefix: Path | str) -> Path:
    """Return the top level directory of where to put files to be uploaded."""
    path = Path(prefix) / "deriva-ml"
    path.mkdir(exist_ok=True, parents=True)
    return path

upload_staging_root

upload_staging_root(
    prefix: Path | str, exec_rid: str
) -> Path

Return the ephemeral upload-staging directory, created at upload time only.

This directory holds symlinks arranged in the regex-expected tree structure that GenericUploader needs. It is created from manifest data at upload time and cleaned up after upload completes.

Source code in src/deriva_ml/dataset/upload.py

def upload_staging_root(prefix: Path | str, exec_rid: str) -> Path:
    """Return the ephemeral upload-staging directory, created at upload time only.

    This directory holds symlinks arranged in the regex-expected tree structure
    that GenericUploader needs. It is created from manifest data at upload time
    and cleaned up after upload completes.
    """
    path = execution_root(prefix, exec_rid) / "upload-staging"
    path.mkdir(parents=True, exist_ok=True)
    return path

Upload Utilities

NULL_SENTINEL module-attribute

asset_file_path

asset_root

asset_table_upload_spec

asset_type_path

bulk_upload_configuration

execution_rids

execution_root

flat_asset_dir

manifest_path

normalize_asset_dir

table_path

upload_asset

upload_directory

upload_root

upload_staging_root

NULL_SENTINEL `module-attribute`