Configuration Groups

This guide details each configuration group and how to customize them.

DerivaML Connection (`deriva_ml`)

File: src/configs/deriva.py

Purpose: Define catalog connection settings.

from hydra_zen import store
from deriva_ml.core.config import DerivaMLConfig

deriva_store = store(group="deriva_ml")

# Local development server
deriva_store(
    DerivaMLConfig(
        hostname="localhost",
        catalog_id="45",
    ),
    name="local",
)

# Production server
deriva_store(
    DerivaMLConfig(
        hostname="<hostname>",
        catalog_id="2",
    ),
    name="eye_ai",
)

# REQUIRED: default configuration
deriva_store(
    DerivaMLConfig(
        hostname="localhost",
        catalog_id="45",
    ),
    name="default_deriva",
)

Usage:

uv run deriva-ml-run deriva_ml=eye_ai

Datasets (`datasets`)

File: src/configs/datasets.py

Purpose: Define input dataset specifications.

from hydra_zen import store
from deriva_ml.dataset import DatasetSpecConfig

datasets_store = store(group="datasets")

# Training dataset
training = [
    DatasetSpecConfig(
        rid="ABC1",
        version="1.0.0",
        materialize=True,  # Download files
    ),
]

# Testing dataset
testing = [
    DatasetSpecConfig(rid="ABC2", version="2.0.0"),
]

# Multiple datasets
combined = [
    DatasetSpecConfig(rid="ABC1", version="1.0.0"),
    DatasetSpecConfig(rid="ABC2", version="2.0.0"),
]

# Register configurations
datasets_store(training, name="training")
datasets_store(testing, name="testing")
datasets_store(combined, name="combined")
datasets_store(training, name="default_dataset")  # REQUIRED

DatasetSpecConfig Options:

Field	Type	Description
`rid`	str	Dataset RID (required)
`version`	str	Version string (e.g., "1.0.0")
`materialize`	bool	Download asset files (default: True)
`description`	str	Human-readable description

Usage:

uv run deriva-ml-run datasets=testing

Assets (`assets`)

File: src/configs/assets.py

Purpose: Define input assets like model weights or configuration files.

from hydra_zen import store
from deriva_ml.execution import with_description

assets_store = store(group="assets")

# Plain RID strings
assets_store(
    with_description(
        ["XYZ1", "XYZ2"],
        "Model weights and config file.",
    ),
    name="pretrained",
)

# For large files with caching
from deriva_ml.asset.aux_classes import AssetSpecConfig
assets_store(
    with_description(
        [AssetSpecConfig(rid="XYZ1", cache=True)],
        "Large model weights, cached locally.",
    ),
    name="cached_weights",
)

# REQUIRED: default_asset (plain list, no with_description)
assets_store([], name="default_asset")

Usage:

uv run deriva-ml-run assets=pretrained

Model Configuration (`model_config`)

File: src/configs/<model_name>.py

Purpose: Define model hyperparameters and variants.

from hydra_zen import builds, store
from models.my_model import my_model

# Build base configuration
MyModelConfig = builds(
    my_model,
    learning_rate=1e-3,
    epochs=10,
    batch_size=64,
    populate_full_signature=True,
    zen_partial=True,
)

model_store = store(group="model_config")

# Register variants
model_store(MyModelConfig, name="default_model")  # REQUIRED
model_store(MyModelConfig, epochs=3, name="quick")
model_store(MyModelConfig, epochs=50, name="extended")
model_store(MyModelConfig, learning_rate=1e-2, name="fast_lr")

Usage:

# Use a variant
uv run deriva-ml-run model_config=quick

# Override inline
uv run deriva-ml-run model_config.epochs=25

Workflow (`workflow`)

File: src/configs/workflow.py

Purpose: Define workflow metadata for provenance tracking.

from hydra_zen import store, builds
from deriva_ml.execution import Workflow

Cifar10CNNWorkflow = builds(
    Workflow,
    name="CIFAR-10 2-Layer CNN",
    workflow_type=["Training", "Image Classification"],
    description="Train a CNN on CIFAR-10.",
    populate_full_signature=True,
)

workflow_store = store(group="workflow")
workflow_store(Cifar10CNNWorkflow, name="default_workflow")

Required Defaults

Each configuration group must have a default configuration. The naming convention is:

Group	Default Name
`deriva_ml`	`default_deriva`
`datasets`	`default_dataset`
`assets`	`default_asset`
`model_config`	`default_model`
`workflow`	`default_workflow`

If a default is missing, Hydra will fail with a composition error.

Configuration Groups

DerivaML Connection (deriva_ml)

Datasets (datasets)

Assets (assets)

Model Configuration (model_config)

Workflow (workflow)

Required Defaults

DerivaML Connection (`deriva_ml`)

Datasets (`datasets`)

Assets (`assets`)

Model Configuration (`model_config`)

Workflow (`workflow`)