Skip to content

Configuration Groups

This guide details each configuration group and how to customize them.

DerivaML Connection (deriva_ml)

File: src/configs/deriva.py

Purpose: Define catalog connection settings.

from hydra_zen import store
from deriva_ml.core.config import DerivaMLConfig

deriva_store = store(group="deriva_ml")

# Local development server
deriva_store(
    DerivaMLConfig(
        hostname="localhost",
        catalog_id="45",
    ),
    name="local",
)

# Production server
deriva_store(
    DerivaMLConfig(
        hostname="<hostname>",
        catalog_id="2",
    ),
    name="eye_ai",
)

# REQUIRED: default configuration
deriva_store(
    DerivaMLConfig(
        hostname="localhost",
        catalog_id="45",
    ),
    name="default_deriva",
)

Usage:

uv run deriva-ml-run deriva_ml=eye_ai

Datasets (datasets)

File: src/configs/datasets.py

Purpose: Define input dataset specifications.

from hydra_zen import store
from deriva_ml.dataset import DatasetSpecConfig

datasets_store = store(group="datasets")

# Training dataset
training = [
    DatasetSpecConfig(
        rid="ABC1",
        version="1.0.0",
        materialize=True,  # Download files
    ),
]

# Testing dataset
testing = [
    DatasetSpecConfig(rid="ABC2", version="2.0.0"),
]

# Multiple datasets
combined = [
    DatasetSpecConfig(rid="ABC1", version="1.0.0"),
    DatasetSpecConfig(rid="ABC2", version="2.0.0"),
]

# Register configurations
datasets_store(training, name="training")
datasets_store(testing, name="testing")
datasets_store(combined, name="combined")
datasets_store(training, name="default_dataset")  # REQUIRED

DatasetSpecConfig Options:

Field Type Description
rid str Dataset RID (required)
version str Version string (e.g., "1.0.0")
materialize bool Download asset files (default: True)
description str Human-readable description

Usage:

uv run deriva-ml-run datasets=testing

Assets (assets)

File: src/configs/assets.py

Purpose: Define input assets like model weights or configuration files.

from hydra_zen import store
from deriva_ml.execution import with_description

assets_store = store(group="assets")

# Plain RID strings
assets_store(
    with_description(
        ["XYZ1", "XYZ2"],
        "Model weights and config file.",
    ),
    name="pretrained",
)

# For large files with caching
from deriva_ml.asset.aux_classes import AssetSpecConfig
assets_store(
    with_description(
        [AssetSpecConfig(rid="XYZ1", cache=True)],
        "Large model weights, cached locally.",
    ),
    name="cached_weights",
)

# REQUIRED: default_asset (plain list, no with_description)
assets_store([], name="default_asset")

Usage:

uv run deriva-ml-run assets=pretrained

Model Configuration (model_config)

File: src/configs/<model_name>.py

Purpose: Define model hyperparameters and variants.

from hydra_zen import builds, store
from models.my_model import my_model

# Build base configuration
MyModelConfig = builds(
    my_model,
    learning_rate=1e-3,
    epochs=10,
    batch_size=64,
    populate_full_signature=True,
    zen_partial=True,
)

model_store = store(group="model_config")

# Register variants
model_store(MyModelConfig, name="default_model")  # REQUIRED
model_store(MyModelConfig, epochs=3, name="quick")
model_store(MyModelConfig, epochs=50, name="extended")
model_store(MyModelConfig, learning_rate=1e-2, name="fast_lr")

Usage:

# Use a variant
uv run deriva-ml-run model_config=quick

# Override inline
uv run deriva-ml-run model_config.epochs=25

Workflow (workflow)

File: src/configs/workflow.py

Purpose: Define workflow metadata for provenance tracking.

from hydra_zen import store, builds
from deriva_ml.execution import Workflow

Cifar10CNNWorkflow = builds(
    Workflow,
    name="CIFAR-10 2-Layer CNN",
    workflow_type=["Training", "Image Classification"],
    description="Train a CNN on CIFAR-10.",
    populate_full_signature=True,
)

workflow_store = store(group="workflow")
workflow_store(Cifar10CNNWorkflow, name="default_workflow")

Required Defaults

Each configuration group must have a default configuration. The naming convention is:

Group Default Name
deriva_ml default_deriva
datasets default_dataset
assets default_asset
model_config default_model
workflow default_workflow

If a default is missing, Hydra will fail with a composition error.