Skip to content

Examples

Ready-to-run code examples and configuration files for common Dataphy workflows.

Configuration Examples

Basic Episode Augmentation

Simple augmentation config for getting started:

basic_aug.yaml
version: 1
pipeline:
  sync_views: true
  steps:
    - name: color_jitter
      magnitude: 0.1
    - name: cutout
      holes: 1
      size_range: [8, 16]
  background:
    adapter: none
seed: 42

Robotics-Optimized Augmentation

Configuration optimized for robotics datasets:

robotics_aug.yaml
version: 1
pipeline:
  # Synchronize augmentations across all camera views
  sync_views: true

  steps:
    # Preserve spatial context for robot-object relationships
    - name: random_crop_pad
      keep_ratio_min: 0.88

    # Small spatial shifts to simulate minor camera movements
    - name: random_translate
      px: 8

    # Lighting variations for different environments
    - name: color_jitter
      magnitude: 0.15

    # Subtle texture effects for sensor realism
    - name: random_conv
      kernel_variance: 0.035

    # Small occlusion patches to simulate partial blocking
    - name: cutout
      holes: 1
      size_range: [8, 16]

  background:
    adapter: none

seed: 42

Python Scripts

Dataset Exploration Script

explore_dataset.py
#!/usr/bin/env python3
"""
Explore a robotics dataset and print detailed information.
"""

from pathlib import Path
from dataphy.dataset.registry import create_dataset_loader

def explore_dataset(dataset_path: str):
    """Explore and print dataset information."""

    # Load dataset (auto-detect format)
    loader = create_dataset_loader(dataset_path)

    # Get dataset info
    info = loader.get_dataset_info()
    print(f"Dataset Overview:")
    print(f"  Format: {info.format}")
    print(f"  Episodes: {info.total_episodes}")
    print(f"  Total timesteps: {info.total_timesteps:,}")

    # List episodes
    episodes = loader.get_episode_ids()
    print(f"\nEpisodes: {len(episodes)}")
    for i, episode_id in enumerate(episodes[:5]):
        print(f"  {i}: {episode_id}")
    if len(episodes) > 5:
        print(f"  ... and {len(episodes) - 5} more")

    # Examine first episode
    if episodes:
        episode_data = loader.get_episode(episodes[0])
        print(f"\nFirst Episode ({episodes[0]}):")
        print(f"  Timesteps: {len(episode_data)}")

        # Show first timestep structure
        if episode_data:
            timestep = episode_data[0]
            print(f"  Action shape: {timestep['action'].shape}")
            print(f"  Observation keys: {list(timestep['observation'].keys())}")

            # Show image cameras if available
            obs = timestep['observation']
            if 'images' in obs:
                cameras = list(obs['images'].keys())
                print(f"  Cameras: {cameras}")
                for cam in cameras:
                    shape = obs['images'][cam].shape
                    print(f"    {cam}: {shape}")

if __name__ == "__main__":
    import sys

    if len(sys.argv) != 2:
        print("Usage: python explore_dataset.py <dataset_path>")
        sys.exit(1)

    dataset_path = sys.argv[1]
    explore_dataset(dataset_path)

Batch Augmentation Script

batch_augment.py
#!/usr/bin/env python3
"""
Apply augmentations to multiple episodes in batch.
"""

from pathlib import Path
from dataphy.dataset.registry import create_dataset_loader, DatasetFormat
from dataphy.dataset.episode_augmentor import EpisodeAugmentor

def batch_augment_episodes(
    dataset_path: str,
    config_file: str,
    episode_indices: list,
    cameras: list = None
):
    """Augment multiple episodes in batch."""

    # Setup
    loader = create_dataset_loader(dataset_path, DatasetFormat.LEROBOT)
    augmentor = EpisodeAugmentor(loader)

    # Get available episodes
    episodes = augmentor.list_episodes()
    print(f"Dataset has {len(episodes)} episodes")

    # Validate episode indices
    valid_indices = []
    for idx in episode_indices:
        if 0 <= idx < len(episodes):
            valid_indices.append(idx)
        else:
            print(f"Skipping invalid episode index: {idx}")

    print(f"Will augment {len(valid_indices)} episodes")
    if cameras:
        print(f"Target cameras: {cameras}")
    else:
        print(f"Target cameras: ALL")

    # Process each episode
    for i, episode_idx in enumerate(valid_indices):
        episode_name = episodes[episode_idx]
        print(f"\n[{i+1}/{len(valid_indices)}] Augmenting {episode_name}")

        try:
            # Check available cameras
            available_cameras = augmentor.get_available_cameras(episode_name)
            print(f"  Available cameras: {available_cameras}")

            # Validate requested cameras
            if cameras:
                invalid_cameras = [c for c in cameras if c not in available_cameras]
                if invalid_cameras:
                    print(f"  Invalid cameras for this episode: {invalid_cameras}")
                    continue

            # Apply augmentation
            augmentor.augment_episode(
                episode_id=episode_idx,
                config_file=config_file,
                camera_streams=cameras,
                preserve_original=True
            )
            print(f"  Completed successfully")

        except Exception as e:
            print(f"  Error: {e}")
            continue

    # Show final status
    backups = augmentor.list_backups()
    print(f"\nFinal Status:")
    print(f"  Episodes augmented: {len([idx for idx in valid_indices if episodes[idx] in backups])}")
    print(f"  Total backups: {len(backups)}")

if __name__ == "__main__":
    # Configuration
    dataset_path = "./dataset"
    config_file = "./examples/full_augmentation_pipeline.yaml"

    # Episodes to augment (indices)
    target_episodes = [0, 1, 2, 5, 10]

    # Specific cameras (None = all cameras)
    target_cameras = ["observation.images.webcam"]
    # target_cameras = None  # For all cameras

    batch_augment_episodes(
        dataset_path=dataset_path,
        config_file=config_file,
        episode_indices=target_episodes,
        cameras=target_cameras
    )

CLI Usage Examples

Complete Dataset Workflow

dataset_workflow.sh
#!/bin/bash
# Complete dataset workflow from fetch to visualization

DATASET_NAME="giraffe_cleaning"
REPO_ID="carpit680/giraffe_clean_desk2"
CONFIG_FILE="examples/full_augmentation_pipeline.yaml"

echo "Starting complete dataset workflow"

# Step 1: Fetch dataset
echo "\n📥 Step 1: Fetching dataset..."
dataphy dataset fetch \
  --format lerobot \
  --repo-id $REPO_ID \
  --output ./datasets/$DATASET_NAME

# Step 2: Explore dataset
echo "\nStep 2: Exploring dataset..."
dataphy dataset load \
  --dataset-path ./datasets/$DATASET_NAME \
  --info

# Step 3: List episodes
echo "\nStep 3: Listing episodes..."
dataphy augment dataset \
  --dataset-path ./datasets/$DATASET_NAME \
  --list-episodes

# Step 4: Augment first few episodes
echo "\nStep 4: Augmenting episodes..."
for episode in 0 1 2; do
  echo "  Augmenting episode $episode"
  dataphy augment dataset \
    --dataset-path ./datasets/$DATASET_NAME \
    --config $CONFIG_FILE \
    --episode $episode
done

# Step 5: Visualize results
echo "\nStep 5: Visualizing results..."
dataphy dataset visualize \
  --format lerobot \
  --dataset-path ./datasets/$DATASET_NAME \
  --episode 0

echo "\nWorkflow completed!"

Augmentation Parameter Testing

test_augmentations.sh
#!/bin/bash
# Test different augmentation parameters

DATASET_PATH="./datasets/test_dataset"
EPISODE=0

echo "🧪 Testing different augmentation parameters"

# Test 1: Gentle augmentation
echo "\n🔹 Test 1: Gentle augmentation"
cat > gentle_aug.yaml << EOF
version: 1
pipeline:
  sync_views: true
  steps:
    - name: color_jitter
      magnitude: 0.05
    - name: cutout
      holes: 1
      size_range: [4, 8]
  background:
    adapter: none
seed: 42
EOF

dataphy augment dataset \
  --dataset-path $DATASET_PATH \
  --config gentle_aug.yaml \
  --episode $EPISODE

# Test 2: Aggressive augmentation
echo "\n🔹 Test 2: Aggressive augmentation"
cat > aggressive_aug.yaml << EOF
version: 1
pipeline:
  sync_views: true
  steps:
    - name: random_crop_pad
      keep_ratio_min: 0.75
    - name: color_jitter
      magnitude: 0.25
    - name: cutout
      holes: 2
      size_range: [16, 32]
  background:
    adapter: none
seed: 42
EOF

# Restore first, then test
dataphy augment dataset \
  --dataset-path $DATASET_PATH \
  --restore episode_000000

dataphy augment dataset \
  --dataset-path $DATASET_PATH \
  --config aggressive_aug.yaml \
  --episode $EPISODE

echo "\nParameter testing completed"

Usage Instructions

Running Python Examples

  1. Ensure Dataphy is installed:
poetry install --extras rerun
  1. Run exploration script:
python examples/explore_dataset.py ./path/to/dataset
  1. Run batch augmentation:
    python examples/batch_augment.py
    

Running CLI Examples

  1. Make scripts executable:
chmod +x examples/*.sh
  1. Run complete workflow:
./examples/dataset_workflow.sh
  1. Test augmentation parameters:
    ./examples/test_augmentations.sh
    

Next Steps

  • API Reference: Explore all available functions
  • Tutorials: Learn step-by-step workflows
  • Experiment: Modify these examples for your specific use case