Skip to main content

Datasets

Released datasets are versioned collections with WebDataset shards on Hugging Face. Stream them directly into your training pipeline.

List datasets

datasets = client.datasets.list()
for ds in datasets.items:
    print(ds.name, ds.clip_count)

Get dataset details

dataset = client.datasets.get("egocentric-100k")
print(dataset.name, dataset.clip_count)

Browse clips

View the actual video and frames before you train on anything.
from IPython.display import Image, display, Video

# Pick a clip
clips = client.clips.list(factory_id="FACTORY_ID", page_size=5)
clip = clips.items[0]

# Watch it
video = client.clips.video(clip.clip_id)
print(video.signed_url)  # paste in browser, or:
display(Video(url=video.signed_url, width=600))

# See individual frames
frames = client.clips.frames(clip.clip_id, page_size=8)
for frame in frames.items:
    print(f"frame {frame.frame_index} @ {frame.timestamp_seconds}s")
    display(Image(url=frame.image_url, width=400))

WebDataset shard URLs

Each dataset is exported as .tar shards. Each shard contains MP4 clips + JSON metadata.
urls = client.datasets.webdataset_urls("egocentric-100k")
print(f"{len(urls)} shards")

# Or page through shard metadata
shards = client.datasets.list_webdataset_artifacts("egocentric-100k", page_size=10)
for shard in shards.items:
    print(shard.shard_path, shard.clip_count, shard.size_bytes)

Shard format

shard-000000.tar
├── clip-00000000.mp4    # H.265 video clip
├── clip-00000000.json   # metadata (factory, worker, timestamps, duration)
├── clip-00000001.mp4
├── clip-00000001.json
└── ...

Train: fine-tune VideoMAE on factory footage

Classify what a worker is doing — soldering, assembly, inspection, etc. Uses decord for video decoding and VideoMAE from Hugging Face.
pip install buildai-sdk webdataset decord transformers torch matplotlib scikit-learn
import io
import json

import torch
import webdataset as wds
from buildai import Client
from decord import VideoReader, cpu
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor

# --- 1. Get shard URLs from Build AI ---

client = Client()
urls = client.datasets.webdataset_urls("egocentric-100k")
print(f"Training on {len(urls)} shards")

# --- 2. Define action labels ---

ACTIONS = ["soldering", "assembly", "inspection", "packaging", "tool_change", "idle"]
label_map = {a: i for i, a in enumerate(ACTIONS)}

# --- 3. Video decoding + preprocessing ---

processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base")

def decode_sample(sample):
    video_bytes = sample["mp4"]
    metadata = json.loads(sample["json"])

    vr = VideoReader(io.BytesIO(video_bytes), ctx=cpu(0))
    indices = torch.linspace(0, len(vr) - 1, 16).long().tolist()
    frames = [vr[i].asnumpy() for i in indices]

    inputs = processor(frames, return_tensors="pt")
    label = label_map.get(metadata.get("action", "idle"), 5)

    return inputs["pixel_values"].squeeze(0), torch.tensor(label)

# --- 4. Build streaming dataloader ---

dataset = (
    wds.WebDataset(urls, shardshuffle=True)
    .shuffle(500)
    .decode()
    .map(decode_sample)
    .batched(4)
)
loader = wds.WebLoader(dataset, batch_size=None, num_workers=4)

# --- 5. Fine-tune ---

model = VideoMAEForVideoClassification.from_pretrained(
    "MCG-NJU/videomae-base",
    num_labels=len(ACTIONS),
    ignore_mismatched_sizes=True,
)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

losses = []
for step, (pixel_values, labels) in enumerate(loader):
    outputs = model(pixel_values=pixel_values, labels=labels)
    outputs.loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    losses.append(outputs.loss.item())
    if step % 10 == 0:
        print(f"step {step} — loss: {outputs.loss.item():.4f}")

Visualize training

Loss curve

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 4))
plt.plot(losses, linewidth=0.8)
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("VideoMAE fine-tuning loss")
plt.tight_layout()
plt.savefig("training_loss.png", dpi=150)
plt.show()

Confusion matrix

Run inference on a held-out shard and plot predictions vs ground truth.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

model.eval()
all_preds, all_labels = [], []

eval_dataset = (
    wds.WebDataset(urls[-1:])  # hold out last shard
    .decode()
    .map(decode_sample)
    .batched(4)
)

with torch.no_grad():
    for pixel_values, labels in wds.WebLoader(eval_dataset, batch_size=None):
        logits = model(pixel_values=pixel_values).logits
        preds = logits.argmax(dim=-1)
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

cm = confusion_matrix(all_labels, all_preds, labels=range(len(ACTIONS)))
disp = ConfusionMatrixDisplay(cm, display_labels=ACTIONS)
fig, ax = plt.subplots(figsize=(8, 8))
disp.plot(ax=ax, cmap="Blues", xticks_rotation=45)
plt.title("Action classification — confusion matrix")
plt.tight_layout()
plt.savefig("confusion_matrix.png", dpi=150)
plt.show()

Sample predictions

import matplotlib.pyplot as plt

model.eval()
samples = wds.WebDataset(urls[-1:]).decode()

fig, axes = plt.subplots(2, 4, figsize=(16, 6))
for idx, sample in enumerate(samples):
    if idx >= 4:
        break
    pixel_values, label = decode_sample(sample)

    with torch.no_grad():
        logits = model(pixel_values=pixel_values.unsqueeze(0)).logits
        pred = logits.argmax(dim=-1).item()

    vr = VideoReader(io.BytesIO(sample["mp4"]), ctx=cpu(0))
    axes[0, idx].imshow(vr[0].asnumpy())
    axes[0, idx].set_title(f"true: {ACTIONS[label]}", fontsize=9)
    axes[0, idx].axis("off")

    axes[1, idx].imshow(vr[len(vr) // 2].asnumpy())
    color = "green" if pred == label else "red"
    axes[1, idx].set_title(f"pred: {ACTIONS[pred]}", fontsize=9, color=color)
    axes[1, idx].axis("off")

plt.suptitle("Sample predictions", fontsize=13)
plt.tight_layout()
plt.savefig("sample_predictions.png", dpi=150)
plt.show()