Swin Transformer: Empirical Evaluation on Small Fine-Grained Data

A controlled four-family architecture comparison (Swin T/S/B, RegNetY CNNs, EfficientNet B3-B7, ViT-B/16) on the Oxford-IIIT Pet Dataset under RTX 4090 constraints. Three findings: Swin's hierarchical attention transfers cleanly to small datasets (93.8-96.35%), EfficientNet's compound scaling breaks (B3 beats B7 by 8.66 points), and ViT catastrophically fails (7.17%: barely above the 2.7% random baseline).

PyTorchtimmSwin-T/S/BRegNetYEfficientNet B3-B7ViT-B/16Oxford-IIIT PetRTX 4090

Languages

Jupyter Notebook98%

Python2%

efficientnet_benchmark.py

"""
EfficientNet B3-B7 benchmark: the family that produced the inverse-scaling
result. Each variant is trained at its native input resolution (300² for B3
through 600² for B7) so the compound-scaling design isn't broken by forcing
a common input size.

Result: B3 (11M params, 300²) at 80.58% accuracy *beat* B7 (64M params, 600²)
at 71.92%. See `baseline-analysis.md` for the analysis.
"""
import time
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import timm
from torchvision import transforms

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLASSES = 37


# Native input sizes per the EfficientNet paper. Forcing a common size would
# have defeated the compound-scaling comparison.
EFFICIENTNET_MODELS = {
    "EffNet-B3": {"timm_name": "tf_efficientnet_b3", "image_size": 300},
    "EffNet-B4": {"timm_name": "tf_efficientnet_b4", "image_size": 380},
    "EffNet-B5": {"timm_name": "tf_efficientnet_b5", "image_size": 456},
    "EffNet-B6": {"timm_name": "tf_efficientnet_b6", "image_size": 528},
    "EffNet-B7": {"timm_name": "tf_efficientnet_b7", "image_size": 600},
}


def build_transforms(image_size: int) -> transforms.Compose:
    return transforms.Compose([
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])


def measure_throughput(model: nn.Module, image_size: int, batch_size: int = 16,
                       n_batches: int = 20) -> float:
    """Forward-pass throughput in images per second, GPU-side only."""
    model.eval()
    x = torch.randn(batch_size, 3, image_size, image_size, device=DEVICE)
    # Warmup: first batch always slow due to kernel autotuning
    with torch.no_grad():
        _ = model(x)
    torch.cuda.synchronize()

    start = time.time()
    with torch.no_grad():
        for _ in range(n_batches):
            _ = model(x)
    torch.cuda.synchronize()
    elapsed = time.time() - start
    return (n_batches * batch_size) / elapsed


def benchmark_family(train_ds_factory, val_ds_factory, epochs: int = 5,
                     batch_size: int = 16):
    """Train each EfficientNet variant from B3 to B7 with identical
    hyperparameters apart from input size.

    train_ds_factory(image_size) -> Dataset
    val_ds_factory(image_size)   -> Dataset
    """
    results = []
    for label, spec in EFFICIENTNET_MODELS.items():
        image_size = spec["image_size"]
        model = timm.create_model(spec["timm_name"], pretrained=True,
                                  num_classes=NUM_CLASSES).to(DEVICE)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

        train_loader = DataLoader(train_ds_factory(image_size),
                                  batch_size=batch_size, shuffle=True,
                                  num_workers=4, pin_memory=True)
        val_loader = DataLoader(val_ds_factory(image_size),
                                batch_size=batch_size, shuffle=False,
                                num_workers=4, pin_memory=True)

        peak_val_acc = 0.0
        for epoch in range(1, epochs + 1):
            model.train()
            for images, labels in train_loader:
                images = images.to(DEVICE); labels = labels.to(DEVICE)
                optimizer.zero_grad()
                logits = model(images)
                loss = criterion(logits, labels)
                loss.backward(); optimizer.step()

            model.eval()
            correct, seen = 0, 0
            with torch.no_grad():
                for images, labels in val_loader:
                    images = images.to(DEVICE); labels = labels.to(DEVICE)
                    correct += (model(images).argmax(1) == labels).sum().item()
                    seen += labels.size(0)
            val_acc = correct / seen
            peak_val_acc = max(peak_val_acc, val_acc)
            print(f"  [{label}] epoch {epoch}: val_acc={val_acc:.4f}")

        throughput = measure_throughput(model, image_size)
        params = sum(p.numel() for p in model.parameters()) / 1e6
        results.append({
            "model": label,
            "image_size": image_size,
            "params_M": params,
            "peak_val_acc": peak_val_acc,
            "throughput_img_per_s": throughput,
        })

    return results