rpmjp/projects/swin-transformer-study/efficientnet_benchmark.py
CompletedMay to Dec 2025
Swin Transformer: Empirical Evaluation on Small Fine-Grained Data
A controlled four-family architecture comparison (Swin T/S/B, RegNetY CNNs, EfficientNet B3-B7, ViT-B/16) on the Oxford-IIIT Pet Dataset under RTX 4090 constraints. Three findings: Swin's hierarchical attention transfers cleanly to small datasets (93.8-96.35%), EfficientNet's compound scaling breaks (B3 beats B7 by 8.66 points), and ViT catastrophically fails (7.17%: barely above the 2.7% random baseline).
PyTorchtimmSwin-T/S/BRegNetYEfficientNet B3-B7ViT-B/16Oxford-IIIT PetRTX 4090
Languages
Jupyter Notebook98%
Python2%
efficientnet_benchmark.py
"""
EfficientNet B3-B7 benchmark: the family that produced the inverse-scaling
result. Each variant is trained at its native input resolution (300² for B3
through 600² for B7) so the compound-scaling design isn't broken by forcing
a common input size.
Result: B3 (11M params, 300²) at 80.58% accuracy *beat* B7 (64M params, 600²)
at 71.92%. See `baseline-analysis.md` for the analysis.
"""
import time
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import timm
from torchvision import transforms
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_CLASSES = 37
# Native input sizes per the EfficientNet paper. Forcing a common size would
# have defeated the compound-scaling comparison.
EFFICIENTNET_MODELS = {
"EffNet-B3": {"timm_name": "tf_efficientnet_b3", "image_size": 300},
"EffNet-B4": {"timm_name": "tf_efficientnet_b4", "image_size": 380},
"EffNet-B5": {"timm_name": "tf_efficientnet_b5", "image_size": 456},
"EffNet-B6": {"timm_name": "tf_efficientnet_b6", "image_size": 528},
"EffNet-B7": {"timm_name": "tf_efficientnet_b7", "image_size": 600},
}
def build_transforms(image_size: int) -> transforms.Compose:
return transforms.Compose([
transforms.Resize((image_size, image_size)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
def measure_throughput(model: nn.Module, image_size: int, batch_size: int = 16,
n_batches: int = 20) -> float:
"""Forward-pass throughput in images per second, GPU-side only."""
model.eval()
x = torch.randn(batch_size, 3, image_size, image_size, device=DEVICE)
# Warmup: first batch always slow due to kernel autotuning
with torch.no_grad():
_ = model(x)
torch.cuda.synchronize()
start = time.time()
with torch.no_grad():
for _ in range(n_batches):
_ = model(x)
torch.cuda.synchronize()
elapsed = time.time() - start
return (n_batches * batch_size) / elapsed
def benchmark_family(train_ds_factory, val_ds_factory, epochs: int = 5,
batch_size: int = 16):
"""Train each EfficientNet variant from B3 to B7 with identical
hyperparameters apart from input size.
train_ds_factory(image_size) -> Dataset
val_ds_factory(image_size) -> Dataset
"""
results = []
for label, spec in EFFICIENTNET_MODELS.items():
image_size = spec["image_size"]
model = timm.create_model(spec["timm_name"], pretrained=True,
num_classes=NUM_CLASSES).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
train_loader = DataLoader(train_ds_factory(image_size),
batch_size=batch_size, shuffle=True,
num_workers=4, pin_memory=True)
val_loader = DataLoader(val_ds_factory(image_size),
batch_size=batch_size, shuffle=False,
num_workers=4, pin_memory=True)
peak_val_acc = 0.0
for epoch in range(1, epochs + 1):
model.train()
for images, labels in train_loader:
images = images.to(DEVICE); labels = labels.to(DEVICE)
optimizer.zero_grad()
logits = model(images)
loss = criterion(logits, labels)
loss.backward(); optimizer.step()
model.eval()
correct, seen = 0, 0
with torch.no_grad():
for images, labels in val_loader:
images = images.to(DEVICE); labels = labels.to(DEVICE)
correct += (model(images).argmax(1) == labels).sum().item()
seen += labels.size(0)
val_acc = correct / seen
peak_val_acc = max(peak_val_acc, val_acc)
print(f" [{label}] epoch {epoch}: val_acc={val_acc:.4f}")
throughput = measure_throughput(model, image_size)
params = sum(p.numel() for p in model.parameters()) / 1e6
results.append({
"model": label,
"image_size": image_size,
"params_M": params,
"peak_val_acc": peak_val_acc,
"throughput_img_per_s": throughput,
})
return results