Data loading
Alexandre Strube // Sabrina Benassou
June 25, 2024
Time | Title |
---|---|
10:00 - 10:15 | Welcome, questions |
10:15 - 11:30 | Data loading |
11:30 - 12:00 | Coffee Break (flexible) |
12:30 - 14:00 | Parallelize Training |
$PROJECT_projectname
for code (projectname
is training2425
in this
case)
$DATA_projectname
for
big data(*)
$SCRATCH_projectname
for temporary files (fast, but not permanent)
/dev/shm
is a filesystem on ram -
ultra fast ⚡️ILSVRC
|-- Data/
`-- CLS-LOC
|-- test
|-- train
| |-- n01440764
| | |-- n01440764_10026.JPEG
| | |-- n01440764_10027.JPEG
| | |-- n01440764_10029.JPEG
| |-- n01695060
| | |-- n01695060_10009.JPEG
| | |-- n01695060_10022.JPEG
| | |-- n01695060_10028.JPEG
| | |-- ...
| |...
|-- val
|-- ILSVRC2012_val_00000001.JPEG
|-- ILSVRC2012_val_00016668.JPEG
|-- ILSVRC2012_val_00033335.JPEG
|-- ...
imagenet_train.json
{
'ILSVRC/Data/CLS-LOC/train/n03146219/n03146219_8050.JPEG': 524,
'ILSVRC/Data/CLS-LOC/train/n03146219/n03146219_12728.JPEG': 524,
'ILSVRC/Data/CLS-LOC/train/n03146219/n03146219_9736.JPEG': 524,
...
'ILSVRC/Data/CLS-LOC/train/n03146219/n03146219_7460.JPEG': 524,
...
}
imagenet_val.json
def __getitem__(self, idx):
if self.arrowfile is None:
self.arrowfile = pa.OSFile(self.data_root, 'rb')
self.reader = pa.ipc.open_file(self.arrowfile)
row = self.reader.get_batch(idx)
img_string = row['image_data'][0].as_py()
target = row['label'][0].as_py()
with io.BytesIO(img_string) as byte_stream:
with Image.open(byte_stream) as img:
img = img.convert("RGB")
if self.transform:
img = self.transform(img)
return img, target
def __getitem__(self, idx):
if self.h5file is None:
self.h5file = h5py.File(self.train_data_path, 'r')[self.split]
self.imgs = self.h5file["images"]
self.targets = self.h5file["targets"]
img_string = self.imgs[idx]
target = self.targets[idx]
with io.BytesIO(img_string) as byte_stream:
with Image.open(byte_stream) as img:
img = img.convert("RGB")
if self.transform:
img = self.transform(img)
return img, target
/p/scratch/training2402/data/Flickr30K/
and read it using a dataloader ?