目錄
背景
代碼
總結
背景
之前一篇論文代碼使用此方式将資料進行了打包,然後再訓練,實際上這種方式處理資料集能夠提升讀取耗費的時間。但是由于電腦性能有限,不得不将打封包件的方式替換掉,故也查了下這種方式打包資料的思路,以後備用。代碼未細緻整理,但是親測可用。祝好!
代碼
打包
import os
import struct
# 判斷檔案夾中是否有目标類型圖檔,沒有則傳回0
def is_image_file(filename):
# 如果不都為空、0、false,則any()傳回true
return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
# 建立圖檔資料集,存在清單中并傳回
def make_dataset(dir):
images = []
assert os.path.isdir(dir), '%s is not a valid directory' % dir
# os.walk(top[, topdown=True[, onerror=None[, followlinks=False]]]) 通過在目錄樹中遊走輸出在目錄中的檔案名,top傳回三項(root,dirs,files),分别代表:
# 目前正在周遊的這個檔案夾的本身的位址; list類型,内容是該檔案夾中所有的目錄的名字(不包括子目錄); list類型,内容是該檔案夾中所有的檔案(不包括子目錄)
for root, _, fnames in sorted(os.walk(dir)):
for fname in fnames:
if is_image_file(fname):
# print(fname)
# 拼接出圖檔的位址,并加入到images清單
path = os.path.join(root, fname)
images.append(path)
return images
def pack(out_dir, indir, target_folders):
# 周遊存放資料集的檔案夾
for target_folder in target_folders:
# 拼接生成存放資料集檔案夾的路徑
curr_indir = os.path.join(indir, target_folder)
# 生成的大檔案路徑(含問檔案名)
curr_out_file = os.path.join(os.path.join(out_dir, '%s.bigfile' % (target_folder)))
image_lists = make_dataset(curr_indir)
image_lists.sort()
with open(curr_out_file, 'wb') as wfid:
# 寫入檔案數量
wfid.write(struct.pack('i', len(image_lists)))
for i, img_path in enumerate(image_lists):
# 寫入檔案名稱
img_name = os.path.basename(img_path)
img_name_bytes = img_name.encode('utf-8')
wfid.write(struct.pack('i', len(img_name_bytes)))
wfid.write(img_name_bytes)
# 寫入圖檔資料
with open(img_path, 'rb') as img_fid:
img_bytes = img_fid.read()
wfid.write(struct.pack('i', len(img_bytes)))
wfid.write(img_bytes)
if i % 1 == 0:
print('write %d files done' % i)
if __name__ == '__main__':
IMG_EXTENSIONS = [
'.jpg', '.JPG', '.jpeg', '.JPEG',
'.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.npy'
]
#打包結果存儲位置
out_dir = 'C:/Users/Administrator/Desktop/test/bigdata'
#待打封包件的檔案夾路徑
indir = 'C:/Users/Administrator/Desktop/test'
#存儲待打封包件的檔案夾名字
target_folders = ['image', 'npy']
pack(out_dir, indir, target_folders)
解包
# -*- coding:utf-8 -*-
import io
import struct
import os
from PIL import Image
import numpy as np
def unpack(file_path,save_path,flag=1):
print('start load bigfile (%0.02f GB) into memory' % (os.path.getsize(file_path) / 1024 / 1024 / 1024))
with open(file_path, 'rb') as fid:
img_num = struct.unpack('i', fid.read(4))[0]
img_names = []
img_bytes = []
print('find total %d images' % img_num)
for i in range(img_num):
img_name_len = struct.unpack('i', fid.read(4))[0]
img_name = fid.read(img_name_len).decode('utf-8')
img_names.append(img_name)
img_bytes_len = struct.unpack('i', fid.read(4))[0]
img_bytes.append(fid.read(img_bytes_len))
if i % 5000 == 0:
print('load %d images done' % i)
print('load all %d images done' % img_num)
# 傳回圖檔名字和圖檔
for index in range(0, len(img_names)):
try:
if flag == 1: #解包圖檔類型
img = Image.open(io.BytesIO(img_bytes[index])).convert('RGB')
path_img = os.path.join(save_path,img_names[index])
img.save(path_img)
elif flag == 0: #解包npy類型
npy= np.load(io.BytesIO(img_bytes[index]))
path_npy = os.path.join(save_path,img_names[index])
np.save(path_npy,npy)
except Exception:
print('file read error for index %d: %s' % (index, img_names[index]))
if __name__ == '__main__':
#打包圖檔類型後的檔案路徑
filepath1 = 'C:/Users/Administrator/Desktop/test/bigdata/image.bigfile'
#解包後圖檔存儲位址
save_path1 = 'C:/Users/Administrator/Desktop/test/out1'
#打包npy類型後的檔案路徑
filepath2 = 'C:/Users/Administrator/Desktop/test/bigdata/npy.bigfile'
#解包後npy存儲位址
save_path2 = 'C:/Users/Administrator/Desktop/test/out2'
##解包圖檔檔案
#unpack(filepath1,save_path1,flag=1)
#解包npy矩陣檔案
unpack(filepath2,save_path2,flag=0)