天天看點

執行個體struct.pack打封包件、解封包件(圖像)

目錄

​​背景​​

​​代碼​​

​​總結​​

背景

之前一篇論文代碼使用此方式将資料進行了打包,然後再訓練,實際上這種方式處理資料集能夠提升讀取耗費的時間。但是由于電腦性能有限,不得不将打封包件的方式替換掉,故也查了下這種方式打包資料的思路,以後備用。代碼未細緻整理,但是親測可用。祝好!

代碼

打包

import os
import struct

# 判斷檔案夾中是否有目标類型圖檔,沒有則傳回0
def is_image_file(filename):
    # 如果不都為空、0、false,則any()傳回true
    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)


# 建立圖檔資料集,存在清單中并傳回
def make_dataset(dir):
    images = []
    assert os.path.isdir(dir), '%s is not a valid directory' % dir

    # os.walk(top[, topdown=True[, onerror=None[, followlinks=False]]]) 通過在目錄樹中遊走輸出在目錄中的檔案名,top傳回三項(root,dirs,files),分别代表:
    # 目前正在周遊的這個檔案夾的本身的位址;  list類型,内容是該檔案夾中所有的目錄的名字(不包括子目錄);  list類型,内容是該檔案夾中所有的檔案(不包括子目錄)
    for root, _, fnames in sorted(os.walk(dir)):
        for fname in fnames:
            if is_image_file(fname):
                # print(fname)
                # 拼接出圖檔的位址,并加入到images清單
                path = os.path.join(root, fname)
                images.append(path)

    return images


def pack(out_dir, indir, target_folders):
    # 周遊存放資料集的檔案夾
    for target_folder in target_folders:
        # 拼接生成存放資料集檔案夾的路徑
        curr_indir = os.path.join(indir, target_folder)
        # 生成的大檔案路徑(含問檔案名)
        curr_out_file = os.path.join(os.path.join(out_dir, '%s.bigfile' % (target_folder)))
        image_lists = make_dataset(curr_indir)
        image_lists.sort()
        with open(curr_out_file, 'wb') as wfid:
            # 寫入檔案數量
            wfid.write(struct.pack('i', len(image_lists)))
            for i, img_path in enumerate(image_lists):
                # 寫入檔案名稱
                img_name = os.path.basename(img_path)
                img_name_bytes = img_name.encode('utf-8')
                wfid.write(struct.pack('i', len(img_name_bytes)))
                wfid.write(img_name_bytes)

                # 寫入圖檔資料
                with open(img_path, 'rb') as img_fid:
                    img_bytes = img_fid.read()
                wfid.write(struct.pack('i', len(img_bytes)))
                wfid.write(img_bytes)

                if i % 1 == 0:
                    print('write %d files done' % i)


if __name__ == '__main__':
    IMG_EXTENSIONS = [
        '.jpg', '.JPG', '.jpeg', '.JPEG',
        '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP', '.npy'
    ]
    #打包結果存儲位置
    out_dir = 'C:/Users/Administrator/Desktop/test/bigdata'
    #待打封包件的檔案夾路徑
    indir = 'C:/Users/Administrator/Desktop/test'
    #存儲待打封包件的檔案夾名字
    target_folders = ['image', 'npy']

    pack(out_dir, indir, target_folders)      

解包

# -*- coding:utf-8 -*-
import io
import struct
import os
from PIL import Image
import numpy as np

def unpack(file_path,save_path,flag=1):
    print('start load bigfile (%0.02f GB) into memory' % (os.path.getsize(file_path) / 1024 / 1024 / 1024))
    with open(file_path, 'rb') as fid:
        img_num = struct.unpack('i', fid.read(4))[0]
        img_names = []
        img_bytes = []
        print('find total %d images' % img_num)
        for i in range(img_num):
            img_name_len = struct.unpack('i', fid.read(4))[0]
            img_name = fid.read(img_name_len).decode('utf-8')
            img_names.append(img_name)
            img_bytes_len = struct.unpack('i', fid.read(4))[0]
            img_bytes.append(fid.read(img_bytes_len))
            if i % 5000 == 0:
                print('load %d images done' % i)
        print('load all %d images done' % img_num)

    # 傳回圖檔名字和圖檔
    for index in range(0, len(img_names)):
        try:
            if flag == 1: #解包圖檔類型
                img = Image.open(io.BytesIO(img_bytes[index])).convert('RGB')
                path_img = os.path.join(save_path,img_names[index])
                img.save(path_img)
            elif flag == 0: #解包npy類型
                npy= np.load(io.BytesIO(img_bytes[index]))
                path_npy = os.path.join(save_path,img_names[index])
                np.save(path_npy,npy)
        except Exception:
            print('file read error for index %d: %s' % (index, img_names[index]))


if __name__ == '__main__':
    #打包圖檔類型後的檔案路徑
    filepath1 = 'C:/Users/Administrator/Desktop/test/bigdata/image.bigfile'
    #解包後圖檔存儲位址
    save_path1 = 'C:/Users/Administrator/Desktop/test/out1'

    #打包npy類型後的檔案路徑
    filepath2 = 'C:/Users/Administrator/Desktop/test/bigdata/npy.bigfile'
    #解包後npy存儲位址
    save_path2 = 'C:/Users/Administrator/Desktop/test/out2'

    ##解包圖檔檔案
    #unpack(filepath1,save_path1,flag=1)

    #解包npy矩陣檔案
    unpack(filepath2,save_path2,flag=0)      

總結

繼續閱讀