天天看點

通過DarkLabel建構DeepSort标注格式和ReID資料集

上一篇文章推薦了DarkLabel标注軟體,承諾會附上配套的代碼,本文主要分享的是格式轉換的幾個腳本。

先附上腳本位址: https://github.com/pprp/SimpleCVReproduction/tree/master/DarkLabel

先來了解一下為何DarkLabel能生成這麼多格式的資料集,來看看DarkLabel的格式:

frame(從0開始計), 數量, id(從0開始), box(x1,y1,x2,y2), class=null
0,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
1,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
2,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
           

每一幀,每張圖檔上的目标都可以提取到,并且每個目标有bbox、配置設定了一個ID、class

這些資訊都可以滿足目标檢測、ReID、跟蹤資料集。

ps:說明一下,以下腳本都是筆者自己寫的,專用于單類的檢測、跟蹤、重識别的代碼,如果有需要多類的,還需要自己修改多類部分的代碼。

1. DarkLabel轉Detection

這裡筆者寫了一個腳本轉成VOC2007中的xml格式的标注,代碼如下:

import cv2
import os
import shutil
import tqdm
import sys

root_path = r"I:\Dataset\VideoAnnotation"

def print_flush(str):
    print(str, end='\r')
    sys.stdout.flush()

def genXML(xml_dir, outname, bboxes, width, height):
    xml_file = open((xml_dir + '/' + outname + '.xml'), 'w')
    xml_file.write('<annotation>\n')
    xml_file.write('    <folder>VOC2007</folder>\n')
    xml_file.write('    <filename>' + outname + '.jpg' + '</filename>\n')
    xml_file.write('    <size>\n')
    xml_file.write('        <width>' + str(width) + '</width>\n')
    xml_file.write('        <height>' + str(height) + '</height>\n')
    xml_file.write('        <depth>3</depth>\n')
    xml_file.write('    </size>\n')

    for bbox in bboxes:
        x1, y1, x2, y2 = bbox
        xml_file.write('    <object>\n')
        xml_file.write('        <name>' + 'cow' + '</name>\n')
        xml_file.write('        <pose>Unspecified</pose>\n')
        xml_file.write('        <truncated>0</truncated>\n')
        xml_file.write('        <difficult>0</difficult>\n')
        xml_file.write('        <bndbox>\n')
        xml_file.write('            <xmin>' + str(x1) + '</xmin>\n')
        xml_file.write('            <ymin>' + str(y1) + '</ymin>\n')
        xml_file.write('            <xmax>' + str(x2) + '</xmax>\n')
        xml_file.write('            <ymax>' + str(y2) + '</ymax>\n')
        xml_file.write('        </bndbox>\n')
        xml_file.write('    </object>\n')

    xml_file.write('</annotation>')


def gen_empty_xml(xml_dir, outname, width, height):
    xml_file = open((xml_dir + '/' + outname + '.xml'), 'w')
    xml_file.write('<annotation>\n')
    xml_file.write('    <folder>VOC2007</folder>\n')
    xml_file.write('    <filename>' + outname + '.png' + '</filename>\n')
    xml_file.write('    <size>\n')
    xml_file.write('        <width>' + str(width) + '</width>\n')
    xml_file.write('        <height>' + str(height) + '</height>\n')
    xml_file.write('        <depth>3</depth>\n')
    xml_file.write('    </size>\n')
    xml_file.write('</annotation>')


def getJPG(src_video_file, tmp_video_frame_save_dir):
    # gen jpg from video
    cap = cv2.VideoCapture(src_video_file)
    if not os.path.exists(tmp_video_frame_save_dir):
        os.makedirs(tmp_video_frame_save_dir)

    frame_cnt = 0
    isrun, frame = cap.read()
    width, height = frame.shape[1], frame.shape[0]

    while (isrun):
        save_name = append_name + "_" + str(frame_cnt) + ".jpg"
        cv2.imwrite(os.path.join(tmp_video_frame_save_dir, save_name), frame)
        frame_cnt += 1
        print_flush("Extracting frame :%d" % frame_cnt)
        isrun, frame = cap.read()
    return width, height

def delTmpFrame(tmp_video_frame_save_dir):
    if os.path.exists(tmp_video_frame_save_dir):
        shutil.rmtree(tmp_video_frame_save_dir)
    print('delete %s success!' % tmp_video_frame_save_dir)

def assign_jpgAndAnnot(src_annot_file, dst_annot_dir, dst_jpg_dir, tmp_video_frame_save_dir, width, height):
    # get coords from annotations files
    txt_file = open(src_annot_file, "r")

    content = txt_file.readlines()

    for line in content:
        item = line[:-1]
        items = item.split(',')
        frame_id, num_of_cow = items[0], items[1]
        print_flush("Assign jpg and annotion : %s" % frame_id)

        bboxes = []

        for i in range(int(num_of_cow)):
            obj_id = items[1 + i * 6 + 1]
            obj_x1, obj_y1 = int(items[1 + i * 6 + 2]), int(items[1 + i * 6 + 3])
            obj_x2, obj_y2 = int(items[1 + i * 6 + 4]), int(items[1 + i * 6 + 5])
            # preprocess the coords
            obj_x1 = max(1, obj_x1)
            obj_y1 = max(1, obj_y1)
            obj_x2 = min(width, obj_x2)
            obj_y2 = min(height, obj_y2)
            bboxes.append([obj_x1, obj_y1, obj_x2, obj_y2])

        genXML(dst_annot_dir, append_name + "_" + str(frame_id), bboxes, width,
            height)
        shutil.copy(
            os.path.join(tmp_video_frame_save_dir,
                        append_name + "_" + str(frame_id) + ".jpg"),
            os.path.join(dst_jpg_dir, append_name + "_" + str(frame_id) + ".jpg"))

    txt_file.close()


if __name__ == "__main__":
    append_names = ["cutout%d" % i for i in range(19, 66)]

    for append_name in append_names:
        print("processing",append_name)
        src_video_file = os.path.join(root_path, append_name + ".mp4")

        if not os.path.exists(src_video_file):
            continue

        src_annot_file = os.path.join(root_path, append_name + "_gt.txt")

        dst_annot_dir = os.path.join(root_path, "Annotations")
        dst_jpg_dir = os.path.join(root_path, "JPEGImages")

        tmp_video_frame_save_dir = os.path.join(root_path, append_name)

        width, height = getJPG(src_video_file, tmp_video_frame_save_dir)

        assign_jpgAndAnnot(src_annot_file, dst_annot_dir, dst_jpg_dir, tmp_video_frame_save_dir, width, height)

        delTmpFrame(tmp_video_frame_save_dir)
           

如果想轉成U版yolo需要的格式可以點選 https://github.com/pprp/voc2007_for_yolo_torch 使用這裡的腳本。

2. DarkLabel轉ReID資料集

ReID資料集其實與分類資料集很相似,最出名的是Market1501資料集,對這個資料集不熟悉的可以先百度一下。簡單來說ReID資料集隻比分類中多了query, gallery的概念,也很簡單。轉換代碼如下:

import os
import shutil
import cv2
import numpy as np
import glob
import sys
import random
"""[summary]
根據視訊和darklabel得到的标注檔案
"""

def preprocessVideo(video_path):
    '''
    預處理,将視訊變為一幀一幀的圖檔
    '''
    if not os.path.exists(video_frame_save_path):
        os.mkdir(video_frame_save_path)

    vidcap = cv2.VideoCapture(video_path)
    (cap, frame) = vidcap.read()

    height = frame.shape[0]
    width = frame.shape[1]

    cnt_frame = 0

    while (cap):
        cv2.imwrite(
            os.path.join(video_frame_save_path, "frame_%d.jpg" % (cnt_frame)),
            frame)
        cnt_frame += 1
        print(cnt_frame, end="\r")
        sys.stdout.flush()
        (cap, frame) = vidcap.read()
    vidcap.release()
    return width, height


def postprocess(video_frame_save_path):
    '''
    後處理,删除無用的檔案夾
    '''
    if os.path.exists(video_frame_save_path):
        shutil.rmtree(video_frame_save_path)


def extractVideoImgs(frame, video_frame_save_path, coords):
    '''
    摳圖
    '''
    x1, y1, x2, y2 = coords
    # get image from save path
    img = cv2.imread(
        os.path.join(video_frame_save_path, "frame_%d.jpg" % (frame)))

    if img is None:
        return None
    # crop
    save_img = img[y1:y2, x1:x2]
    return save_img


def bbox_ious(box1, box2):
    b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
    b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]

    # Intersection area
    inter_area = (min(b1_x2, b2_x2) - max(b1_x1, b2_x1)) * \
                 (min(b1_y2, b2_y2) - max(b1_y1, b2_y1))

    # Union Area
    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1
    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1
    union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area

    return inter_area / union_area


def bbox_iou(box1, box2):
    # format box1: x1,y1,x2,y2
    # format box2: a1,b1,a2,b2
    x1, y1, x2, y2 = box1
    a1, b1, a2, b2 = box2

    i_left_top_x = max(a1, x1)
    i_left_top_y = max(b1, y1)

    i_bottom_right_x = min(a2, x2)
    i_bottom_right_y = min(b2, y2)

    intersection = (i_bottom_right_x - i_left_top_x) * (i_bottom_right_y -
                                                        i_left_top_y)

    area_two_box = (x2 - x1) * (y2 - y1) + (a2 - a1) * (b2 - b1)

    return intersection * 1.0 / (area_two_box - intersection)


def restrictCoords(width, height, x, y):
    x = max(1, x)
    y = max(1, y)
    x = min(x, width)
    y = min(y, height)
    return x, y


if __name__ == "__main__":

    total_cow_num = 0

    root_dir = "./data/videoAndLabel"
    reid_dst_path = "./data/reid"
    done_dir = "./data/done"

    txt_list = glob.glob(os.path.join(root_dir, "*.txt"))
    video_list = glob.glob(os.path.join(root_dir, "*.mp4"))

    for i in range(len(txt_list)):
        txt_path = txt_list[i]
        video_path = video_list[i]

        print("processing:", video_path)

        if not os.path.exists(txt_path):
            continue

        video_name = os.path.basename(video_path).split('.')[0]
        video_frame_save_path = os.path.join(os.path.dirname(video_path),
                                             video_name)

        f_txt = open(txt_path, "r")

        width, height = preprocessVideo(video_path)

        print("done")

        # video_cow_id = video_name + str(total_cow_num)

        for line in f_txt.readlines():
            bboxes = line.split(',')
            ids = []
            frame_id = int(bboxes[0])

            box_list = []

            if frame_id % 30 != 0:
                continue

            num_object = int(bboxes[1])
            for num_obj in range(num_object):
                # obj = 0, 1, 2
                obj_id = bboxes[1 + (num_obj) * 6 + 1]
                obj_x1 = int(bboxes[1 + (num_obj) * 6 + 2])
                obj_y1 = int(bboxes[1 + (num_obj) * 6 + 3])
                obj_x2 = int(bboxes[1 + (num_obj) * 6 + 4])
                obj_y2 = int(bboxes[1 + (num_obj) * 6 + 5])

                box_list.append([obj_x1, obj_y1, obj_x2, obj_y2])
                # process coord
                obj_x1, obj_y1 = restrictCoords(width, height, obj_x1, obj_y1)
                obj_x2, obj_y2 = restrictCoords(width, height, obj_x2, obj_y2)

                specific_object_name = video_name + "_" + obj_id

                # mkdir for reid dataset
                id_dir = os.path.join(reid_dst_path, specific_object_name)

                if not os.path.exists(id_dir):
                    os.makedirs(id_dir)

                # save pic
                img = extractVideoImgs(frame_id, video_frame_save_path,
                                       (obj_x1, obj_y1, obj_x2, obj_y2))
                print(type(img))

                if img is None or img.shape[0] == 0 or img.shape[1] == 0:
                    print(specific_object_name + " is empty")
                    continue

                # print(frame_id)
                img = cv2.resize(img, (256, 256))

                normalizedImg = np.zeros((256, 256))
                img = cv2.normalize(img, normalizedImg, 0, 255,
                                    cv2.NORM_MINMAX)

                cv2.imwrite(
                    os.path.join(id_dir, "%s_%d.jpg") %
                    (specific_object_name, frame_id), img)

            max_w = width - 256
            max_h = height - 256

            # 随機選取左上角坐标
            select_x = random.randint(1, max_w)
            select_y = random.randint(1, max_h)
            rand_box = [select_x, select_y, select_x + 256, select_y + 256]

            # 背景圖儲存位置
            bg_dir = os.path.join(reid_dst_path, "bg")
            if not os.path.exists(bg_dir):
                os.makedirs(bg_dir)

            iou_list = []

            for idx in range(len(box_list)):
                cow_box = box_list[idx]
                iou = bbox_iou(cow_box, rand_box)
                iou_list.append(iou)

            # print("iou list:" , iou_list)

            if np.array(iou_list).all() < 0:
                img = extractVideoImgs(frame_id, video_frame_save_path,
                                       rand_box)
                if img is None:
                    print(specific_object_name + "is empty")
                    continue
                normalizedImg = np.zeros((256, 256))
                img = cv2.normalize(img, normalizedImg, 0, 255,
                                    cv2.NORM_MINMAX)
                cv2.imwrite(
                    os.path.join(bg_dir, "bg_%s_%d.jpg") %
                    (video_name, frame_id), img)

        f_txt.close()
        postprocess(video_frame_save_path)
        shutil.move(video_path, done_dir)
        shutil.move(txt_path, done_dir)
           

資料集配套代碼在: https://github.com/pprp/reid_for_deepsort

3. DarkLabel轉MOT16格式

其實DarkLabel标注得到資訊和MOT16是幾乎一緻的,隻不過需要轉化一下,腳本如下:

import os
'''
gt.txt:
---------
frame(從1開始計), id, box(left top w, h),ignore=1(不忽略), class=1(從1開始),覆寫=1), 
1,1,1363,569,103,241,1,1,0.86014
2,1,1362,568,103,241,1,1,0.86173
3,1,1362,568,103,241,1,1,0.86173
4,1,1362,568,103,241,1,1,0.86173

cutout24_gt.txt
---
frame(從0開始計), 數量, id(從0開始), box(x1,y1,x2,y2), class=null
0,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
1,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
2,4,0,450,194,558,276,null,1,408,147,469,206,null,2,374,199,435,307,null,3,153,213,218,314,null
'''


def xyxy2xywh(x):
    # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
    # y = torch.zeros_like(x) if isinstance(x,
    #                                       torch.Tensor) else np.zeros_like(x)
    y = [0, 0, 0, 0]

    y[0] = (x[0] + x[2]) / 2
    y[1] = (x[1] + x[3]) / 2
    y[2] = x[2] - x[0]
    y[3] = x[3] - x[1]
    return y

def process_darklabel(video_label_path, mot_label_path):
    f = open(video_label_path, "r")
    f_o = open(mot_label_path, "w")

    contents = f.readlines()

    for line in contents:
        line = line[:-1]
        num_list = [num for num in line.split(',')]

        frame_id = int(num_list[0]) + 1
        total_num = int(num_list[1])

        base = 2

        for i in range(total_num):

            print(base, base + i * 6, base + i * 6 + 4)

            _id = int(num_list[base + i * 6]) + 1
            _box_x1 = int(num_list[base + i * 6 + 1])
            _box_y1 = int(num_list[base + i * 6 + 2])
            _box_x2 = int(num_list[base + i * 6 + 3])
            _box_y2 = int(num_list[base + i * 6 + 4])

            y = xyxy2xywh([_box_x1, _box_y1, _box_x2, _box_y2])

            write_line = "%d,%d,%d,%d,%d,%d,1,1,1\n" % (frame_id, _id, y[0],
                                                        y[1], y[2], y[3])

            f_o.write(write_line)

    f.close()
    f_o.close()

if __name__ == "__main__":
    root_dir = "./data/videosample"

    for item in os.listdir(root_dir):
        full_path = os.path.join(root_dir, item)

        video_path = os.path.join(full_path, item+".mp4")
        video_label_path = os.path.join(full_path, item + "_gt.txt")
        mot_label_path = os.path.join(full_path, "gt.txt")
        process_darklabel(video_label_path, mot_label_path)
           

以上就是DarkLabel轉各種資料集格式的腳本了,DarkLabel還是非常友善的,可以快速建構自己的資料集。通常兩分鐘的視訊可以生成2880張之多的圖檔,但是在目标檢測中并不推薦将所有的圖檔都作為訓練集,因為前後幀之間差距太小了,幾乎是一模一樣的。這種資料會導緻訓練速度很慢、泛化能力變差。

有兩種解決方案:

  • 可以選擇隔幾幀選取一幀作為資料集,比如每隔10幀作為資料集。具體選擇多少作為間隔還是具體問題具體分析,如果視訊中變化目标變化較快,可以适當縮短間隔;如果視訊中大部分都是靜止對象,可以适當增大間隔。
  • 還有一種更好的方案是:對原視訊用ffmpeg提取關鍵幀,将關鍵幀的内容作為資料集。關鍵幀和關鍵幀之間的差距比較大,适合作為目标檢測資料集。

代碼改變世界