PaddleOCR 識别資料制作
-
- OCR 常見資料集準備
- 生成PaddleOCR識别資料集
- Python常見轉義符号
- ChineseOCR 資料集轉PaddleOCR 資料集訓練格式
以PaddleOCR 為基礎
OCR 常見資料集準備
OCR資料集整理
通用中英文OCR資料集
手寫資料集
垂類多語言
生成PaddleOCR識别資料集
根據描述,街景資料集根據真值圖crop 下載下傳資料集後打開标注,會發現有illegibility和difficult這個字段,根據這個字段的真值去crop圖,point是相對應的坐标
import json
import cv2
import os
import numpy as np
import re
def get_region(box):
max_all = np.max(box, axis=0)
min_all = np.min(box, axis=0)
x_max, y_max = int(max_all[0]), int(max_all[1])
x_min, y_min = int(min_all[0]), int(min_all[1])
# cv2.rectangle(img, (x_min - 10, y_min - 10), (x_max + 10, y_max + 10), (0, 0, 255), 3)
# cv2.imwrite('./images/crop_img.jpg', img)
return x_max , y_max , x_min , y_min
def cv_imread(file_path = ""):
img_mat=cv2.imdecode(np.fromfile(file_path,dtype=np.uint8),-1)
return img_mat
def cv_imwrite(file_path , frame ):
cv2.imencode('.png', frame)[1].tofile(file_path)
def protxt(txt):
res = []
txt = txt[1:-1]
txt_sp = txt.split(',')
for i in txt_sp:
res.append(int(float(i)))
return res
def find_txt(txt):
txt = re.findall(r'\[(.+?)\]', txt)
txt = re.findall(r'\d+', txt[0])
res = [int(i) for i in txt]
return res
def get_dict(txt):
txt = txt[1:-1]
txt = txt.replace(': false', ': "false"').replace(': True', ': "True"').replace(': False', ': "False"').replace(': true', ': "true"').replace('}, {', '}${').replace("'",'"').replace('/','').replace('\\','')
txt = txt.replace('(','(').replace(')',')')
b = txt.split('$')
res = []
for i in b:
d = {"transcription": '', "points": '', "difficult": ''}
n1 = i.index(', "points"')
d["transcription"] = i[19:n1 - 1]
for i in b:
d = {"transcription": '', "points": '', "difficult": ''}
n1 = i.index(', "points"')
n2 = i.index('[[')
n3 = i.index(']]')
d["transcription"] = i[19:n1 - 1]
g = i[n2 + 1:n3 + 1]
g = g.replace(' ', '').replace('],[', ']&[')
g2 = g.split('&')
temp = []
for u in g2:
temp.append(protxt(u))
d["points"] = temp
if 'difficult' in i:
n4 = i.index('cult":')
d["difficult"] = i[n4 + 8:-2]
if 'illegibility' in i:
n4 = i.index('lity":')
d["illegibility"] = i[n4 + 8:-2]
res.append(d)
return res
file_path = 'det_data/train_det.txt'
with open(file_path,'r',encoding = 'UTF-8') as f_obj:
lines = f_obj.readlines()
for line in lines:
# 逐行讀取資料
line = line.split('\t')
# 處理圖檔對圖檔進行讀取
try:
path_img = './det_data/'+line[0]
# path_img = './det_data/det_test_imgs/gt_0.jpg'
# train_data / det_data / det_test_imgs /¥3998700.png
img = cv_imread(path_img)
# cv2.imshow('im',img)
# cv2.waitKey(0)
con_txts = get_dict(line[1])
for txt in con_txts:
points = txt["points"]
transcription = txt["transcription"]
transcription = transcription.replace('"',"''")
if transcription != '###' :
x_max, y_max, x_min, y_min = get_region(points)
img_crop = img[y_min:y_max, x_min:x_max]
# cv2.imshow('im_crop', img_crop)
# cv2.waitKey(0)
cv_imwrite('E:/rec_data/rec_train_data/'+transcription+'.png',img_crop)
except Exception as e:
print(path_img+' '+transcription +'寫入錯誤')
# 對point進行讀取
最終效果:
在編寫腳本的時候遇到很多轉義的問題 發現2019ICDAR中文資料集标注使用的是雙引号,2015ICDAR使用的是雙引号,在處理單雙引号的時候遇到了一些轉義的問題。
Python常見轉義符号
轉義字元 | 描述 |
---|---|
(在行尾時) | 續行符 |
\ | 反斜杠符号 |
\’ | 單引号 |
\” | 雙引号 |
\a | 響鈴 |
\b | 倒退(Backspace) |
\e | 轉義 |
\000 | 空 |
\n | 換行 |
\v | 縱向制表符 |
\t | 橫向制表符 |
\r | 回車 |
\f | 換頁 |
\oyy | 八進制數yy代表的字元,例如:\o12代表換行 |
\xyy | 十進制數yy代表的字元,例如:\x0a代表換行 |
\other | 其它的字元以普通格式輸出 |
ChineseOCR 資料集轉PaddleOCR 資料集訓練格式
ChineseOCR Github 位址
主要是将chineseOCR 數字序列轉換為paddleOCR的文字
chineseOCR 的資料标注 這些數字一一對應着字典裡的字
轉換後
with open('char_dict.txt','r',encoding='utf-8') as f_obj :
dict_key = f_obj.readlines()
# print(dict_key)
dict_key = dict(enumerate(dict_key))
with open('data_test.txt','r',encoding='utf-8') as f_obj:
data_labels = f_obj.readlines()
for data_label in data_labels:
label = data_label.split(' ')
img_file = label[0]
img_labels = label[1:]
label_txt = ''
for img_label in img_labels:
txt = dict_key[int(img_label)]
label_txt += txt
label_txt = label_txt.replace('\n','')
with open('test_label.txt','a+',encoding='utf-8') as f :
f.writelines(img_file+'\t'+label_txt + '\n')