制作目标检测数据集常用python脚本整理

摘要

制作个人目标检测数据集时,有查找爬取的错误图片,更改标注路径等等需要,写了以下python脚本

代码

查找错误图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import os
import imghdr
from progressbar import ProgressBar

path ='./JPEGImages'
original_images =[]

for root, dirs, filenames in os.walk(path):
for filename in filenames:
original_images.append(os.path.join(root, filename))

original_images = sorted(original_images)
print('num:',len(original_images))
f = open('check_error.txt','w+')
error_images =[]
progress = ProgressBar()
for filename in progress(original_images):
check = imghdr.what(filename)
if check == None:
f.write(filename)
f.write('\n')
error_images.append(filename)
print(len(error_images))
f.seek(0)
for s in f:
print(s)
f.close()

按顺序重命名标注文件和图片(分成两部分防止图片和标注文件数量不一致)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import os

class BatchRename():
def __init__(self):
self.path1 = './JPEGImages'
self.path2 = './Annotations'

def rename(self):
filelist1 = os.listdir(self.path1)
filelist1.sort()
total_num1 = len(filelist1)
i1 = 1
srcpath1 = os.path.abspath(self.path1)
for item1 in filelist1:
os.rename(os.path.join(srcpath1, item1), os.path.join(srcpath1, str(i1).zfill(5) + '.jpg'))
i1 += 1

filelist2 = os.listdir(self.path2)
filelist2.sort()
total_num2 = len(filelist2)
i2 = 1
srcpath2 = os.path.abspath(self.path2)
for item2 in filelist2:
os.rename(os.path.join(srcpath2, item2), os.path.join(srcpath2, str(i2).zfill(5) + '.xml'))
i2 += 1


if __name__ == '__main__':
demo = BatchRename()
demo.rename()

修改标注文件中的图片路径为本目录下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import os,sys

class FilesChange():
def __init__(self):
self.path = './Annotations'

def Change(self):
filelist = os.listdir(self.path)
for item in filelist:
srcfile = os.path.join(os.path.abspath(self.path), item)
item = item.rstrip('.xml')
item = item.zfill(5)
line_replace = 2
with open(srcfile,'r') as fd:
lines = fd.readlines()
lines[line_replace] = (' <filename>' + item + '.webp</filename>\n')
lines[line_replace + 1] = (' <path>' + format(srcfile[:-21]) + 'JPEGImages/' + item + '.webp</path>\n')
with open(srcfile,'w') as fd:
fd.writelines(lines)


if __name__ == '__main__':
demo = FilesChange()
demo.Change()

输出没有标注的图片和标注文件名到 txt 文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import os,sys,re

class AppleSearch():
def __init__(self):
self.path = './Annotations'

def Search(self):
filetxt = 'NoAppleXml.txt'
txtlist = []
filelist = os.listdir(self.path)
for item in filelist:
srcfile = os.path.join(os.path.abspath(self.path), item)
with open(srcfile,'r') as fd:
content = fd.read()
target = re.findall("apple", content)
if target == []:
txtlist.append(item.rstrip('.xml'))
txtpath = os.path.join(os.path.abspath('./'), filetxt)
with open(txtpath,'w') as fd:
for i in txtlist:
print(i)
fd.write("{}\n".format(i))

if __name__ == '__main__':
demo = AppleSearch()
demo.Search()

从 txt 文件读取要删除的图片和标注文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import os,sys,fileinput,re

class DeleteNoApple():
def __init__(self):
self.path1 = './JPEGImages'
self.path2 = './Annotations'

def Delete(self):
filetxt = './NoAppleXml.txt'
content = []
txtpath = os.path.join(os.path.abspath('./'), filetxt)
# with open(txtpath) as fd:
for line in fileinput.input(txtpath):
content.append(line.rstrip('\n'))

if content:
for i in content:
srcpath1 = os.path.join(os.path.abspath(self.path1), i+'.webp')
os.remove(srcpath1)
srcpath2 = os.path.join(os.path.abspath(self.path2), i+'.xml')
os.remove(srcpath2)


if __name__ == '__main__':
demo = DeleteNoApple()
demo.Delete()

批量图片格式转换 (未过滤 jpg 图片防止图片编码错误)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import os
from PIL import Image

imgList = os.listdir('./JPEGImages')
for item in imgList:
try:
img = Image.open(item)
file_name, file_type = os.path.splitext(item)

# if file_type == 'jpg':
# continue
else:
img.save("%s.webp"%(file_name), 'jpg')

except IOError:
print("picture convert error")

批量更改标签名

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import os,sys

class FilesChange():
def __init__(self):
self.path = './Annotations'

def Change(self):
old = "Apple"
new = "apple"
filelist = os.listdir(self.path)
for item in filelist:
srcfile = os.path.join(os.path.abspath(self.path), item)
line_replace = 2
with open(srcfile,'r+') as fd:
lines = fd.readlines()
fd.seek(0)
for line in lines:
if old in line:
lines="".join(lines).replace(old,new)
fd.writelines("".join(lines))


if __name__ == '__main__':
demo = FilesChange()
demo.Change()

查找无对应图片的标签 和 无对应标签的图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# -*- coding: utf-8 -*-
import os


path1 = r'./JPEGImages'
path2 = r'./Annotations'

def file_name(image_dir,xml_dir):
jpg_list = []
xml_list = []
for root, dirs, files in os.walk(image_dir):
for file in files:
jpg_list.append(os.path.splitext(file)[0])
for root, dirs, files in os.walk(xml_dir):
for file in files:
xml_list.append(os.path.splitext(file)[0])
print(len(jpg_list))
diff = set(xml_list).difference(set(jpg_list)) # 差集,在a中但不在b中的元素
for name in diff:
print("no jpg", name + ".xml")
diff2 = set(jpg_list).difference(set(xml_list)) # 差集,在b中但不在a中的元素
print(len(diff2))
for name in diff2:
print("no xml", name + ".webp")
if __name__ == '__main__':

file_name(path1,path2)

使用 imgaug 库图像增广

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import xml.etree.ElementTree as ET
import pickle
import os
from os import getcwd
import numpy as np
from PIL import Image

import imgaug as ia
from imgaug import augmenters as iaa
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


ia.seed(1)

def read_xml_annotation(root, image_id):
in_file = open(os.path.join(root, image_id))
tree = ET.parse(in_file)
root = tree.getroot()
bndboxlist = []

for object in root.findall('object'): # 找到root节点下的所有country节点
bndbox = object.find('bndbox') # 子节点下节点rank的值

xmin = int(float(bndbox.find('xmin').text))
xmax = int(float(bndbox.find('xmax').text))
ymin = int(float(bndbox.find('ymin').text))
ymax = int(float(bndbox.find('ymax').text))
# print(xmin,ymin,xmax,ymax)
bndboxlist.append([xmin,ymin,xmax,ymax])
# print(bndboxlist)

bndbox = root.find('object').find('bndbox')
return bndboxlist
# (506.0000, 330.0000, 528.0000, 348.0000) -> (520.4747, 381.5080, 540.5596, 398.6603)
def change_xml_annotation(root, image_id, new_target):
new_xmin = new_target[0]
new_ymin = new_target[1]
new_xmax = new_target[2]
new_ymax = new_target[3]

in_file = open(os.path.join(root, str(image_id) + '.xml')) # 这里root分别由两个意思
tree = ET.parse(in_file)
xmlroot = tree.getroot()
object = xmlroot.find('object')
bndbox = object.find('bndbox')
xmin = bndbox.find('xmin')
xmin.text = str(new_xmin)
ymin = bndbox.find('ymin')
ymin.text = str(new_ymin)
xmax = bndbox.find('xmax')
xmax.text = str(new_xmax)
ymax = bndbox.find('ymax')
ymax.text = str(new_ymax)
tree.write(os.path.join(root, str(image_id) + "_aug" + '.xml'))

def change_xml_list_annotation(root, image_id, new_target,saveroot,id):

in_file = open(os.path.join(root, str(image_id) + '.xml')) # 这里root分别由两个意思
tree = ET.parse(in_file)
xmlroot = tree.getroot()
index = 0

for object in xmlroot.findall('object'): # 找到root节点下的所有country节点
bndbox = object.find('bndbox') # 子节点下节点rank的值

# xmin = int(bndbox.find('xmin').text)
# xmax = int(bndbox.find('xmax').text)
# ymin = int(bndbox.find('ymin').text)
# ymax = int(bndbox.find('ymax').text)

new_xmin = new_target[index][0]
new_ymin = new_target[index][1]
new_xmax = new_target[index][2]
new_ymax = new_target[index][3]

xmin = bndbox.find('xmin')
xmin.text = str(new_xmin)
ymin = bndbox.find('ymin')
ymin.text = str(new_ymin)
xmax = bndbox.find('xmax')
xmax.text = str(new_xmax)
ymax = bndbox.find('ymax')
ymax.text = str(new_ymax)

index = index + 1

tree.write(os.path.join(saveroot, str(image_id) + "_aug_" + str(id) + '.xml'))


def mkdir(path):

# 去除首位空格
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False

if __name__ == "__main__":

IMG_DIR = "./JPEGImages"
XML_DIR = "./Annotations"

AUG_XML_DIR = "./aug_Annotations" # 存储增强后的XML文件夹路径
mkdir(AUG_XML_DIR)

AUG_IMG_DIR = "./aug_JPEGImages" # 存储增强后的影像文件夹路径
mkdir(AUG_IMG_DIR)

AUGLOOP = 2 # 每张影像增强的数量

boxes_img_aug_list = []
new_bndbox = []
new_bndbox_list = []


# 影像增强
seq = iaa.Sequential([
# iaa.Flipud(0.5), # vertically flip 20% of all images
iaa.Fliplr(0.5), # 镜像
iaa.Multiply((1.2, 1.5)), # change brightness, doesn't affect BBs
iaa.GaussianBlur(sigma=(0, 3.0)), # iaa.GaussianBlur(0.5),
iaa.Affine(
translate_px={"x": 15, "y": 15},
scale=(0.8, 0.95),
rotate=(-30, 30)
) # translate by 40/60px on x/y axis, and scale to 50-70%, affects BBs
])

for root, sub_folders, files in os.walk(XML_DIR):

for name in files:

bndbox = read_xml_annotation(XML_DIR, name)

for epoch in range(AUGLOOP):
seq_det = seq.to_deterministic() # 保持坐标和图像同步改变,而不是随机

# 读取图片
img = Image.open(os.path.join(IMG_DIR, name[:-4] + '.webp'))
img = np.array(img)

# bndbox 坐标增强
for i in range(len(bndbox)):
bbs = ia.BoundingBoxesOnImage([
ia.BoundingBox(x1=bndbox[i][0], y1=bndbox[i][1], x2=bndbox[i][2], y2=bndbox[i][3]),
], shape=img.shape)

bbs_aug = seq_det.augment_bounding_boxes([bbs])[0]
boxes_img_aug_list.append(bbs_aug)

# new_bndbox_list:[[x1,y1,x2,y2],...[],[]]
new_bndbox_list.append([int(bbs_aug.bounding_boxes[0].x1),
int(bbs_aug.bounding_boxes[0].y1),
int(bbs_aug.bounding_boxes[0].x2),
int(bbs_aug.bounding_boxes[0].y2)])
# 存储变化后的图片
image_aug = seq_det.augment_images([img])[0]
path = os.path.join(AUG_IMG_DIR, str(name[:-4]) + "_aug_" + str(epoch) + '.webp')
# image_auged = bbs.draw_on_image(image_aug, thickness=0)
Image.fromarray(image_aug).save(path)

# 存储变化后的XML
change_xml_list_annotation(XML_DIR, name[:-4], new_bndbox_list,AUG_XML_DIR,epoch)
print(str(name[:-4]) + "_aug_" + str(epoch) + '.webp')
new_bndbox_list = []
- ETX   Thank you for reading -
  • Copyright: All posts on this blog except otherwise stated, All adopt CC BY-NC-ND 4.0 license agreement. Please indicate the source of reprint!