# Import Packages
import os
import re
import pandas as pd
from bs4 import BeautifulSoup as soup
from PIL import Image
# Annotation Extraction Function
def Extract(path):
# Safely open annotation
with open(path) as annotation:
# Read annotation with Beautiful Soup
rawXML = annotation.read()
bsXML = soup(rawXML,"xml")
# Stores image metadata in variables
size = bsXML.find('size')
index =int(re.search(r'\d+',path).group())
width = int(size.find('width').get_text())
height = int(size.find('height').get_text())
depth = int(size.find('depth').get_text())
# Stores labels in a list of dictionaries
faces = []
objects = bsXML.find_all('object')
for face in objects:
labels={}
labels['index'] = index
labels['width'] = width
labels['height'] = height
labels['depth'] = depth
labels['label'] = face.find('name').get_text()
labels['xmin'] = int(face.find('xmin').get_text())
labels['xmax'] = int(face.find('xmax').get_text())
labels['ymin'] = int(face.find('ymin').get_text())
labels['ymax'] = int(face.find('ymax').get_text())
faces.append(labels)
return faces
# Collect Images
imagePath = "./Data/images/"
imageList = [[int(re.search(r'\d+',f).group()),os.path.join(imagePath,f)] for f in os.listdir(imagePath) if os.path.isfile(os.path.join(imagePath,f))]
images = pd.DataFrame(imageList,columns=['index','path']).set_index('index')
# Collect Labels
annotationPath = "./Data/annotations/"
labelList = [x for f in os.listdir(annotationPath) for x in Extract(os.path.join(annotationPath,f))]
labels = pd.DataFrame(labelList)
# Find and remove labels with incorrectly worn mask label or faces smaller than 16x16
exclude = labels[(labels['label']=='mask_weared_incorrect') | (labels['xmax']-labels['xmin']<16) | (labels['xmax']-labels['xmin']<16)].index
labels.drop(exclude,inplace=True)
labels = labels.set_index('index')
images = images.loc[labels.index.unique()]
# Move balanced images to Train (300 with / 300 without) & Validation (100 with / 100 without) Directories
train_with = 0
train_without = 0
test_with = 0
test_without = 0
for idx, path in images['path'].iteritems():
with Image.open(path) as im:
imRGB = im.convert('RGB')
imGRAY = imRGB.convert('L')
l = labels.loc[[idx]]
for i in range(len(l)):
savePath=None
tindex = str(idx)+"_"+str(i)
c = imGRAY.crop((l.iloc[i]['xmin'],l.iloc[i]['ymin'],l.iloc[i]['xmax'],l.iloc[i]['ymax']))
if l.iloc[i]['label']=='with_mask':
if train_with<300:
savePath = "./Data/train/with_mask/"+tindex+".jpg"
train_with+=1
elif train_without<100:
savePath = "./Data/validation/with_mask/"+tindex+".jpg"
train_without+=1
else:
if test_with<300:
savePath = "./Data/train/without_mask/"+tindex+".jpg"
test_with+=1
elif test_without<100:
savePath = "./Data/validation/without_mask/"+tindex+".jpg"
test_without+=1
if savePath:
c.save(savePath)