# Import Packages
import os
import re
import tensorflow as tf
import pandas as pd

from bs4 import BeautifulSoup as soup
from PIL import Image

import matplotlib.pyplot as plot
import matplotlib.patches as patches


# Sample Image
with Image.open("./Data/images/maksssksksss0.png") as rawImage:
    display(rawImage)


# Sample Annotation
with open("./Data/annotations/maksssksksss0.xml") as annotation:
    rawXML = annotation.read()
    print(rawXML)

<annotation>
    <folder>images</folder>
    <filename>maksssksksss0.png</filename>
    <size>
        <width>512</width>
        <height>366</height>
        <depth>3</depth>
    </size>
    <segmented>0</segmented>
    <object>
        <name>without_mask</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <occluded>0</occluded>
        <difficult>0</difficult>
        <bndbox>
            <xmin>79</xmin>
            <ymin>105</ymin>
            <xmax>109</xmax>
            <ymax>142</ymax>
        </bndbox>
    </object>
    <object>
        <name>with_mask</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <occluded>0</occluded>
        <difficult>0</difficult>
        <bndbox>
            <xmin>185</xmin>
            <ymin>100</ymin>
            <xmax>226</xmax>
            <ymax>144</ymax>
        </bndbox>
    </object>
    <object>
        <name>without_mask</name>
        <pose>Unspecified</pose>
        <truncated>0</truncated>
        <occluded>0</occluded>
        <difficult>0</difficult>
        <bndbox>
            <xmin>325</xmin>
            <ymin>90</ymin>
            <xmax>360</xmax>
            <ymax>141</ymax>
        </bndbox>
    </object>
</annotation>


# Data Extraction Function
def Extract(path):
    # Safely open annotation
    with open(path) as annotation:
        # Read annotation with Beautiful Soup
        rawXML = annotation.read()
        bsXML = soup(rawXML,"xml")

        # Stores Metadata in a Dictionary
        meta = {}
        size = bsXML.find('size')
        meta['index']=int(re.search(r'\d+',path).group())
        meta['width'] = int(size.find('width').get_text())
        meta['height'] = int(size.find('height').get_text())
        meta['depth'] = int(size.find('depth').get_text())

        # Stores Labels in a List of Dictionaries
        faces = []
        objects = bsXML.find_all('object')
        for face in objects:
            labels={}
            labels['index']=meta['index']
            labels['label'] = face.find('name').get_text()
            labels['pose'] = face.find('pose').get_text()
            labels['truncated'] = int(face.find('truncated').get_text())
            labels['occluded'] = int(face.find('occluded').get_text())
            labels['difficult'] = int(face.find('difficult').get_text())
            labels['xmin'] = int(face.find('xmin').get_text())
            labels['xmax'] = int(face.find('xmax').get_text())
            labels['ymin'] = int(face.find('ymin').get_text())
            labels['ymax'] = int(face.find('ymax').get_text())
            faces.append(labels)
        return(meta,faces)


# Extract data from sample
meta,faces = Extract("./Data/annotations/maksssksksss0.xml")
print("Metadata: ")
print(meta)
print("Face Objects: ")
print(faces)

Metadata: 
{'index': 0, 'width': 512, 'height': 366, 'depth': 3}
Face Objects: 
[{'index': 0, 'label': 'without_mask', 'pose': 'Unspecified', 'truncated': 0, 'occluded': 0, 'difficult': 0, 'xmin': 79, 'xmax': 109, 'ymin': 105, 'ymax': 142}, {'index': 0, 'label': 'with_mask', 'pose': 'Unspecified', 'truncated': 0, 'occluded': 0, 'difficult': 0, 'xmin': 185, 'xmax': 226, 'ymin': 100, 'ymax': 144}, {'index': 0, 'label': 'without_mask', 'pose': 'Unspecified', 'truncated': 0, 'occluded': 0, 'difficult': 0, 'xmin': 325, 'xmax': 360, 'ymin': 90, 'ymax': 141}]


# Show labels on Image
with Image.open("./Data/images/maksssksksss0.png") as rawImage:
    figure, axis = plot.subplots()
    axis.imshow(rawImage)
    meta,objects = Extract("./Data/annotations/maksssksksss0.xml")
    for face in objects:
        color = 'green' if face['label']=='with_mask' else 'red'
        corner = (face['xmin'],face['ymin'])
        width = face['xmax']-face['xmin']
        height = face['ymax']-face['ymin']
        rect = patches.Rectangle(corner, width, height, linewidth=2, edgecolor=color, facecolor='none')
        axis.add_patch(rect)
    plot.show()


# Collect Samples
imagePath = "./Data/images/"
imagesList = [[int(re.search(r'\d+',os.path.join(imagePath,f)).group()),os.path.join(imagePath,f)] for f in os.listdir(imagePath) if os.path.isfile(os.path.join(imagePath,f))]
images = pd.DataFrame(imagesList,columns=['index','path']).set_index('index')

# Collect Labels
annotationpath = "./Data/annotations/"
metaList=[]
facesList=[]
for f in os.listdir(annotationpath):
    path = os.path.join(annotationpath,f)
    meta, faces = Extract(path)
    metaList.append(meta)
    facesList=facesList+faces
meta = pd.DataFrame(metaList).set_index('index')
labels = pd.DataFrame(facesList).set_index('index')


# Metadata Summary Statistics
meta.describe()


# Explore Image Sizes
groupedMeta = meta.groupby(['height', 'width']).size().reset_index().rename(columns={0:'count'})
print("Most images of the same size:")
print(groupedMeta[groupedMeta['count']==groupedMeta['count'].max()])
plot.scatter(groupedMeta['width'],groupedMeta['height'],c='darkblue',s=3*groupedMeta['count'])
plot.xlabel('width (pixels)')
plot.ylabel('height (pixels)')
plot.title('Sizes of Images in Dataset')
plot.show()

Most images of the same size:
     height  width  count
115     400    301    184


# Labels Summary Statistics
nlabels = len(labels)
nWithMask = len(labels[labels['label']=='with_mask'])
nWithoutMask = len(labels[labels['label']=='without_mask'])
nIncorrectMask = len(labels[labels['label']=='mask_weared_incorrect'])
nOther = nlabels - nWithMask - nWithoutMask - nIncorrectMask
labelTable = pd.DataFrame([
    ["With","{:.1f}".format(100*nWithMask/nlabels)+'%'],
    ["Without","{:.1f}".format(100*nWithoutMask/nlabels)+"%"],
    ["Incorrect","{:.1f}".format(100*nIncorrectMask/nlabels)+"%"]],columns=["Mask Label","Label Classifications ("+str(nlabels)+" Total Labels)"]).set_index("Mask Label")
labelTable


# Explore Number of Labels per Image
groupedLabels = labels.groupby(by='index').size()
plot.hist(groupedLabels,bins=25,color='teal')
plot.xlabel('Labels in Image')
plot.ylabel('Number of Images')
plot.title('Distribution of Labels in Images')
plot.show()


# Look into other label variables
print(labels.groupby(by='pose').size())
labels.describe()

pose
Unspecified    4072
dtype: int64

	width	height	depth
count	853.000000	853.000000	853.0
mean	370.589683	309.289566	3.0
std	56.224049	74.474840	0.0
min	193.000000	156.000000	3.0
25%	301.000000	254.000000	3.0
50%	400.000000	280.000000	3.0
75%	400.000000	400.000000	3.0
max	600.000000	600.000000	3.0

	truncated	occluded	difficult	xmin	xmax	ymin	ymax
count	4072.0	4072.0	4072.0	4072.000000	4072.000000	4072.000000	4072.000000
mean	0.0	0.0	0.0	182.207024	213.356090	85.780697	120.785609
std	0.0	0.0	0.0	104.471254	102.712267	52.571821	70.355098
min	0.0	0.0	0.0	1.000000	8.000000	1.000000	6.000000
25%	0.0	0.0	0.0	96.000000	134.000000	49.000000	73.000000
50%	0.0	0.0	0.0	177.000000	212.000000	75.000000	103.000000
75%	0.0	0.0	0.0	266.000000	292.000000	113.000000	148.000000
max	0.0	0.0	0.0	569.000000	592.000000	330.000000	495.000000

Detection and Classification of People With and Without Masks in Images¶

Exploratory Data Analysis¶

	Label Classifications (4072 Total Labels)
Mask Label
With	79.4%
Without	17.6%
Incorrect	3.0%