implementation for 3rd individual project

2020-05-24 13:56:26 +00:00 · 2020-05-24 13:56:26 +00:00 · b023049f27
parent 752b8866c7
commit b023049f27
3 changed files with 316 additions and 0 deletions
--- a/ClassificationGarbage.rar
+++ b/ClassificationGarbage.rar
--- a/Report_Klaudia_Przybylska.md
+++ b/Report_Klaudia_Przybylska.md
@ -0,0 +1,85 @@
+# Report - Individual Project Klaudia Przybylska
+## General information
+In our project, our agent - garbage truck is collecting trash from dumpsters on the grid and then bringing it to the garbage dump. However to make sure that it wasn't sorted incorrectly or mixed on the way because the road was bumpy, wastes is checked again before the truck is emptied and is sorted accordingly.
+The program uses Random Forest Classifier to recognize five types of rubbish:
+* cardboard
+* glass
+* metal
+* paper
+* plastic
+Before running the program it is obligatory to unpack "Garbage classifier.rar" and "ClassificationGarbage.rar".
+## Extracting information from images
+In order to use Random Forest Classifier to classify pictures, I used three global feature descriptors:
+* Hu Moments - responsible for capturing information about shapes because they have information about intensity and position of pixels. They are invariant to image transformations (unlike moments or central moments).
+```
+def hu_moments(image):
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    moments = cv2.moments(gray)
+    huMoments = cv2.HuMoments(moments).flatten()
+    return huMoments
+```
+* Color histogram - representation of the distribution of colors in an image.
+```
+def histogram(image, mask=None):
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    hist  = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
+    cv2.normalize(hist, hist)
+    histogram = hist.flatten()
+    return histogram
+```
+* Haralick Texture is used to quantify an image based on texture (the consistency of patterns and colors in an image).
+```
+def haralick(image):
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    haralick = mahotas.features.haralick(gray).mean(axis=0)
+    return haralick
+```
+* All three features are then stacked into one matrix and used in training the classifier, and in the same way for testing it.
+```
+allFeatures = np.hstack([histo, hara, huMoments])
+```
+##Creating test and training sets
+Data is divided between two sets, where training set contains 80% of all data and test set only 20%. Images are randomly shuffled.
+```
+allFileNames = os.listdir(sourceDir)
+np.random.shuffle(allFileNames)
+trainingFileNames, testFileNames = np.split(np.array(allFileNames), [int(len(allFileNames) * (1 - testRatio))])
+```
+##Implementation
+Functions in garbageDumpSorting.py:
+* createSets - divides images between test and training set. This function should be run only once, unless the folders with training and test set are removed,
+```
+trainingFileNames, testFileNames = np.split(np.array(allFileNames), [int(len(allFileNames) * (1 - testRatio))])
+```
+* huMoments, haralick, histogram - calculate global feature descriptors,
+* processTrainData, processTestData - both work in the same way, they iterate over files in train or test directory, saves features as a matrix and then saves results to h5 file, it is recommended to run it only once as it takes some time to finish.
+```
+allFeatures = np.hstack([histo, hara, huMoments])
+```
+* trainAndTest - creates classifier, trains it and scores it,
+```
+clf  = RandomForestClassifier(n_estimators=100, max_depth=15, random_state=9)
+```
+* classifyImage - predicts what kind of garbage is visible on a single image,
+```
+prediction = clf.predict(features)[0]
+```
+* sortDump - checks what kinds of trash are inside the garbage truck and their quantity, empties the garbage truck and sorts its contents on the garbage dump.
+
+##Changes in common part
+I created class garbageDump in which I store information about the quantity of trash present on the garbage dump. I had to add a small function to Garbagetruck class in order to remove wastes from the garbage truck. In main I initialize garbage dump and at the end I display its contents.
+
+##Libraries
+The following libraries are required to run the program:
+```
+import os
+import numpy as np
+import shutil
+import cv2
+import mahotas
+import h5py
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.ensemble import RandomForestClassifier
+import random
+```
--- a/garbageDumpSorting.py
+++ b/garbageDumpSorting.py
@ -0,0 +1,231 @@
+#Creating training and test set
+import os
+import numpy as np
+import shutil
+#Feature descriptors
+import cv2
+import mahotas
+#saving data
+import h5py
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import MinMaxScaler
+#classifier
+from sklearn.ensemble import RandomForestClassifier
+#other
+import random
+from models.Garbagetruck import GarbageTruck
+from models.garbageDump import Dump
+
+#https://www.kaggle.com/asdasdasasdas/garbage-classification - dataset
+
+
+def createSets():
+    rootDir = 'ClassificationGarbage'
+    typesDir = ['/cardboard', '/glass', '/metal', '/paper', '/plastic']
+    testRatio = 0.2
+
+    for cls in typesDir:
+        os.makedirs(rootDir + '/trainSet' + cls)
+        os.makedirs(rootDir + '/testSet' + cls)
+        sourceDir = rootDir + cls
+        allFileNames = os.listdir(sourceDir)
+        np.random.shuffle(allFileNames)
+        trainingFileNames, testFileNames = np.split(np.array(allFileNames), [int(len(allFileNames) * (1 - testRatio))])
+        trainingFileNames = [sourceDir +'/' + name for name in trainingFileNames.tolist()]
+        testFileNames = [sourceDir +'/' + name for name in testFileNames.tolist()]
+        print(cls + ':')
+        print('Total images: ', len(allFileNames))
+        print('Training: ', len(trainingFileNames))
+        print('Testing: ', len(testFileNames))
+
+        for name in trainingFileNames:
+            shutil.copy(name, rootDir +'/trainSet' + cls)
+        for name in testFileNames:
+            shutil.copy(name, rootDir +'/testSet' + cls)
+        print("Images copied.")
+
+
+def processTrainData():
+    trainTypes = os.listdir('ClassificationGarbage/trainSet')
+    trainTypes.sort()
+    features = []
+    types = []
+    trainDir = 'ClassificationGarbage/trainSet/'
+    size = tuple((500, 500))
+
+    #process data
+
+    for type in trainTypes:
+        dir = os.path.join(trainDir, type)
+        currentType = type
+        print("Processing " + type + "...")
+        for imagename in os.listdir(dir):
+            file = dir + "/" + imagename
+            image = cv2.imread(file)
+            image = cv2.resize(image, size)
+            #Global features
+            huMoments = hu_moments(image)
+            hara = haralick(image)
+            histo = histogram(image)
+            allFeatures = np.hstack([histo, hara, huMoments])
+            types.append(currentType)
+            features.append(allFeatures)
+        print("Done.")
+    print("All processed.")
+    print("Training...")
+
+    #save data
+
+    h5Data = 'output/data.h5'
+    h5Types = 'output/types.h5'
+
+    targetNames = np.unique(types)
+    le = LabelEncoder()
+    target = le.fit_transform(types)
+    scaler = MinMaxScaler(feature_range=(0, 1))
+    rescaledFeatures = scaler.fit_transform(features)
+
+    fileData = h5py.File(h5Data, 'w')
+    fileData.create_dataset('dataset_1', data=np.array(rescaledFeatures))
+    fileTypes = h5py.File(h5Types, 'w')
+    fileTypes.create_dataset('dataset_1', data=np.array(target))
+    fileData.close()
+    fileTypes.close()
+
+
+def processTestData():
+    trainTypes = os.listdir('ClassificationGarbage/trainSet')
+    trainTypes.sort()
+    testDir = 'ClassificationGarbage/testSet/'
+    size = tuple((500, 500))
+    testTypes = []
+    testFeatures = []
+    print("Testing...")
+
+    #process data
+
+    for type in trainTypes:
+        dir = os.path.join(testDir, type)
+        currentType = type
+        for imagename in os.listdir(dir):
+            file = dir + "/" + imagename
+            image = cv2.imread(file)
+            image = cv2.resize(image, size)
+            #Global features
+            huMoments = hu_moments(image)
+            hara = haralick(image)
+            histo = histogram(image)
+            allFeatures = np.hstack([histo, hara, huMoments])
+            testTypes.append(currentType)
+            testFeatures.append(allFeatures)
+
+    #save data
+
+    h5TestData = 'output/testdata.h5'
+    h5TestTypes = 'output/testtypes.h5'
+
+    targetNames = np.unique(testTypes)
+    le = LabelEncoder()
+    target = le.fit_transform(testTypes)
+    scaler = MinMaxScaler(feature_range=(0, 1))
+    rescaledFeatures = scaler.fit_transform(testFeatures)
+
+    fileTestData = h5py.File(h5TestData, 'w')
+    fileTestData.create_dataset('dataset_1', data=np.array(rescaledFeatures))
+    fileTestTypes = h5py.File(h5TestTypes, 'w')
+    fileTestTypes.create_dataset('dataset_1', data=np.array(target))
+    fileTestData.close()
+    fileTestTypes.close()
+
+
+
+
+def trainAndTest():
+    h5Data = 'output/data.h5'
+    h5Types = 'output/types.h5'
+    h5TestData = 'output/testdata.h5'
+    h5TestTypes = 'output/testtypes.h5'
+
+    #import train data
+    fileData = h5py.File(h5Data, 'r')
+    fileTypes = h5py.File(h5Types, 'r')
+    features = fileData['dataset_1']
+    types = fileTypes['dataset_1']
+    allFeatures = np.array(features)
+    allTypes = np.array(types)
+    fileData.close()
+    fileTypes.close()
+
+    # create model
+    clf  = RandomForestClassifier(n_estimators=100, random_state=9)
+    clf.fit(allFeatures, allTypes)
+
+    #import test data
+    fileTestData = h5py.File(h5TestData, 'r')
+    fileTestTypes = h5py.File(h5TestTypes, 'r')
+    features = fileTestData['dataset_1']
+    types = fileTestTypes['dataset_1']
+    allFeatures = np.array(features)
+    allTypes = np.array(types)
+    fileTestData.close()
+    fileTestTypes.close()
+
+    #Rfc score
+    print("Random Forest Classifier score:")
+    print(clf.score(allFeatures, allTypes))
+    return clf
+
+
+#global features
+
+def hu_moments(image):
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    moments = cv2.moments(gray)
+    huMoments = cv2.HuMoments(moments).flatten()
+    return huMoments
+
+def haralick(image):
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    haralick = mahotas.features.haralick(gray).mean(axis=0)
+    return haralick
+
+def histogram(image, mask=None):
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    hist  = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
+    cv2.normalize(hist, hist)
+    histogram = hist.flatten()
+    return histogram
+
+
+#Test one image
+def classifyImage(file, clf):
+    size = tuple((500, 500))
+    types = os.listdir('ClassificationGarbage/testSet')
+    types.sort()
+    image = cv2.imread(file)
+    image = cv2.resize(image, size)
+    #Global features
+    huMoments = hu_moments(image)
+    hara   = haralick(image)
+    histo  = histogram(image)
+    allFeatures = np.hstack([histo, hara, huMoments])
+    features = allFeatures.reshape(1,-1)
+    prediction = clf.predict(features)[0]
+    return types[prediction]
+
+
+#At the garbage dump
+def sortDump(cardboard, glass, metal, paper, plastic, clf, GT, dump):
+    testDir = 'ClassificationGarbage/testSet'
+    testTypes = os.listdir(testDir)
+    testTypes.sort()
+    noGarbage = [cardboard, glass, metal, paper, plastic]
+    for i in range(len(testTypes)):
+        print("Among " + testTypes[i] + ", we found: ")
+        for j in range(0, noGarbage[i]):
+            image = random.choice(os.listdir(testDir + '/' + testTypes[i]))
+            file = testDir + '/' + testTypes[i] + "/"+ image
+            prediction = classifyImage(file, clf)
+            print(prediction)
+            GT.empty(testTypes[i])
+            dump.addGarbage(prediction)