docker exercise 1

2022-04-03 10:54:43 +02:00 · 2022-04-03 10:54:43 +02:00 · df72fdef61
commit df72fdef61
parent dd575691bf
3 changed files with 10893 additions and 0 deletions
--- a/16
+++ b/16
@ -0,0 +1,16 @@
+FROM ubuntu:latest
+
+RUN apt-get update
+RUN apt install -y python3.8
+RUN apt-get install -y python3-pip
+
+WORKDIR /app
+
+COPY ./data_expl.py ./
+COPY ./googleplaystore.csv ./
+
+RUN pip3 install pandas
+RUN pip3 install numpy
+
+
+CMD python3 data_expl.py
--- a/data_expl.py
+++ b/data_expl.py
@ -0,0 +1,35 @@
+import pandas as pd
+import numpy as np
+
+data = pd.read_csv('./googleplaystore.csv')
+
+data.dropna(subset=['Rating', 'Type','Content Rating','Current Ver','Android Ver'], inplace=True)
+data.reset_index(drop=True, inplace=True)
+data.drop(columns=["Size", "Android Ver", "Current Ver", "Last Updated"])
+
+# normalizing text
+to_lowercase = ['App', 'Category', 'Type', 'Content Rating', 'Genres']
+for column in to_lowercase:
+    data[column] = data[column].apply(str.lower)
+
+data["Installs"] = data["Installs"].replace({'\+': ''}, regex=True)
+data["Installs"] = data["Installs"].replace({',': ''}, regex=True)
+
+# normalizing numbers
+data["Reviews"] = pd.to_numeric(data["Reviews"], errors='coerce')
+max_value = data["Reviews"].max()
+min_value = data["Reviews"].min()
+data["Reviews"] = (data["Reviews"] - min_value) / (max_value - min_value)
+
+data["Installs"] = pd.to_numeric(data["Installs"], errors='coerce')
+max_value = data["Installs"].max()
+min_value = data["Installs"].min()
+data["Installs"] = (data["Installs"] - min_value) / (max_value - min_value)
+
+#print(data)
+
+
+# splitting into sets
+np.random.seed(123)
+train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])
+print(f"Data shape: {data.shape}\nTrain shape: {train.shape}\nTest shape: {test.shape}\nValidation shape:{validate.shape}")
--- a/googleplaystore.csv
+++ b/googleplaystore.csv