docker exercise 1
This commit is contained in:
parent
dd575691bf
commit
df72fdef61
16
Dockerfile
Normal file
16
Dockerfile
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
FROM ubuntu:latest
|
||||||
|
|
||||||
|
RUN apt-get update
|
||||||
|
RUN apt install -y python3.8
|
||||||
|
RUN apt-get install -y python3-pip
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY ./data_expl.py ./
|
||||||
|
COPY ./googleplaystore.csv ./
|
||||||
|
|
||||||
|
RUN pip3 install pandas
|
||||||
|
RUN pip3 install numpy
|
||||||
|
|
||||||
|
|
||||||
|
CMD python3 data_expl.py
|
35
data_expl.py
Normal file
35
data_expl.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
data = pd.read_csv('./googleplaystore.csv')
|
||||||
|
|
||||||
|
data.dropna(subset=['Rating', 'Type','Content Rating','Current Ver','Android Ver'], inplace=True)
|
||||||
|
data.reset_index(drop=True, inplace=True)
|
||||||
|
data.drop(columns=["Size", "Android Ver", "Current Ver", "Last Updated"])
|
||||||
|
|
||||||
|
# normalizing text
|
||||||
|
to_lowercase = ['App', 'Category', 'Type', 'Content Rating', 'Genres']
|
||||||
|
for column in to_lowercase:
|
||||||
|
data[column] = data[column].apply(str.lower)
|
||||||
|
|
||||||
|
data["Installs"] = data["Installs"].replace({'\+': ''}, regex=True)
|
||||||
|
data["Installs"] = data["Installs"].replace({',': ''}, regex=True)
|
||||||
|
|
||||||
|
# normalizing numbers
|
||||||
|
data["Reviews"] = pd.to_numeric(data["Reviews"], errors='coerce')
|
||||||
|
max_value = data["Reviews"].max()
|
||||||
|
min_value = data["Reviews"].min()
|
||||||
|
data["Reviews"] = (data["Reviews"] - min_value) / (max_value - min_value)
|
||||||
|
|
||||||
|
data["Installs"] = pd.to_numeric(data["Installs"], errors='coerce')
|
||||||
|
max_value = data["Installs"].max()
|
||||||
|
min_value = data["Installs"].min()
|
||||||
|
data["Installs"] = (data["Installs"] - min_value) / (max_value - min_value)
|
||||||
|
|
||||||
|
#print(data)
|
||||||
|
|
||||||
|
|
||||||
|
# splitting into sets
|
||||||
|
np.random.seed(123)
|
||||||
|
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])
|
||||||
|
print(f"Data shape: {data.shape}\nTrain shape: {train.shape}\nTest shape: {test.shape}\nValidation shape:{validate.shape}")
|
10842
googleplaystore.csv
Normal file
10842
googleplaystore.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user