ium_444517/data_expl.py
2022-04-03 12:30:04 +02:00

48 lines
1.5 KiB
Python

import pandas as pd
import numpy as np
data = pd.read_csv('./googleplaystore.csv')
data.dropna(subset=['Rating', 'Type','Content Rating','Current Ver','Android Ver'], inplace=True)
data.reset_index(drop=True, inplace=True)
data.drop(columns=["Size", "Android Ver", "Current Ver", "Last Updated"])
# normalizing text
to_lowercase = ['App', 'Category', 'Type', 'Content Rating', 'Genres']
for column in to_lowercase:
data[column] = data[column].apply(str.lower)
data["Installs"] = data["Installs"].replace({'\+': ''}, regex=True)
data["Installs"] = data["Installs"].replace({',': ''}, regex=True)
# normalizing numbers
data["Reviews"] = pd.to_numeric(data["Reviews"], errors='coerce')
max_value = data["Reviews"].max()
min_value = data["Reviews"].min()
data["Reviews"] = (data["Reviews"] - min_value) / (max_value - min_value)
data["Installs"] = pd.to_numeric(data["Installs"], errors='coerce')
max_value = data["Installs"].max()
min_value = data["Installs"].min()
data["Installs"] = (data["Installs"] - min_value) / (max_value - min_value)
#print(data)
# splitting into sets
np.random.seed(123)
train, validate, test = np.split(data.sample(frac=1, random_state=42), [int(.6*len(data)), int(.8*len(data))])
print(f"Data shape: {data.shape}\nTrain shape: {train.shape}\nTest shape: {test.shape}\nValidation shape:{validate.shape}")
f = open("apps_train.csv", "w")
f.write(str(train))
f.close()
f = open("apps_test.csv", "w")
f.write(str(test))
f.close()
f = open("apps_validate.csv", "w")
f.write(str(validate))
f.close()