40 lines
1.2 KiB
Python
40 lines
1.2 KiB
Python
|
import pandas as pd
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
|
||
|
|
||
|
def main():
|
||
|
data = pd.read_csv('resources/data.csv', header=0, sep=',')
|
||
|
|
||
|
columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend']
|
||
|
string_columns = ['name', 'brand', 'categories', 'primaryCategories', 'keys', 'manufacturer', 'reviews.title',
|
||
|
'reviews.username', 'reviews.text']
|
||
|
|
||
|
data = data[string_columns + columns]
|
||
|
|
||
|
for c in string_columns:
|
||
|
data[c] = data[c].str.lower()
|
||
|
|
||
|
# print(data.isnull().sum())
|
||
|
data.dropna()
|
||
|
|
||
|
data.to_csv('resources/data.csv')
|
||
|
|
||
|
train, test = train_test_split(data, train_size=0.6, random_state=1)
|
||
|
test, dev = train_test_split(test, test_size=0.5, random_state=1)
|
||
|
test.to_csv('resources/test.csv')
|
||
|
train.to_csv('resources/train.csv')
|
||
|
dev.to_csv('resources/dev.csv')
|
||
|
|
||
|
print("Mean reviews rating for each primary category: ")
|
||
|
print(data[["primaryCategories", "reviews.rating"]].groupby("primaryCategories").mean())
|
||
|
|
||
|
print("\n\nCounted primary categories: ")
|
||
|
print(data["primaryCategories"].value_counts())
|
||
|
|
||
|
print("\n\nGeneral data statistics: ")
|
||
|
print(data.describe(include='all'))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|