ium_434780/main.py

62 lines
1.9 KiB
Python
Raw Normal View History

2021-05-08 22:32:28 +02:00
import string
2021-04-17 18:54:01 +02:00
import pandas as pd
from sklearn.model_selection import train_test_split
2021-05-08 22:32:28 +02:00
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def remove_punct(text):
translator = str.maketrans("", "", string.punctuation)
return text.translate(translator)
stop = set(stopwords.words("english"))
def remove_stopwords(text):
filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
return " ".join(filtered_words)
2021-04-17 18:54:01 +02:00
def main():
2021-05-08 22:32:28 +02:00
data = pd.read_csv('dataset-Amazon.csv')
2021-04-17 18:54:01 +02:00
columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend']
string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title',
2021-04-17 18:54:01 +02:00
'reviews.username', 'reviews.text']
data = data[string_columns + columns]
for c in string_columns:
data[c] = data[c].str.lower()
2021-05-08 22:32:28 +02:00
data[c] = data[c].map(remove_punct)
data[c] = data[c].map(remove_stopwords)
print("Empty rows summary:")
print(data.isnull().sum())
data.loc[(data["reviews.rating"] > 3), 'reviews.doRecommend'] = True
data.loc[(data["reviews.rating"] <= 3), 'reviews.doRecommend'] = False
data["reviews.doRecommend"] = data["reviews.doRecommend"].astype(int)
print(data.isnull().sum())
2021-04-17 18:54:01 +02:00
data.to_csv('data.csv')
2021-04-17 18:54:01 +02:00
train, test = train_test_split(data, train_size=0.6, random_state=1)
test, dev = train_test_split(test, test_size=0.5, random_state=1)
test.to_csv('test.csv')
train.to_csv('train.csv')
dev.to_csv('dev.csv')
2021-04-17 18:54:01 +02:00
2021-04-17 21:53:31 +02:00
print("\n\nMean reviews rating for each primary category: ")
2021-04-17 18:54:01 +02:00
print(data[["primaryCategories", "reviews.rating"]].groupby("primaryCategories").mean())
print("\n\nCounted primary categories: ")
print(data["primaryCategories"].value_counts())
print("\n\nGeneral data statistics: ")
print(data.describe(include='all'))
if __name__ == '__main__':
main()