import pandas as pd from sklearn.model_selection import train_test_split def main(): data = pd.read_csv('resources/Amazon_Consumer_Reviews.csv', header=0, sep=',') columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend'] string_columns = ['name', 'brand', 'categories', 'primaryCategories', 'keys', 'manufacturer', 'reviews.title', 'reviews.username', 'reviews.text'] data = data[string_columns + columns] for c in string_columns: data[c] = data[c].str.lower() print("Empty rows summary:") print(data.isnull().sum()) data.dropna() data.to_csv('resources/data.csv') train, test = train_test_split(data, train_size=0.6, random_state=1) test, dev = train_test_split(test, test_size=0.5, random_state=1) test.to_csv('resources/test.csv') train.to_csv('resources/train.csv') dev.to_csv('resources/dev.csv') print("\n\nMean reviews rating for each primary category: ") print(data[["primaryCategories", "reviews.rating"]].groupby("primaryCategories").mean()) print("\n\nCounted primary categories: ") print(data["primaryCategories"].value_counts()) print("\n\nGeneral data statistics: ") print(data.describe(include='all')) if __name__ == '__main__': main()