import pandas as pd from sklearn.model_selection import train_test_split def main(): data = pd.read_csv('Amazon_Consumer_Reviews.csv', header=0, sep=',') columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend'] string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title', 'reviews.username', 'reviews.text'] data = data[string_columns + columns] for c in string_columns: data[c] = data[c].str.lower() # print("Empty rows summary:") # print(data.isnull().sum()) # data["reviews.title"].fillna("No title", inplace = True) # print(data.isnull().sum()) data.to_csv('data.csv') train, test = train_test_split(data, train_size=0.6, random_state=1) test, dev = train_test_split(test, test_size=0.5, random_state=1) test.to_csv('test.csv') train.to_csv('train.csv') dev.to_csv('dev.csv') print("\n\nMean reviews rating for each primary category: ") print(data[["primaryCategories", "reviews.rating"]].groupby("primaryCategories").mean()) print("\n\nCounted primary categories: ") print(data["primaryCategories"].value_counts()) print("\n\nGeneral data statistics: ") print(data.describe(include='all')) if __name__ == '__main__': main()