import string import pandas as pd from sklearn.model_selection import train_test_split import nltk nltk.download('stopwords') from nltk.corpus import stopwords def remove_punct(text): translator = str.maketrans("", "", string.punctuation) return text.translate(translator) stop = set(stopwords.words("english")) def remove_stopwords(text): filtered_words = [word.lower() for word in text.split() if word.lower() not in stop] return " ".join(filtered_words) def main(): data = pd.read_csv('dataset-Amazon.csv') columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend'] string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title', 'reviews.username', 'reviews.text'] data = data[string_columns + columns] for c in string_columns: data[c] = data[c].str.lower() data[c] = data[c].map(remove_punct) data[c] = data[c].map(remove_stopwords) print("Empty rows summary:") print(data.isnull().sum()) data.loc[(data["reviews.rating"] > 3), 'reviews.doRecommend'] = True data.loc[(data["reviews.rating"] <= 3), 'reviews.doRecommend'] = False data["reviews.doRecommend"] = data["reviews.doRecommend"].astype(int) print(data.isnull().sum()) data.to_csv('data.csv') train, test = train_test_split(data, train_size=0.6, random_state=1) test, dev = train_test_split(test, test_size=0.5, random_state=1) test.to_csv('test.csv') train.to_csv('train.csv') dev.to_csv('dev.csv') print("\n\nMean reviews rating for each primary category: ") print(data[["primaryCategories", "reviews.rating"]].groupby("primaryCategories").mean()) print("\n\nCounted primary categories: ") print(data["primaryCategories"].value_counts()) print("\n\nGeneral data statistics: ") print(data.describe(include='all')) if __name__ == '__main__': main()