commit ebbc4f57406cfd2cf0ea31ed24994bfd7e7d408f Author: Maks Kulikowski Date: Sun Jan 29 17:46:11 2023 +0100 adding project files diff --git a/kindle_reviews.csv b/kindle_reviews.csv new file mode 100644 index 0000000..c1b6e18 Binary files /dev/null and b/kindle_reviews.csv differ diff --git a/projekt.py b/projekt.py new file mode 100644 index 0000000..a36e4dc --- /dev/null +++ b/projekt.py @@ -0,0 +1,191 @@ +import pandas as pd +import math +import string +import nltk +from tensorflow import keras +from keras.models import Sequential +from keras.layers.core import Dense, Dropout, Activation +from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import make_pipeline +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error, precision_recall_fscore_support, accuracy_score +from nltk.corpus import stopwords +nltk.download("stopwords") + + +#--------parametry +size_of_dataset = 3000 +classes = 5 +batch_size = 16 +epochs = 25 + + +#--------usunięcie pospolitych słów i interpunkcji +def stop_word_removal(text): + stop_words = stopwords.words("english") + punct = string.punctuation + + clean_text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words]).replace('\n',' ') + return clean_text.translate(str.maketrans('', '', punct)) + + +#--------zamiana tekstu na tf-idf +def preprocess_x(data): + data_cleaned = [stop_word_removal(review) for review in data["Review"]] + + vectorizer = TfidfVectorizer() + data_tfidf = vectorizer.fit_transform(data_cleaned) + data_tfidf = pd.DataFrame(data = data_tfidf.toarray(),columns=vectorizer.get_feature_names_out()) + + data = pd.concat([data, data_tfidf], axis="columns") + data = data.dropna() + + return data + + +#--------stworzenie i nauczenie modelu dla regresji liniowej +def linear_reggresion(x, y): + model = LinearRegression() + model.fit(x, y) + + return model + + +#--------stworzenie i nauczenie modelu dla zregularyzowanej regresji liniowej +def regularized_linear_reggresion(x, y): + model = make_pipeline(StandardScaler(), Ridge(alpha=0.00000001)) #0.000000001 + model.fit(x, y) + + return model + + +#--------stworzenie i nauczenie modelu dla regresji logistycznej +def logistic_reggresion(x, y): + y = y.values.ravel() + + model = LogisticRegression(solver='sag') + model.fit(x, y) + + return model + + +#--------stworzenie i nauczenie modelu dla sieci neuronowej +def nn(x, y): + y = keras.utils.to_categorical(y, classes+1) + x = x.values + + model = Sequential() + model.add(Dense(1000, input_shape = (x.shape[1], ) ) ) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + model.add(Dense(100)) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + model.add(Dense(50)) + model.add(Activation('relu')) + model.add(Dropout(0.5)) + model.add(Dense(classes+1)) + model.add(Activation('softmax')) + model.compile(loss='categorical_crossentropy', optimizer='adam') + model.fit(x, y, batch_size=batch_size, epochs=epochs, verbose=1) + + return model + + +#--------wczytanie danych i ich przygotowanie +alldata = pd.read_csv( + "kindle_reviews.csv", + header=0, + usecols=[ + "Rating", + "Review", + ], +) +alldata.drop(alldata.index[size_of_dataset:len(alldata.index)], inplace=True) + +alldata_preprocessed = preprocess_x(alldata) +print("-----------------------------------------------------") +print(alldata_preprocessed) +print("-----------------------------------------------------") + + +#--------przygotowanie zbioru uczącego i testowego +data_train, data_test = train_test_split(alldata_preprocessed, test_size=0.15) + +y_train = pd.DataFrame(data_train["Rating"]) + +x_train = pd.DataFrame(data_train) +del x_train["Review"] +del x_train["Rating"] + +y_expected = pd.DataFrame(data_test["Rating"]) + +x_test = pd.DataFrame(data_test) +x_test_text = pd.DataFrame(data_test["Review"]) +del x_test["Review"] +del x_test["Rating"] + + +#--------uczenie modeli +linear_model = linear_reggresion(x_train, y_train) +regularized_model = regularized_linear_reggresion(x_train, y_train) +logistic_model = logistic_reggresion(x_train, y_train) +nn_model = nn(x_train, y_train) + + +#--------predykcja wyników na zbiorze testowym +y_predicted_linear = linear_model.predict(x_test) +y_predicted_reg = regularized_model.predict(x_test) +y_predicted_logistic = logistic_model.predict(x_test) +x_test_numpy = x_test.values +y_predicted_nn = nn_model.predict(x_test_numpy, batch_size=batch_size).argmax(axis=1) + + +#--------ocena wyników +mse = mean_squared_error(y_expected, y_predicted_linear) +rmse = math.sqrt(mse) + +mse_reg = mean_squared_error(y_expected, y_predicted_reg) +rmse_reg = math.sqrt(mse_reg) + +accuracy_lr = accuracy_score(y_expected, y_predicted_logistic) +precision_lr, recall_lr, fscore_lr, support_lr = precision_recall_fscore_support(y_expected, y_predicted_logistic, average="micro") + +accuracy_nn = accuracy_score(y_expected, y_predicted_nn) +precision_nn, recall_nn, fscore_nn, support_nn = precision_recall_fscore_support(y_expected, y_predicted_nn, average="micro") + +print("-----------------------------------------------------") +print("LINEAR REGRESSION EVALUATION") +print(f"RMSE: {rmse}") +print("-----------------------------------------------------") +print("REGULARIZED LINEAR REGRESSION EVALUATION") +print(f"RMSE: {rmse_reg}") +print("-----------------------------------------------------") +print("LOGISTIC REGGRESION EVALUATION") +print(f"Accuracy: {accuracy_lr}") +print(f"Precision: {precision_lr}") +print(f"Recall: {recall_lr}") +print(f"F-score: {fscore_lr}") +print("-----------------------------------------------------") +print("NEURAL NETWORK EVALUATION") +print(f"Accuracy: {accuracy_nn}") +print(f"Precision: {precision_nn}") +print(f"Recall: {recall_nn}") +print(f"F-score: {fscore_nn}") +print("-----------------------------------------------------") + + +#--------wypisanie pierwszych kilku przykładów ze zbioru testowego +pd.options.display.max_colwidth = 150 +for i in range(0,5): + print(f"Text from test dataset: {x_test_text.iloc[i].to_string(index=False)}") + print(f"Real rating: {y_expected.iloc[i].to_string(index=False)}") + for result in y_predicted_linear[i]: + print(f"Predicted rating by Linear Reggresion: {result}") + print(f"Predicted rating by Logistic Reggresion: {y_predicted_logistic[i]}") + print(f"Predicted rating by Neural Network: {y_predicted_nn[i]}") + for result in y_predicted_reg[i]: + print(f"Predicted rating by Regularized Linear Reggresion: {result}") +print("-----------------------------------------------------") \ No newline at end of file