import pandas as pd import math import string import nltk from tensorflow import keras from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, precision_recall_fscore_support, accuracy_score from nltk.corpus import stopwords nltk.download("stopwords") #--------parametry size_of_dataset = 3000 classes = 5 batch_size = 16 epochs = 25 #--------usunięcie pospolitych słów i interpunkcji def stop_word_removal(text): stop_words = stopwords.words("english") punct = string.punctuation clean_text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words]).replace('\n',' ') return clean_text.translate(str.maketrans('', '', punct)) #--------zamiana tekstu na tf-idf def preprocess_x(data): data_cleaned = [stop_word_removal(review) for review in data["Review"]] vectorizer = TfidfVectorizer() data_tfidf = vectorizer.fit_transform(data_cleaned) data_tfidf = pd.DataFrame(data = data_tfidf.toarray(),columns=vectorizer.get_feature_names_out()) data = pd.concat([data, data_tfidf], axis="columns") data = data.dropna() return data #--------stworzenie i nauczenie modelu dla regresji liniowej def linear_reggresion(x, y): model = LinearRegression() model.fit(x, y) return model #--------stworzenie i nauczenie modelu dla zregularyzowanej regresji liniowej def regularized_linear_reggresion(x, y): model = make_pipeline(StandardScaler(), Ridge(alpha=0.00000001)) #0.000000001 model.fit(x, y) return model #--------stworzenie i nauczenie modelu dla regresji logistycznej def logistic_reggresion(x, y): y = y.values.ravel() model = LogisticRegression(solver='sag') model.fit(x, y) return model #--------stworzenie i nauczenie modelu dla sieci neuronowej def nn(x, y): y = keras.utils.to_categorical(y, classes+1) x = x.values model = Sequential() model.add(Dense(1000, input_shape = (x.shape[1], ) ) ) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(100)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(50)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(classes+1)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') model.fit(x, y, batch_size=batch_size, epochs=epochs, verbose=1) return model #--------wczytanie danych i ich przygotowanie alldata = pd.read_csv( "kindle_reviews.csv", header=0, usecols=[ "Rating", "Review", ], ) alldata.drop(alldata.index[size_of_dataset:len(alldata.index)], inplace=True) alldata_preprocessed = preprocess_x(alldata) print("-----------------------------------------------------") print(alldata_preprocessed) print("-----------------------------------------------------") #--------przygotowanie zbioru uczącego i testowego data_train, data_test = train_test_split(alldata_preprocessed, test_size=0.15) y_train = pd.DataFrame(data_train["Rating"]) x_train = pd.DataFrame(data_train) del x_train["Review"] del x_train["Rating"] y_expected = pd.DataFrame(data_test["Rating"]) x_test = pd.DataFrame(data_test) x_test_text = pd.DataFrame(data_test["Review"]) del x_test["Review"] del x_test["Rating"] #--------uczenie modeli linear_model = linear_reggresion(x_train, y_train) regularized_model = regularized_linear_reggresion(x_train, y_train) logistic_model = logistic_reggresion(x_train, y_train) nn_model = nn(x_train, y_train) #--------predykcja wyników na zbiorze testowym y_predicted_linear = linear_model.predict(x_test) y_predicted_reg = regularized_model.predict(x_test) y_predicted_logistic = logistic_model.predict(x_test) x_test_numpy = x_test.values y_predicted_nn = nn_model.predict(x_test_numpy, batch_size=batch_size).argmax(axis=1) #--------ocena wyników mse = mean_squared_error(y_expected, y_predicted_linear) rmse = math.sqrt(mse) mse_reg = mean_squared_error(y_expected, y_predicted_reg) rmse_reg = math.sqrt(mse_reg) accuracy_lr = accuracy_score(y_expected, y_predicted_logistic) precision_lr, recall_lr, fscore_lr, support_lr = precision_recall_fscore_support(y_expected, y_predicted_logistic, average="micro") accuracy_nn = accuracy_score(y_expected, y_predicted_nn) precision_nn, recall_nn, fscore_nn, support_nn = precision_recall_fscore_support(y_expected, y_predicted_nn, average="micro") print("-----------------------------------------------------") print("LINEAR REGRESSION EVALUATION") print(f"RMSE: {rmse}") print("-----------------------------------------------------") print("REGULARIZED LINEAR REGRESSION EVALUATION") print(f"RMSE: {rmse_reg}") print("-----------------------------------------------------") print("LOGISTIC REGGRESION EVALUATION") print(f"Accuracy: {accuracy_lr}") print(f"Precision: {precision_lr}") print(f"Recall: {recall_lr}") print(f"F-score: {fscore_lr}") print("-----------------------------------------------------") print("NEURAL NETWORK EVALUATION") print(f"Accuracy: {accuracy_nn}") print(f"Precision: {precision_nn}") print(f"Recall: {recall_nn}") print(f"F-score: {fscore_nn}") print("-----------------------------------------------------") #--------wypisanie pierwszych kilku przykładów ze zbioru testowego pd.options.display.max_colwidth = 150 for i in range(0,5): print(f"Text from test dataset: {x_test_text.iloc[i].to_string(index=False)}") print(f"Real rating: {y_expected.iloc[i].to_string(index=False)}") for result in y_predicted_linear[i]: print(f"Predicted rating by Linear Reggresion: {result}") print(f"Predicted rating by Logistic Reggresion: {y_predicted_logistic[i]}") print(f"Predicted rating by Neural Network: {y_predicted_nn[i]}") for result in y_predicted_reg[i]: print(f"Predicted rating by Regularized Linear Reggresion: {result}") print("-----------------------------------------------------")