UM-projekt/projekt.py

191 lines
6.2 KiB
Python

import pandas as pd
import math
import string
import nltk
from tensorflow import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, precision_recall_fscore_support, accuracy_score
from nltk.corpus import stopwords
nltk.download("stopwords")
#--------parametry
size_of_dataset = 3000
classes = 5
batch_size = 16
epochs = 25
#--------usunięcie pospolitych słów i interpunkcji
def stop_word_removal(text):
stop_words = stopwords.words("english")
punct = string.punctuation
clean_text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words]).replace('\n',' ')
return clean_text.translate(str.maketrans('', '', punct))
#--------zamiana tekstu na tf-idf
def preprocess_x(data):
data_cleaned = [stop_word_removal(review) for review in data["Review"]]
vectorizer = TfidfVectorizer()
data_tfidf = vectorizer.fit_transform(data_cleaned)
data_tfidf = pd.DataFrame(data = data_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
data = pd.concat([data, data_tfidf], axis="columns")
data = data.dropna()
return data
#--------stworzenie i nauczenie modelu dla regresji liniowej
def linear_reggresion(x, y):
model = LinearRegression()
model.fit(x, y)
return model
#--------stworzenie i nauczenie modelu dla zregularyzowanej regresji liniowej
def regularized_linear_reggresion(x, y):
model = make_pipeline(StandardScaler(), Ridge(alpha=0.00000001)) #0.000000001
model.fit(x, y)
return model
#--------stworzenie i nauczenie modelu dla regresji logistycznej
def logistic_reggresion(x, y):
y = y.values.ravel()
model = LogisticRegression(solver='sag')
model.fit(x, y)
return model
#--------stworzenie i nauczenie modelu dla sieci neuronowej
def nn(x, y):
y = keras.utils.to_categorical(y, classes+1)
x = x.values
model = Sequential()
model.add(Dense(1000, input_shape = (x.shape[1], ) ) )
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(classes+1))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(x, y, batch_size=batch_size, epochs=epochs, verbose=1)
return model
#--------wczytanie danych i ich przygotowanie
alldata = pd.read_csv(
"kindle_reviews.csv",
header=0,
usecols=[
"Rating",
"Review",
],
)
alldata.drop(alldata.index[size_of_dataset:len(alldata.index)], inplace=True)
alldata_preprocessed = preprocess_x(alldata)
print("-----------------------------------------------------")
print(alldata_preprocessed)
print("-----------------------------------------------------")
#--------przygotowanie zbioru uczącego i testowego
data_train, data_test = train_test_split(alldata_preprocessed, test_size=0.15)
y_train = pd.DataFrame(data_train["Rating"])
x_train = pd.DataFrame(data_train)
del x_train["Review"]
del x_train["Rating"]
y_expected = pd.DataFrame(data_test["Rating"])
x_test = pd.DataFrame(data_test)
x_test_text = pd.DataFrame(data_test["Review"])
del x_test["Review"]
del x_test["Rating"]
#--------uczenie modeli
linear_model = linear_reggresion(x_train, y_train)
regularized_model = regularized_linear_reggresion(x_train, y_train)
logistic_model = logistic_reggresion(x_train, y_train)
nn_model = nn(x_train, y_train)
#--------predykcja wyników na zbiorze testowym
y_predicted_linear = linear_model.predict(x_test)
y_predicted_reg = regularized_model.predict(x_test)
y_predicted_logistic = logistic_model.predict(x_test)
x_test_numpy = x_test.values
y_predicted_nn = nn_model.predict(x_test_numpy, batch_size=batch_size).argmax(axis=1)
#--------ocena wyników
mse = mean_squared_error(y_expected, y_predicted_linear)
rmse = math.sqrt(mse)
mse_reg = mean_squared_error(y_expected, y_predicted_reg)
rmse_reg = math.sqrt(mse_reg)
accuracy_lr = accuracy_score(y_expected, y_predicted_logistic)
precision_lr, recall_lr, fscore_lr, support_lr = precision_recall_fscore_support(y_expected, y_predicted_logistic, average="micro")
accuracy_nn = accuracy_score(y_expected, y_predicted_nn)
precision_nn, recall_nn, fscore_nn, support_nn = precision_recall_fscore_support(y_expected, y_predicted_nn, average="micro")
print("-----------------------------------------------------")
print("LINEAR REGRESSION EVALUATION")
print(f"RMSE: {rmse}")
print("-----------------------------------------------------")
print("REGULARIZED LINEAR REGRESSION EVALUATION")
print(f"RMSE: {rmse_reg}")
print("-----------------------------------------------------")
print("LOGISTIC REGGRESION EVALUATION")
print(f"Accuracy: {accuracy_lr}")
print(f"Precision: {precision_lr}")
print(f"Recall: {recall_lr}")
print(f"F-score: {fscore_lr}")
print("-----------------------------------------------------")
print("NEURAL NETWORK EVALUATION")
print(f"Accuracy: {accuracy_nn}")
print(f"Precision: {precision_nn}")
print(f"Recall: {recall_nn}")
print(f"F-score: {fscore_nn}")
print("-----------------------------------------------------")
#--------wypisanie pierwszych kilku przykładów ze zbioru testowego
pd.options.display.max_colwidth = 150
for i in range(0,5):
print(f"Text from test dataset: {x_test_text.iloc[i].to_string(index=False)}")
print(f"Real rating: {y_expected.iloc[i].to_string(index=False)}")
for result in y_predicted_linear[i]:
print(f"Predicted rating by Linear Reggresion: {result}")
print(f"Predicted rating by Logistic Reggresion: {y_predicted_logistic[i]}")
print(f"Predicted rating by Neural Network: {y_predicted_nn[i]}")
for result in y_predicted_reg[i]:
print(f"Predicted rating by Regularized Linear Reggresion: {result}")
print("-----------------------------------------------------")