191 lines
6.2 KiB
Python
191 lines
6.2 KiB
Python
|
import pandas as pd
|
||
|
import math
|
||
|
import string
|
||
|
import nltk
|
||
|
from tensorflow import keras
|
||
|
from keras.models import Sequential
|
||
|
from keras.layers.core import Dense, Dropout, Activation
|
||
|
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
from sklearn.pipeline import make_pipeline
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
from sklearn.metrics import mean_squared_error, precision_recall_fscore_support, accuracy_score
|
||
|
from nltk.corpus import stopwords
|
||
|
nltk.download("stopwords")
|
||
|
|
||
|
|
||
|
#--------parametry
|
||
|
size_of_dataset = 3000
|
||
|
classes = 5
|
||
|
batch_size = 16
|
||
|
epochs = 25
|
||
|
|
||
|
|
||
|
#--------usunięcie pospolitych słów i interpunkcji
|
||
|
def stop_word_removal(text):
|
||
|
stop_words = stopwords.words("english")
|
||
|
punct = string.punctuation
|
||
|
|
||
|
clean_text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words]).replace('\n',' ')
|
||
|
return clean_text.translate(str.maketrans('', '', punct))
|
||
|
|
||
|
|
||
|
#--------zamiana tekstu na tf-idf
|
||
|
def preprocess_x(data):
|
||
|
data_cleaned = [stop_word_removal(review) for review in data["Review"]]
|
||
|
|
||
|
vectorizer = TfidfVectorizer()
|
||
|
data_tfidf = vectorizer.fit_transform(data_cleaned)
|
||
|
data_tfidf = pd.DataFrame(data = data_tfidf.toarray(),columns=vectorizer.get_feature_names_out())
|
||
|
|
||
|
data = pd.concat([data, data_tfidf], axis="columns")
|
||
|
data = data.dropna()
|
||
|
|
||
|
return data
|
||
|
|
||
|
|
||
|
#--------stworzenie i nauczenie modelu dla regresji liniowej
|
||
|
def linear_reggresion(x, y):
|
||
|
model = LinearRegression()
|
||
|
model.fit(x, y)
|
||
|
|
||
|
return model
|
||
|
|
||
|
|
||
|
#--------stworzenie i nauczenie modelu dla zregularyzowanej regresji liniowej
|
||
|
def regularized_linear_reggresion(x, y):
|
||
|
model = make_pipeline(StandardScaler(), Ridge(alpha=0.00000001)) #0.000000001
|
||
|
model.fit(x, y)
|
||
|
|
||
|
return model
|
||
|
|
||
|
|
||
|
#--------stworzenie i nauczenie modelu dla regresji logistycznej
|
||
|
def logistic_reggresion(x, y):
|
||
|
y = y.values.ravel()
|
||
|
|
||
|
model = LogisticRegression(solver='sag')
|
||
|
model.fit(x, y)
|
||
|
|
||
|
return model
|
||
|
|
||
|
|
||
|
#--------stworzenie i nauczenie modelu dla sieci neuronowej
|
||
|
def nn(x, y):
|
||
|
y = keras.utils.to_categorical(y, classes+1)
|
||
|
x = x.values
|
||
|
|
||
|
model = Sequential()
|
||
|
model.add(Dense(1000, input_shape = (x.shape[1], ) ) )
|
||
|
model.add(Activation('relu'))
|
||
|
model.add(Dropout(0.5))
|
||
|
model.add(Dense(100))
|
||
|
model.add(Activation('relu'))
|
||
|
model.add(Dropout(0.5))
|
||
|
model.add(Dense(50))
|
||
|
model.add(Activation('relu'))
|
||
|
model.add(Dropout(0.5))
|
||
|
model.add(Dense(classes+1))
|
||
|
model.add(Activation('softmax'))
|
||
|
model.compile(loss='categorical_crossentropy', optimizer='adam')
|
||
|
model.fit(x, y, batch_size=batch_size, epochs=epochs, verbose=1)
|
||
|
|
||
|
return model
|
||
|
|
||
|
|
||
|
#--------wczytanie danych i ich przygotowanie
|
||
|
alldata = pd.read_csv(
|
||
|
"kindle_reviews.csv",
|
||
|
header=0,
|
||
|
usecols=[
|
||
|
"Rating",
|
||
|
"Review",
|
||
|
],
|
||
|
)
|
||
|
alldata.drop(alldata.index[size_of_dataset:len(alldata.index)], inplace=True)
|
||
|
|
||
|
alldata_preprocessed = preprocess_x(alldata)
|
||
|
print("-----------------------------------------------------")
|
||
|
print(alldata_preprocessed)
|
||
|
print("-----------------------------------------------------")
|
||
|
|
||
|
|
||
|
#--------przygotowanie zbioru uczącego i testowego
|
||
|
data_train, data_test = train_test_split(alldata_preprocessed, test_size=0.15)
|
||
|
|
||
|
y_train = pd.DataFrame(data_train["Rating"])
|
||
|
|
||
|
x_train = pd.DataFrame(data_train)
|
||
|
del x_train["Review"]
|
||
|
del x_train["Rating"]
|
||
|
|
||
|
y_expected = pd.DataFrame(data_test["Rating"])
|
||
|
|
||
|
x_test = pd.DataFrame(data_test)
|
||
|
x_test_text = pd.DataFrame(data_test["Review"])
|
||
|
del x_test["Review"]
|
||
|
del x_test["Rating"]
|
||
|
|
||
|
|
||
|
#--------uczenie modeli
|
||
|
linear_model = linear_reggresion(x_train, y_train)
|
||
|
regularized_model = regularized_linear_reggresion(x_train, y_train)
|
||
|
logistic_model = logistic_reggresion(x_train, y_train)
|
||
|
nn_model = nn(x_train, y_train)
|
||
|
|
||
|
|
||
|
#--------predykcja wyników na zbiorze testowym
|
||
|
y_predicted_linear = linear_model.predict(x_test)
|
||
|
y_predicted_reg = regularized_model.predict(x_test)
|
||
|
y_predicted_logistic = logistic_model.predict(x_test)
|
||
|
x_test_numpy = x_test.values
|
||
|
y_predicted_nn = nn_model.predict(x_test_numpy, batch_size=batch_size).argmax(axis=1)
|
||
|
|
||
|
|
||
|
#--------ocena wyników
|
||
|
mse = mean_squared_error(y_expected, y_predicted_linear)
|
||
|
rmse = math.sqrt(mse)
|
||
|
|
||
|
mse_reg = mean_squared_error(y_expected, y_predicted_reg)
|
||
|
rmse_reg = math.sqrt(mse_reg)
|
||
|
|
||
|
accuracy_lr = accuracy_score(y_expected, y_predicted_logistic)
|
||
|
precision_lr, recall_lr, fscore_lr, support_lr = precision_recall_fscore_support(y_expected, y_predicted_logistic, average="micro")
|
||
|
|
||
|
accuracy_nn = accuracy_score(y_expected, y_predicted_nn)
|
||
|
precision_nn, recall_nn, fscore_nn, support_nn = precision_recall_fscore_support(y_expected, y_predicted_nn, average="micro")
|
||
|
|
||
|
print("-----------------------------------------------------")
|
||
|
print("LINEAR REGRESSION EVALUATION")
|
||
|
print(f"RMSE: {rmse}")
|
||
|
print("-----------------------------------------------------")
|
||
|
print("REGULARIZED LINEAR REGRESSION EVALUATION")
|
||
|
print(f"RMSE: {rmse_reg}")
|
||
|
print("-----------------------------------------------------")
|
||
|
print("LOGISTIC REGGRESION EVALUATION")
|
||
|
print(f"Accuracy: {accuracy_lr}")
|
||
|
print(f"Precision: {precision_lr}")
|
||
|
print(f"Recall: {recall_lr}")
|
||
|
print(f"F-score: {fscore_lr}")
|
||
|
print("-----------------------------------------------------")
|
||
|
print("NEURAL NETWORK EVALUATION")
|
||
|
print(f"Accuracy: {accuracy_nn}")
|
||
|
print(f"Precision: {precision_nn}")
|
||
|
print(f"Recall: {recall_nn}")
|
||
|
print(f"F-score: {fscore_nn}")
|
||
|
print("-----------------------------------------------------")
|
||
|
|
||
|
|
||
|
#--------wypisanie pierwszych kilku przykładów ze zbioru testowego
|
||
|
pd.options.display.max_colwidth = 150
|
||
|
for i in range(0,5):
|
||
|
print(f"Text from test dataset: {x_test_text.iloc[i].to_string(index=False)}")
|
||
|
print(f"Real rating: {y_expected.iloc[i].to_string(index=False)}")
|
||
|
for result in y_predicted_linear[i]:
|
||
|
print(f"Predicted rating by Linear Reggresion: {result}")
|
||
|
print(f"Predicted rating by Logistic Reggresion: {y_predicted_logistic[i]}")
|
||
|
print(f"Predicted rating by Neural Network: {y_predicted_nn[i]}")
|
||
|
for result in y_predicted_reg[i]:
|
||
|
print(f"Predicted rating by Regularized Linear Reggresion: {result}")
|
||
|
print("-----------------------------------------------------")
|