135 KiB
135 KiB
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)
data = pd.read_csv("train.tsv", sep="\t", on_bad_lines='skip')
data.columns = ["label", "text"]
data.head()
label | text | |
---|---|---|
0 | 1 | Przyjmujący reprezentacji Polski wrócił do PGE Skry Bełchatów Tylko rok trwał rozbrat Artura Sza... |
1 | 0 | FEN 9: Zapowiedź walki Róża Gumienna vs Katarzyna Posiadała (wideo) Podczas Fight Exclusive Nigh... |
2 | 1 | Aleksander Filipiak: Czuję się dobrze w nowym klubie Aleksander Filipiak w przerwie letniej zami... |
3 | 0 | Victoria Carl i Aleksiej Czerwotkin mistrzami świata juniorów na 5 i 10 kilometrów Biegi na 5 i ... |
4 | 1 | Świat poznał ją na mundialu. Francuska WAG czaruje pięknym ciałem Rachel Legrain-Trapani to jedn... |
data['text_clean'] = data['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
data.head()
label | text | text_clean | |
---|---|---|---|
0 | 1 | Przyjmujący reprezentacji Polski wrócił do PGE Skry Bełchatów Tylko rok trwał rozbrat Artura Sza... | [przyjmujący, reprezentacji, polski, wrócił, do, pge, skry, bełchatów, tylko, rok, trwał, rozbra... |
1 | 0 | FEN 9: Zapowiedź walki Róża Gumienna vs Katarzyna Posiadała (wideo) Podczas Fight Exclusive Nigh... | [fen, zapowiedź, walki, róża, gumienna, vs, katarzyna, posiadała, wideo, podczas, fight, exclusi... |
2 | 1 | Aleksander Filipiak: Czuję się dobrze w nowym klubie Aleksander Filipiak w przerwie letniej zami... | [aleksander, filipiak, czuję, się, dobrze, nowym, klubie, aleksander, filipiak, przerwie, letnie... |
3 | 0 | Victoria Carl i Aleksiej Czerwotkin mistrzami świata juniorów na 5 i 10 kilometrów Biegi na 5 i ... | [victoria, carl, aleksiej, czerwotkin, mistrzami, świata, juniorów, na, kilometrów, biegi, na, k... |
4 | 1 | Świat poznał ją na mundialu. Francuska WAG czaruje pięknym ciałem Rachel Legrain-Trapani to jedn... | [świat, poznał, ją, na, mundialu, francuska, wag, czaruje, pięknym, ciałem, rachel, legrain, tra... |
w2v_model = gensim.models.Word2Vec(data["text_clean"],
vector_size=500,
window=5,
min_count=2,
workers=4)
from keras.src.utils import pad_sequences
from keras.src.legacy.preprocessing.text import Tokenizer
token = Tokenizer(7229)
token.fit_on_texts(data['text_clean'])
text = token.texts_to_sequences(data['text_clean'])
text = pad_sequences(text, 75)
X_train, X_test, y_train, y_test = train_test_split(np.array(text), data['label'], test_size=0.2)
X_train
array([[ 0, 0, 0, ..., 2, 20, 1957], [ 0, 0, 0, ..., 7153, 441, 292], [ 0, 0, 0, ..., 3702, 2385, 9], ..., [ 0, 0, 0, ..., 520, 1094, 3132], [ 0, 0, 0, ..., 44, 287, 1800], [ 0, 0, 0, ..., 160, 57, 187]])
vocab_size = len(token.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 500))
for word, i in token.word_index.items():
if word in w2v_model.wv:
embedding_matrix[i] = w2v_model.wv[word]
import keras
opt = keras.optimizers.Adam(learning_rate=0.001)
from keras.src.layers import Dropout, Dense, Activation, Embedding, MaxPooling1D, GlobalMaxPooling1D
from keras.src.layers import Conv1D
from keras import Sequential
keras_model = Sequential()
keras_model.add(Embedding(vocab_size, 500, weights=[embedding_matrix], trainable=False))
keras_model.add(Dropout(0.4))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPooling1D())
keras_model.add(Dropout(0.4))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPooling1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPooling1D())
keras_model.add(Dropout(0.4))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.4))
keras_model.add(Dense(1))
keras_model.add(Activation('sigmoid'))
keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer=opt)
keras_model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test))
Epoch 1/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 22ms/step - acc: 0.9434 - loss: 0.1327 - val_acc: 0.9795 - val_loss: 0.0647 Epoch 2/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 21ms/step - acc: 0.9801 - loss: 0.0614 - val_acc: 0.9798 - val_loss: 0.0675 Epoch 3/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 22ms/step - acc: 0.9831 - loss: 0.0532 - val_acc: 0.9822 - val_loss: 0.0542 Epoch 4/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 22ms/step - acc: 0.9851 - loss: 0.0455 - val_acc: 0.9832 - val_loss: 0.0502 Epoch 5/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 22ms/step - acc: 0.9850 - loss: 0.0458 - val_acc: 0.9803 - val_loss: 0.0574 Epoch 6/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 21ms/step - acc: 0.9860 - loss: 0.0419 - val_acc: 0.9836 - val_loss: 0.0552 Epoch 7/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 22ms/step - acc: 0.9862 - loss: 0.0395 - val_acc: 0.9830 - val_loss: 0.0646 Epoch 8/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 22ms/step - acc: 0.9874 - loss: 0.0362 - val_acc: 0.9787 - val_loss: 0.0723 Epoch 9/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 27ms/step - acc: 0.9876 - loss: 0.0346 - val_acc: 0.9798 - val_loss: 0.0796 Epoch 10/10 [1m1227/1227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 40ms/step - acc: 0.9885 - loss: 0.0347 - val_acc: 0.9829 - val_loss: 0.0487
<keras.src.callbacks.history.History at 0x1ffe75a94d0>
model = keras_model
def preprocess(path):
data = pd.read_csv(path, sep="\t", on_bad_lines='skip')
data.columns = ["text"]
data['text_clean'] = data['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
text = token.texts_to_sequences(data['text_clean'])
text = pad_sequences(text, 75)
return text
x = preprocess("test-A/in.tsv")
res = model.predict(x)
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
y_predictions = np.where(res>=0.49, 1, 0)
out = pd.DataFrame(y_predictions)
out
0 | |
---|---|
0 | 1 |
1 | 1 |
2 | 0 |
3 | 1 |
4 | 1 |
... | ... |
5439 | 1 |
5440 | 1 |
5441 | 1 |
5442 | 0 |
5443 | 1 |
5444 rows × 1 columns
out.to_csv('out.tsv', sep="\t", index=False)
import pandas as pd
import pathlib
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from sklearn.metrics import confusion_matrix as cm_sklearn
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
def plot_discrimination_threshold(clf, X_test, y_test, argmax='f1', title='Metrics vs Discriminant Threshold', fig_size=(10, 8), dpi=100, save_fig_path=None):
"""
Plot precision, recall and f1-score vs discriminant threshold for the given pipeline model
Parameters
----------
clf : estimator instance (either sklearn.Pipeline, imblearn.Pipeline or a classifier)
PRE-FITTED classifier or a PRE-FITTED Pipeline in which the last estimator is a classifier.
X_test : pandas.DataFrame of shape (n_samples, n_features)
Test features.
y_test : pandas.Series of shape (n_samples,)
Target values.
argmax : str, default: 'f1'
Annotate the threshold maximized by the supplied metric. Options: 'f1', 'precision', 'recall'
title : str, default ='FPR and FNR vs Discriminant Threshold'
Plot title.
fig_size : tuple, default = (10, 8)
Size (inches) of the plot.
dpi : int, default = 100
Image DPI.
save_fig_path : str, defaut=None
Full path where to save the plot. Will generate the folders if they don't exist already.
Returns
-------
fig : Matplotlib.pyplot.Figure
Figure from matplotlib
ax : Matplotlib.pyplot.Axe
Axe object from matplotlib
"""
thresholds = np.linspace(0, 1, 100)
precision_ls = []
recall_ls = []
f1_ls = []
fpr_ls = []
fnr_ls = []
# obtain probabilities
probs = clf.predict(X_test)
for threshold in thresholds:
# obtain class prediction based on threshold
y_predictions = np.where(probs>=threshold, 1, 0)
# obtain confusion matrix
tn, fp, fn, tp = cm_sklearn(y_test, y_predictions).ravel()
# obtain FRP and FNR
FPR = fp / (tn + fp)
FNR = fn / (tp + fn)
# obtain precision, recall and f1 scores
precision = precision_score(y_test, y_predictions, average='binary')
recall = recall_score(y_test, y_predictions, average='binary')
f1 = f1_score(y_test, y_predictions, average='binary')
precision_ls.append(precision)
recall_ls.append(recall)
f1_ls.append(f1)
fpr_ls.append(FPR)
fnr_ls.append(FNR)
metrics = pd.concat([
pd.Series(precision_ls),
pd.Series(recall_ls),
pd.Series(f1_ls),
pd.Series(fpr_ls),
pd.Series(fnr_ls)], axis=1)
metrics.columns = ['precision', 'recall', 'f1', 'fpr', 'fnr']
metrics.index = thresholds
plt.rcParams["figure.facecolor"] = 'white'
plt.rcParams["axes.facecolor"] = 'white'
plt.rcParams["savefig.facecolor"] = 'white'
fig, ax = plt.subplots(1, 1, figsize=fig_size, dpi=dpi)
ax.plot(metrics['precision'], label='Precision')
ax.plot(metrics['recall'], label='Recall')
ax.plot(metrics['f1'], label='f1')
ax.plot(metrics['fpr'], label='False Positive Rate (FPR)', linestyle='dotted')
ax.plot(metrics['fnr'], label='False Negative Rate (FNR)', linestyle='dotted')
# Draw a threshold line
disc_threshold = round(metrics[argmax].idxmax(), 2)
ax.axvline(x=metrics[argmax].idxmax(), color='black', linestyle='dashed', label="$t_r$="+str(disc_threshold))
ax.xaxis.set_major_locator(MultipleLocator(0.1))
ax.xaxis.set_major_formatter('{x:.1f}')
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.yaxis.set_major_formatter('{x:.1f}')
ax.xaxis.set_minor_locator(MultipleLocator(0.05))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))
ax.tick_params(which='both', width=2)
ax.tick_params(which='major', length=7)
ax.tick_params(which='minor', length=4, color='black')
plt.grid(True)
plt.xlabel('Probability Threshold', fontsize=18)
plt.ylabel('Scores', fontsize=18)
plt.title(title, fontsize=18)
leg = ax.legend(loc='best', frameon=True, framealpha=0.7)
leg_frame = leg.get_frame()
leg_frame.set_color('gold')
plt.show()
if (save_fig_path != None):
path = pathlib.Path(save_fig_path)
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(save_fig_path, dpi=dpi)
return fig, ax, disc_threshold
plot_discrimination_threshold(model,X_test, y_test)
[1m614/614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
(<Figure size 1000x800 with 1 Axes>, <Axes: title={'center': 'Metrics vs Discriminant Threshold'}, xlabel='Probability Threshold', ylabel='Scores'>, 0.49)
x = preprocess("dev-0/in.tsv")
res = model.predict(x)
y_predictions = np.where(res >= 0.49, 1, 0)
out = pd.DataFrame(y_predictions)
out.to_csv('dev-0/out.tsv', sep="\t", index=False)
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
from sklearn.metrics import accuracy_score
y = pd.read_csv("./dev-0/expected.tsv")
score = accuracy_score(y_true=y, y_pred=out)
score
0.9814712896716199
import math
points = math.ceil(score * 7.0)
points
7