dl_word2vec_pilka/training.ipynb
2024-05-18 18:20:35 +02:00

105 KiB

import numpy as np
from gensim.models import KeyedVectors
# https://github.com/sdadas/polish-nlp-resources?tab=readme-ov-file#fasttext
fasttext_model = KeyedVectors.load("fasttext_100_3_polish.bin")
fasttext_model.wv.get_vector('office', norm=True)
array([-0.10575686,  0.00275842, -0.15149923,  0.04684225, -0.09484185,
        0.27445596, -0.13551135, -0.08899829, -0.16027121,  0.0851451 ,
        0.18053234, -0.0375574 ,  0.08118784, -0.03693336,  0.13899295,
        0.0148702 , -0.03542297,  0.07549705, -0.06520785, -0.09142417,
       -0.00903711,  0.15116395, -0.100382  ,  0.0491643 ,  0.00223149,
       -0.06079744,  0.0959003 , -0.12254302, -0.06442568, -0.01175186,
        0.14357556,  0.082293  , -0.00633075,  0.12356292, -0.12873764,
        0.03582585,  0.00486956,  0.02265417,  0.09742602,  0.00961361,
       -0.07241934,  0.05235291, -0.15645239,  0.05410094, -0.03922489,
       -0.19014828,  0.05091096, -0.16526255, -0.04351336,  0.02157344,
       -0.15707618, -0.01369421,  0.05524002, -0.12716308,  0.10982089,
        0.11500968,  0.00536837,  0.16475938, -0.13811931, -0.02000868,
        0.06066024, -0.03149116, -0.12379967, -0.21108894,  0.07293601,
       -0.14373247, -0.10874739, -0.03041346,  0.24131383,  0.06944644,
       -0.00836486, -0.11847664, -0.04725966,  0.00336932,  0.04964857,
        0.01957623, -0.02785001, -0.0883517 , -0.12014113, -0.02970322,
       -0.00858476, -0.0711842 ,  0.04591263, -0.05298669, -0.0397255 ,
        0.06707988, -0.01675842,  0.08076061, -0.01310711,  0.01628348,
        0.03469754, -0.04314699, -0.00516709,  0.2871206 ,  0.05852846,
       -0.18093199, -0.00342047, -0.147456  , -0.04751889, -0.02945601],
      dtype=float32)
fasttext_model.wv.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
0.72575206
import string
import numpy as np
def get_document_vector(document_string):
    processed = document_string.translate(str.maketrans('', '', string.punctuation)).split()
    try:
        processed = np.array(fasttext_model.wv.get_sentence_vector(processed))
        return processed
    except ValueError:
        return np.NAN
get_document_vector("This, is - a test.")
array([ 0.01578879, -0.0966718 , -0.12096456,  0.03103824,  0.11989886,
        0.08134278, -0.02491791, -0.04889391, -0.05900102,  0.05264781,
        0.07366802,  0.05264994,  0.03538202,  0.03959122,  0.08029908,
       -0.05133899, -0.00391489,  0.05500277,  0.02347905,  0.08629225,
       -0.08096454,  0.033292  , -0.07492353,  0.03558746, -0.03898185,
        0.01894082, -0.00977144, -0.02125431, -0.09896845, -0.07426734,
        0.07132851,  0.05021148,  0.06596912, -0.02060991, -0.06927098,
       -0.0590184 ,  0.03158417, -0.00033762,  0.18291356,  0.02761706,
       -0.0305428 , -0.07682855, -0.0167096 ,  0.02518708, -0.01596445,
       -0.0379869 ,  0.02503271, -0.10872342, -0.0715234 , -0.10176589,
       -0.03008098, -0.1061382 ,  0.04008991, -0.01109458,  0.01513245,
        0.00942784,  0.00155242,  0.05995774, -0.11261091, -0.06303023,
        0.02372515,  0.00859607, -0.02200282,  0.02862521, -0.091718  ,
        0.01269631, -0.02191854, -0.09026017,  0.03745283, -0.00393062,
       -0.02468689, -0.08132526, -0.0274496 , -0.09630067,  0.07670791,
        0.01474745, -0.05055737, -0.00122033, -0.07364829,  0.01220732,
       -0.09696812, -0.13338262,  0.06731747, -0.03619792,  0.03923816,
       -0.03797578,  0.0150913 , -0.04379996, -0.01847179,  0.06803966,
        0.0418974 , -0.0373757 ,  0.0374969 ,  0.08460734,  0.02028288,
       -0.0726779 ,  0.04701586,  0.02269063,  0.09565686,  0.02680007],
      dtype=float32)
import pandas as pd

training_file = pd.read_csv("train/train.tsv", sep='\t', on_bad_lines="warn", names=["class","text_data"])
training_file.head()
C:\Users\Adrian\AppData\Local\Temp\ipykernel_1428\2569974358.py:3: ParserWarning: Skipping line 25706: expected 2 fields, saw 3
Skipping line 58881: expected 2 fields, saw 3
Skipping line 73761: expected 2 fields, saw 3

  training_file = pd.read_csv("train/train.tsv", sep='\t', on_bad_lines="warn", names=["class","text_data"])
class text_data
0 1 Mindaugas Budzinauskas wierzy w odbudowę formy...
1 1 Przyjmujący reprezentacji Polski wrócił do PGE...
2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...
3 1 Aleksander Filipiak: Czuję się dobrze w nowym ...
4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ...
training_file["text_data"].apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))
0        Mindaugas Budzinauskas wierzy w odbudowę formy...
1        Przyjmujący reprezentacji Polski wrócił do PGE...
2        FEN 9 Zapowiedź walki Róża Gumienna vs Katarzy...
3        Aleksander Filipiak Czuję się dobrze w nowym k...
4        Victoria Carl i Aleksiej Czerwotkin mistrzami ...
                               ...                        
98124    Kamil Syprzak zaczyna kolekcjonować trofea FC ...
98125    Holandia dwa gole Piotra Parzyszka Piotr Parzy...
98126    Sparingowo Korona gorsza od Stali Lettieri spr...
98127    Vive  Wisła Ośmiu debiutantów w tegorocznej św...
98128    WTA Miami Timea Bacsinszky pokonana Swietłana ...
Name: text_data, Length: 98129, dtype: object
training_file.dropna(inplace=True)
len(training_file.index)
98129
training_file["vectorized"] = training_file["text_data"].apply(get_document_vector)
training_file.dropna(inplace=True)
print(len(training_file.index))
98128
training_file.head()
class text_data vectorized
0 1 Mindaugas Budzinauskas wierzy w odbudowę formy... [-0.010491192, -0.058443062, -0.1072605, 0.068...
1 1 Przyjmujący reprezentacji Polski wrócił do PGE... [0.019159772, -0.03807462, -0.093816765, 0.080...
2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz... [0.019561907, -0.09903135, -0.08141139, 0.0962...
3 1 Aleksander Filipiak: Czuję się dobrze w nowym ... [0.0019692876, -0.040995505, -0.112910554, 0.0...
4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ... [0.026810315, -0.07052034, -0.12447791, 0.0609...
training_file.info()
<class 'pandas.core.frame.DataFrame'>
Index: 98128 entries, 0 to 98128
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   class       98128 non-null  int64 
 1   text_data   98128 non-null  object
 2   vectorized  98128 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.0+ MB
training_file["vectorized"].iloc[0]
array([-0.01049119, -0.05844306, -0.1072605 ,  0.0680153 ,  0.01738467,
        0.02759303,  0.02690293, -0.02688588, -0.00051565,  0.04960843,
        0.0267325 , -0.07590238, -0.00112739, -0.02663443, -0.01215785,
       -0.02335822,  0.00596362,  0.03255358,  0.03372947, -0.0320864 ,
        0.06242761,  0.05441704, -0.10440411,  0.02391675, -0.02517564,
       -0.00581436,  0.01041707, -0.02866426, -0.07569201, -0.05691882,
        0.01377875,  0.05586738,  0.02601947, -0.01073826, -0.07011177,
        0.05394488,  0.00468541, -0.0290179 ,  0.12888645,  0.05720428,
       -0.04035591, -0.05646745, -0.00185273,  0.01846331,  0.02260421,
       -0.05327827, -0.0299728 , -0.01459699, -0.01037856, -0.05196216,
       -0.02092045, -0.00421424, -0.0101665 , -0.01815657,  0.03365456,
        0.01784232, -0.01427742, -0.05149295, -0.01840808,  0.08991119,
        0.03609616, -0.03085677, -0.02868558,  0.0879923 , -0.08462378,
       -0.02428374, -0.06649223,  0.05328292,  0.09114845, -0.0074865 ,
        0.07581685,  0.02017863,  0.01063073,  0.02651897,  0.00125264,
       -0.04205399, -0.15118514, -0.01358473, -0.04589266,  0.00465928,
       -0.01037135, -0.0240653 ,  0.01271867, -0.00046581, -0.0062453 ,
       -0.01982017, -0.00213563,  0.0068075 , -0.01338028, -0.01335924,
        0.11551541,  0.01461171, -0.0956174 ,  0.09537749,  0.02394151,
       -0.1085504 ,  0.0310267 ,  0.02344807,  0.01435937,  0.03094357],
      dtype=float32)
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.optimizers import Adam

def create_model():
    inputs = keras.Input(shape=(100,))
    dense = layers.Dense(64, activation="relu")(inputs)
    output = layers.Dense(1, activation="sigmoid")(dense)
    model = keras.Model(inputs=inputs, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model
from sklearn.model_selection import train_test_split

training_file = training_file.sample(frac=1).reset_index(drop=True)
train, valid = train_test_split(training_file, test_size=0.2)
train_x = np.stack(train["vectorized"].values)
train_y = np.stack(train["class"].values)
valid_x = np.stack(valid["vectorized"].values)
valid_y = np.stack(valid["class"].values)
print(train["vectorized"])
11925    [0.00907336, -0.035000063, -0.046387862, 0.107...
29681    [0.029005446, -0.062998086, -0.10763814, 0.059...
39659    [0.0073042903, -0.028519068, -0.07020145, 0.08...
26203    [-0.012138679, -0.036977977, -0.05332508, 0.05...
93611    [0.034259614, -0.06937863, -0.09370455, 0.0414...
                               ...                        
90557    [0.015421399, -0.051549092, -0.118074715, 0.07...
10805    [-0.017966524, -0.07279962, -0.10843535, 0.071...
17336    [0.038043424, -0.024239093, -0.11319029, 0.066...
39497    [0.03166563, -0.061132513, -0.09316901, 0.1028...
87005    [0.02179843, -0.042094912, -0.078197055, 0.084...
Name: vectorized, Length: 78502, dtype: object
train.head()
class text_data vectorized
11925 1 Tam było czuć historię. Leo Beenhakker zaurocz... [0.00907336, -0.035000063, -0.046387862, 0.107...
29681 1 Filip Dylewicz: Po raz pierwszy od 20 lat prow... [0.029005446, -0.062998086, -0.10763814, 0.059...
39659 1 Czytaj w "PN". Finaliści MŚ. Piękno i rygor Hi... [0.0073042903, -0.028519068, -0.07020145, 0.08...
26203 1 Novak Djoković podał do sądu władze miasta Rio... [-0.012138679, -0.036977977, -0.05332508, 0.05...
93611 1 Wimbledon: Jelena Ostapenko nie stawiła oporu ... [0.034259614, -0.06937863, -0.09370455, 0.0414...
print(len(train.index))
78502
valid.head()
class text_data vectorized
87636 1 Szok w Hali Mistrzów, Energa Czarni w półfinal... [0.0040451484, -0.034110088, -0.1111216, 0.050...
96088 1 Postępy Kuby Błaszczykowskiego Jakub Błaszczyk... [0.01574161, -0.055649985, -0.077657014, 0.085...
54386 1 Pobici piłkarze odchodzą ze Sportingu Lizbona.... [-0.013020566, -0.076468304, -0.127176, 0.0720...
29418 1 El. LE: polskie kluby znają potencjalnych rywa... [-0.005546203, -0.033757057, -0.10181239, 0.07...
80561 1 Było ofensywnie i efektownie. Polpharma Starog... [0.021034276, -0.06635279, -0.091047965, 0.054...
print(len(valid.index))
19626
callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
model = create_model()
history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=20, callbacks=[callback])
Epoch 1/20
2454/2454 [==============================] - 3s 1ms/step - loss: 0.3017 - accuracy: 0.8763 - val_loss: 0.2039 - val_accuracy: 0.9214
Epoch 2/20
2454/2454 [==============================] - 2s 946us/step - loss: 0.1920 - accuracy: 0.9256 - val_loss: 0.1896 - val_accuracy: 0.9268
Epoch 3/20
2454/2454 [==============================] - 2s 989us/step - loss: 0.1837 - accuracy: 0.9285 - val_loss: 0.1848 - val_accuracy: 0.9287
Epoch 4/20
2454/2454 [==============================] - 2s 954us/step - loss: 0.1795 - accuracy: 0.9298 - val_loss: 0.1820 - val_accuracy: 0.9301
Epoch 5/20
2454/2454 [==============================] - 3s 1ms/step - loss: 0.1768 - accuracy: 0.9308 - val_loss: 0.1804 - val_accuracy: 0.9304
Epoch 6/20
2454/2454 [==============================] - 3s 1ms/step - loss: 0.1733 - accuracy: 0.9320 - val_loss: 0.1756 - val_accuracy: 0.9324
Epoch 7/20
2454/2454 [==============================] - 2s 982us/step - loss: 0.1692 - accuracy: 0.9336 - val_loss: 0.1721 - val_accuracy: 0.9331
Epoch 8/20
2454/2454 [==============================] - 3s 1ms/step - loss: 0.1636 - accuracy: 0.9359 - val_loss: 0.1699 - val_accuracy: 0.9349
Epoch 9/20
2454/2454 [==============================] - 2s 952us/step - loss: 0.1578 - accuracy: 0.9379 - val_loss: 0.1671 - val_accuracy: 0.9358
Epoch 10/20
2454/2454 [==============================] - 3s 1ms/step - loss: 0.1519 - accuracy: 0.9413 - val_loss: 0.1549 - val_accuracy: 0.9400
Epoch 11/20
2454/2454 [==============================] - 2s 974us/step - loss: 0.1462 - accuracy: 0.9432 - val_loss: 0.1509 - val_accuracy: 0.9420
Epoch 12/20
2454/2454 [==============================] - 2s 967us/step - loss: 0.1413 - accuracy: 0.9450 - val_loss: 0.1460 - val_accuracy: 0.9437
Epoch 13/20
2454/2454 [==============================] - 2s 971us/step - loss: 0.1367 - accuracy: 0.9459 - val_loss: 0.1396 - val_accuracy: 0.9472
Epoch 14/20
2454/2454 [==============================] - 3s 1ms/step - loss: 0.1322 - accuracy: 0.9492 - val_loss: 0.1380 - val_accuracy: 0.9482
Epoch 15/20
2454/2454 [==============================] - 2s 986us/step - loss: 0.1293 - accuracy: 0.9501 - val_loss: 0.1343 - val_accuracy: 0.9495
Epoch 16/20
2454/2454 [==============================] - 2s 975us/step - loss: 0.1262 - accuracy: 0.9516 - val_loss: 0.1320 - val_accuracy: 0.9494
Epoch 17/20
2454/2454 [==============================] - 2s 975us/step - loss: 0.1236 - accuracy: 0.9523 - val_loss: 0.1289 - val_accuracy: 0.9510
Epoch 18/20
2454/2454 [==============================] - 2s 990us/step - loss: 0.1210 - accuracy: 0.9534 - val_loss: 0.1272 - val_accuracy: 0.9514
Epoch 19/20
2454/2454 [==============================] - 3s 1ms/step - loss: 0.1193 - accuracy: 0.9539 - val_loss: 0.1246 - val_accuracy: 0.9530
Epoch 20/20
2454/2454 [==============================] - 3s 1ms/step - loss: 0.1174 - accuracy: 0.9544 - val_loss: 0.1240 - val_accuracy: 0.9518
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
<matplotlib.legend.Legend at 0x1654007c520>
from matplotlib import pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
<matplotlib.legend.Legend at 0x16542647be0>
def process_input(directory):
    with open(directory+"/in.tsv", encoding="utf-8") as data_file:
        df = pd.DataFrame([], columns=['text_data'])
        for line in data_file:
            df = df._append({'text_data': line}, ignore_index=True)
    df["text_data"].apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))
    df["vectorized"] = df["text_data"].apply(get_document_vector)
    data_x = np.stack(df["vectorized"].values)
    predictions = model.predict(data_x)
    return predictions

predictions = process_input("dev-0")[:,0]
171/171 [==============================] - 0s 591us/step
predictions
array([9.9900788e-01, 9.9998349e-01, 2.1855670e-03, ..., 2.0675772e-04,
       9.9930727e-01, 9.8721308e-01], dtype=float32)
print(np.rint(predictions)[:100])
[1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1.
 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0.
 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0.
 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0.
 1. 0. 0. 1.]
predictions_rounded = np.rint(predictions)
np.savetxt("dev-0/out.tsv",predictions_rounded, fmt='%i')
predictions = process_input("test-A")[:,0]
171/171 [==============================] - 0s 619us/step
predictions_rounded = np.rint(predictions)
np.savetxt("test-A/out.tsv",predictions_rounded, fmt='%i')
model.save("model.keras")