Word2Vec/Word2Vec2.ipynb
2024-05-19 19:15:33 +02:00

5.8 KiB
Raw Permalink Blame History

import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Funkcja do przygotowania korpusu do trenowania word2vec
def prepare_corpus(filepaths):
    corpus = []
    for filepath in filepaths:
        with open(filepath, 'r', encoding="utf8") as file:
            for line in file:
                tokens = simple_preprocess(line)
                corpus.append(tokens)
    return corpus

# Funkcja do zamiany tekstów na wektory przy użyciu word2vec
def vectorize_text(text, model):
    tokens = simple_preprocess(text)
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Funkcja do wczytywania danych tekstowych
def load_data(filepath):
    texts = []
    with open(filepath, 'r', encoding="utf8") as file:
        for line in file:
            texts.append(line.strip())
    return texts

# Przygotowanie korpusu i trening modelu word2vec
corpus = prepare_corpus(['dev-0/in.tsv', 'test-A/in.tsv'])
w2v_model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.save("word2vec.model")

# Wczytywanie tekstów
dev_texts = load_data('dev-0/in.tsv')
test_texts = load_data('test-A/in.tsv')

# Zamiana tekstów na wektory
dev_vectors = np.array([vectorize_text(text, w2v_model) for text in dev_texts])
test_vectors = np.array([vectorize_text(text, w2v_model) for text in test_texts])

# Wczytywanie etykiet dla danych dev
dev_labels_df = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None)
dev_labels = dev_labels_df[0].values

# Podział danych dev na zbiór treningowy i walidacyjny
X_train, X_val, y_train, y_val = train_test_split(dev_vectors, dev_labels, test_size=0.2, random_state=42)

# Budowa modelu sieci neuronowej
model_nn = Sequential([
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Trening modelu z walidacją
history = model_nn.fit(X_train, y_train, epochs=1000, batch_size=32, validation_data=(X_val, y_val), verbose=0)

# Predykcje dla zbioru dev i test
dev_predictions = model_nn.predict(dev_vectors)
test_predictions = model_nn.predict(test_vectors)

# Konwersja predykcji do binarnych klas (0 lub 1)
dev_predictions = (dev_predictions > 0.5).astype(int)
test_predictions = (test_predictions > 0.5).astype(int)

# Zapis predykcji do plików
def save_predictions(predictions, filepath):
    with open(filepath, 'w', encoding="utf8") as file:
        for pred in predictions:
            file.write(f"{pred[0]}\n")

save_predictions(dev_predictions, 'dev-0/out.tsv')
save_predictions(test_predictions, 'test-A/out.tsv')

# Porównanie wyników z plikiem "expected"
dev_pred_labels = pd.read_csv('dev-0/out.tsv', header=None).values.flatten()
expected_labels = pd.read_csv('dev-0/expected.tsv', sep='\t', header=None).values.flatten()

# Wyświetlenie dokładności i raportu klasyfikacji
accuracy = accuracy_score(expected_labels, dev_pred_labels)
report = classification_report(expected_labels, dev_pred_labels)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)
171/171 ━━━━━━━━━━━━━━━━━━━━ 0s 676us/step
171/171 ━━━━━━━━━━━━━━━━━━━━ 0s 541us/step
Accuracy: 0.9394717534849596
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1983
           1       0.95      0.95      0.95      3469

    accuracy                           0.94      5452
   macro avg       0.93      0.93      0.93      5452
weighted avg       0.94      0.94      0.94      5452