word2vec_dl/word2vec.ipynb
2024-09-27 04:17:42 +02:00

8.2 KiB

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
dev_0_in = "./sport-text-classification-ball-ISI-public/dev-0/in.tsv"
test_A_in = "./sport-text-classification-ball-ISI-public/test-A/in.tsv"

dev_0_out = "./sport-text-classification-ball-ISI-public/dev-0/out.tsv"
test_A_out = "./sport-text-classification-ball-ISI-public/test-A/out.tsv"

train = "./sport-text-classification-ball-ISI-public/train/train.tsv"
expected = "./sport-text-classification-ball-ISI-public/dev-0/expected.tsv"
def build_corpus(file_list):
    documents = []
    for file in file_list:
        with open(file, 'r', encoding="utf8") as f:
            for line in f:
                processed_line = simple_preprocess(line)
                documents.append(processed_line)
    return documents
def text_to_vector(text, model):
    tokens = simple_preprocess(text)
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)
def read_text(filepath):
    lines = []
    with open(filepath, 'r', encoding="utf8") as file:
        for line in file:
            lines.append(line.strip())
    return lines
def save_predictions(predictions, filepath):
    with open(filepath, 'w', encoding="utf8") as file:
        for prediction in predictions:
            file.write(f"{prediction[0]}\n")
documents = build_corpus([dev_0_in, test_A_in])
w2v_model = Word2Vec(sentences=documents, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.save("word2vec.model")

dev_texts = read_text(dev_0_in)
test_texts = read_text(test_A_in)
dev_features = np.array([text_to_vector(text, w2v_model) for text in dev_texts])
test_features = np.array([text_to_vector(text, w2v_model) for text in test_texts])

dev_labels = pd.read_csv(expected, sep='\t', header=None).values.flatten()
X_train, X_valid, y_train, y_valid = train_test_split(dev_features, dev_labels, test_size=0.2, random_state=42)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_valid_tensor = torch.tensor(X_valid, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid, dtype=torch.float32).unsqueeze(1)
dev_features_tensor = torch.tensor(dev_features, dtype=torch.float32)
test_features_tensor = torch.tensor(test_features, dtype=torch.float32)

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(100, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

model = SimpleNN()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 1000
batch_size = 32
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        model.eval()
        with torch.no_grad():
            valid_outputs = model(X_valid_tensor)
            valid_loss = criterion(valid_outputs, y_valid_tensor)
            print(f'Epoka [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {valid_loss.item():.4f}')

model.eval()
with torch.no_grad():
    dev_predictions_raw = model(dev_features_tensor).numpy()
    test_predictions_raw = model(test_features_tensor).numpy()

dev_predictions = (dev_predictions_raw > 0.5).astype(int)
test_predictions = (test_predictions_raw > 0.5).astype(int)
Epoka [100/1000], Loss: 0.3149, Validation Loss: 0.3540
Epoka [200/1000], Loss: 0.2778, Validation Loss: 0.3339
Epoka [300/1000], Loss: 0.2638, Validation Loss: 0.3201
Epoka [400/1000], Loss: 0.2511, Validation Loss: 0.3047
Epoka [500/1000], Loss: 0.2408, Validation Loss: 0.2913
Epoka [600/1000], Loss: 0.2321, Validation Loss: 0.2807
Epoka [700/1000], Loss: 0.2243, Validation Loss: 0.2718
Epoka [800/1000], Loss: 0.2182, Validation Loss: 0.2654
Epoka [900/1000], Loss: 0.2136, Validation Loss: 0.2605
Epoka [1000/1000], Loss: 0.2101, Validation Loss: 0.2573
save_predictions(dev_predictions, dev_0_out)
save_predictions(test_predictions,test_A_out)
df = pd.read_csv(dev_0_out, header=None).values.flatten()

accuracy = accuracy_score(dev_labels, df)
report = classification_report(dev_labels, df)

print(f"Dokładność: {accuracy:.4f}")
print(report)
Dokładność: 0.8995
              precision    recall  f1-score   support

           0       0.88      0.84      0.86      1983
           1       0.91      0.93      0.92      3469

    accuracy                           0.90      5452
   macro avg       0.89      0.89      0.89      5452
weighted avg       0.90      0.90      0.90      5452