sport-text-classification-b.../main.py
2021-05-30 23:44:55 +02:00

136 lines
4.1 KiB
Python

from gensim.models import KeyedVectors
import nltk
import numpy as np
import os
import gensim
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import torch
import csv
# Assigning data from files to variables
train = pd.read_table('train/train.tsv', error_bad_lines=False,
sep='\t', quoting=csv.QUOTE_NONE, header=None)
x_dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False,
sep='\t', header=None, quoting=csv.QUOTE_NONE)
y_dev = pd.read_table('dev-0/expected.tsv', error_bad_lines=False,
sep='\t', header=None, quoting=csv.QUOTE_NONE)
x_test = pd.read_table('test-A/in.tsv', error_bad_lines=False,
sep='\t', header=None, quoting=csv.QUOTE_NONE)
# Data split na x i y
x_train = train[1].values
y_train = train[0].values
x_dev = x_dev[0].values
x_test = x_test[0].values
# I needed this only once
# nltk.download('punkt')
# Tokenization
def tokenize(data):
new_data = [nltk.word_tokenize(x) for x in data]
for doc in new_data:
i = 0
while i < len(doc):
if doc[i].isalpha():
doc[i] = doc[i].lower()
else:
del doc[i]
i += 1
return new_data
x_train_tokenized = tokenize(x_train)
x_dev_tokenized = tokenize(x_dev)
x_test_tokenized = tokenize(x_test)
# trained custom model form wiki-forms-all-100-skipg-ns
# run only on first try
# http://dsmodels.nlp.ipipan.waw.pl/dsmodels/wiki-forms-all-100-skipg-ns.txt.gz
# word2vec = KeyedVectors.load_word2vec_format(
# 'wiki-forms-all-100-skipg-ns.txt.gz', binary=False)
# word2vec.save("word2vec.bin")
word2vec = KeyedVectors.load("word2vec.bin")
x_train = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_train]
x_train_tensor = torch.tensor(
np.array(x_train, dtype=np.float32).astype(np.float32))
x_train_vec = np.array(x_train, dtype=np.float32)
x_dev = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_dev]
x_dev_vec = np.array(x_dev, dtype=np.float32)
x_test = [np.mean([word2vec[word] for word in content if word in word2vec] or [
np.zeros(100)], axis=0) for content in x_test]
x_test_vec = np.array(x_test, dtype=np.float32)
class NNModel(torch.nn.Module):
def __init__(self):
super(NNModel, self).__init__()
self.fc1 = torch.nn.Linear(100, 200)
self.fc2 = torch.nn.Linear(200, 1)
def forward(self, x):
x = self.fc1(x)
x = torch.relu(x)
x = self.fc2(x)
x = torch.sigmoid(x)
return x
model = NNModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
batch_size = 12
for epoch in range(10):
loss_score = 0
acc_score = 0
items_total = 0
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = x_train_vec[i:i + batch_size]
X = torch.tensor(X.astype(np.float32))
Y = y_train[i:i + batch_size]
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
Y_predictions = model(X)
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
items_total += Y.shape[0]
optimizer.zero_grad()
loss = criterion(Y_predictions, Y)
loss.backward()
optimizer.step()
loss_score += loss.item() * Y.shape[0]
print(epoch)
# Generation
y_pred_dev = model(torch.tensor(x_dev_vec.astype(np.float32)))
y_pred_dev = y_pred_dev.cpu().detach().numpy()
y_pred_dev = (y_pred_dev > 0.5)
y_pred_dev = np.asarray(y_pred_dev, dtype=np.int32)
y_pred_dev.tofile('dev-0/out.tsv', sep='\n')
y_pred_test = model(torch.tensor(x_dev_vec.astype(np.float32)))
y_pred_test = y_pred_test.cpu().detach().numpy()
y_pred_test = (y_pred_test > 0.5)
y_pred_test = np.asarray(y_pred_test, dtype=np.int32)
y_pred_test.tofile('test-A/out.tsv', sep='\n')