This commit is contained in:
Jakub 2021-07-03 22:21:37 +02:00
parent c6566dd4e8
commit 2fa5ee6636
3 changed files with 2433 additions and 2550 deletions

File diff suppressed because it is too large Load Diff

277
main.py
View File

@ -1,222 +1,105 @@
#!/usr/bin/env python # noinspection PyUnresolvedReferences
# coding: utf-8
# In[1]:
import csv import csv
import torch
# In[2]:
get_ipython().system('pip install gensim')
# In[17]:
import nltk
nltk.download('punkt')
# In[9]:
get_ipython().system('pip install nltk')
# In[3]:
get_ipython().system('pip install torch')
# In[4]:
import gensim.downloader
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import torch from nltk.util import pr
from gensim import downloader
from nltk.tokenize import word_tokenize
BATCH_SIZE = 5
# In[5]: class NeuralNetworkModel(torch.nn.Module):
def __init__(self):
import torch.nn as nn dim = 200
from nltk import word_tokenize super(NeuralNetworkModel, self).__init__()
self.one = torch.nn.Linear(dim, 500)
self.two = torch.nn.Linear(500, 1)
# In[13]:
header_names = ["content", "id", "label"]
# In[23]:
class FF(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(FF, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(hidden_dim, output_dim)
def forward(self, x): def forward(self, x):
out = self.fc1(x) x = self.one(x)
out = self.relu1(out) x = torch.relu(x)
out = self.relu2(out) x = self.two(x)
out = self.fc3(out) x = torch.sigmoid(x)
return torch.sigmoid(out) return x
train_set_labels = pd.read_table( def read_data():
"train/expected.tsv", x_labels = (pd.read_csv('in-header.tsv', sep='\t')).columns
error_bad_lines=False, y_labels = (pd.read_csv('out-header.tsv', sep='\t')).columns
quoting=csv.QUOTE_NONE,
header=None,
names=header_names[2:],
)
train_set_features = pd.read_table( x_train = pd.read_table('train/in.tsv', header=None, quoting=csv.QUOTE_NONE, names=x_labels)
"train/in.tsv.xz", y_train = pd.read_table('train/expected.tsv', header=None, quoting=csv.QUOTE_NONE, names=y_labels)
error_bad_lines=False, x_dev = pd.read_table('dev-0/in.tsv', header=None, quoting=csv.QUOTE_NONE, names=x_labels)
quoting=csv.QUOTE_NONE, x_test = pd.read_table('test-A/in.tsv', header=None, quoting=csv.QUOTE_NONE, names=x_labels)
header=None,
names=header_names[:2],
)
# remove some rows for faster development
remove_n = 200000
drop_indices = np.random.choice(x_train.index, remove_n, replace=False)
x_train = x_train.drop(drop_indices)
y_train = y_train.drop(drop_indices)
test_set = pd.read_table( return x_labels, y_labels, x_train, y_train, x_dev, x_test
"test-A/in.tsv.xz",
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
names=header_names[:2],
)
dev_set = pd.read_table( def process_data(x_labels, y_labels, x_train, y_train, x_dev, x_test):
"dev-0/in.tsv.xz", x_train = x_train[x_labels[0]].str.lower()
error_bad_lines=False, x_dev = x_dev[x_labels[0]].str.lower()
header=None, x_test = x_test[x_labels[0]].str.lower()
quoting=csv.QUOTE_NONE, y_train = y_train[y_labels[0]]
names=header_names[:2],
)
X_train = train_set_features["content"].str.lower()
y_train = train_set_labels["label"]
X_dev = dev_set["content"].str.lower() x_train = [word_tokenize(x) for x in x_train]
X_test = test_set["content"].str.lower() x_dev = [word_tokenize(x) for x in x_dev]
X_train = [word_tokenize(content) for content in X_train] x_test = [word_tokenize(x) for x in x_test]
X_dev = [word_tokenize(content) for content in X_dev]
X_test = [word_tokenize(content) for content in X_test]
word2vec = gensim.downloader.load("word2vec-google-news-300")
w2v = downloader.load('glove-wiki-gigaword-200')
# In[24]: x_train = [np.mean([w2v[w] for w in d if w in w2v] or [np.zeros(200)], axis=0) for d in x_train]
x_dev = [np.mean([w2v[w] for w in d if w in w2v] or [np.zeros(200)], axis=0) for d in x_dev]
x_test = [np.mean([w2v[w] for w in d if w in w2v] or [np.zeros(200)], axis=0) for d in x_test]
return x_train, y_train, x_dev, x_test
X_train = [ def predict(model, x_data, out_path):
np.mean( y_out = []
[word2vec[word] for word in content if word in word2vec] or [np.zeros(300)],
axis=0,
)
for content in X_train
]
X_dev = [
np.mean(
[word2vec[word] for word in content if word in word2vec] or [np.zeros(300)],
axis=0,
)
for content in X_dev
]
X_test = [
np.mean(
[word2vec[word] for word in content if word in word2vec] or [np.zeros(300)],
axis=0,
)
for content in X_test
]
hidden_layer = 650
epochs = 15
batch_size = 10
# In[27]:
output_dim = 1
input_dim =300
model = FF(input_dim, hidden_layer, output_dim)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.BCELoss()
# In[28]:
for epoch in range(epochs):
model.train()
for i in range(0, y_train.shape[0], batch_size):
X = X_train[i : i + batch_size]
X = torch.tensor(X)
y = y_train[i : i + batch_size]
y = torch.tensor(y.astype(np.float32).to_numpy()).reshape(-1, 1)
outputs = model(X.float())
loss = criterion(outputs, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
test_prediction = []
dev_prediction = []
model.eval() model.eval()
with torch.no_grad(): with torch.no_grad():
for i in range(0, len(X_test), batch_size): for i in range(0, len(x_data), BATCH_SIZE):
X = X_test[i : i + batch_size] x = x_data[i:i + BATCH_SIZE]
x = torch.tensor(x)
pred = nn_model(x.float())
y_pred = (pred > 0.5)
y_out.extend(y_pred)
y_data = np.asarray(y_out, dtype=np.int32)
pd.DataFrame(y_data).to_csv(out_path, sep='\t', index=False, header=False)
if __name__ == "__main__":
x_labels, y_labels, x_train, y_train, x_dev, x_test = read_data()
x_train, y_train, x_dev, x_test = process_data(x_labels, y_labels, x_train, y_train, x_dev, x_test)
nn_model = NeuralNetworkModel()
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(nn_model.parameters(), lr=0.1)
for epoch in range(5):
nn_model.train()
for i in range(0, y_train.shape[0], BATCH_SIZE):
X = x_train[i:i + BATCH_SIZE]
X = torch.tensor(X) X = torch.tensor(X)
outputs = model(X.float()) Y = y_train[i:i + BATCH_SIZE]
Y = torch.tensor(Y.astype(np.float32).to_numpy()).reshape(-1, 1)
prediction = outputs > 0.5 Y_predictions = nn_model(X.float())
test_prediction += prediction.tolist()
for i in range(0, len(X_dev), batch_size):
X = X_dev[i : i + batch_size]
X = torch.tensor(X)
outputs = model(X.float())
prediction = outputs > 0.5
dev_prediction += prediction.tolist()
test_prediction = np.asarray(test_prediction, dtype=np.int32)
dev_prediction = np.asarray(dev_prediction, dtype=np.int32)
test_prediction.tofile("./test-A/out.tsv", sep="\n")
dev_prediction.tofile("./dev-0/out.tsv", sep="\n")
# In[ ]:
# In[ ]:
# In[ ]:
loss = criterion(Y_predictions, Y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
predict(nn_model, x_dev, 'dev-0/out.tsv')
predict(nn_model, x_test, 'test-A/out.tsv')

File diff suppressed because it is too large Load Diff