A commit
This commit is contained in:
parent
756ef4277a
commit
1f199f4d86
5272
dev-0/in.tsv
Normal file
5272
dev-0/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
130
main.py
Normal file
130
main.py
Normal file
@ -0,0 +1,130 @@
|
||||
import numpy as np
|
||||
from regex import W
|
||||
from network import LogisticRegressionModel
|
||||
from gensim.models import Word2Vec
|
||||
import torch
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
import csv
|
||||
import nltk
|
||||
import re
|
||||
|
||||
|
||||
def vectorize(word2vec, documents):
|
||||
vectorized = []
|
||||
|
||||
for d in documents:
|
||||
vectorized.append(
|
||||
np.mean(
|
||||
[
|
||||
word2vec.wv[word]
|
||||
if word in word2vec.wv
|
||||
else np.zeros(100, dtype=float)
|
||||
for word in d
|
||||
],
|
||||
axis=0,
|
||||
)
|
||||
)
|
||||
|
||||
return np.array(vectorized)
|
||||
|
||||
|
||||
def flatten(t):
|
||||
return [str(int(item)) for sublist in t for item in sublist]
|
||||
|
||||
|
||||
def predict(model, testX):
|
||||
testX = torch.tensor(testX.astype(np.float32))
|
||||
with torch.no_grad():
|
||||
return flatten(model(testX).round().tolist())
|
||||
|
||||
|
||||
def trainModel(
|
||||
model,
|
||||
trainX,
|
||||
trainY,
|
||||
optimizer,
|
||||
criterion=torch.nn.BCELoss(),
|
||||
epochs=10,
|
||||
batchSize=256,
|
||||
):
|
||||
for epoch in range(epochs):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.train()
|
||||
for i in range(0, trainY.shape[0], batchSize):
|
||||
X = trainX[i : i + batchSize]
|
||||
X = torch.tensor(X.astype(np.float32))
|
||||
Y = trainY[i : i + batchSize]
|
||||
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||
Y_predictions = model(X)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||
items_total += Y.shape[0]
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(Y_predictions, Y)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_score += loss.item() * Y.shape[0]
|
||||
|
||||
print(f"Epoch {epoch+1}/{epochs}")
|
||||
loss, accuracy = getMetrics(model, trainX, trainY, criterion, batchSize)
|
||||
print(f"Train set\nloss = {loss}, accuracy = {accuracy}")
|
||||
|
||||
|
||||
def getMetrics(model, X_dataset, Y_dataset, criterion, batchSize):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.eval()
|
||||
for i in range(0, Y_dataset.shape[0], batchSize):
|
||||
X = np.array(X_dataset[i : i + batchSize], dtype=object)
|
||||
X = torch.tensor(X.astype(np.float32))
|
||||
Y = np.array(Y_dataset[i : i + batchSize])
|
||||
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||
Y_predictions = model(X)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||
items_total += Y.shape[0]
|
||||
|
||||
loss = criterion(Y_predictions, Y)
|
||||
|
||||
loss_score += loss.item() * Y.shape[0]
|
||||
return (loss_score / items_total), (acc_score / items_total)
|
||||
|
||||
def write(content, dir):
|
||||
with open(dir, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerows(content)
|
||||
|
||||
def read_file(path):
|
||||
with open(path, 'r') as f:
|
||||
return f.readlines()
|
||||
|
||||
|
||||
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("train/in.tsv")]
|
||||
y = np.array([re.sub(r'\n', '', x) for x in read_file("train/expected.tsv")])
|
||||
|
||||
print(X[0])
|
||||
|
||||
w2v = Word2Vec(X, vector_size=100, min_count=2)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
||||
|
||||
X_train = vectorize(w2v, X_train)
|
||||
X_test = vectorize(w2v, X_test)
|
||||
|
||||
print(X_train[0])
|
||||
nn = LogisticRegressionModel()
|
||||
optimizer = torch.optim.SGD(nn.parameters(), lr=0.01)
|
||||
|
||||
trainModel(nn, X_train, y_train, optimizer)
|
||||
|
||||
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("test-A/in.tsv")]
|
||||
X = vectorize(w2v, X)
|
||||
write(predict(nn, X), "test-A/out.tsv")
|
||||
|
||||
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("dev-0/in.tsv")]
|
||||
X = vectorize(w2v, X)
|
||||
write(predict(nn, X), "dev-0/out.tsv")
|
14
network.py
Normal file
14
network.py
Normal file
@ -0,0 +1,14 @@
|
||||
import torch
|
||||
|
||||
class LogisticRegressionModel(torch.nn.Module):
|
||||
def __init__(self, features=100):
|
||||
super(LogisticRegressionModel, self).__init__()
|
||||
self.fc = torch.nn.Linear(features,400)
|
||||
self.fc2 = torch.nn.Linear(400,1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc(x)
|
||||
x = torch.relu(x)
|
||||
x = self.fc2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
5152
test-A/in.tsv
Normal file
5152
test-A/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/in.tsv
Normal file
289579
train/in.tsv
Normal file
File diff suppressed because one or more lines are too long
BIN
train/in.tsv.xz
BIN
train/in.tsv.xz
Binary file not shown.
12
train_word2vec.py
Normal file
12
train_word2vec.py
Normal file
@ -0,0 +1,12 @@
|
||||
import csv
|
||||
from gensim.test.utils import common_texts
|
||||
from gensim.models import Word2Vec
|
||||
|
||||
with open("train/in.tsv") as f:
|
||||
texts = csv.reader(f, delimiter="\t")
|
||||
|
||||
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
|
||||
model.save("word2vec.model")
|
||||
|
||||
word_vectors = model.wv
|
||||
word_vectors.save("word2vec.wordvectors")
|
BIN
word2vec.model
Normal file
BIN
word2vec.model
Normal file
Binary file not shown.
BIN
word2vec.wordvectors
Normal file
BIN
word2vec.wordvectors
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user