A commit
This commit is contained in:
parent
756ef4277a
commit
1f199f4d86
5272
dev-0/in.tsv
Normal file
5272
dev-0/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
130
main.py
Normal file
130
main.py
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
import numpy as np
|
||||||
|
from regex import W
|
||||||
|
from network import LogisticRegressionModel
|
||||||
|
from gensim.models import Word2Vec
|
||||||
|
import torch
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import csv
|
||||||
|
import nltk
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def vectorize(word2vec, documents):
|
||||||
|
vectorized = []
|
||||||
|
|
||||||
|
for d in documents:
|
||||||
|
vectorized.append(
|
||||||
|
np.mean(
|
||||||
|
[
|
||||||
|
word2vec.wv[word]
|
||||||
|
if word in word2vec.wv
|
||||||
|
else np.zeros(100, dtype=float)
|
||||||
|
for word in d
|
||||||
|
],
|
||||||
|
axis=0,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return np.array(vectorized)
|
||||||
|
|
||||||
|
|
||||||
|
def flatten(t):
|
||||||
|
return [str(int(item)) for sublist in t for item in sublist]
|
||||||
|
|
||||||
|
|
||||||
|
def predict(model, testX):
|
||||||
|
testX = torch.tensor(testX.astype(np.float32))
|
||||||
|
with torch.no_grad():
|
||||||
|
return flatten(model(testX).round().tolist())
|
||||||
|
|
||||||
|
|
||||||
|
def trainModel(
|
||||||
|
model,
|
||||||
|
trainX,
|
||||||
|
trainY,
|
||||||
|
optimizer,
|
||||||
|
criterion=torch.nn.BCELoss(),
|
||||||
|
epochs=10,
|
||||||
|
batchSize=256,
|
||||||
|
):
|
||||||
|
for epoch in range(epochs):
|
||||||
|
loss_score = 0
|
||||||
|
acc_score = 0
|
||||||
|
items_total = 0
|
||||||
|
model.train()
|
||||||
|
for i in range(0, trainY.shape[0], batchSize):
|
||||||
|
X = trainX[i : i + batchSize]
|
||||||
|
X = torch.tensor(X.astype(np.float32))
|
||||||
|
Y = trainY[i : i + batchSize]
|
||||||
|
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||||
|
Y_predictions = model(X)
|
||||||
|
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||||
|
items_total += Y.shape[0]
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
loss = criterion(Y_predictions, Y)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
loss_score += loss.item() * Y.shape[0]
|
||||||
|
|
||||||
|
print(f"Epoch {epoch+1}/{epochs}")
|
||||||
|
loss, accuracy = getMetrics(model, trainX, trainY, criterion, batchSize)
|
||||||
|
print(f"Train set\nloss = {loss}, accuracy = {accuracy}")
|
||||||
|
|
||||||
|
|
||||||
|
def getMetrics(model, X_dataset, Y_dataset, criterion, batchSize):
|
||||||
|
loss_score = 0
|
||||||
|
acc_score = 0
|
||||||
|
items_total = 0
|
||||||
|
model.eval()
|
||||||
|
for i in range(0, Y_dataset.shape[0], batchSize):
|
||||||
|
X = np.array(X_dataset[i : i + batchSize], dtype=object)
|
||||||
|
X = torch.tensor(X.astype(np.float32))
|
||||||
|
Y = np.array(Y_dataset[i : i + batchSize])
|
||||||
|
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||||
|
Y_predictions = model(X)
|
||||||
|
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||||
|
items_total += Y.shape[0]
|
||||||
|
|
||||||
|
loss = criterion(Y_predictions, Y)
|
||||||
|
|
||||||
|
loss_score += loss.item() * Y.shape[0]
|
||||||
|
return (loss_score / items_total), (acc_score / items_total)
|
||||||
|
|
||||||
|
def write(content, dir):
|
||||||
|
with open(dir, 'w', newline='') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerows(content)
|
||||||
|
|
||||||
|
def read_file(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
return f.readlines()
|
||||||
|
|
||||||
|
|
||||||
|
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("train/in.tsv")]
|
||||||
|
y = np.array([re.sub(r'\n', '', x) for x in read_file("train/expected.tsv")])
|
||||||
|
|
||||||
|
print(X[0])
|
||||||
|
|
||||||
|
w2v = Word2Vec(X, vector_size=100, min_count=2)
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
||||||
|
|
||||||
|
X_train = vectorize(w2v, X_train)
|
||||||
|
X_test = vectorize(w2v, X_test)
|
||||||
|
|
||||||
|
print(X_train[0])
|
||||||
|
nn = LogisticRegressionModel()
|
||||||
|
optimizer = torch.optim.SGD(nn.parameters(), lr=0.01)
|
||||||
|
|
||||||
|
trainModel(nn, X_train, y_train, optimizer)
|
||||||
|
|
||||||
|
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("test-A/in.tsv")]
|
||||||
|
X = vectorize(w2v, X)
|
||||||
|
write(predict(nn, X), "test-A/out.tsv")
|
||||||
|
|
||||||
|
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("dev-0/in.tsv")]
|
||||||
|
X = vectorize(w2v, X)
|
||||||
|
write(predict(nn, X), "dev-0/out.tsv")
|
14
network.py
Normal file
14
network.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
class LogisticRegressionModel(torch.nn.Module):
|
||||||
|
def __init__(self, features=100):
|
||||||
|
super(LogisticRegressionModel, self).__init__()
|
||||||
|
self.fc = torch.nn.Linear(features,400)
|
||||||
|
self.fc2 = torch.nn.Linear(400,1)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
x = self.fc(x)
|
||||||
|
x = torch.relu(x)
|
||||||
|
x = self.fc2(x)
|
||||||
|
x = torch.sigmoid(x)
|
||||||
|
return x
|
5152
test-A/in.tsv
Normal file
5152
test-A/in.tsv
Normal file
File diff suppressed because one or more lines are too long
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
289579
train/in.tsv
Normal file
289579
train/in.tsv
Normal file
File diff suppressed because one or more lines are too long
BIN
train/in.tsv.xz
BIN
train/in.tsv.xz
Binary file not shown.
12
train_word2vec.py
Normal file
12
train_word2vec.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
import csv
|
||||||
|
from gensim.test.utils import common_texts
|
||||||
|
from gensim.models import Word2Vec
|
||||||
|
|
||||||
|
with open("train/in.tsv") as f:
|
||||||
|
texts = csv.reader(f, delimiter="\t")
|
||||||
|
|
||||||
|
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
|
||||||
|
model.save("word2vec.model")
|
||||||
|
|
||||||
|
word_vectors = model.wv
|
||||||
|
word_vectors.save("word2vec.wordvectors")
|
BIN
word2vec.model
Normal file
BIN
word2vec.model
Normal file
Binary file not shown.
BIN
word2vec.wordvectors
Normal file
BIN
word2vec.wordvectors
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user