A commit
This commit is contained in:
parent
756ef4277a
commit
1f199f4d86
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,130 @@
|
|||
import numpy as np
|
||||
from regex import W
|
||||
from network import LogisticRegressionModel
|
||||
from gensim.models import Word2Vec
|
||||
import torch
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
import csv
|
||||
import nltk
|
||||
import re
|
||||
|
||||
|
||||
def vectorize(word2vec, documents):
|
||||
vectorized = []
|
||||
|
||||
for d in documents:
|
||||
vectorized.append(
|
||||
np.mean(
|
||||
[
|
||||
word2vec.wv[word]
|
||||
if word in word2vec.wv
|
||||
else np.zeros(100, dtype=float)
|
||||
for word in d
|
||||
],
|
||||
axis=0,
|
||||
)
|
||||
)
|
||||
|
||||
return np.array(vectorized)
|
||||
|
||||
|
||||
def flatten(t):
|
||||
return [str(int(item)) for sublist in t for item in sublist]
|
||||
|
||||
|
||||
def predict(model, testX):
|
||||
testX = torch.tensor(testX.astype(np.float32))
|
||||
with torch.no_grad():
|
||||
return flatten(model(testX).round().tolist())
|
||||
|
||||
|
||||
def trainModel(
|
||||
model,
|
||||
trainX,
|
||||
trainY,
|
||||
optimizer,
|
||||
criterion=torch.nn.BCELoss(),
|
||||
epochs=10,
|
||||
batchSize=256,
|
||||
):
|
||||
for epoch in range(epochs):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.train()
|
||||
for i in range(0, trainY.shape[0], batchSize):
|
||||
X = trainX[i : i + batchSize]
|
||||
X = torch.tensor(X.astype(np.float32))
|
||||
Y = trainY[i : i + batchSize]
|
||||
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||
Y_predictions = model(X)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||
items_total += Y.shape[0]
|
||||
|
||||
optimizer.zero_grad()
|
||||
loss = criterion(Y_predictions, Y)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
loss_score += loss.item() * Y.shape[0]
|
||||
|
||||
print(f"Epoch {epoch+1}/{epochs}")
|
||||
loss, accuracy = getMetrics(model, trainX, trainY, criterion, batchSize)
|
||||
print(f"Train set\nloss = {loss}, accuracy = {accuracy}")
|
||||
|
||||
|
||||
def getMetrics(model, X_dataset, Y_dataset, criterion, batchSize):
|
||||
loss_score = 0
|
||||
acc_score = 0
|
||||
items_total = 0
|
||||
model.eval()
|
||||
for i in range(0, Y_dataset.shape[0], batchSize):
|
||||
X = np.array(X_dataset[i : i + batchSize], dtype=object)
|
||||
X = torch.tensor(X.astype(np.float32))
|
||||
Y = np.array(Y_dataset[i : i + batchSize])
|
||||
Y = torch.tensor(Y.astype(np.float32)).reshape(-1, 1)
|
||||
Y_predictions = model(X)
|
||||
acc_score += torch.sum((Y_predictions > 0.5) == Y).item()
|
||||
items_total += Y.shape[0]
|
||||
|
||||
loss = criterion(Y_predictions, Y)
|
||||
|
||||
loss_score += loss.item() * Y.shape[0]
|
||||
return (loss_score / items_total), (acc_score / items_total)
|
||||
|
||||
def write(content, dir):
|
||||
with open(dir, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerows(content)
|
||||
|
||||
def read_file(path):
|
||||
with open(path, 'r') as f:
|
||||
return f.readlines()
|
||||
|
||||
|
||||
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("train/in.tsv")]
|
||||
y = np.array([re.sub(r'\n', '', x) for x in read_file("train/expected.tsv")])
|
||||
|
||||
print(X[0])
|
||||
|
||||
w2v = Word2Vec(X, vector_size=100, min_count=2)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y)
|
||||
|
||||
X_train = vectorize(w2v, X_train)
|
||||
X_test = vectorize(w2v, X_test)
|
||||
|
||||
print(X_train[0])
|
||||
nn = LogisticRegressionModel()
|
||||
optimizer = torch.optim.SGD(nn.parameters(), lr=0.01)
|
||||
|
||||
trainModel(nn, X_train, y_train, optimizer)
|
||||
|
||||
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("test-A/in.tsv")]
|
||||
X = vectorize(w2v, X)
|
||||
write(predict(nn, X), "test-A/out.tsv")
|
||||
|
||||
X = [re.sub(r'\t[0-9]+\n', '', x) for x in read_file("dev-0/in.tsv")]
|
||||
X = vectorize(w2v, X)
|
||||
write(predict(nn, X), "dev-0/out.tsv")
|
|
@ -0,0 +1,14 @@
|
|||
import torch
|
||||
|
||||
class LogisticRegressionModel(torch.nn.Module):
|
||||
def __init__(self, features=100):
|
||||
super(LogisticRegressionModel, self).__init__()
|
||||
self.fc = torch.nn.Linear(features,400)
|
||||
self.fc2 = torch.nn.Linear(400,1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc(x)
|
||||
x = torch.relu(x)
|
||||
x = self.fc2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
BIN
train/in.tsv.xz
BIN
train/in.tsv.xz
Binary file not shown.
|
@ -0,0 +1,12 @@
|
|||
import csv
|
||||
from gensim.test.utils import common_texts
|
||||
from gensim.models import Word2Vec
|
||||
|
||||
with open("train/in.tsv") as f:
|
||||
texts = csv.reader(f, delimiter="\t")
|
||||
|
||||
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
|
||||
model.save("word2vec.model")
|
||||
|
||||
word_vectors = model.wv
|
||||
word_vectors.save("word2vec.wordvectors")
|
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue