Add solution for full dataset
This commit is contained in:
parent
142f0ca72e
commit
48d4451077
574
dev-0/out.tsv
574
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
25
solution.py
25
solution.py
@ -20,22 +20,23 @@ model = NeuralNetworkModel()
|
|||||||
criterion = torch.nn.BCELoss()
|
criterion = torch.nn.BCELoss()
|
||||||
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
|
||||||
|
|
||||||
|
|
||||||
def train():
|
def train():
|
||||||
with open('train/train.tsv') as f:
|
with open('train/train.tsv') as f:
|
||||||
docs = [line.rstrip() for line in f]
|
docs = [line.rstrip() for line in f]
|
||||||
docs_preprocessed = []
|
docs_preprocessed = []
|
||||||
y = []
|
y = []
|
||||||
for doc in docs[:10000]:
|
for doc in docs:
|
||||||
y_with_doc = doc.split('\t')
|
y_with_doc = doc.split('\t')
|
||||||
y.append(y_with_doc[0])
|
y.append(y_with_doc[0])
|
||||||
doc = y_with_doc[1]
|
docs_preprocessed.append(preprocess(y_with_doc[1]))
|
||||||
doc = preprocess(doc)
|
|
||||||
docs_preprocessed.append(doc)
|
|
||||||
y = [int(value) for value in y]
|
y = [int(value) for value in y]
|
||||||
y = np.reshape(y, (len(y), 1))
|
y = np.reshape(y, (len(y), 1))
|
||||||
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
|
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs_preprocessed)]
|
||||||
|
print("Fitting vectorizer...")
|
||||||
global vectorizer
|
global vectorizer
|
||||||
vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features)
|
vectorizer = Doc2Vec(tagged_documents, min_count=10, epochs=300, dm=0, vector_size=n_features)
|
||||||
|
print("DONE!")
|
||||||
x = vectorizer.dv.vectors
|
x = vectorizer.dv.vectors
|
||||||
for epoch in range(5):
|
for epoch in range(5):
|
||||||
loss_score = 0
|
loss_score = 0
|
||||||
@ -58,25 +59,21 @@ def train():
|
|||||||
print(epoch)
|
print(epoch)
|
||||||
print(get_loss_acc(model, x, y, criterion, optimizer))
|
print(get_loss_acc(model, x, y, criterion, optimizer))
|
||||||
|
|
||||||
|
|
||||||
def classify(path):
|
def classify(path):
|
||||||
|
print("Predicting for", path)
|
||||||
with open(path + 'in.tsv') as f:
|
with open(path + 'in.tsv') as f:
|
||||||
docs = [line.rstrip() for line in f]
|
docs = [line.rstrip() for line in f]
|
||||||
docs = [preprocess(doc) for doc in docs]
|
docs = [preprocess(doc) for doc in docs]
|
||||||
test_x = [vectorizer.infer_vector(doc) for doc in docs]
|
test_x = [vectorizer.infer_vector(doc) for doc in docs]
|
||||||
test_x = np.array(test_x, np.float32)
|
test_x = np.array(test_x, np.float32)
|
||||||
test_x = torch.tensor(test_x)
|
predictions = model(torch.tensor(test_x)).detach().numpy()[:, 0]
|
||||||
predictions = model(test_x).detach().numpy()[:, 0]
|
predictions = [1 if value >= 0.5 else 0 for value in predictions]
|
||||||
p = []
|
|
||||||
for prediction in predictions:
|
|
||||||
if prediction >= 0.5:
|
|
||||||
p.append(1)
|
|
||||||
else:
|
|
||||||
p.append(0)
|
|
||||||
with open(path + 'out.tsv', 'w') as file:
|
with open(path + 'out.tsv', 'w') as file:
|
||||||
for prediction in p:
|
for prediction in predictions:
|
||||||
file.write("%i\n" % prediction)
|
file.write("%i\n" % prediction)
|
||||||
|
|
||||||
|
|
||||||
train()
|
train()
|
||||||
classify('dev-0/')
|
classify('dev-0/')
|
||||||
# classify('test-A/')
|
classify('test-A/')
|
5447
test-A/out.tsv
Normal file
5447
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user