Add scripts and dev prediction
This commit is contained in:
parent
756ef4277a
commit
d0d7934292
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
116
main.py
Normal file
116
main.py
Normal file
@ -0,0 +1,116 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
from model import Model
|
||||
import csv
|
||||
from gensim import downloader
|
||||
import torch
|
||||
from nltk import word_tokenize
|
||||
|
||||
IN_FILE_NAME = "in.tsv.xz"
|
||||
OUT_FILE_NAME = "out.tsv"
|
||||
TRAIN_PATH = "train"
|
||||
WORD_2_VEC_MODEL_NAME = "word2vec-google-news-300"
|
||||
EXP_FILE_NAME = "expected.tsv"
|
||||
FILE_SEP = "\t"
|
||||
BATCH_SIZE = 10
|
||||
EPOCHS = 10
|
||||
IN_HEADER_FILE_NAME = "in-header.tsv"
|
||||
OUT_HEADER_FILE_NAME = "out-header.tsv"
|
||||
THRESHOLD = 0.5
|
||||
|
||||
# Model dimensions
|
||||
INPUT_D = 300
|
||||
HIDDEN_D = 600
|
||||
OUTPUT_D = 1
|
||||
|
||||
|
||||
def main(dirname):
|
||||
check_path(IN_HEADER_FILE_NAME)
|
||||
in_cols = (pd.read_csv(IN_HEADER_FILE_NAME, sep=FILE_SEP)).columns
|
||||
check_path(OUT_HEADER_FILE_NAME)
|
||||
out_cols = (pd.read_csv(OUT_HEADER_FILE_NAME, sep=FILE_SEP)).columns
|
||||
|
||||
print("Reading train data...")
|
||||
train_set_features = get_tsv_data(os.path.join(
|
||||
TRAIN_PATH, IN_FILE_NAME), names=in_cols)
|
||||
train_set_labels = get_tsv_data(os.path.join(
|
||||
TRAIN_PATH, EXP_FILE_NAME), names=out_cols, compression=None)
|
||||
|
||||
print("Reading input data...")
|
||||
in_set = get_tsv_data(os.path.join(dirname, IN_FILE_NAME), names=in_cols)
|
||||
|
||||
print("Preparing training data...")
|
||||
X_train_raw = train_set_features[in_cols[0]].str.lower()
|
||||
X_train = [word_tokenize(content) for content in X_train_raw]
|
||||
Y_train = train_set_labels[out_cols[0]]
|
||||
|
||||
print("Preparing input data...")
|
||||
X_in_raw = in_set[in_cols[0]].str.lower()
|
||||
|
||||
print("Loading word 2 vector model...")
|
||||
w2v_model = downloader.load(WORD_2_VEC_MODEL_NAME)
|
||||
|
||||
print("Vectorizing data...")
|
||||
X_train = vectorize(X_train, w2v_model)
|
||||
X_in = vectorize(X_in_raw, w2v_model)
|
||||
|
||||
model = Model(input_dim=INPUT_D, hidden_dim=HIDDEN_D, output_dim=OUTPUT_D)
|
||||
|
||||
print("Starting model training...")
|
||||
model.run_training(X_train, Y_train, BATCH_SIZE, EPOCHS)
|
||||
|
||||
model.eval()
|
||||
|
||||
predictions = predict(model, X_in)
|
||||
|
||||
out_file_path = os.path.join(dirname, OUT_FILE_NAME)
|
||||
print(f"Saving predictions to file: {out_file_path}")
|
||||
np.asarray(predictions, dtype=np.int32).tofile(out_file_path, sep="\n")
|
||||
|
||||
|
||||
def vectorize(set, w2v_model):
|
||||
return [np.mean([w2v_model[word] for word in doc if word in w2v_model] or [
|
||||
np.zeros(INPUT_D)], axis=0) for doc in set]
|
||||
|
||||
|
||||
def predict(model, X_in):
|
||||
res = []
|
||||
|
||||
with torch.no_grad():
|
||||
for X in chunks(X_in, BATCH_SIZE):
|
||||
Xt = torch.tensor(X)
|
||||
res += ((model(Xt.float())) > THRESHOLD).tolist()
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def chunks(iterable, n):
|
||||
"""Yield successive n-sized chunks from iterable."""
|
||||
for i in range(0, len(iterable), n):
|
||||
yield iterable[i:i + n]
|
||||
|
||||
|
||||
def get_tsv_data(filename: str, names, compression="infer"):
|
||||
check_path(filename)
|
||||
return pd.read_csv(
|
||||
filename,
|
||||
sep=FILE_SEP,
|
||||
compression=compression,
|
||||
error_bad_lines=False,
|
||||
quoting=csv.QUOTE_NONE,
|
||||
header=None,
|
||||
names=names,
|
||||
)
|
||||
|
||||
|
||||
def check_path(filename: str):
|
||||
if not os.path.exists(filename):
|
||||
raise Exception(f"Path {filename} does not exist!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
raise Exception("Name of working dir not specified!")
|
||||
main(sys.argv[1])
|
48
model.py
Normal file
48
model.py
Normal file
@ -0,0 +1,48 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Model(nn.Module):
|
||||
"""NN Model class"""
|
||||
|
||||
def __init__(self, input_dim=300, hidden_dim=600, output_dim=1):
|
||||
"""Initializes new instance of Model class"""
|
||||
|
||||
super(Model, self).__init__()
|
||||
|
||||
self.input_dim = input_dim
|
||||
self.hidden_dim = hidden_dim
|
||||
self.output_dim = output_dim
|
||||
|
||||
self.fc1 = nn.Linear(self.input_dim, self.hidden_dim)
|
||||
self.fc2 = nn.Linear(self.hidden_dim, self.output_dim)
|
||||
|
||||
self.criterion = nn.BCELoss()
|
||||
self.optimizer = torch.optim.SGD(self.parameters(), lr=0.01)
|
||||
|
||||
def forward(self, x):
|
||||
"""Step forward learning fn"""
|
||||
|
||||
x = self.fc1(x)
|
||||
x = torch.relu(x)
|
||||
x = self.fc2(x)
|
||||
x = torch.sigmoid(x)
|
||||
return x
|
||||
|
||||
def run_training(self, X_train, Y_train, batch_size, epochs_count):
|
||||
for _ in range(epochs_count):
|
||||
self.train()
|
||||
for i in range(0, Y_train.shape[0], batch_size):
|
||||
X = X_train[i: i + batch_size]
|
||||
X = torch.tensor(X)
|
||||
y = Y_train[i: i + batch_size]
|
||||
y = torch.tensor(
|
||||
y.astype(np.float32).to_numpy()).reshape(-1, 1)
|
||||
|
||||
outputs = self(X.float())
|
||||
loss = self.criterion(outputs, y)
|
||||
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
Loading…
Reference in New Issue
Block a user