131 lines
3.5 KiB
Python
131 lines
3.5 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import os
|
|
import sys
|
|
from model import Model
|
|
import csv
|
|
from gensim import downloader
|
|
import torch
|
|
from nltk import word_tokenize
|
|
|
|
IN_FILE_NAME = "in.tsv.xz"
|
|
OUT_FILE_NAME = "out.tsv"
|
|
TRAIN_PATH = "train"
|
|
WORD_2_VEC_MODEL_NAME = "word2vec-google-news-300"
|
|
EXP_FILE_NAME = "expected.tsv"
|
|
FILE_SEP = "\t"
|
|
IN_HEADER_FILE_NAME = "in-header.tsv"
|
|
OUT_HEADER_FILE_NAME = "out-header.tsv"
|
|
|
|
# Model training config
|
|
BATCH_SIZE = 5
|
|
EPOCHS = 15
|
|
THRESHOLD = 0.5
|
|
|
|
# Model dimensions
|
|
INPUT_D = 300
|
|
HIDDEN_D = 600
|
|
OUTPUT_D = 1
|
|
|
|
|
|
def main(dirnames):
|
|
check_path(IN_HEADER_FILE_NAME)
|
|
in_cols = (pd.read_csv(IN_HEADER_FILE_NAME, sep=FILE_SEP)).columns
|
|
check_path(OUT_HEADER_FILE_NAME)
|
|
out_cols = (pd.read_csv(OUT_HEADER_FILE_NAME, sep=FILE_SEP)).columns
|
|
|
|
print("Reading train data...")
|
|
train_set_features = get_tsv_data(os.path.join(
|
|
TRAIN_PATH, IN_FILE_NAME), names=in_cols)
|
|
train_set_labels = get_tsv_data(os.path.join(
|
|
TRAIN_PATH, EXP_FILE_NAME), names=out_cols, compression=None)
|
|
|
|
print("Reading input data...")
|
|
in_sets = []
|
|
for d in dirnames:
|
|
print(f"\tReading dir: {d}...")
|
|
in_sets.append(get_tsv_data(
|
|
os.path.join(d, IN_FILE_NAME), names=in_cols))
|
|
|
|
print("Preparing training data...")
|
|
X_train_raw = train_set_features[in_cols[0]].str.lower()
|
|
X_train = [word_tokenize(content) for content in X_train_raw]
|
|
Y_train = train_set_labels[out_cols[0]]
|
|
|
|
print("Preparing input data...")
|
|
X_ins_raw = []
|
|
for s in in_sets:
|
|
X_ins_raw.append(s[in_cols[0]].str.lower())
|
|
|
|
print("Loading word 2 vector model...")
|
|
w2v_model = downloader.load(WORD_2_VEC_MODEL_NAME)
|
|
|
|
print("Vectorizing data...")
|
|
X_train = vectorize(X_train, w2v_model)
|
|
|
|
X_ins = []
|
|
for r in X_ins_raw:
|
|
X_ins.append(vectorize(r, w2v_model))
|
|
|
|
model = Model(input_dim=INPUT_D, hidden_dim=HIDDEN_D, output_dim=OUTPUT_D)
|
|
|
|
print("Starting model training...")
|
|
model.run_training(X_train, Y_train, BATCH_SIZE, EPOCHS)
|
|
|
|
model.eval()
|
|
|
|
for i in range(len(X_ins)):
|
|
print(
|
|
f"\tPredicting for: {os.path.join(dirnames[i], IN_FILE_NAME)}...")
|
|
predictions = predict(model, X_ins[i])
|
|
|
|
out_file_path = os.path.join(dirnames[i], OUT_FILE_NAME)
|
|
print(f"Saving predictions to file: {out_file_path}")
|
|
np.asarray(predictions, dtype=np.int32).tofile(out_file_path, sep="\n")
|
|
|
|
|
|
def vectorize(set, w2v_model):
|
|
return [np.mean([w2v_model[word] for word in doc if word in w2v_model] or [
|
|
np.zeros(INPUT_D)], axis=0) for doc in set]
|
|
|
|
|
|
def predict(model, X_in):
|
|
res = []
|
|
|
|
with torch.no_grad():
|
|
for X in chunks(X_in, BATCH_SIZE):
|
|
Xt = torch.tensor(X)
|
|
res += ((model(Xt.float())) > THRESHOLD).tolist()
|
|
|
|
return res
|
|
|
|
|
|
def chunks(iterable, n):
|
|
"""Yield successive n-sized chunks from iterable."""
|
|
for i in range(0, len(iterable), n):
|
|
yield iterable[i:i + n]
|
|
|
|
|
|
def get_tsv_data(filename: str, names, compression="infer"):
|
|
check_path(filename)
|
|
return pd.read_csv(
|
|
filename,
|
|
sep=FILE_SEP,
|
|
compression=compression,
|
|
error_bad_lines=False,
|
|
quoting=csv.QUOTE_NONE,
|
|
header=None,
|
|
names=names,
|
|
)
|
|
|
|
|
|
def check_path(filename: str):
|
|
if not os.path.exists(filename):
|
|
raise Exception(f"Path {filename} does not exist!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
raise Exception("Name of working dir not specified!")
|
|
main(sys.argv[1:])
|