import pandas as pd import numpy as np import os import sys from model import Model import csv from gensim import downloader import torch from nltk import word_tokenize IN_FILE_NAME = "in.tsv.xz" OUT_FILE_NAME = "out.tsv" TRAIN_PATH = "train" WORD_2_VEC_MODEL_NAME = "word2vec-google-news-300" EXP_FILE_NAME = "expected.tsv" FILE_SEP = "\t" BATCH_SIZE = 10 EPOCHS = 10 IN_HEADER_FILE_NAME = "in-header.tsv" OUT_HEADER_FILE_NAME = "out-header.tsv" THRESHOLD = 0.5 # Model dimensions INPUT_D = 300 HIDDEN_D = 600 OUTPUT_D = 1 def main(dirnames): check_path(IN_HEADER_FILE_NAME) in_cols = (pd.read_csv(IN_HEADER_FILE_NAME, sep=FILE_SEP)).columns check_path(OUT_HEADER_FILE_NAME) out_cols = (pd.read_csv(OUT_HEADER_FILE_NAME, sep=FILE_SEP)).columns print("Reading train data...") train_set_features = get_tsv_data(os.path.join( TRAIN_PATH, IN_FILE_NAME), names=in_cols) train_set_labels = get_tsv_data(os.path.join( TRAIN_PATH, EXP_FILE_NAME), names=out_cols, compression=None) print("Reading input data...") in_sets = [] for d in dirnames: in_sets.append(get_tsv_data( os.path.join(d, IN_FILE_NAME), names=in_cols)) print("Preparing training data...") X_train_raw = train_set_features[in_cols[0]].str.lower() X_train = [word_tokenize(content) for content in X_train_raw] Y_train = train_set_labels[out_cols[0]] print("Preparing input data...") X_ins_raw = [] for s in in_sets: X_ins_raw.append(s[in_cols[0]].str.lower()) print("Loading word 2 vector model...") w2v_model = downloader.load(WORD_2_VEC_MODEL_NAME) print("Vectorizing data...") X_train = vectorize(X_train, w2v_model) X_ins = [] for r in X_ins_raw: X_ins.append(vectorize(r, w2v_model)) model = Model(input_dim=INPUT_D, hidden_dim=HIDDEN_D, output_dim=OUTPUT_D) print("Starting model training...") model.run_training(X_train, Y_train, BATCH_SIZE, EPOCHS) model.eval() for i in range(len(X_ins)): predictions = predict(model, X_ins[i]) out_file_path = os.path.join(dirnames[i], OUT_FILE_NAME) print(f"Saving predictions to file: {out_file_path}") np.asarray(predictions, dtype=np.int32).tofile(out_file_path, sep="\n") def vectorize(set, w2v_model): return [np.mean([w2v_model[word] for word in doc if word in w2v_model] or [ np.zeros(INPUT_D)], axis=0) for doc in set] def predict(model, X_in): res = [] with torch.no_grad(): for X in chunks(X_in, BATCH_SIZE): Xt = torch.tensor(X) res += ((model(Xt.float())) > THRESHOLD).tolist() return res def chunks(iterable, n): """Yield successive n-sized chunks from iterable.""" for i in range(0, len(iterable), n): yield iterable[i:i + n] def get_tsv_data(filename: str, names, compression="infer"): check_path(filename) return pd.read_csv( filename, sep=FILE_SEP, compression=compression, error_bad_lines=False, quoting=csv.QUOTE_NONE, header=None, names=names, ) def check_path(filename: str): if not os.path.exists(filename): raise Exception(f"Path {filename} does not exist!") if __name__ == "__main__": if len(sys.argv) < 2: raise Exception("Name of working dir not specified!") main(sys.argv[1:])