paranormal-or-skeptic-ISI-p.../main.py
nlitkowski 4689a528ad update
2021-05-26 02:57:28 +02:00

126 lines
3.3 KiB
Python

import pandas as pd
import numpy as np
import os
import sys
from model import Model
import csv
from gensim import downloader
import torch
from nltk import word_tokenize
IN_FILE_NAME = "in.tsv.xz"
OUT_FILE_NAME = "out.tsv"
TRAIN_PATH = "train"
WORD_2_VEC_MODEL_NAME = "word2vec-google-news-300"
EXP_FILE_NAME = "expected.tsv"
FILE_SEP = "\t"
BATCH_SIZE = 10
EPOCHS = 10
IN_HEADER_FILE_NAME = "in-header.tsv"
OUT_HEADER_FILE_NAME = "out-header.tsv"
THRESHOLD = 0.5
# Model dimensions
INPUT_D = 300
HIDDEN_D = 600
OUTPUT_D = 1
def main(dirnames):
check_path(IN_HEADER_FILE_NAME)
in_cols = (pd.read_csv(IN_HEADER_FILE_NAME, sep=FILE_SEP)).columns
check_path(OUT_HEADER_FILE_NAME)
out_cols = (pd.read_csv(OUT_HEADER_FILE_NAME, sep=FILE_SEP)).columns
print("Reading train data...")
train_set_features = get_tsv_data(os.path.join(
TRAIN_PATH, IN_FILE_NAME), names=in_cols)
train_set_labels = get_tsv_data(os.path.join(
TRAIN_PATH, EXP_FILE_NAME), names=out_cols, compression=None)
print("Reading input data...")
in_sets = []
for d in dirnames:
in_sets.append(get_tsv_data(
os.path.join(d, IN_FILE_NAME), names=in_cols))
print("Preparing training data...")
X_train_raw = train_set_features[in_cols[0]].str.lower()
X_train = [word_tokenize(content) for content in X_train_raw]
Y_train = train_set_labels[out_cols[0]]
print("Preparing input data...")
X_ins_raw = []
for s in in_sets:
X_ins_raw.append(s[in_cols[0]].str.lower())
print("Loading word 2 vector model...")
w2v_model = downloader.load(WORD_2_VEC_MODEL_NAME)
print("Vectorizing data...")
X_train = vectorize(X_train, w2v_model)
X_ins = []
for r in X_ins_raw:
X_ins.append(vectorize(r, w2v_model))
model = Model(input_dim=INPUT_D, hidden_dim=HIDDEN_D, output_dim=OUTPUT_D)
print("Starting model training...")
model.run_training(X_train, Y_train, BATCH_SIZE, EPOCHS)
model.eval()
for i in range(len(X_ins)):
predictions = predict(model, X_ins[i])
out_file_path = os.path.join(dirnames[i], OUT_FILE_NAME)
print(f"Saving predictions to file: {out_file_path}")
np.asarray(predictions, dtype=np.int32).tofile(out_file_path, sep="\n")
def vectorize(set, w2v_model):
return [np.mean([w2v_model[word] for word in doc if word in w2v_model] or [
np.zeros(INPUT_D)], axis=0) for doc in set]
def predict(model, X_in):
res = []
with torch.no_grad():
for X in chunks(X_in, BATCH_SIZE):
Xt = torch.tensor(X)
res += ((model(Xt.float())) > THRESHOLD).tolist()
return res
def chunks(iterable, n):
"""Yield successive n-sized chunks from iterable."""
for i in range(0, len(iterable), n):
yield iterable[i:i + n]
def get_tsv_data(filename: str, names, compression="infer"):
check_path(filename)
return pd.read_csv(
filename,
sep=FILE_SEP,
compression=compression,
error_bad_lines=False,
quoting=csv.QUOTE_NONE,
header=None,
names=names,
)
def check_path(filename: str):
if not os.path.exists(filename):
raise Exception(f"Path {filename} does not exist!")
if __name__ == "__main__":
if len(sys.argv) < 2:
raise Exception("Name of working dir not specified!")
main(sys.argv[1:])