This commit is contained in:
nlitkowski 2021-05-26 02:57:28 +02:00
parent 53fd98388c
commit 4689a528ad
3 changed files with 5175 additions and 16 deletions

23
main.py
View File

@ -26,7 +26,7 @@ HIDDEN_D = 600
OUTPUT_D = 1 OUTPUT_D = 1
def main(dirname): def main(dirnames):
check_path(IN_HEADER_FILE_NAME) check_path(IN_HEADER_FILE_NAME)
in_cols = (pd.read_csv(IN_HEADER_FILE_NAME, sep=FILE_SEP)).columns in_cols = (pd.read_csv(IN_HEADER_FILE_NAME, sep=FILE_SEP)).columns
check_path(OUT_HEADER_FILE_NAME) check_path(OUT_HEADER_FILE_NAME)
@ -39,7 +39,10 @@ def main(dirname):
TRAIN_PATH, EXP_FILE_NAME), names=out_cols, compression=None) TRAIN_PATH, EXP_FILE_NAME), names=out_cols, compression=None)
print("Reading input data...") print("Reading input data...")
in_set = get_tsv_data(os.path.join(dirname, IN_FILE_NAME), names=in_cols) in_sets = []
for d in dirnames:
in_sets.append(get_tsv_data(
os.path.join(d, IN_FILE_NAME), names=in_cols))
print("Preparing training data...") print("Preparing training data...")
X_train_raw = train_set_features[in_cols[0]].str.lower() X_train_raw = train_set_features[in_cols[0]].str.lower()
@ -47,14 +50,19 @@ def main(dirname):
Y_train = train_set_labels[out_cols[0]] Y_train = train_set_labels[out_cols[0]]
print("Preparing input data...") print("Preparing input data...")
X_in_raw = in_set[in_cols[0]].str.lower() X_ins_raw = []
for s in in_sets:
X_ins_raw.append(s[in_cols[0]].str.lower())
print("Loading word 2 vector model...") print("Loading word 2 vector model...")
w2v_model = downloader.load(WORD_2_VEC_MODEL_NAME) w2v_model = downloader.load(WORD_2_VEC_MODEL_NAME)
print("Vectorizing data...") print("Vectorizing data...")
X_train = vectorize(X_train, w2v_model) X_train = vectorize(X_train, w2v_model)
X_in = vectorize(X_in_raw, w2v_model)
X_ins = []
for r in X_ins_raw:
X_ins.append(vectorize(r, w2v_model))
model = Model(input_dim=INPUT_D, hidden_dim=HIDDEN_D, output_dim=OUTPUT_D) model = Model(input_dim=INPUT_D, hidden_dim=HIDDEN_D, output_dim=OUTPUT_D)
@ -63,9 +71,10 @@ def main(dirname):
model.eval() model.eval()
predictions = predict(model, X_in) for i in range(len(X_ins)):
predictions = predict(model, X_ins[i])
out_file_path = os.path.join(dirname, OUT_FILE_NAME) out_file_path = os.path.join(dirnames[i], OUT_FILE_NAME)
print(f"Saving predictions to file: {out_file_path}") print(f"Saving predictions to file: {out_file_path}")
np.asarray(predictions, dtype=np.int32).tofile(out_file_path, sep="\n") np.asarray(predictions, dtype=np.int32).tofile(out_file_path, sep="\n")
@ -113,4 +122,4 @@ def check_path(filename: str):
if __name__ == "__main__": if __name__ == "__main__":
if len(sys.argv) < 2: if len(sys.argv) < 2:
raise Exception("Name of working dir not specified!") raise Exception("Name of working dir not specified!")
main(sys.argv[1]) main(sys.argv[1:])

View File

@ -19,7 +19,8 @@ class Model(nn.Module):
self.fc2 = nn.Linear(self.hidden_dim, self.hidden_dim) self.fc2 = nn.Linear(self.hidden_dim, self.hidden_dim)
self.fc3 = nn.Linear(self.hidden_dim, self.output_dim) self.fc3 = nn.Linear(self.hidden_dim, self.output_dim)
self.relu = nn.ReLU() self.r1 = nn.ReLU()
self.r2 = nn.ReLU()
self.criterion = nn.BCELoss() self.criterion = nn.BCELoss()
self.optimizer = torch.optim.SGD(self.parameters(), lr=0.01) self.optimizer = torch.optim.SGD(self.parameters(), lr=0.01)
@ -28,19 +29,16 @@ class Model(nn.Module):
"""Step forward learning fn""" """Step forward learning fn"""
x = self.fc1(x) x = self.fc1(x)
x = self.relu(x) x = self.r1(x)
x = self.fc2(x) x = self.r2(x)
x = self.relu(x)
x = self.fc3(x) x = self.fc3(x)
x = torch.sigmoid(x) x = torch.sigmoid(x)
return x return x
def run_training(self, X_train, Y_train, batch_size, epochs_count): def run_training(self, X_train, Y_train, batch_size, epochs_count):
for _ in range(epochs_count): for i in range(epochs_count):
self.train() self.train()
print(f"{Y_train.shape[0]}, {Y_train.shape[0] == self.input_dim}") print(f"Epochs: {i + 1}/{epochs_count}")
print(f"{Y_train.shape[0]}, {Y_train.shape[0] == self.hidden_dim}")
print(f"{Y_train.shape[0]}, {Y_train.shape[0] == self.output_dim}")
for i in range(0, Y_train.shape[0], batch_size): for i in range(0, Y_train.shape[0], batch_size):
X = X_train[i: i + batch_size] X = X_train[i: i + batch_size]
X = torch.tensor(X) X = torch.tensor(X)

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff