88 lines
2.8 KiB
Python
88 lines
2.8 KiB
Python
|
import tensorflow as tf
|
||
|
import pickle
|
||
|
import os
|
||
|
|
||
|
gpus = tf.config.experimental.list_physical_devices('GPU')
|
||
|
for gpu in gpus:
|
||
|
tf.config.experimental.set_memory_growth(gpu, True)
|
||
|
|
||
|
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
|
||
|
import numpy as np
|
||
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||
|
|
||
|
|
||
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||
|
from tensorflow.keras.models import Sequential
|
||
|
from tensorflow.keras.layers import Activation, Dropout, Dense
|
||
|
from tensorflow.keras.layers import Embedding
|
||
|
import sys
|
||
|
from tensorflow.keras.layers import MaxPooling1D, GlobalMaxPooling2D, LSTM, Bidirectional
|
||
|
from sklearn.utils import shuffle
|
||
|
|
||
|
|
||
|
vocab_size = int(sys.argv[1])
|
||
|
embedding_size = int(sys.argv[2])
|
||
|
LSTM_SIZE = int(sys.argv[3])
|
||
|
DROPOUT_LSTM = float(sys.argv[4])
|
||
|
DROPOUT_REGULAR = float(sys.argv[5])
|
||
|
BATCH_SIZE = int(sys.argv[6])
|
||
|
checkpoints = sys.argv[7]
|
||
|
#p1 = float(sys.argv[3])
|
||
|
#p2 = float(sys.argv[4])
|
||
|
|
||
|
FILE='2'
|
||
|
|
||
|
train_year = [float(a.rstrip('\n')) for a in open(f'../train/expected{FILE}.tsv','r')]
|
||
|
max_year = max(train_year)
|
||
|
min_year = min(train_year)
|
||
|
|
||
|
train_text_tokenized= pickle.load(open('train_text_30k_for_keras.pickle', 'rb'))
|
||
|
maxlen = max([len(a) for a in train_text_tokenized])
|
||
|
|
||
|
tokenizer = pickle.load(open('tokenizer.pickle','rb'))
|
||
|
test_text_tokenized = [a.rstrip('\n') for a in open('../test-A/in.tsv', 'r')]
|
||
|
test_text_tokenized = tokenizer.texts_to_sequences(test_text_tokenized)
|
||
|
test_text_tokenized = pad_sequences(test_text_tokenized, padding='post', maxlen=maxlen)
|
||
|
|
||
|
eval_year = [float(a.rstrip()) for a in open(f'../dev-0/expected{FILE}.tsv','r')]
|
||
|
|
||
|
maxlen = max([len(a) for a in train_text_tokenized])
|
||
|
|
||
|
|
||
|
model = Sequential()
|
||
|
embedding_layer = Embedding(vocab_size, embedding_size, input_length=maxlen , trainable=True)
|
||
|
model.add(embedding_layer)
|
||
|
#model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM, return_sequences=True)))
|
||
|
model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM)))
|
||
|
model.add(Dropout(DROPOUT_REGULAR))
|
||
|
model.add(Dense(1, activation='linear'))
|
||
|
|
||
|
model_checkpoint = sorted(os.listdir(checkpoints))[-1]
|
||
|
model.load_weights('checkpoints/saved-model-000006-0.02.hdf5','rb')
|
||
|
|
||
|
scaler = pickle.load(open('minmaxscaler.pickle', 'rb'))
|
||
|
|
||
|
# EVAL PREDS
|
||
|
|
||
|
eval_text_tokenized = pickle.load(open('eval_text_30k_for_keras.pickle', 'rb'))
|
||
|
eval_preds = scaler.inverse_transform(model.predict(eval_text_tokenized, batch_size=20))
|
||
|
eval_preds = np.minimum(eval_preds, max_year)
|
||
|
eval_preds = np.maximum(eval_preds, min_year)
|
||
|
|
||
|
f = open(f'../dev-0/out{FILE}.tsv', 'w')
|
||
|
for i in eval_preds:
|
||
|
f.write(str(i[0]) + '\n')
|
||
|
f.close()
|
||
|
|
||
|
# TEST PREDS
|
||
|
|
||
|
test_preds = scaler.inverse_transform(model.predict(test_text_tokenized, batch_size=20))
|
||
|
test_preds = np.minimum(test_preds, max_year)
|
||
|
test_preds = np.maximum(test_preds, min_year)
|
||
|
|
||
|
f = open(f'../test-A/out{FILE}.tsv', 'w')
|
||
|
for i in test_preds:
|
||
|
f.write(str(i[0]) + '\n')
|
||
|
f.close()
|
||
|
|