import tensorflow as tf import pickle gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU'))) from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint import numpy as np from tensorflow.keras.preprocessing.text import Tokenizer from sklearn.preprocessing import MinMaxScaler from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Activation, Dropout, Dense from tensorflow.keras.layers import Embedding import sys from tensorflow.keras.layers import MaxPooling1D, GlobalMaxPooling2D, LSTM, Bidirectional from sklearn.utils import shuffle vocab_size = int(sys.argv[1]) embedding_size = int(sys.argv[2]) LSTM_SIZE = int(sys.argv[3]) DROPOUT_LSTM = float(sys.argv[4]) DROPOUT_REGULAR = float(sys.argv[5]) BATCH_SIZE = int(sys.argv[6]) checkpoints = sys.argv[7] #p1 = float(sys.argv[3]) #p2 = float(sys.argv[4]) FILE='2' train_text = [a.rstrip('\n') for a in open('../train/in.tsv','r')] train_year = [float(a.rstrip('\n')) for a in open(f'../train/expected{FILE}.tsv','r')] tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(train_text) train_text_tokenized = tokenizer.texts_to_sequences(train_text) maxlen = 500 train_text_tokenized = pad_sequences(train_text_tokenized, padding='post', maxlen=maxlen) pickle.dump(train_text_tokenized, open('train_text_30k_for_keras.pickle', 'wb')) pickle.dump(tokenizer, open('tokenizer.pickle', 'wb')) train_text_tokenized= pickle.load(open('train_text_30k_for_keras.pickle', 'rb')) eval_text_tokenized = [a.rstrip('\n') for a in open('../dev-0/in.tsv', 'r')] eval_text_tokenized = tokenizer.texts_to_sequences(eval_text_tokenized) eval_text_tokenized = pad_sequences(eval_text_tokenized, padding='post', maxlen=maxlen) pickle.dump(eval_text_tokenized, open('eval_text_30k_for_keras.pickle', 'wb')) eval_text_tokenized = pickle.load(open('eval_text_30k_for_keras.pickle', 'rb')) eval_year = [float(a.rstrip()) for a in open(f'../dev-0/expected{FILE}.tsv','r')] model = Sequential() embedding_layer = Embedding(vocab_size, embedding_size, input_length=maxlen , trainable=True) model.add(embedding_layer) #model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM, return_sequences=True))) model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM))) model.add(Dropout(DROPOUT_REGULAR)) model.add(Dense(1, activation='linear')) train_text_tokenized, train_year = shuffle(train_text_tokenized, train_year) train_year = np.array(train_year).reshape(-1,1) eval_year = np.array(eval_year).reshape(-1,1) scaler = MinMaxScaler().fit(train_year) pickle.dump(scaler, open('minmaxscaler.pickle', 'wb')) train_year_scaled = scaler.transform(train_year) eval_year_scaled = scaler.transform(eval_year) filepath = "./" + checkpoints + "/saved-model-{epoch:06d}-{val_loss:.2f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True) es = EarlyStopping(monitor='val_loss', patience = 70) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse', metrics=['mse']) history = model.fit(train_text_tokenized, train_year_scaled, batch_size=BATCH_SIZE, epochs=5000, verbose=1, validation_data = (eval_text_tokenized, eval_year_scaled), callbacks = [es, checkpoint])