challenging-america-year-pr.../keras_lstm/train.py

import tensorflow as tf
import pickle

gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dropout, Dense
from tensorflow.keras.layers import Embedding
import sys
from tensorflow.keras.layers import MaxPooling1D, GlobalMaxPooling2D, LSTM, Bidirectional
from sklearn.utils import shuffle


vocab_size = int(sys.argv[1])
embedding_size = int(sys.argv[2])
LSTM_SIZE = int(sys.argv[3])
DROPOUT_LSTM = float(sys.argv[4])
DROPOUT_REGULAR = float(sys.argv[5])
BATCH_SIZE = int(sys.argv[6])
checkpoints = sys.argv[7]
#p1 = float(sys.argv[3])
#p2 = float(sys.argv[4])


train_text = [a.rstrip('\n') for a in open('../train/in.tsv','r')]
train_year = [float(a.rstrip('\n')) for a in open('../train/expected.tsv','r')]


#tokenizer = Tokenizer(num_words=vocab_size)
#tokenizer.fit_on_texts(train_text)
#train_text_tokenized = tokenizer.texts_to_sequences(train_text)
maxlen = 500
#train_text_tokenized = pad_sequences(train_text_tokenized, padding='post', maxlen=maxlen)
#pickle.dump(train_text_tokenized, open('train_text_30k_for_keras.pickle', 'wb'))
#pickle.dump(tokenizer, open('tokenizer.pickle', 'wb'))
train_text_tokenized= pickle.load(open('train_text_30k_for_keras.pickle', 'rb'))


#eval_text_tokenized = [a.rstrip('\n') for a in open('../dev-0/in.tsv', 'r')]
#eval_text_tokenized = tokenizer.texts_to_sequences(eval_text_tokenized)
#eval_text_tokenized = pad_sequences(eval_text_tokenized, padding='post', maxlen=maxlen)
#pickle.dump(eval_text_tokenized, open('eval_text_30k_for_keras.pickle', 'wb'))
eval_text_tokenized = pickle.load(open('eval_text_30k_for_keras.pickle', 'rb'))

eval_year = [float(a.rstrip()) for a in open('../dev-0/expected.tsv','r')]


model = Sequential()
embedding_layer = Embedding(vocab_size, embedding_size, input_length=maxlen , trainable=True)
model.add(embedding_layer)
#model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM, return_sequences=True)))
model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM)))
model.add(Dropout(DROPOUT_REGULAR))
model.add(Dense(1, activation='linear'))

train_text_tokenized, train_year = shuffle(train_text_tokenized, train_year)

train_year = np.array(train_year).reshape(-1,1)
eval_year = np.array(eval_year).reshape(-1,1)

scaler = MinMaxScaler().fit(train_year)
pickle.dump(scaler, open('minmaxscaler.pickle', 'wb'))

train_year_scaled = scaler.transform(train_year)
eval_year_scaled = scaler.transform(eval_year)


filepath = "./" + checkpoints + "/saved-model-{epoch:06d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True)
es = EarlyStopping(monitor='val_loss', patience = 70)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse', metrics=['mse'])
history = model.fit(train_text_tokenized, train_year_scaled, batch_size=BATCH_SIZE, epochs=5000, verbose=1, validation_data = (eval_text_tokenized, eval_year_scaled), callbacks = [es, checkpoint])
lstm 2021-06-10 08:45:46 +02:00			`import tensorflow as tf`
			`import pickle`

			`gpus = tf.config.experimental.list_physical_devices('GPU')`
			`for gpu in gpus:`
			`tf.config.experimental.set_memory_growth(gpu, True)`

			`print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))`

			`from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint`
			`import numpy as np`
			`from tensorflow.keras.preprocessing.text import Tokenizer`

			`from sklearn.preprocessing import MinMaxScaler`


			`from tensorflow.keras.preprocessing.sequence import pad_sequences`
			`from tensorflow.keras.models import Sequential`
			`from tensorflow.keras.layers import Activation, Dropout, Dense`
			`from tensorflow.keras.layers import Embedding`
			`import sys`
			`from tensorflow.keras.layers import MaxPooling1D, GlobalMaxPooling2D, LSTM, Bidirectional`
			`from sklearn.utils import shuffle`


			`vocab_size = int(sys.argv[1])`
			`embedding_size = int(sys.argv[2])`
			`LSTM_SIZE = int(sys.argv[3])`
			`DROPOUT_LSTM = float(sys.argv[4])`
			`DROPOUT_REGULAR = float(sys.argv[5])`
			`BATCH_SIZE = int(sys.argv[6])`
			`checkpoints = sys.argv[7]`
			`#p1 = float(sys.argv[3])`
			`#p2 = float(sys.argv[4])`


			`train_text = [a.rstrip('\n') for a in open('../train/in.tsv','r')]`
			`train_year = [float(a.rstrip('\n')) for a in open('../train/expected.tsv','r')]`


			`#tokenizer = Tokenizer(num_words=vocab_size)`
			`#tokenizer.fit_on_texts(train_text)`
			`#train_text_tokenized = tokenizer.texts_to_sequences(train_text)`
			`maxlen = 500`
			`#train_text_tokenized = pad_sequences(train_text_tokenized, padding='post', maxlen=maxlen)`
			`#pickle.dump(train_text_tokenized, open('train_text_30k_for_keras.pickle', 'wb'))`
			`#pickle.dump(tokenizer, open('tokenizer.pickle', 'wb'))`
			`train_text_tokenized= pickle.load(open('train_text_30k_for_keras.pickle', 'rb'))`



			`#eval_text_tokenized = [a.rstrip('\n') for a in open('../dev-0/in.tsv', 'r')]`
			`#eval_text_tokenized = tokenizer.texts_to_sequences(eval_text_tokenized)`
			`#eval_text_tokenized = pad_sequences(eval_text_tokenized, padding='post', maxlen=maxlen)`
			`#pickle.dump(eval_text_tokenized, open('eval_text_30k_for_keras.pickle', 'wb'))`
			`eval_text_tokenized = pickle.load(open('eval_text_30k_for_keras.pickle', 'rb'))`

			`eval_year = [float(a.rstrip()) for a in open('../dev-0/expected.tsv','r')]`


			`model = Sequential()`
			`embedding_layer = Embedding(vocab_size, embedding_size, input_length=maxlen , trainable=True)`
			`model.add(embedding_layer)`
			`#model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM, return_sequences=True)))`
			`model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM)))`
			`model.add(Dropout(DROPOUT_REGULAR))`
			`model.add(Dense(1, activation='linear'))`

			`train_text_tokenized, train_year = shuffle(train_text_tokenized, train_year)`

			`train_year = np.array(train_year).reshape(-1,1)`
			`eval_year = np.array(eval_year).reshape(-1,1)`

			`scaler = MinMaxScaler().fit(train_year)`
			`pickle.dump(scaler, open('minmaxscaler.pickle', 'wb'))`

			`train_year_scaled = scaler.transform(train_year)`
			`eval_year_scaled = scaler.transform(eval_year)`



			`filepath = "./" + checkpoints + "/saved-model-{epoch:06d}-{val_loss:.2f}.hdf5"`
			`checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True)`
			`es = EarlyStopping(monitor='val_loss', patience = 70)`
			`model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse', metrics=['mse'])`
			`history = model.fit(train_text_tokenized, train_year_scaled, batch_size=BATCH_SIZE, epochs=5000, verbose=1, validation_data = (eval_text_tokenized, eval_year_scaled), callbacks = [es, checkpoint])`