Compare commits

..

1 Commits

Author SHA1 Message Date
cb907d93b4 Fix dev-0 files, add gonito.yaml 2022-04-28 22:13:37 +02:00
7 changed files with 17779 additions and 21055 deletions

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

5
gonito.yaml Normal file
View File

@ -0,0 +1,5 @@
description: roberta base with regression layer on top lr=1e-8 4 epochs
tags:
- transformer
- roberta
- roberta-base

View File

@ -57,8 +57,8 @@ model.add(Bidirectional(LSTM(LSTM_SIZE, dropout = DROPOUT_LSTM)))
model.add(Dropout(DROPOUT_REGULAR))
model.add(Dense(1, activation='linear'))
#model_checkpoint = sorted(os.listdir(checkpoints))[-1]
model.load_weights('checkpoints/saved-model-000018-0.03.hdf5','rb')
model_checkpoint = sorted(os.listdir(checkpoints))[-1]
model.load_weights('checkpoints/saved-model-000006-0.02.hdf5','rb')
scaler = pickle.load(open('minmaxscaler.pickle', 'rb'))

View File

@ -38,22 +38,22 @@ FILE='2'
train_text = [a.rstrip('\n') for a in open('../train/in.tsv','r')]
train_year = [float(a.rstrip('\n')) for a in open(f'../train/expected{FILE}.tsv','r')]
maxlen = 500
#tokenizer = Tokenizer(num_words=vocab_size)
#tokenizer.fit_on_texts(train_text)
#train_text_tokenized = tokenizer.texts_to_sequences(train_text)
#train_text_tokenized = pad_sequences(train_text_tokenized, padding='post', maxlen=maxlen)
#pickle.dump(train_text_tokenized, open('train_text_30k_for_keras.pickle', 'wb'))
#pickle.dump(tokenizer, open('tokenizer.pickle', 'wb'))
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_text)
train_text_tokenized = tokenizer.texts_to_sequences(train_text)
maxlen = 500
train_text_tokenized = pad_sequences(train_text_tokenized, padding='post', maxlen=maxlen)
pickle.dump(train_text_tokenized, open('train_text_30k_for_keras.pickle', 'wb'))
pickle.dump(tokenizer, open('tokenizer.pickle', 'wb'))
train_text_tokenized= pickle.load(open('train_text_30k_for_keras.pickle', 'rb'))
#eval_text_tokenized = [a.rstrip('\n') for a in open('../dev-0/in.tsv', 'r')]
#eval_text_tokenized = tokenizer.texts_to_sequences(eval_text_tokenized)
#eval_text_tokenized = pad_sequences(eval_text_tokenized, padding='post', maxlen=maxlen)
#pickle.dump(eval_text_tokenized, open('eval_text_30k_for_keras.pickle', 'wb'))
eval_text_tokenized = [a.rstrip('\n') for a in open('../dev-0/in.tsv', 'r')]
eval_text_tokenized = tokenizer.texts_to_sequences(eval_text_tokenized)
eval_text_tokenized = pad_sequences(eval_text_tokenized, padding='post', maxlen=maxlen)
pickle.dump(eval_text_tokenized, open('eval_text_30k_for_keras.pickle', 'wb'))
eval_text_tokenized = pickle.load(open('eval_text_30k_for_keras.pickle', 'rb'))
eval_year = [float(a.rstrip()) for a in open(f'../dev-0/expected{FILE}.tsv','r')]
@ -84,5 +84,5 @@ eval_year_scaled = scaler.transform(eval_year)
filepath = "./" + checkpoints + "/saved-model-{epoch:06d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True)
es = EarlyStopping(monitor='val_loss', patience = 70)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.000001), loss='mse', metrics=['mse'])
history = model.fit(train_text_tokenized, train_year_scaled, batch_size=BATCH_SIZE, epochs=100, verbose=1, validation_data = (eval_text_tokenized, eval_year_scaled), callbacks = [es, checkpoint])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse', metrics=['mse'])
history = model.fit(train_text_tokenized, train_year_scaled, batch_size=BATCH_SIZE, epochs=5000, verbose=1, validation_data = (eval_text_tokenized, eval_year_scaled), callbacks = [es, checkpoint])

File diff suppressed because it is too large Load Diff