ium_434780/train-tensorflow-sacred.py

106 lines
3.2 KiB
Python
Raw Normal View History

2021-05-17 11:32:32 +02:00
import pandas as pd
from sacred.observers import FileStorageObserver
from silence_tensorflow import silence_tensorflow
from tensorflow import keras
silence_tensorflow()
from tensorflow.keras.preprocessing.text import Tokenizer
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from sacred import Experiment
ex = Experiment("file_observer")
ex.observers.append(FileStorageObserver('my_runs'))
@ex.config
def my_config():
epochs = 15
batch_size = 16
@ex.capture
def prepare_model(epochs, batch_size):
df = pd.read_csv('data.csv')
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')
df.dropna(subset = ['reviews.text'], inplace = True)
val_df.dropna(subset = ['reviews.text'], inplace = True)
test_df.dropna(subset = ['reviews.text'], inplace = True)
train_df.dropna(subset = ['reviews.text'], inplace = True)
train_sentences = train_df['reviews.text'].to_numpy()
train_labels = train_df['reviews.doRecommend'].to_numpy()
val_sentences = val_df['reviews.text'].to_numpy()
val_labels = val_df['reviews.doRecommend'].to_numpy()
test_sentences = test_df['reviews.text'].to_numpy()
test_labels = test_df['reviews.doRecommend'].to_numpy()
# print(train_labels.shape)
# print(train_sentences.shape)
counter = counter_word(df['reviews.text'])
num_unique_words = len(counter)
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
max_length = 30
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")
test_df['reviews.text'] = test_padded
test_df.to_csv('test.csv')
model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr = 0.001)
metrics = ["accuracy"]
model.compile(loss = loss, optimizer = optim, metrics = metrics)
model.fit(train_padded, train_labels, epochs = 20, validation_data=(val_padded, val_labels), verbose=2)
predictions = model.predict(test_padded)
predictions = [1 if p > 0.5 else 0 for p in predictions]
model.save('trained_model')
file = open('results.txt', 'w')
file.write(predictions.__str__())
file.close()
def counter_word(text_col):
count = Counter()
for text in text_col.values:
for word in text.split():
count[word] += 1
return count
@ex.automain
def my_main(epochs, batch_size):
prepare_model()
ex.run()
ex.add_artifact('trained_model')