import pandas as pd from sacred.observers import FileStorageObserver from silence_tensorflow import silence_tensorflow from tensorflow import keras silence_tensorflow() from tensorflow.keras.preprocessing.text import Tokenizer from collections import Counter from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras import layers from sacred import Experiment ex = Experiment("file_observer") ex.observers.append(FileStorageObserver('my_runs')) @ex.config def my_config(): epochs = 15 batch_size = 16 @ex.capture def prepare_model(epochs, batch_size): df = pd.read_csv('data.csv') train_df = pd.read_csv('train.csv') val_df = pd.read_csv('dev.csv') test_df = pd.read_csv('test.csv') df.dropna(subset = ['reviews.text'], inplace = True) val_df.dropna(subset = ['reviews.text'], inplace = True) test_df.dropna(subset = ['reviews.text'], inplace = True) train_df.dropna(subset = ['reviews.text'], inplace = True) train_sentences = train_df['reviews.text'].to_numpy() train_labels = train_df['reviews.doRecommend'].to_numpy() val_sentences = val_df['reviews.text'].to_numpy() val_labels = val_df['reviews.doRecommend'].to_numpy() test_sentences = test_df['reviews.text'].to_numpy() test_labels = test_df['reviews.doRecommend'].to_numpy() # print(train_labels.shape) # print(train_sentences.shape) counter = counter_word(df['reviews.text']) num_unique_words = len(counter) tokenizer = Tokenizer(num_words=num_unique_words) tokenizer.fit_on_texts(train_sentences) word_index = tokenizer.word_index train_sequences = tokenizer.texts_to_sequences(train_sentences) val_sequences = tokenizer.texts_to_sequences(val_sentences) test_sequences = tokenizer.texts_to_sequences(test_sentences) max_length = 30 train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post") val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post") test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post") test_df['reviews.text'] = test_padded test_df.to_csv('test.csv') model = keras.models.Sequential() model.add(layers.Embedding(num_unique_words, 32, input_length=max_length)) model.add(layers.LSTM(64, dropout=0.1)) model.add(layers.Dense(1, activation="sigmoid")) model.summary() loss = keras.losses.BinaryCrossentropy(from_logits=False) optim = keras.optimizers.Adam(lr = 0.001) metrics = ["accuracy"] model.compile(loss = loss, optimizer = optim, metrics = metrics) model.fit(train_padded, train_labels, epochs = 20, validation_data=(val_padded, val_labels), verbose=2) predictions = model.predict(test_padded) predictions = [1 if p > 0.5 else 0 for p in predictions] model.save('trained_model') file = open('results.txt', 'w') file.write(predictions.__str__()) file.close() def counter_word(text_col): count = Counter() for text in text_col.values: for word in text.split(): count[word] += 1 return count @ex.automain def my_main(epochs, batch_size): prepare_model() ex.run() ex.add_artifact('trained_model')