Improvements
All checks were successful
s434780-training/pipeline/head This commit looks good

This commit is contained in:
sadurska@trui.pl 2021-05-08 22:32:28 +02:00
parent b5b9a795fc
commit 2841a76304
8 changed files with 85073 additions and 15084 deletions

File diff suppressed because one or more lines are too long

33332
data.csv

File diff suppressed because one or more lines are too long

28333
dataset-Amazon.csv Normal file

File diff suppressed because one or more lines are too long

6667
dev.csv

File diff suppressed because one or more lines are too long

29
main.py
View File

@ -1,9 +1,24 @@
import string
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def remove_punct(text):
translator = str.maketrans("", "", string.punctuation)
return text.translate(translator)
stop = set(stopwords.words("english"))
def remove_stopwords(text):
filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
return " ".join(filtered_words)
def main():
data = pd.read_csv('Amazon_Consumer_Reviews.csv', header=0, sep=',')
data = pd.read_csv('dataset-Amazon.csv')
columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend']
string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title',
@ -13,11 +28,15 @@ def main():
for c in string_columns:
data[c] = data[c].str.lower()
data[c] = data[c].map(remove_punct)
data[c] = data[c].map(remove_stopwords)
# print("Empty rows summary:")
# print(data.isnull().sum())
# data["reviews.title"].fillna("No title", inplace = True)
# print(data.isnull().sum())
print("Empty rows summary:")
print(data.isnull().sum())
data.loc[(data["reviews.rating"] > 3), 'reviews.doRecommend'] = True
data.loc[(data["reviews.rating"] <= 3), 'reviews.doRecommend'] = False
data["reviews.doRecommend"] = data["reviews.doRecommend"].astype(int)
print(data.isnull().sum())
data.to_csv('data.csv')

130
main2.py
View File

@ -1,107 +1,81 @@
import re
import string
import pandas as pd
from silence_tensorflow import silence_tensorflow
from tensorflow import keras
silence_tensorflow()
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label
def counter_word(text_col):
count = Counter()
for text in text_col.values:
for word in text.split():
count[word] += 1
return count
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')
df = pd.read_csv('data.csv')
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')
batch_size = 32
seed = 42
df.dropna(subset = ['reviews.text'], inplace = True)
train_df.dropna(subset = ['reviews.text'], inplace = True)
val_df.dropna(subset = ['reviews.text'], inplace = True)
test_df.dropna(subset = ['reviews.text'], inplace = True)
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='validation',
seed=seed)
train_sentences = train_df['reviews.text'].to_numpy()
train_labels = train_df['reviews.doRecommend'].to_numpy()
val_sentences = val_df['reviews.text'].to_numpy()
val_labels = val_df['reviews.doRecommend'].to_numpy()
test_sentences = test_df['reviews.text'].to_numpy()
test_labels = test_df['reviews.doRecommend'].to_numpy()
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/test',
batch_size=batch_size)
# print(train_labels.shape)
# print(train_sentences.shape)
max_features = 10000
sequence_length = 250
counter = counter_word(df['reviews.text'])
num_unique_words = len(counter)
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
tokenizer = Tokenizer(num_words=num_unique_words)
tokenizer.fit_on_texts(train_sentences)
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
word_index = tokenizer.word_index
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
AUTOTUNE = tf.data.AUTOTUNE
max_length = 30
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
embedding_dim = 16
model = tf.keras.Sequential([
layers.Embedding(max_features + 1, embedding_dim),
layers.Dropout(0.2),
layers.GlobalAveragePooling1D(),
layers.Dropout(0.2),
layers.Dense(1)])
model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model.add(layers.LSTM(64, dropout=0.1))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
optimizer='adam',
metrics=tf.metrics.BinaryAccuracy(threshold=0.0))
loss = keras.losses.BinaryCrossentropy(from_logits=False)
optim = keras.optimizers.Adam(lr = 0.001)
metrics = ["accuracy"]
epochs = 10
history = model.fit(
train_ds,
validation_data=val_ds,
epochs=epochs)
model.compile(loss = loss, optimizer = optim, metrics = metrics)
model.fit(train_padded, train_labels, epochs = 20, validation_data=(val_padded, val_labels), verbose=2)
loss, accuracy = model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)
predictions = model.predict(test_padded)
export_model = tf.keras.Sequential([
vectorize_layer,
model,
layers.Activation('sigmoid')
])
export_model.compile(
loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)
loss, accuracy = export_model.evaluate(raw_test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)
predictions = [1 if p > 0.5 else 0 for p in predictions]
file = open('results.txt', 'w')
file.write('test loss: ' + loss + '\n' + 'test accuracy: ' + accuracy)
file.write(predictions.__str__())
file.close()

6666
test.csv

File diff suppressed because it is too large Load Diff

19999
train.csv

File diff suppressed because one or more lines are too long