Improvements
All checks were successful
s434780-training/pipeline/head This commit looks good

This commit is contained in:
sadurska@trui.pl 2021-05-08 22:32:28 +02:00
parent b5b9a795fc
commit 2841a76304
8 changed files with 85073 additions and 15084 deletions

File diff suppressed because one or more lines are too long

33332
data.csv

File diff suppressed because one or more lines are too long

28333
dataset-Amazon.csv Normal file

File diff suppressed because one or more lines are too long

6667
dev.csv

File diff suppressed because one or more lines are too long

29
main.py
View File

@ -1,9 +1,24 @@
import string
import pandas as pd import pandas as pd
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
def remove_punct(text):
translator = str.maketrans("", "", string.punctuation)
return text.translate(translator)
stop = set(stopwords.words("english"))
def remove_stopwords(text):
filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
return " ".join(filtered_words)
def main(): def main():
data = pd.read_csv('Amazon_Consumer_Reviews.csv', header=0, sep=',') data = pd.read_csv('dataset-Amazon.csv')
columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend'] columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend']
string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title', string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title',
@ -13,11 +28,15 @@ def main():
for c in string_columns: for c in string_columns:
data[c] = data[c].str.lower() data[c] = data[c].str.lower()
data[c] = data[c].map(remove_punct)
data[c] = data[c].map(remove_stopwords)
# print("Empty rows summary:") print("Empty rows summary:")
# print(data.isnull().sum()) print(data.isnull().sum())
# data["reviews.title"].fillna("No title", inplace = True) data.loc[(data["reviews.rating"] > 3), 'reviews.doRecommend'] = True
# print(data.isnull().sum()) data.loc[(data["reviews.rating"] <= 3), 'reviews.doRecommend'] = False
data["reviews.doRecommend"] = data["reviews.doRecommend"].astype(int)
print(data.isnull().sum())
data.to_csv('data.csv') data.to_csv('data.csv')

130
main2.py
View File

@ -1,107 +1,81 @@
import re import pandas as pd
import string
from silence_tensorflow import silence_tensorflow from silence_tensorflow import silence_tensorflow
from tensorflow import keras
silence_tensorflow() silence_tensorflow()
import tensorflow as tf from tensorflow.keras.preprocessing.text import Tokenizer
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
def vectorize_text(text, label): def counter_word(text_col):
text = tf.expand_dims(text, -1) count = Counter()
return vectorize_layer(text), label for text in text_col.values:
for word in text.split():
count[word] += 1
return count
def custom_standardization(input_data): df = pd.read_csv('data.csv')
lowercase = tf.strings.lower(input_data) train_df = pd.read_csv('train.csv')
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ') val_df = pd.read_csv('dev.csv')
return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '') test_df = pd.read_csv('test.csv')
batch_size = 32 df.dropna(subset = ['reviews.text'], inplace = True)
seed = 42 train_df.dropna(subset = ['reviews.text'], inplace = True)
val_df.dropna(subset = ['reviews.text'], inplace = True)
test_df.dropna(subset = ['reviews.text'], inplace = True)
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
'aclImdb/train',
batch_size=batch_size,
validation_split=0.2,
subset='training',
seed=seed)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory( train_sentences = train_df['reviews.text'].to_numpy()
'aclImdb/train', train_labels = train_df['reviews.doRecommend'].to_numpy()
batch_size=batch_size, val_sentences = val_df['reviews.text'].to_numpy()
validation_split=0.2, val_labels = val_df['reviews.doRecommend'].to_numpy()
subset='validation', test_sentences = test_df['reviews.text'].to_numpy()
seed=seed) test_labels = test_df['reviews.doRecommend'].to_numpy()
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory( # print(train_labels.shape)
'aclImdb/test', # print(train_sentences.shape)
batch_size=batch_size)
max_features = 10000 counter = counter_word(df['reviews.text'])
sequence_length = 250 num_unique_words = len(counter)
vectorize_layer = TextVectorization( tokenizer = Tokenizer(num_words=num_unique_words)
standardize=custom_standardization, tokenizer.fit_on_texts(train_sentences)
max_tokens=max_features,
output_mode='int',
output_sequence_length=sequence_length)
train_text = raw_train_ds.map(lambda x, y: x) word_index = tokenizer.word_index
vectorize_layer.adapt(train_text)
train_ds = raw_train_ds.map(vectorize_text) train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_ds = raw_val_ds.map(vectorize_text) val_sequences = tokenizer.texts_to_sequences(val_sentences)
test_ds = raw_test_ds.map(vectorize_text) test_sequences = tokenizer.texts_to_sequences(test_sentences)
AUTOTUNE = tf.data.AUTOTUNE max_length = 30
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
embedding_dim = 16 model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
model = tf.keras.Sequential([ model.add(layers.LSTM(64, dropout=0.1))
layers.Embedding(max_features + 1, embedding_dim), model.add(layers.Dense(1, activation="sigmoid"))
layers.Dropout(0.2),
layers.GlobalAveragePooling1D(),
layers.Dropout(0.2),
layers.Dense(1)])
model.summary() model.summary()
model.compile(loss=losses.BinaryCrossentropy(from_logits=True), loss = keras.losses.BinaryCrossentropy(from_logits=False)
optimizer='adam', optim = keras.optimizers.Adam(lr = 0.001)
metrics=tf.metrics.BinaryAccuracy(threshold=0.0)) metrics = ["accuracy"]
epochs = 10 model.compile(loss = loss, optimizer = optim, metrics = metrics)
history = model.fit( model.fit(train_padded, train_labels, epochs = 20, validation_data=(val_padded, val_labels), verbose=2)
train_ds,
validation_data=val_ds,
epochs=epochs)
loss, accuracy = model.evaluate(test_ds) predictions = model.predict(test_padded)
print("Loss: ", loss)
print("Accuracy: ", accuracy)
export_model = tf.keras.Sequential([ predictions = [1 if p > 0.5 else 0 for p in predictions]
vectorize_layer,
model,
layers.Activation('sigmoid')
])
export_model.compile(
loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)
loss, accuracy = export_model.evaluate(raw_test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)
file = open('results.txt', 'w') file = open('results.txt', 'w')
file.write('test loss: ' + loss + '\n' + 'test accuracy: ' + accuracy) file.write(predictions.__str__())
file.close() file.close()

6666
test.csv

File diff suppressed because it is too large Load Diff

19999
train.csv

File diff suppressed because one or more lines are too long