This commit is contained in:
parent
b5b9a795fc
commit
2841a76304
File diff suppressed because one or more lines are too long
28333
dataset-Amazon.csv
Normal file
28333
dataset-Amazon.csv
Normal file
File diff suppressed because one or more lines are too long
29
main.py
29
main.py
@ -1,9 +1,24 @@
|
|||||||
|
import string
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
import nltk
|
||||||
|
nltk.download('stopwords')
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
|
||||||
|
def remove_punct(text):
|
||||||
|
translator = str.maketrans("", "", string.punctuation)
|
||||||
|
return text.translate(translator)
|
||||||
|
|
||||||
|
|
||||||
|
stop = set(stopwords.words("english"))
|
||||||
|
def remove_stopwords(text):
|
||||||
|
filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
|
||||||
|
return " ".join(filtered_words)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
data = pd.read_csv('Amazon_Consumer_Reviews.csv', header=0, sep=',')
|
data = pd.read_csv('dataset-Amazon.csv')
|
||||||
|
|
||||||
columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend']
|
columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend']
|
||||||
string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title',
|
string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title',
|
||||||
@ -13,11 +28,15 @@ def main():
|
|||||||
|
|
||||||
for c in string_columns:
|
for c in string_columns:
|
||||||
data[c] = data[c].str.lower()
|
data[c] = data[c].str.lower()
|
||||||
|
data[c] = data[c].map(remove_punct)
|
||||||
|
data[c] = data[c].map(remove_stopwords)
|
||||||
|
|
||||||
# print("Empty rows summary:")
|
print("Empty rows summary:")
|
||||||
# print(data.isnull().sum())
|
print(data.isnull().sum())
|
||||||
# data["reviews.title"].fillna("No title", inplace = True)
|
data.loc[(data["reviews.rating"] > 3), 'reviews.doRecommend'] = True
|
||||||
# print(data.isnull().sum())
|
data.loc[(data["reviews.rating"] <= 3), 'reviews.doRecommend'] = False
|
||||||
|
data["reviews.doRecommend"] = data["reviews.doRecommend"].astype(int)
|
||||||
|
print(data.isnull().sum())
|
||||||
|
|
||||||
data.to_csv('data.csv')
|
data.to_csv('data.csv')
|
||||||
|
|
||||||
|
130
main2.py
130
main2.py
@ -1,107 +1,81 @@
|
|||||||
import re
|
import pandas as pd
|
||||||
import string
|
|
||||||
from silence_tensorflow import silence_tensorflow
|
from silence_tensorflow import silence_tensorflow
|
||||||
|
from tensorflow import keras
|
||||||
|
|
||||||
silence_tensorflow()
|
silence_tensorflow()
|
||||||
import tensorflow as tf
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||||||
|
from collections import Counter
|
||||||
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||||||
from tensorflow.keras import layers
|
from tensorflow.keras import layers
|
||||||
from tensorflow.keras import losses
|
|
||||||
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
|
|
||||||
|
|
||||||
|
|
||||||
def vectorize_text(text, label):
|
def counter_word(text_col):
|
||||||
text = tf.expand_dims(text, -1)
|
count = Counter()
|
||||||
return vectorize_layer(text), label
|
for text in text_col.values:
|
||||||
|
for word in text.split():
|
||||||
|
count[word] += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
def custom_standardization(input_data):
|
df = pd.read_csv('data.csv')
|
||||||
lowercase = tf.strings.lower(input_data)
|
train_df = pd.read_csv('train.csv')
|
||||||
stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
|
val_df = pd.read_csv('dev.csv')
|
||||||
return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')
|
test_df = pd.read_csv('test.csv')
|
||||||
|
|
||||||
|
|
||||||
batch_size = 32
|
df.dropna(subset = ['reviews.text'], inplace = True)
|
||||||
seed = 42
|
train_df.dropna(subset = ['reviews.text'], inplace = True)
|
||||||
|
val_df.dropna(subset = ['reviews.text'], inplace = True)
|
||||||
|
test_df.dropna(subset = ['reviews.text'], inplace = True)
|
||||||
|
|
||||||
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
|
|
||||||
'aclImdb/train',
|
|
||||||
batch_size=batch_size,
|
|
||||||
validation_split=0.2,
|
|
||||||
subset='training',
|
|
||||||
seed=seed)
|
|
||||||
|
|
||||||
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
|
train_sentences = train_df['reviews.text'].to_numpy()
|
||||||
'aclImdb/train',
|
train_labels = train_df['reviews.doRecommend'].to_numpy()
|
||||||
batch_size=batch_size,
|
val_sentences = val_df['reviews.text'].to_numpy()
|
||||||
validation_split=0.2,
|
val_labels = val_df['reviews.doRecommend'].to_numpy()
|
||||||
subset='validation',
|
test_sentences = test_df['reviews.text'].to_numpy()
|
||||||
seed=seed)
|
test_labels = test_df['reviews.doRecommend'].to_numpy()
|
||||||
|
|
||||||
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
|
# print(train_labels.shape)
|
||||||
'aclImdb/test',
|
# print(train_sentences.shape)
|
||||||
batch_size=batch_size)
|
|
||||||
|
|
||||||
max_features = 10000
|
counter = counter_word(df['reviews.text'])
|
||||||
sequence_length = 250
|
num_unique_words = len(counter)
|
||||||
|
|
||||||
vectorize_layer = TextVectorization(
|
tokenizer = Tokenizer(num_words=num_unique_words)
|
||||||
standardize=custom_standardization,
|
tokenizer.fit_on_texts(train_sentences)
|
||||||
max_tokens=max_features,
|
|
||||||
output_mode='int',
|
|
||||||
output_sequence_length=sequence_length)
|
|
||||||
|
|
||||||
train_text = raw_train_ds.map(lambda x, y: x)
|
word_index = tokenizer.word_index
|
||||||
vectorize_layer.adapt(train_text)
|
|
||||||
|
|
||||||
train_ds = raw_train_ds.map(vectorize_text)
|
train_sequences = tokenizer.texts_to_sequences(train_sentences)
|
||||||
val_ds = raw_val_ds.map(vectorize_text)
|
val_sequences = tokenizer.texts_to_sequences(val_sentences)
|
||||||
test_ds = raw_test_ds.map(vectorize_text)
|
test_sequences = tokenizer.texts_to_sequences(test_sentences)
|
||||||
|
|
||||||
AUTOTUNE = tf.data.AUTOTUNE
|
max_length = 30
|
||||||
|
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
|
||||||
|
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding="post", truncating="post")
|
||||||
|
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")
|
||||||
|
|
||||||
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
|
|
||||||
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
|
|
||||||
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
|
|
||||||
|
|
||||||
embedding_dim = 16
|
model = keras.models.Sequential()
|
||||||
|
model.add(layers.Embedding(num_unique_words, 32, input_length=max_length))
|
||||||
model = tf.keras.Sequential([
|
model.add(layers.LSTM(64, dropout=0.1))
|
||||||
layers.Embedding(max_features + 1, embedding_dim),
|
model.add(layers.Dense(1, activation="sigmoid"))
|
||||||
layers.Dropout(0.2),
|
|
||||||
layers.GlobalAveragePooling1D(),
|
|
||||||
layers.Dropout(0.2),
|
|
||||||
layers.Dense(1)])
|
|
||||||
|
|
||||||
model.summary()
|
model.summary()
|
||||||
|
|
||||||
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
|
loss = keras.losses.BinaryCrossentropy(from_logits=False)
|
||||||
optimizer='adam',
|
optim = keras.optimizers.Adam(lr = 0.001)
|
||||||
metrics=tf.metrics.BinaryAccuracy(threshold=0.0))
|
metrics = ["accuracy"]
|
||||||
|
|
||||||
epochs = 10
|
model.compile(loss = loss, optimizer = optim, metrics = metrics)
|
||||||
history = model.fit(
|
model.fit(train_padded, train_labels, epochs = 20, validation_data=(val_padded, val_labels), verbose=2)
|
||||||
train_ds,
|
|
||||||
validation_data=val_ds,
|
|
||||||
epochs=epochs)
|
|
||||||
|
|
||||||
loss, accuracy = model.evaluate(test_ds)
|
predictions = model.predict(test_padded)
|
||||||
print("Loss: ", loss)
|
|
||||||
print("Accuracy: ", accuracy)
|
|
||||||
|
|
||||||
export_model = tf.keras.Sequential([
|
predictions = [1 if p > 0.5 else 0 for p in predictions]
|
||||||
vectorize_layer,
|
|
||||||
model,
|
|
||||||
layers.Activation('sigmoid')
|
|
||||||
])
|
|
||||||
|
|
||||||
export_model.compile(
|
|
||||||
loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
|
|
||||||
)
|
|
||||||
|
|
||||||
loss, accuracy = export_model.evaluate(raw_test_ds)
|
|
||||||
print("Loss: ", loss)
|
|
||||||
print("Accuracy: ", accuracy)
|
|
||||||
|
|
||||||
file = open('results.txt', 'w')
|
file = open('results.txt', 'w')
|
||||||
file.write('test loss: ' + loss + '\n' + 'test accuracy: ' + accuracy)
|
file.write(predictions.__str__())
|
||||||
file.close()
|
file.close()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user