train neural network

This commit is contained in:
AdamOsiowy123 2022-04-24 22:51:20 +02:00
parent f2a5159cfa
commit 801a794e82
4 changed files with 148 additions and 7 deletions

View File

@ -29,11 +29,11 @@ node {
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
"KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) {
sh "python3 download_dataset.py '.' 'dataset.csv'"
sh "ls"
sh "python3 train_neural_network.py '.'"
}
}
stage('Archive artifacts') {
archiveArtifacts "dataset.csv, train_data.csv, test_data.csv, dev_data.csv"
archiveArtifacts "dataset.csv, train_data.csv, test_data.csv, dev_data.csv, neural_network_evaluation.txt"
}
}
}

View File

@ -3,9 +3,11 @@ import os.path
import sys
import pandas as pd
import numpy as np
from kaggle import api
from pandas import read_csv
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
def download_and_save_dataset(data_path, dataset_name):
@ -17,8 +19,21 @@ def download_and_save_dataset(data_path, dataset_name):
def preprocess_dataset(data):
# drop columns with many nulls
return data.drop(['job_id', 'department', 'salary_range', 'benefits'], axis=1)
data = data.replace(np.nan, '', regex=True)
data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
lambda x: ' '.join(x), axis=1)
data['text'] = data['text'].str.lower()
tokenizer = RegexpTokenizer(r'\w+')
data['tokens'] = data['text'].apply(tokenizer.tokenize)
return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
axis=1)
def split_dataset(data_path, dataset_name):
@ -38,7 +53,7 @@ def split_dataset(data_path, dataset_name):
def save_dataset(data_path, data, name):
data.to_csv(os.path.join(data_path, name))
data.to_csv(os.path.join(data_path, name), index=False)
def main():

View File

@ -1,19 +1,55 @@
absl-py==1.0.0
astunparse==1.6.3
cachetools==5.0.0
certifi==2021.10.8
charset-normalizer==2.0.12
click==8.1.2
flatbuffers==2.0
gast==0.5.3
google-auth==2.6.6
google-auth-oauthlib==0.4.6
google-pasta==0.2.0
grpcio==1.44.0
h5py==3.6.0
idna==3.3
importlib-metadata==4.11.3
joblib==1.1.0
kaggle==1.5.12
keras==2.8.0
Keras-Preprocessing==1.1.2
libclang==14.0.1
Markdown==3.3.6
nltk==3.7
numpy==1.22.3
pandas==1.4.1
oauthlib==3.2.0
opt-einsum==3.3.0
pandas==1.4.2
protobuf==3.20.1
pyasn1==0.4.8
pyasn1-modules==0.2.8
python-dateutil==2.8.2
python-slugify==6.1.1
pytz==2022.1
regex==2022.3.15
requests==2.27.1
requests-oauthlib==1.3.1
rsa==4.8
scikit-learn==1.0.2
scipy==1.8.0
six==1.16.0
sklearn==0.0
tensorboard==2.8.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
tensorflow==2.8.0
tensorflow-io-gcs-filesystem==0.25.0
termcolor==1.1.0
text-unidecode==1.3
tf-estimator-nightly==2.8.0.dev2021122109
threadpoolctl==3.1.0
tqdm==4.63.1
tqdm==4.64.0
typing_extensions==4.2.0
urllib3==1.26.9
Werkzeug==2.1.1
wrapt==1.14.0
zipp==3.8.0

90
train_neural_network.py Normal file
View File

@ -0,0 +1,90 @@
#!/usr/bin/python
import os
import pprint
import sys
import pandas as pd
from keras.models import Sequential, load_model
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
def tokenize(x, x_train, x_test, max_len):
tokenizer = Tokenizer(num_words=14000)
tokenizer.fit_on_texts(x)
train_x = tokenizer.texts_to_sequences(x_train)
test_x = tokenizer.texts_to_sequences(x_test)
vocabulary_length = len(tokenizer.word_index) + 1
train_x = pad_sequences(train_x, padding='post', maxlen=max_len)
test_x = pad_sequences(test_x, padding='post', maxlen=max_len)
return train_x, test_x, vocabulary_length
def evaluate_and_save(model, x, y, abs_path):
loss, accuracy = model.evaluate(x, y, verbose=False)
y_predicted = (model.predict(x) >= 0.5).astype(int)
with open(os.path.join(abs_path, 'neural_network_evaluation.txt'), "w") as log_file:
for obj in (
('Accuracy: ', accuracy), ('Loss: ', loss), ('Precision: ', precision_score(y, y_predicted)),
('Recall: ', recall_score(y, y_predicted)), ('F1: ', f1_score(y, y_predicted)),
('Accuracy: ', accuracy_score(y, y_predicted))):
pprint.pprint(obj, log_file)
def load_trained_model(abs_path, model_name):
return load_model(os.path.join(abs_path, model_name))
def save_model(model, abs_path, model_name):
model.save(os.path.join(abs_path, model_name))
def train_model(model, x_train, y_train):
model.fit(x_train, y_train, epochs=1, verbose=False, batch_size=50)
def get_model(output_dim, vocabulary_length):
model = Sequential()
model.add(layers.Embedding(input_dim=vocabulary_length,
output_dim=output_dim,
input_length=100))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
return model
def split_data(data):
x = data['tokens']
y = data['fraudulent']
return x, y
def load_data(data_path, filename) -> pd.DataFrame:
return pd.read_csv(os.path.join(data_path, filename))
def main():
data_path = sys.argv[1]
abs_data_path = os.path.abspath(data_path)
train_data = load_data(abs_data_path, 'train_data.csv')
test_data = load_data(abs_data_path, 'test_data.csv')
x_train, y_train = split_data(train_data)
x_test, y_test = split_data(test_data)
x_train, x_test, vocab_size = tokenize(pd.concat([x_train, x_test]), x_train, x_test, 100)
model = get_model(50, vocab_size)
train_model(model, x_train, y_train)
# save_model(model, abs_data_path, 'neural_network')
evaluate_and_save(model, x_test, y_test, abs_data_path)
if __name__ == '__main__':
main()