train neural network
This commit is contained in:
parent
f2a5159cfa
commit
801a794e82
@ -29,11 +29,11 @@ node {
|
||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||
"KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) {
|
||||
sh "python3 download_dataset.py '.' 'dataset.csv'"
|
||||
sh "ls"
|
||||
sh "python3 train_neural_network.py '.'"
|
||||
}
|
||||
}
|
||||
stage('Archive artifacts') {
|
||||
archiveArtifacts "dataset.csv, train_data.csv, test_data.csv, dev_data.csv"
|
||||
archiveArtifacts "dataset.csv, train_data.csv, test_data.csv, dev_data.csv, neural_network_evaluation.txt"
|
||||
}
|
||||
}
|
||||
}
|
@ -3,9 +3,11 @@ import os.path
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from kaggle import api
|
||||
from pandas import read_csv
|
||||
from sklearn.model_selection import train_test_split
|
||||
from nltk.tokenize import RegexpTokenizer
|
||||
|
||||
|
||||
def download_and_save_dataset(data_path, dataset_name):
|
||||
@ -17,8 +19,21 @@ def download_and_save_dataset(data_path, dataset_name):
|
||||
|
||||
|
||||
def preprocess_dataset(data):
|
||||
# drop columns with many nulls
|
||||
return data.drop(['job_id', 'department', 'salary_range', 'benefits'], axis=1)
|
||||
data = data.replace(np.nan, '', regex=True)
|
||||
|
||||
data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
|
||||
data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
|
||||
data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
|
||||
|
||||
data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
|
||||
lambda x: ' '.join(x), axis=1)
|
||||
data['text'] = data['text'].str.lower()
|
||||
|
||||
tokenizer = RegexpTokenizer(r'\w+')
|
||||
data['tokens'] = data['text'].apply(tokenizer.tokenize)
|
||||
|
||||
return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
|
||||
axis=1)
|
||||
|
||||
|
||||
def split_dataset(data_path, dataset_name):
|
||||
@ -38,7 +53,7 @@ def split_dataset(data_path, dataset_name):
|
||||
|
||||
|
||||
def save_dataset(data_path, data, name):
|
||||
data.to_csv(os.path.join(data_path, name))
|
||||
data.to_csv(os.path.join(data_path, name), index=False)
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -1,19 +1,55 @@
|
||||
absl-py==1.0.0
|
||||
astunparse==1.6.3
|
||||
cachetools==5.0.0
|
||||
certifi==2021.10.8
|
||||
charset-normalizer==2.0.12
|
||||
click==8.1.2
|
||||
flatbuffers==2.0
|
||||
gast==0.5.3
|
||||
google-auth==2.6.6
|
||||
google-auth-oauthlib==0.4.6
|
||||
google-pasta==0.2.0
|
||||
grpcio==1.44.0
|
||||
h5py==3.6.0
|
||||
idna==3.3
|
||||
importlib-metadata==4.11.3
|
||||
joblib==1.1.0
|
||||
kaggle==1.5.12
|
||||
keras==2.8.0
|
||||
Keras-Preprocessing==1.1.2
|
||||
libclang==14.0.1
|
||||
Markdown==3.3.6
|
||||
nltk==3.7
|
||||
numpy==1.22.3
|
||||
pandas==1.4.1
|
||||
oauthlib==3.2.0
|
||||
opt-einsum==3.3.0
|
||||
pandas==1.4.2
|
||||
protobuf==3.20.1
|
||||
pyasn1==0.4.8
|
||||
pyasn1-modules==0.2.8
|
||||
python-dateutil==2.8.2
|
||||
python-slugify==6.1.1
|
||||
pytz==2022.1
|
||||
regex==2022.3.15
|
||||
requests==2.27.1
|
||||
requests-oauthlib==1.3.1
|
||||
rsa==4.8
|
||||
scikit-learn==1.0.2
|
||||
scipy==1.8.0
|
||||
six==1.16.0
|
||||
sklearn==0.0
|
||||
tensorboard==2.8.0
|
||||
tensorboard-data-server==0.6.1
|
||||
tensorboard-plugin-wit==1.8.1
|
||||
tensorflow==2.8.0
|
||||
tensorflow-io-gcs-filesystem==0.25.0
|
||||
termcolor==1.1.0
|
||||
text-unidecode==1.3
|
||||
tf-estimator-nightly==2.8.0.dev2021122109
|
||||
threadpoolctl==3.1.0
|
||||
tqdm==4.63.1
|
||||
tqdm==4.64.0
|
||||
typing_extensions==4.2.0
|
||||
urllib3==1.26.9
|
||||
Werkzeug==2.1.1
|
||||
wrapt==1.14.0
|
||||
zipp==3.8.0
|
||||
|
90
train_neural_network.py
Normal file
90
train_neural_network.py
Normal file
@ -0,0 +1,90 @@
|
||||
#!/usr/bin/python
|
||||
import os
|
||||
import pprint
|
||||
import sys
|
||||
import pandas as pd
|
||||
from keras.models import Sequential, load_model
|
||||
from keras import layers
|
||||
from keras.preprocessing.text import Tokenizer
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
|
||||
import logging
|
||||
|
||||
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
||||
|
||||
|
||||
def tokenize(x, x_train, x_test, max_len):
|
||||
tokenizer = Tokenizer(num_words=14000)
|
||||
tokenizer.fit_on_texts(x)
|
||||
train_x = tokenizer.texts_to_sequences(x_train)
|
||||
test_x = tokenizer.texts_to_sequences(x_test)
|
||||
vocabulary_length = len(tokenizer.word_index) + 1
|
||||
|
||||
train_x = pad_sequences(train_x, padding='post', maxlen=max_len)
|
||||
test_x = pad_sequences(test_x, padding='post', maxlen=max_len)
|
||||
return train_x, test_x, vocabulary_length
|
||||
|
||||
|
||||
def evaluate_and_save(model, x, y, abs_path):
|
||||
loss, accuracy = model.evaluate(x, y, verbose=False)
|
||||
y_predicted = (model.predict(x) >= 0.5).astype(int)
|
||||
with open(os.path.join(abs_path, 'neural_network_evaluation.txt'), "w") as log_file:
|
||||
for obj in (
|
||||
('Accuracy: ', accuracy), ('Loss: ', loss), ('Precision: ', precision_score(y, y_predicted)),
|
||||
('Recall: ', recall_score(y, y_predicted)), ('F1: ', f1_score(y, y_predicted)),
|
||||
('Accuracy: ', accuracy_score(y, y_predicted))):
|
||||
pprint.pprint(obj, log_file)
|
||||
|
||||
|
||||
def load_trained_model(abs_path, model_name):
|
||||
return load_model(os.path.join(abs_path, model_name))
|
||||
|
||||
|
||||
def save_model(model, abs_path, model_name):
|
||||
model.save(os.path.join(abs_path, model_name))
|
||||
|
||||
|
||||
def train_model(model, x_train, y_train):
|
||||
model.fit(x_train, y_train, epochs=1, verbose=False, batch_size=50)
|
||||
|
||||
|
||||
def get_model(output_dim, vocabulary_length):
|
||||
model = Sequential()
|
||||
model.add(layers.Embedding(input_dim=vocabulary_length,
|
||||
output_dim=output_dim,
|
||||
input_length=100))
|
||||
model.add(layers.Flatten())
|
||||
model.add(layers.Dense(10, activation='relu'))
|
||||
model.add(layers.Dense(1, activation='sigmoid'))
|
||||
model.compile(optimizer='adam',
|
||||
loss='binary_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
|
||||
def split_data(data):
|
||||
x = data['tokens']
|
||||
y = data['fraudulent']
|
||||
return x, y
|
||||
|
||||
|
||||
def load_data(data_path, filename) -> pd.DataFrame:
|
||||
return pd.read_csv(os.path.join(data_path, filename))
|
||||
|
||||
|
||||
def main():
|
||||
data_path = sys.argv[1]
|
||||
abs_data_path = os.path.abspath(data_path)
|
||||
train_data = load_data(abs_data_path, 'train_data.csv')
|
||||
test_data = load_data(abs_data_path, 'test_data.csv')
|
||||
x_train, y_train = split_data(train_data)
|
||||
x_test, y_test = split_data(test_data)
|
||||
x_train, x_test, vocab_size = tokenize(pd.concat([x_train, x_test]), x_train, x_test, 100)
|
||||
model = get_model(50, vocab_size)
|
||||
train_model(model, x_train, y_train)
|
||||
# save_model(model, abs_data_path, 'neural_network')
|
||||
evaluate_and_save(model, x_test, y_test, abs_data_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue
Block a user