train neural network
This commit is contained in:
parent
f2a5159cfa
commit
801a794e82
@ -29,11 +29,11 @@ node {
|
|||||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||||
"KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) {
|
"KAGGLE_KEY=${params.KAGGLE_KEY}","CUTOFF=${params.CUTOFF}"]) {
|
||||||
sh "python3 download_dataset.py '.' 'dataset.csv'"
|
sh "python3 download_dataset.py '.' 'dataset.csv'"
|
||||||
sh "ls"
|
sh "python3 train_neural_network.py '.'"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stage('Archive artifacts') {
|
stage('Archive artifacts') {
|
||||||
archiveArtifacts "dataset.csv, train_data.csv, test_data.csv, dev_data.csv"
|
archiveArtifacts "dataset.csv, train_data.csv, test_data.csv, dev_data.csv, neural_network_evaluation.txt"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -3,9 +3,11 @@ import os.path
|
|||||||
import sys
|
import sys
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
from kaggle import api
|
from kaggle import api
|
||||||
from pandas import read_csv
|
from pandas import read_csv
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
from nltk.tokenize import RegexpTokenizer
|
||||||
|
|
||||||
|
|
||||||
def download_and_save_dataset(data_path, dataset_name):
|
def download_and_save_dataset(data_path, dataset_name):
|
||||||
@ -17,8 +19,21 @@ def download_and_save_dataset(data_path, dataset_name):
|
|||||||
|
|
||||||
|
|
||||||
def preprocess_dataset(data):
|
def preprocess_dataset(data):
|
||||||
# drop columns with many nulls
|
data = data.replace(np.nan, '', regex=True)
|
||||||
return data.drop(['job_id', 'department', 'salary_range', 'benefits'], axis=1)
|
|
||||||
|
data['description'] = data['description'].str.replace(r"\W+", " ", regex=True)
|
||||||
|
data['description'] = data['description'].str.replace(r"url_\w+", " ", regex=True)
|
||||||
|
data['description'] = data['description'].str.replace(r"\s+", " ", regex=True)
|
||||||
|
|
||||||
|
data['text'] = data[['title', 'department', 'company_profile', 'description', 'requirements', 'benefits']].apply(
|
||||||
|
lambda x: ' '.join(x), axis=1)
|
||||||
|
data['text'] = data['text'].str.lower()
|
||||||
|
|
||||||
|
tokenizer = RegexpTokenizer(r'\w+')
|
||||||
|
data['tokens'] = data['text'].apply(tokenizer.tokenize)
|
||||||
|
|
||||||
|
return data.drop(['job_id', 'department', 'company_profile', 'description', 'requirements', 'benefits', 'text'],
|
||||||
|
axis=1)
|
||||||
|
|
||||||
|
|
||||||
def split_dataset(data_path, dataset_name):
|
def split_dataset(data_path, dataset_name):
|
||||||
@ -38,7 +53,7 @@ def split_dataset(data_path, dataset_name):
|
|||||||
|
|
||||||
|
|
||||||
def save_dataset(data_path, data, name):
|
def save_dataset(data_path, data, name):
|
||||||
data.to_csv(os.path.join(data_path, name))
|
data.to_csv(os.path.join(data_path, name), index=False)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -1,19 +1,55 @@
|
|||||||
|
absl-py==1.0.0
|
||||||
|
astunparse==1.6.3
|
||||||
|
cachetools==5.0.0
|
||||||
certifi==2021.10.8
|
certifi==2021.10.8
|
||||||
charset-normalizer==2.0.12
|
charset-normalizer==2.0.12
|
||||||
|
click==8.1.2
|
||||||
|
flatbuffers==2.0
|
||||||
|
gast==0.5.3
|
||||||
|
google-auth==2.6.6
|
||||||
|
google-auth-oauthlib==0.4.6
|
||||||
|
google-pasta==0.2.0
|
||||||
|
grpcio==1.44.0
|
||||||
|
h5py==3.6.0
|
||||||
idna==3.3
|
idna==3.3
|
||||||
|
importlib-metadata==4.11.3
|
||||||
joblib==1.1.0
|
joblib==1.1.0
|
||||||
kaggle==1.5.12
|
kaggle==1.5.12
|
||||||
|
keras==2.8.0
|
||||||
|
Keras-Preprocessing==1.1.2
|
||||||
|
libclang==14.0.1
|
||||||
|
Markdown==3.3.6
|
||||||
|
nltk==3.7
|
||||||
numpy==1.22.3
|
numpy==1.22.3
|
||||||
pandas==1.4.1
|
oauthlib==3.2.0
|
||||||
|
opt-einsum==3.3.0
|
||||||
|
pandas==1.4.2
|
||||||
|
protobuf==3.20.1
|
||||||
|
pyasn1==0.4.8
|
||||||
|
pyasn1-modules==0.2.8
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
python-slugify==6.1.1
|
python-slugify==6.1.1
|
||||||
pytz==2022.1
|
pytz==2022.1
|
||||||
|
regex==2022.3.15
|
||||||
requests==2.27.1
|
requests==2.27.1
|
||||||
|
requests-oauthlib==1.3.1
|
||||||
|
rsa==4.8
|
||||||
scikit-learn==1.0.2
|
scikit-learn==1.0.2
|
||||||
scipy==1.8.0
|
scipy==1.8.0
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
sklearn==0.0
|
sklearn==0.0
|
||||||
|
tensorboard==2.8.0
|
||||||
|
tensorboard-data-server==0.6.1
|
||||||
|
tensorboard-plugin-wit==1.8.1
|
||||||
|
tensorflow==2.8.0
|
||||||
|
tensorflow-io-gcs-filesystem==0.25.0
|
||||||
|
termcolor==1.1.0
|
||||||
text-unidecode==1.3
|
text-unidecode==1.3
|
||||||
|
tf-estimator-nightly==2.8.0.dev2021122109
|
||||||
threadpoolctl==3.1.0
|
threadpoolctl==3.1.0
|
||||||
tqdm==4.63.1
|
tqdm==4.64.0
|
||||||
|
typing_extensions==4.2.0
|
||||||
urllib3==1.26.9
|
urllib3==1.26.9
|
||||||
|
Werkzeug==2.1.1
|
||||||
|
wrapt==1.14.0
|
||||||
|
zipp==3.8.0
|
||||||
|
90
train_neural_network.py
Normal file
90
train_neural_network.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
import os
|
||||||
|
import pprint
|
||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
from keras.models import Sequential, load_model
|
||||||
|
from keras import layers
|
||||||
|
from keras.preprocessing.text import Tokenizer
|
||||||
|
from keras.preprocessing.sequence import pad_sequences
|
||||||
|
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.getLogger("tensorflow").setLevel(logging.ERROR)
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(x, x_train, x_test, max_len):
|
||||||
|
tokenizer = Tokenizer(num_words=14000)
|
||||||
|
tokenizer.fit_on_texts(x)
|
||||||
|
train_x = tokenizer.texts_to_sequences(x_train)
|
||||||
|
test_x = tokenizer.texts_to_sequences(x_test)
|
||||||
|
vocabulary_length = len(tokenizer.word_index) + 1
|
||||||
|
|
||||||
|
train_x = pad_sequences(train_x, padding='post', maxlen=max_len)
|
||||||
|
test_x = pad_sequences(test_x, padding='post', maxlen=max_len)
|
||||||
|
return train_x, test_x, vocabulary_length
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_and_save(model, x, y, abs_path):
|
||||||
|
loss, accuracy = model.evaluate(x, y, verbose=False)
|
||||||
|
y_predicted = (model.predict(x) >= 0.5).astype(int)
|
||||||
|
with open(os.path.join(abs_path, 'neural_network_evaluation.txt'), "w") as log_file:
|
||||||
|
for obj in (
|
||||||
|
('Accuracy: ', accuracy), ('Loss: ', loss), ('Precision: ', precision_score(y, y_predicted)),
|
||||||
|
('Recall: ', recall_score(y, y_predicted)), ('F1: ', f1_score(y, y_predicted)),
|
||||||
|
('Accuracy: ', accuracy_score(y, y_predicted))):
|
||||||
|
pprint.pprint(obj, log_file)
|
||||||
|
|
||||||
|
|
||||||
|
def load_trained_model(abs_path, model_name):
|
||||||
|
return load_model(os.path.join(abs_path, model_name))
|
||||||
|
|
||||||
|
|
||||||
|
def save_model(model, abs_path, model_name):
|
||||||
|
model.save(os.path.join(abs_path, model_name))
|
||||||
|
|
||||||
|
|
||||||
|
def train_model(model, x_train, y_train):
|
||||||
|
model.fit(x_train, y_train, epochs=1, verbose=False, batch_size=50)
|
||||||
|
|
||||||
|
|
||||||
|
def get_model(output_dim, vocabulary_length):
|
||||||
|
model = Sequential()
|
||||||
|
model.add(layers.Embedding(input_dim=vocabulary_length,
|
||||||
|
output_dim=output_dim,
|
||||||
|
input_length=100))
|
||||||
|
model.add(layers.Flatten())
|
||||||
|
model.add(layers.Dense(10, activation='relu'))
|
||||||
|
model.add(layers.Dense(1, activation='sigmoid'))
|
||||||
|
model.compile(optimizer='adam',
|
||||||
|
loss='binary_crossentropy',
|
||||||
|
metrics=['accuracy'])
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def split_data(data):
|
||||||
|
x = data['tokens']
|
||||||
|
y = data['fraudulent']
|
||||||
|
return x, y
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(data_path, filename) -> pd.DataFrame:
|
||||||
|
return pd.read_csv(os.path.join(data_path, filename))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
data_path = sys.argv[1]
|
||||||
|
abs_data_path = os.path.abspath(data_path)
|
||||||
|
train_data = load_data(abs_data_path, 'train_data.csv')
|
||||||
|
test_data = load_data(abs_data_path, 'test_data.csv')
|
||||||
|
x_train, y_train = split_data(train_data)
|
||||||
|
x_test, y_test = split_data(test_data)
|
||||||
|
x_train, x_test, vocab_size = tokenize(pd.concat([x_train, x_test]), x_train, x_test, 100)
|
||||||
|
model = get_model(50, vocab_size)
|
||||||
|
train_model(model, x_train, y_train)
|
||||||
|
# save_model(model, abs_data_path, 'neural_network')
|
||||||
|
evaluate_and_save(model, x_test, y_test, abs_data_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user