create dockerfile; modify python scripts n jenkinsfile
This commit is contained in:
parent
0aa380be3b
commit
848d0db422
22
Dockerfile
Normal file
22
Dockerfile
Normal file
@ -0,0 +1,22 @@
|
||||
FROM python:3.11
|
||||
|
||||
# seting env variables
|
||||
ENV KAGGLE_USERNAME=filippatyk
|
||||
ENV KAGGLE_KEY=""
|
||||
ENV RUN_TYPE=""
|
||||
|
||||
# create working direcotyry
|
||||
WORKDIR /app
|
||||
|
||||
# install python dependencies
|
||||
COPY requirements.txt ./
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY dataset.py ./
|
||||
|
||||
#make dir for data
|
||||
RUN mkdir -p ./data
|
||||
|
||||
|
||||
CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py "--$RUN_TYPE"
|
||||
|
24
Jenkinsfile
vendored
24
Jenkinsfile
vendored
@ -28,23 +28,37 @@ node {
|
||||
checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's424714', url: 'https://git.wmi.amu.edu.pl/s424714/ium_424714']]])
|
||||
}
|
||||
|
||||
stage('Shell Script') {
|
||||
stage('Dockerfile build') {
|
||||
|
||||
// creating global envs for kaggle CLI
|
||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
|
||||
sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
|
||||
// sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
|
||||
|
||||
sh "chmod +x -R ${env.WORKSPACE}"
|
||||
sh "./create-dataset.sh ${CUTOFF}"
|
||||
|
||||
def dockerImage = docker.build("dataset-create", "./dockerfiles/test")
|
||||
// sh "./create-dataset.sh ${CUTOFF}"
|
||||
}
|
||||
}
|
||||
stage("DOCKER testing"){
|
||||
withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}",
|
||||
"KAGGLE_KEY=${params.KAGGLE_KEY}" ]) {
|
||||
// sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME'
|
||||
|
||||
dockerImage.inside{
|
||||
sh "export $KAGGLE_KEY"
|
||||
sh "kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py --dataset"
|
||||
sh "cp /data/dataset/dataset.csv ${WORKSPACE}"
|
||||
}
|
||||
|
||||
|
||||
// sh "./create-dataset.sh ${CUTOFF}"
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
stage('Saving artefacts') {
|
||||
echo 'Goodbye!'
|
||||
archiveArtifacts 'data/dataset.csv'
|
||||
archiveArtifacts 'dataset.csv'
|
||||
}
|
||||
}
|
@ -569,13 +569,6 @@
|
||||
"print(f\"y_val count: {y_val.count()}\")\n",
|
||||
"print(f\"y_test count: {y_test.count()}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
74
dataset.py
Normal file
74
dataset.py
Normal file
@ -0,0 +1,74 @@
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="Dataset",
|
||||
description="creating dataset or loading stats",
|
||||
)
|
||||
parser.add_argument("--dataset", action="store_true", default=False)
|
||||
parser.add_argument("--stats", action="store_true", default=False)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
TRUE_NEWS_PATH = Path("data/True.csv")
|
||||
FAKE_NEWS_PATH = Path("data/Fake.csv")
|
||||
|
||||
DATA_PATH = TRUE_NEWS_PATH.parent
|
||||
DATASET_PATH = DATA_PATH / "dataset"
|
||||
|
||||
# loading datasets
|
||||
true_news = pd.read_csv(TRUE_NEWS_PATH)
|
||||
fake_news = pd.read_csv(FAKE_NEWS_PATH)
|
||||
|
||||
# clearing dataset
|
||||
true_news = true_news.drop(columns=["title", "subject", "date"])
|
||||
fake_news = fake_news.drop(columns=["title", "subject", "date"])
|
||||
|
||||
# setting binary classification
|
||||
true_news["Value"] = 1
|
||||
fake_news["Value"] = 0
|
||||
|
||||
# merging dataset
|
||||
dataset = pd.concat([true_news, fake_news], axis=0)
|
||||
dataset["text"] = dataset["text"].str.strip()
|
||||
dataset.dropna(axis=0, how="any", inplace=False, subset=["text"])
|
||||
|
||||
# spliting dataset for train,val,test
|
||||
X_train, X_val_test, y_train, y_valtest = train_test_split(
|
||||
dataset["text"], dataset["Value"], test_size=0.2, shuffle=True, random_state=20
|
||||
)
|
||||
|
||||
X_test, X_val, y_test, y_val = train_test_split(
|
||||
X_val_test, y_valtest, test_size=0.5, shuffle=True, random_state=21
|
||||
)
|
||||
|
||||
train_data = pd.concat([X_train, y_train], axis=1)
|
||||
val_data = pd.concat([X_val, y_val], axis=1)
|
||||
test_data = pd.concat([X_test, y_test], axis=1)
|
||||
|
||||
if args.dataset:
|
||||
DATASET_PATH.mkdir(parents=True, exist_ok=True)
|
||||
dataset.to_csv((DATASET_PATH / "dataset.csv"), index=False)
|
||||
train_data.to_csv((DATASET_PATH / "train.csv"), index=False)
|
||||
val_data.to_csv((DATASET_PATH / "val.csv"), index=False)
|
||||
test_data.to_csv((DATASET_PATH / "test.csv"), index=False)
|
||||
|
||||
print(dataset)
|
||||
|
||||
if args.stats:
|
||||
std_stats = [y_train.std(), y_val.std(), y_test.std()]
|
||||
mean_stats = [y_train.mean(), y_val.mean(), y_test.mean()]
|
||||
count_stats = [y_train.count(), y_val.count(), y_test.count()]
|
||||
stats = pd.DataFrame(
|
||||
data=[std_stats, mean_stats, count_stats],
|
||||
index=["std", "mean", "count"],
|
||||
columns=["train", "val", "test"],
|
||||
)
|
||||
stats.to_csv((DATA_PATH / "stats.csv"))
|
||||
print(stats)
|
||||
|
||||
if not (args.dataset or args.stats):
|
||||
print("NO RUN TYPE SPECIFIED")
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
pandas==1.5.3
|
||||
scikit_learn==1.2.2
|
||||
kaggle==1.5.13
|
Loading…
Reference in New Issue
Block a user