From 848d0db422f1e4c5bfdcde82826fd3fde20baed9 Mon Sep 17 00:00:00 2001 From: s424714 Date: Mon, 3 Apr 2023 15:31:55 +0200 Subject: [PATCH] create dockerfile; modify python scripts n jenkinsfile --- Dockerfile | 22 ++++++++++++++ Jenkinsfile | 22 +++++++++++--- dane.ipynb | 7 ----- dataset.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 ++ test.sh | 3 +- 6 files changed, 119 insertions(+), 12 deletions(-) create mode 100644 Dockerfile create mode 100644 dataset.py create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8483205 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.11 + +# seting env variables +ENV KAGGLE_USERNAME=filippatyk +ENV KAGGLE_KEY="" +ENV RUN_TYPE="" + +# create working direcotyry +WORKDIR /app + +# install python dependencies +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY dataset.py ./ + +#make dir for data +RUN mkdir -p ./data + + +CMD kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py "--$RUN_TYPE" + diff --git a/Jenkinsfile b/Jenkinsfile index be94d01..04d55c7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,23 +28,37 @@ node { checkout([$class: 'GitSCM', branches: [[name: '*/master']], extensions: [], userRemoteConfigs: [[credentialsId: 's424714', url: 'https://git.wmi.amu.edu.pl/s424714/ium_424714']]]) } - stage('Shell Script') { + stage('Dockerfile build') { // creating global envs for kaggle CLI withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { - sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME' + // sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME' sh "chmod +x -R ${env.WORKSPACE}" - sh "./create-dataset.sh ${CUTOFF}" + + def dockerImage = docker.build("dataset-create", "./dockerfiles/test") + // sh "./create-dataset.sh ${CUTOFF}" } + } + stage("DOCKER testing"){ + withEnv(["KAGGLE_USERNAME=${params.KAGGLE_USERNAME}", + "KAGGLE_KEY=${params.KAGGLE_KEY}" ]) { + // sh 'echo KAGGLE_USERNAME: $KAGGLE_USERNAME' + dockerImage.inside{ + sh "export $KAGGLE_KEY" + sh "kaggle datasets download -p data --unzip clmentbisaillon/fake-and-real-news-dataset && python ./dataset.py --dataset" + sh "cp /data/dataset/dataset.csv ${WORKSPACE}" + } + // sh "./create-dataset.sh ${CUTOFF}" + } } stage('Saving artefacts') { echo 'Goodbye!' - archiveArtifacts 'data/dataset.csv' + archiveArtifacts 'dataset.csv' } } \ No newline at end of file diff --git a/dane.ipynb b/dane.ipynb index 53b8a44..d38bcf5 100644 --- a/dane.ipynb +++ b/dane.ipynb @@ -569,13 +569,6 @@ "print(f\"y_val count: {y_val.count()}\")\n", "print(f\"y_test count: {y_test.count()}\")\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/dataset.py b/dataset.py new file mode 100644 index 0000000..aa170f4 --- /dev/null +++ b/dataset.py @@ -0,0 +1,74 @@ +import argparse +from pathlib import Path + +import pandas as pd +from sklearn.model_selection import train_test_split + +parser = argparse.ArgumentParser( + prog="Dataset", + description="creating dataset or loading stats", +) +parser.add_argument("--dataset", action="store_true", default=False) +parser.add_argument("--stats", action="store_true", default=False) + +args = parser.parse_args() + +TRUE_NEWS_PATH = Path("data/True.csv") +FAKE_NEWS_PATH = Path("data/Fake.csv") + +DATA_PATH = TRUE_NEWS_PATH.parent +DATASET_PATH = DATA_PATH / "dataset" + +# loading datasets +true_news = pd.read_csv(TRUE_NEWS_PATH) +fake_news = pd.read_csv(FAKE_NEWS_PATH) + +# clearing dataset +true_news = true_news.drop(columns=["title", "subject", "date"]) +fake_news = fake_news.drop(columns=["title", "subject", "date"]) + +# setting binary classification +true_news["Value"] = 1 +fake_news["Value"] = 0 + +# merging dataset +dataset = pd.concat([true_news, fake_news], axis=0) +dataset["text"] = dataset["text"].str.strip() +dataset.dropna(axis=0, how="any", inplace=False, subset=["text"]) + +# spliting dataset for train,val,test +X_train, X_val_test, y_train, y_valtest = train_test_split( + dataset["text"], dataset["Value"], test_size=0.2, shuffle=True, random_state=20 +) + +X_test, X_val, y_test, y_val = train_test_split( + X_val_test, y_valtest, test_size=0.5, shuffle=True, random_state=21 +) + +train_data = pd.concat([X_train, y_train], axis=1) +val_data = pd.concat([X_val, y_val], axis=1) +test_data = pd.concat([X_test, y_test], axis=1) + +if args.dataset: + DATASET_PATH.mkdir(parents=True, exist_ok=True) + dataset.to_csv((DATASET_PATH / "dataset.csv"), index=False) + train_data.to_csv((DATASET_PATH / "train.csv"), index=False) + val_data.to_csv((DATASET_PATH / "val.csv"), index=False) + test_data.to_csv((DATASET_PATH / "test.csv"), index=False) + + print(dataset) + +if args.stats: + std_stats = [y_train.std(), y_val.std(), y_test.std()] + mean_stats = [y_train.mean(), y_val.mean(), y_test.mean()] + count_stats = [y_train.count(), y_val.count(), y_test.count()] + stats = pd.DataFrame( + data=[std_stats, mean_stats, count_stats], + index=["std", "mean", "count"], + columns=["train", "val", "test"], + ) + stats.to_csv((DATA_PATH / "stats.csv")) + print(stats) + +if not (args.dataset or args.stats): + print("NO RUN TYPE SPECIFIED") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..426b98b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pandas==1.5.3 +scikit_learn==1.2.2 +kaggle==1.5.13 \ No newline at end of file diff --git a/test.sh b/test.sh index 28a9006..4fbb703 100755 --- a/test.sh +++ b/test.sh @@ -1 +1,2 @@ -echo "Witaj $1" \ No newline at end of file +KUPA=$1 +echo "Witaj --$KUPA" \ No newline at end of file