diff --git a/Dockerfile b/Dockerfile index 84e702f..f0a8dda 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,7 @@ RUN pip3 install -r ./requirements.txt # Skopiujmy nasz skrypt do katalogu /app w kontenerze COPY ./process_data.sh ./ COPY ./download_data_and_process.py ./ +COPY ./stats.py ./ # Domyślne polecenie, które zostanie uruchomione w kontenerze po jego starcie CMD python -u ./download_data_and_process.py \ No newline at end of file diff --git a/process_data.sh b/process_data.sh index 1c40f64..d618df8 100755 --- a/process_data.sh +++ b/process_data.sh @@ -12,4 +12,4 @@ head -n $CUTOFF data_not_cutted.csv > data.csv sed -n '1,2500p' data.csv > data_test.csv sed -n '2501,5000p' data.csv > data_dev.csv tail -n +5001 data.csv > data_train.csv -rm data.csv real-or-fake-fake-jobposting-prediction.zip column_titles.csv data_not_shuf.csv data_not_cutted.csv \ No newline at end of file +rm data.csv real-or-fake-fake-jobposting-prediction.zip data_not_shuf.csv data_not_cutted.csv \ No newline at end of file diff --git a/stats.py b/stats.py index 0ce1f25..84da0d4 100644 --- a/stats.py +++ b/stats.py @@ -1,11 +1,21 @@ import subprocess import pandas as pd import numpy as np +import os -data=pd.read_csv('data_train.csv') -data_2=pd.read_csv('data_dev.csv') -data_3=pd.read_csv('data_test.csv') -data = pd.concat([data, data_2, data_3], axis=0) + +path = '' + +all_files = ['column_titles.csv', 'data_train.csv', 'data_dev.csv', 'data_test.csv'] + +data_file = open("data.csv", "w") +for name in all_files: + f = open(name, "r") + data_file.write(f.read()) + f.close() + +data_file.close() +data=pd.read_csv('data.csv') data = data.replace(np.nan, '', regex=True) print("="*20)