diff --git a/Jenkinsfile b/Jenkinsfile index 1a783e4..3733864 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,5 +1,7 @@ pipeline{ - agent any + agent { + docker { image 'ubuntu:latest' } + } stages{ stage('checkout: Check out from version control'){ steps{ @@ -8,14 +10,7 @@ pipeline{ } stage('sh: Shell Script'){ steps{ - sh '''#!/bin/bash - wget -c https://git.wmi.amu.edu.pl/s434780/ium_434780/src/branch/master/data.csv - head -n -1 data.csv | shuf > data.csv.shuf - wc -l data.csv - head -n 500 data.csv.shuf > test.csv - head -n 500 data.csv.shuf | tail -n 500 > dev.csv - tail -n +501 data.csv.shuf > train.csv - wc -l *.csv ''' + sh './script.sh' } } stage('Archive artifacts'){ diff --git a/Jenkinsfile-stats b/Jenkinsfile-stats new file mode 100644 index 0000000..306304b --- /dev/null +++ b/Jenkinsfile-stats @@ -0,0 +1,25 @@ +pipeline{ + agent any + stages{ + stage('checkout: Check out from version control'){ + steps{ + checkout([$class: 'GitSCM', branches: [[name: '*/master']], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[credentialsId: '321737ab-1c4d-475f-9667-513cf19ba596', url: 'https://git.wmi.amu.edu.pl/s434780/ium_434780.git']]]) + } + } + stage('Copy artifacts'){ + steps{ + copyArtifacts fingerprintArtifacts: true, projectName: 's434780-create-dataset', selector: buildParameter('BUILD_SELECTOR') + } + } + stage('sh: Shell Script'){ + steps{ + sh './stats.sh' + } + } + stage('Archive artifacts'){ + steps{ + archiveArtifacts 'stats.txt' + } + } + } +} \ No newline at end of file diff --git a/main.py b/main.py index 33e45e6..d0d0798 100644 --- a/main.py +++ b/main.py @@ -3,10 +3,10 @@ from sklearn.model_selection import train_test_split def main(): - data = pd.read_csv('resources/Amazon_Consumer_Reviews.csv', header=0, sep=',') + data = pd.read_csv('Amazon_Consumer_Reviews.csv', header=0, sep=',') columns = ['reviews.date', 'reviews.numHelpful', 'reviews.rating', 'reviews.doRecommend'] - string_columns = ['name', 'brand', 'categories', 'primaryCategories', 'keys', 'manufacturer', 'reviews.title', + string_columns = ['name', 'categories', 'primaryCategories', 'manufacturer', 'reviews.title', 'reviews.username', 'reviews.text'] data = data[string_columns + columns] @@ -14,17 +14,19 @@ def main(): for c in string_columns: data[c] = data[c].str.lower() - print("Empty rows summary:") - print(data.isnull().sum()) - data.dropna() + # print("Empty rows summary:") + # print(data.isnull().sum()) + # data["reviews.title"].fillna("No title", inplace = True) + # print(data.isnull().sum()) - data.to_csv('resources/data.csv') + data.to_csv('data.csv') train, test = train_test_split(data, train_size=0.6, random_state=1) test, dev = train_test_split(test, test_size=0.5, random_state=1) - test.to_csv('resources/test.csv') - train.to_csv('resources/train.csv') - dev.to_csv('resources/dev.csv') + + test.to_csv('test.csv') + train.to_csv('train.csv') + dev.to_csv('dev.csv') print("\n\nMean reviews rating for each primary category: ") print(data[["primaryCategories", "reviews.rating"]].groupby("primaryCategories").mean()) diff --git a/stats.sh b/stats.sh index e69de29..8a8812c 100644 --- a/stats.sh +++ b/stats.sh @@ -0,0 +1,2 @@ +#!/bin/bash +wc -l *.csv > stats.txt \ No newline at end of file