bash script for fetching data

This commit is contained in:
s434765 2021-03-27 17:28:33 +01:00
parent 78a084b498
commit cd7e0040cc
3 changed files with 25 additions and 100028 deletions

100001
data.csv

File diff suppressed because it is too large Load Diff

19
get_data.sh Executable file
View File

@ -0,0 +1,19 @@
#!/bin/bash
if kaggle datasets download -d sgonkaggle/youtube-trend-with-subscriber && unzip youtube-trend-with-subscriber.zip; then
head -n 2 USvideos_modified.csv
grep -v -e "^$" - USvideos_modified.csv
COUNT=$(wc -l "USvideos_modified.csv")
echo "${COUNT}"
head -n -1 "USvideos_modified.csv" | shuf > "data_shuf"
head -n 544 data_shuf > data_test
head -n 1088 data_shuf | tail -n 544 > data_dev
head -n +1089 data_shuf > data_train
echo "Shuffled dataset"
wc -l data_shuf
echo "Test dataset" wc -l data_test
echo "Dev dataset"
wc -l data_dev
echo "Train dataset"
wc -l data_train
python main.py USvideos_modified.csv
fi

33
main.py
View File

@ -1,29 +1,8 @@
import subprocess
#!/usr/bin/python
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
process = subprocess.Popen("kaggle datasets download -d alwinjoseph/stress-detection-of-medical-partitioners",
shell=True, stdout=subprocess.PIPE)
process.wait()
process = subprocess.Popen("tar -xf stress-detection-of-medical-partitioners.zip", shell=True, stdout=subprocess.PIPE)
process.wait()
data = pd.read_csv('data.csv')
data_train, data_test = train_test_split(data, test_size=50000, random_state=1, stratify=data["Alcohol_usage"])
print("All data")
print("Number of rows ", data.shape[0])
print("Number of columns ", data.shape[1])
print("\nTrain data")
print("Number of rows ", data_train.shape[0])
print("Number of columns ", data_train.shape[1])
print("\nTest data")
print("Number of rows ", data_test.shape[0])
print("Number of columns ", data_test.shape[1])
print("\n")
print(data.describe(include='all'))
print("\n")
print(data["Alcohol_usage"].value_counts())
data = pd.read_csv(sys.argv[1])
print(data.describe(include="all"))
print(data['likes'].value_counts())