bash script for fetching data
This commit is contained in:
parent
78a084b498
commit
cd7e0040cc
19
get_data.sh
Executable file
19
get_data.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
if kaggle datasets download -d sgonkaggle/youtube-trend-with-subscriber && unzip youtube-trend-with-subscriber.zip; then
|
||||
head -n 2 USvideos_modified.csv
|
||||
grep -v -e "^$" - USvideos_modified.csv
|
||||
COUNT=$(wc -l "USvideos_modified.csv")
|
||||
echo "${COUNT}"
|
||||
head -n -1 "USvideos_modified.csv" | shuf > "data_shuf"
|
||||
head -n 544 data_shuf > data_test
|
||||
head -n 1088 data_shuf | tail -n 544 > data_dev
|
||||
head -n +1089 data_shuf > data_train
|
||||
echo "Shuffled dataset"
|
||||
wc -l data_shuf
|
||||
echo "Test dataset" wc -l data_test
|
||||
echo "Dev dataset"
|
||||
wc -l data_dev
|
||||
echo "Train dataset"
|
||||
wc -l data_train
|
||||
python main.py USvideos_modified.csv
|
||||
fi
|
33
main.py
33
main.py
@ -1,29 +1,8 @@
|
||||
import subprocess
|
||||
#!/usr/bin/python
|
||||
|
||||
import sys
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
process = subprocess.Popen("kaggle datasets download -d alwinjoseph/stress-detection-of-medical-partitioners",
|
||||
shell=True, stdout=subprocess.PIPE)
|
||||
process.wait()
|
||||
process = subprocess.Popen("tar -xf stress-detection-of-medical-partitioners.zip", shell=True, stdout=subprocess.PIPE)
|
||||
process.wait()
|
||||
|
||||
data = pd.read_csv('data.csv')
|
||||
data_train, data_test = train_test_split(data, test_size=50000, random_state=1, stratify=data["Alcohol_usage"])
|
||||
print("All data")
|
||||
print("Number of rows ", data.shape[0])
|
||||
print("Number of columns ", data.shape[1])
|
||||
|
||||
print("\nTrain data")
|
||||
print("Number of rows ", data_train.shape[0])
|
||||
print("Number of columns ", data_train.shape[1])
|
||||
|
||||
print("\nTest data")
|
||||
print("Number of rows ", data_test.shape[0])
|
||||
print("Number of columns ", data_test.shape[1])
|
||||
|
||||
print("\n")
|
||||
print(data.describe(include='all'))
|
||||
|
||||
print("\n")
|
||||
print(data["Alcohol_usage"].value_counts())
|
||||
data = pd.read_csv(sys.argv[1])
|
||||
print(data.describe(include="all"))
|
||||
print(data['likes'].value_counts())
|
||||
|
Loading…
Reference in New Issue
Block a user