bash script for fetching data

2021-03-27 17:28:33 +01:00 · 2021-03-27 17:28:33 +01:00 · cd7e0040cc
commit cd7e0040cc
parent 78a084b498
3 changed files with 25 additions and 100028 deletions
--- a/data.csv
+++ b/data.csv
--- a/get_data.sh
+++ b/get_data.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+if kaggle datasets download -d sgonkaggle/youtube-trend-with-subscriber && unzip youtube-trend-with-subscriber.zip; then
+    head -n 2 USvideos_modified.csv
+    grep -v -e "^$" - USvideos_modified.csv
+    COUNT=$(wc -l "USvideos_modified.csv")
+    echo "${COUNT}"
+    head -n -1 "USvideos_modified.csv" | shuf > "data_shuf"
+    head -n 544 data_shuf > data_test
+    head -n 1088 data_shuf | tail -n 544 > data_dev
+    head -n +1089 data_shuf > data_train
+    echo "Shuffled dataset"
+    wc -l data_shuf
+    echo "Test dataset"    wc -l data_test
+    echo "Dev dataset"
+    wc -l data_dev
+    echo "Train dataset"
+    wc -l data_train
+    python main.py USvideos_modified.csv
+fi
--- a/main.py
+++ b/main.py
@ -1,29 +1,8 @@
-import subprocess
+#!/usr/bin/python
+
+import sys
 import pandas as pd
-from sklearn.model_selection import train_test_split

-process = subprocess.Popen("kaggle datasets download -d alwinjoseph/stress-detection-of-medical-partitioners",
-                           shell=True, stdout=subprocess.PIPE)
-process.wait()
-process = subprocess.Popen("tar -xf stress-detection-of-medical-partitioners.zip", shell=True, stdout=subprocess.PIPE)
-process.wait()
-
-data = pd.read_csv('data.csv')
-data_train, data_test = train_test_split(data, test_size=50000, random_state=1, stratify=data["Alcohol_usage"])
-print("All data")
-print("Number of rows ", data.shape[0])
-print("Number of columns ", data.shape[1])
-
-print("\nTrain data")
-print("Number of rows ", data_train.shape[0])
-print("Number of columns ", data_train.shape[1])
-
-print("\nTest data")
-print("Number of rows ", data_test.shape[0])
-print("Number of columns ", data_test.shape[1])
-
-print("\n")
-print(data.describe(include='all'))
-
-print("\n")
-print(data["Alcohol_usage"].value_counts())
+data = pd.read_csv(sys.argv[1])
+print(data.describe(include="all"))
+print(data['likes'].value_counts())