187 lines
3.4 KiB
Plaintext
187 lines
3.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5d8abefd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install kaggle\n",
|
|
"!pip install pandas\n",
|
|
"!pip install seaborn\n",
|
|
"!pip install opendatasets --upgrade\n",
|
|
"import opendatasets as od\n",
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5911d036",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cd ~/.kaggle"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8d07bb31",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!kaggle datasets download -d shree1992/housedata"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9c04f525",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!unzip -o housedata.zip"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b407a064",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!wc -l data.csv\n",
|
|
"data = pd.read_csv('data.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "17d29b04",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5fa21ef9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!head -n -1 data.csv | shuf > data.csv.shuf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "baa9b34e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!head -n 460 data.csv.shuf > data.csv.test\n",
|
|
"!head -n 920 data.csv.shuf | tail -n 460 > data.csv.dev\n",
|
|
"!tail -n +920 data.csv.shuf > data.csv.train"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "30b520c0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!rm data.csv.shuf\n",
|
|
"!wc -l data.csv*"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b7652c69",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_test = pd.read_csv('data.csv.test')\n",
|
|
"data_dev = pd.read_csv('data.csv.dev')\n",
|
|
"data_train = pd.read_csv('data.csv.train')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c4670def",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b18d32a1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_train.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6b8d6568",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_dev.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "dd366750",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_test.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2801c205",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"grep -P \"^$\" -n data.csv"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.10"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|