ium_495719/data_processing.py

16 lines
636 B
Python
Raw Normal View History

2024-04-02 23:18:46 +02:00
from sklearn.model_selection import train_test_split
2024-04-02 20:40:42 +02:00
import pandas as pd
2024-04-02 23:25:58 +02:00
import subprocess
2024-04-02 20:40:42 +02:00
2024-04-02 23:25:58 +02:00
subprocess.run(["kaggle", "datasets", "download", "muhammadbinimran/housing-price-prediction-data", "--unzip"])
2024-04-02 20:40:42 +02:00
housing_price_dataset = pd.read_csv('housing_price_dataset.csv')
2024-04-27 18:00:53 +02:00
housing_price_dataset = pd.get_dummies(housing_price_dataset, columns=['Neighborhood'])
2024-04-02 23:18:46 +02:00
hp_train_test, hp_dev = train_test_split(housing_price_dataset, test_size=0.1)
hp_train, hp_test = train_test_split(hp_train_test, test_size=1000)
2024-04-02 20:40:42 +02:00
hp_train.to_csv('hp_train.csv', index=False)
hp_dev.to_csv('hp_dev.csv', index=False)
hp_test.to_csv('hp_test.csv', index=False)