Projekt_ML/scraper.py
2024-02-25 17:26:20 +01:00

35 lines
1.0 KiB
Python

import pandas as pd
df_2311 = pd.read_csv('archive/apartments_pl_2023_11.csv')
df_2312 = pd.read_csv('archive/apartments_pl_2023_12.csv')
df_2401 = pd.read_csv('archive/apartments_pl_2024_01.csv')
df_2402 = pd.read_csv('archive/apartments_pl_2024_02.csv')
def pull_krakow(df):
df_krakow = df[df["city"] == 'krakow']
return df_krakow
df_2311 = pull_krakow(df_2311)
df_2312 = pull_krakow(df_2312)
df_2401 = pull_krakow(df_2401)
df_2402 = pull_krakow(df_2402)
df_concatenated = pd.concat([df_2311, df_2312, df_2401, df_2402], ignore_index=True)
sum = len(df_2311) + len(df_2312) + len(df_2401) + len(df_2402)
print(sum)
df_no_duplicates = df_concatenated.drop_duplicates(subset = ["squareMeters", "rooms", "floor", "centreDistance", "price"])
print(len(df_no_duplicates))
df_selected_columns = df_no_duplicates[["squareMeters", "rooms", "floor", "buildYear", "centreDistance", "poiCount", "price"]]
df_na_dropped = df_selected_columns.dropna()
print(len(df_na_dropped))
df_na_dropped.to_csv('data.csv', index=False)
print("Dane zapisane do data.csv.")