35 lines
1.0 KiB
Python
35 lines
1.0 KiB
Python
import pandas as pd
|
|
|
|
df_2311 = pd.read_csv('archive/apartments_pl_2023_11.csv')
|
|
df_2312 = pd.read_csv('archive/apartments_pl_2023_12.csv')
|
|
df_2401 = pd.read_csv('archive/apartments_pl_2024_01.csv')
|
|
df_2402 = pd.read_csv('archive/apartments_pl_2024_02.csv')
|
|
|
|
|
|
def pull_krakow(df):
|
|
df_krakow = df[df["city"] == 'krakow']
|
|
return df_krakow
|
|
|
|
|
|
df_2311 = pull_krakow(df_2311)
|
|
df_2312 = pull_krakow(df_2312)
|
|
df_2401 = pull_krakow(df_2401)
|
|
df_2402 = pull_krakow(df_2402)
|
|
|
|
df_concatenated = pd.concat([df_2311, df_2312, df_2401, df_2402], ignore_index=True)
|
|
|
|
sum = len(df_2311) + len(df_2312) + len(df_2401) + len(df_2402)
|
|
print(sum)
|
|
|
|
df_no_duplicates = df_concatenated.drop_duplicates(subset = ["squareMeters", "rooms", "floor", "centreDistance", "price"])
|
|
print(len(df_no_duplicates))
|
|
|
|
df_selected_columns = df_no_duplicates[["squareMeters", "rooms", "floor", "buildYear", "centreDistance", "poiCount", "price"]]
|
|
|
|
df_na_dropped = df_selected_columns.dropna()
|
|
|
|
print(len(df_na_dropped))
|
|
|
|
df_na_dropped.to_csv('data.csv', index=False)
|
|
|
|
print("Dane zapisane do data.csv.") |