mieszkania5/mieszkania.ipynb
2023-11-05 14:36:16 +01:00

54 KiB

Trenowanie i sprawdzanie modelu na podstawie danych

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

data = pd.read_csv('train/train.tsv', delimiter='\t', header=None)

def extract_number(text):
    try:
        return float(''.join(filter(str.isdigit, str(text))))
    except ValueError:
        return None


data[2] = data[2].apply(extract_number)
mean_value = data[2].mean()
data[2] = data[2].fillna(mean_value)
data[2] = data[2].astype(int)

data[9] = data[9].map({"pierwotny": 1, "wtórny": 0})

data[1] = data[1].map({"do zamieszkania": 2, "do wykończenia": 1,"do remontu":0})
data[1] = data[1].fillna(1)

data[11] = data[11].map({"dom wolnostojący":4,"apartamentowiec":3,"kamienica": 2, "szeregowiec": 1,"blok":0})
data[11] = data[11].fillna(0)

X = data.iloc[:, [1,2,4, 6, 8,9,10,11]]

X = X.apply(pd.to_numeric, errors='coerce')
X[6] = X[6].fillna(11)
X[2] = X[2].fillna(np.mean(X[2]))
X[4] = X[4].fillna(np.mean(X[4]))
X[8] = X[8].fillna(np.mean(X[8]))
X[10] = X[10].fillna(np.mean(X[10]))

print(X)


y = data.iloc[:, 0]


scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)


model = LinearRegression()
model.fit(X_normalized, y)



       1     2     4     6       8   9     10   11
0     2.0   390  7113   2.0   43.44   0   4.0  0.0
1     1.0  1423  7392   2.0   42.60   1   2.0  0.0
2     0.0   300  5621   2.0   44.30   0   4.0  0.0
3     2.0   490  4761   4.0   88.00   0   3.0  1.0
4     1.0   850  6481   3.0   77.00   0  16.0  0.0
...   ...   ...   ...   ...     ...  ..   ...  ...
2542  1.0     1  5400   4.0   94.00   0   4.0  2.0
2543  1.0  1423  6400   2.0   53.50   0   4.0  0.0
2544  1.0   280  6063   3.0   55.25   0   4.0  2.0
2545  1.0  1423  4194   3.0   62.00   0   3.0  2.0
2546  2.0  1423  5077  11.0  392.00   0   4.0  2.0

[2547 rows x 8 columns]
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()

Predykcja modelu:


dane = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)

dane[1] = dane[1].apply(extract_number)
mean_value = dane[1].mean()
dane[1] = dane[1].fillna(mean_value)
dane[1] = dane[1].astype(int)

dane[8] = dane[8].map({"pierwotny": 1, "wtórny": 0})

dane[0] = dane[0].map({"do zamieszkania": 2, "do wykończenia": 1, "do remontu": 0})
dane[0] = dane[0].fillna(1)

dane[10] = dane[10].map({"dom wolnostojący": 4, "apartamentowiec": 3, "kamienica": 2, "szeregowiec": 1, "blok": 0})
dane[10] = dane[10].fillna(0)


X1 = dane[[0,1,3,5,7,8,9,10]]  
X1 = X1.apply(pd.to_numeric, errors='coerce')

X1[0] = X1[0].fillna(1)
X1[1] = X1[1].fillna(np.mean(X1[1]))
X1[3] = X1[3].fillna(np.mean(X1[3]))
X1[5] = X1[5].fillna(1)
X1[7] = X1[7].fillna(np.mean(X1[7]))
X1[8] = X1[8].fillna(1)
X1[9] = X1[9].fillna(np.mean(X1[9]))
X1[10] = X1[10].fillna(np.mean(X1[10]))

print(X1)


scaler = MinMaxScaler()
scaler.fit(X1)
dane_normalized = scaler.transform(X1)
wynik = model.predict(dane_normalized)

wynik_df = pd.DataFrame(wynik)
wynik_df.to_csv('dev-0/out.tsv', sep='\t', index=False)


with open('dev-0/out.tsv', 'r') as file:
    lines = file.readlines()

lines = lines[1:]

with open('dev-0/out.tsv', 'w') as file:
    file.writelines(lines)



#####################################

dane2 = pd.read_csv('test-A/in.tsv', delimiter='\t', header=None)

dane2[1] = dane2[1].apply(extract_number)
mean_value = dane2[1].mean()
dane2[1] = dane2[1].fillna(mean_value)
dane2[1] = dane2[1].astype(int)

dane2[8] = dane2[8].map({"pierwotny": 1, "wtórny": 0})

dane2[0] = dane2[0].map({"do zamieszkania": 2, "do wykończenia": 1, "do remontu": 0})
dane2[0] = dane2[0].fillna(1)

dane2[10] = dane2[10].map({"dom wolnostojący": 4, "apartamentowiec": 3, "kamienica": 2, "szeregowiec": 1, "blok": 0})
dane2[10] = dane2[10].fillna(0)


X2 = dane2[[0,1,3,5,7,8,9,10]]  
X2 = X2.apply(pd.to_numeric, errors='coerce')

X2[0] = X2[0].fillna(1)
X2[1] = X2[1].fillna(np.mean(X2[1]))
X2[3] = X2[3].fillna(np.mean(X2[3]))
X2[5] = X2[5].fillna(1)
X2[7] = X2[7].fillna(np.mean(X2[7]))
X2[8] = X2[8].fillna(1)
X2[9] = X2[9].fillna(np.mean(X2[9]))
X2[10] = X2[10].fillna(np.mean(X2[10]))

print(X2)


scaler = MinMaxScaler()
scaler.fit(X2)
dane_normalized = scaler.transform(X2)
wynik = model.predict(dane_normalized)

wynik_df = pd.DataFrame(wynik) 
wynik_df.to_csv('test-A/out.tsv', sep='\t', index=False)


with open('test-A/out.tsv', 'r') as file:
    lines = file.readlines()

lines = lines[1:]

with open('test-A/out.tsv', 'w') as file:
    file.writelines(lines)



      0       1      3   5      7   8     9    10
0    2.0     250   6311   3  59.10   0   4.0  0.0
1    2.0    1681   7868   2  38.00   0  12.0  0.0
2    2.0     650   5717   3  63.84   0   4.0  0.0
3    2.0     359   7380   4  50.00   0  10.0  0.0
4    1.0    1681   7373   3  65.62   1   3.0  0.0
..   ...     ...    ...  ..    ...  ..   ...  ...
457  1.0    1681   9007   3  72.78   1   5.0  0.0
458  1.0    1681   9202   2  51.23   1   5.0  0.0
459  1.0    1681   5723   3  54.16   1   3.0  0.0
460  2.0  110205   7758   3  90.10   0   5.0  0.0
461  1.0    1681  11822   2  71.90   0   3.0  0.0

[462 rows x 8 columns]
      0     1     3    5      7   8     9    10
0    1.0  3443  6938  3.0  61.99   1   7.0  0.0
1    2.0  3443  6078  4.0  64.00   0   4.0  0.0
2    1.0  3443  6150  3.0  51.15   1   5.0  0.0
3    2.0   350  9373  2.0  45.77   0   7.0  3.0
4    1.0  3443  7200  2.0  44.36   1  13.0  0.0
..   ...   ...   ...  ...    ...  ..   ...  ...
413  1.0  3443  7150  1.0  34.97   1   8.0  0.0
414  1.0  3443  6499  3.0  49.06   1   3.0  0.0
415  1.0  3443  9451  3.0  76.71   1   5.0  0.0
416  1.0  3443  9322  3.0  72.63   1   5.0  0.0
417  1.0  3443  6500  2.0  65.84   0  10.0  0.0

[418 rows x 8 columns]
from sklearn.metrics import mean_squared_error
 
y_pred = model.predict(X_normalized)
mse = mean_squared_error(y, y_pred)
print(f"Średni Błąd Kwadratowy (MSE): {mse:.2f}")

from sklearn.metrics import mean_absolute_error
 
mae = mean_absolute_error(y, y_pred)
print(f"Średni Błąd Bezwzględny (MAE): {mae:.2f}")

from sklearn.metrics import r2_score
 
r2 = r2_score(y, y_pred)
print(f"R-kwadrat (R2): {r2:.2f}")


 
plt.scatter(y, y_pred, label="Dane")

plt.xlabel("Rzeczywiste wartości")
plt.ylabel("Przewidywane wartości")
plt.title("Wykres predykcji vs. rzeczywistość")
plt.legend()
plt.show()
Średni Błąd Kwadratowy (MSE): 8598093191.55
Średni Błąd Bezwzględny (MAE): 37536.91
R-kwadrat (R2): 0.88