54 KiB
54 KiB
Trenowanie i sprawdzanie modelu na podstawie danych
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
data = pd.read_csv('train/train.tsv', delimiter='\t', header=None)
def extract_number(text):
try:
return float(''.join(filter(str.isdigit, str(text))))
except ValueError:
return None
data[2] = data[2].apply(extract_number)
mean_value = data[2].mean()
data[2] = data[2].fillna(mean_value)
data[2] = data[2].astype(int)
data[9] = data[9].map({"pierwotny": 1, "wtórny": 0})
data[1] = data[1].map({"do zamieszkania": 2, "do wykończenia": 1,"do remontu":0})
data[1] = data[1].fillna(1)
data[11] = data[11].map({"dom wolnostojący":4,"apartamentowiec":3,"kamienica": 2, "szeregowiec": 1,"blok":0})
data[11] = data[11].fillna(0)
X = data.iloc[:, [1,2,4, 6, 8,9,10,11]]
X = X.apply(pd.to_numeric, errors='coerce')
X[6] = X[6].fillna(11)
X[2] = X[2].fillna(np.mean(X[2]))
X[4] = X[4].fillna(np.mean(X[4]))
X[8] = X[8].fillna(np.mean(X[8]))
X[10] = X[10].fillna(np.mean(X[10]))
print(X)
y = data.iloc[:, 0]
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
model = LinearRegression()
model.fit(X_normalized, y)
1 2 4 6 8 9 10 11 0 2.0 390 7113 2.0 43.44 0 4.0 0.0 1 1.0 1423 7392 2.0 42.60 1 2.0 0.0 2 0.0 300 5621 2.0 44.30 0 4.0 0.0 3 2.0 490 4761 4.0 88.00 0 3.0 1.0 4 1.0 850 6481 3.0 77.00 0 16.0 0.0 ... ... ... ... ... ... .. ... ... 2542 1.0 1 5400 4.0 94.00 0 4.0 2.0 2543 1.0 1423 6400 2.0 53.50 0 4.0 0.0 2544 1.0 280 6063 3.0 55.25 0 4.0 2.0 2545 1.0 1423 4194 3.0 62.00 0 3.0 2.0 2546 2.0 1423 5077 11.0 392.00 0 4.0 2.0 [2547 rows x 8 columns]
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
Predykcja modelu:
dane = pd.read_csv('dev-0/in.tsv', delimiter='\t', header=None)
dane[1] = dane[1].apply(extract_number)
mean_value = dane[1].mean()
dane[1] = dane[1].fillna(mean_value)
dane[1] = dane[1].astype(int)
dane[8] = dane[8].map({"pierwotny": 1, "wtórny": 0})
dane[0] = dane[0].map({"do zamieszkania": 2, "do wykończenia": 1, "do remontu": 0})
dane[0] = dane[0].fillna(1)
dane[10] = dane[10].map({"dom wolnostojący": 4, "apartamentowiec": 3, "kamienica": 2, "szeregowiec": 1, "blok": 0})
dane[10] = dane[10].fillna(0)
X1 = dane[[0,1,3,5,7,8,9,10]]
X1 = X1.apply(pd.to_numeric, errors='coerce')
X1[0] = X1[0].fillna(1)
X1[1] = X1[1].fillna(np.mean(X1[1]))
X1[3] = X1[3].fillna(np.mean(X1[3]))
X1[5] = X1[5].fillna(1)
X1[7] = X1[7].fillna(np.mean(X1[7]))
X1[8] = X1[8].fillna(1)
X1[9] = X1[9].fillna(np.mean(X1[9]))
X1[10] = X1[10].fillna(np.mean(X1[10]))
print(X1)
scaler = MinMaxScaler()
scaler.fit(X1)
dane_normalized = scaler.transform(X1)
wynik = model.predict(dane_normalized)
wynik_df = pd.DataFrame(wynik)
wynik_df.to_csv('dev-0/out.tsv', sep='\t', index=False)
with open('dev-0/out.tsv', 'r') as file:
lines = file.readlines()
lines = lines[1:]
with open('dev-0/out.tsv', 'w') as file:
file.writelines(lines)
#####################################
dane2 = pd.read_csv('test-A/in.tsv', delimiter='\t', header=None)
dane2[1] = dane2[1].apply(extract_number)
mean_value = dane2[1].mean()
dane2[1] = dane2[1].fillna(mean_value)
dane2[1] = dane2[1].astype(int)
dane2[8] = dane2[8].map({"pierwotny": 1, "wtórny": 0})
dane2[0] = dane2[0].map({"do zamieszkania": 2, "do wykończenia": 1, "do remontu": 0})
dane2[0] = dane2[0].fillna(1)
dane2[10] = dane2[10].map({"dom wolnostojący": 4, "apartamentowiec": 3, "kamienica": 2, "szeregowiec": 1, "blok": 0})
dane2[10] = dane2[10].fillna(0)
X2 = dane2[[0,1,3,5,7,8,9,10]]
X2 = X2.apply(pd.to_numeric, errors='coerce')
X2[0] = X2[0].fillna(1)
X2[1] = X2[1].fillna(np.mean(X2[1]))
X2[3] = X2[3].fillna(np.mean(X2[3]))
X2[5] = X2[5].fillna(1)
X2[7] = X2[7].fillna(np.mean(X2[7]))
X2[8] = X2[8].fillna(1)
X2[9] = X2[9].fillna(np.mean(X2[9]))
X2[10] = X2[10].fillna(np.mean(X2[10]))
print(X2)
scaler = MinMaxScaler()
scaler.fit(X2)
dane_normalized = scaler.transform(X2)
wynik = model.predict(dane_normalized)
wynik_df = pd.DataFrame(wynik)
wynik_df.to_csv('test-A/out.tsv', sep='\t', index=False)
with open('test-A/out.tsv', 'r') as file:
lines = file.readlines()
lines = lines[1:]
with open('test-A/out.tsv', 'w') as file:
file.writelines(lines)
0 1 3 5 7 8 9 10 0 2.0 250 6311 3 59.10 0 4.0 0.0 1 2.0 1681 7868 2 38.00 0 12.0 0.0 2 2.0 650 5717 3 63.84 0 4.0 0.0 3 2.0 359 7380 4 50.00 0 10.0 0.0 4 1.0 1681 7373 3 65.62 1 3.0 0.0 .. ... ... ... .. ... .. ... ... 457 1.0 1681 9007 3 72.78 1 5.0 0.0 458 1.0 1681 9202 2 51.23 1 5.0 0.0 459 1.0 1681 5723 3 54.16 1 3.0 0.0 460 2.0 110205 7758 3 90.10 0 5.0 0.0 461 1.0 1681 11822 2 71.90 0 3.0 0.0 [462 rows x 8 columns] 0 1 3 5 7 8 9 10 0 1.0 3443 6938 3.0 61.99 1 7.0 0.0 1 2.0 3443 6078 4.0 64.00 0 4.0 0.0 2 1.0 3443 6150 3.0 51.15 1 5.0 0.0 3 2.0 350 9373 2.0 45.77 0 7.0 3.0 4 1.0 3443 7200 2.0 44.36 1 13.0 0.0 .. ... ... ... ... ... .. ... ... 413 1.0 3443 7150 1.0 34.97 1 8.0 0.0 414 1.0 3443 6499 3.0 49.06 1 3.0 0.0 415 1.0 3443 9451 3.0 76.71 1 5.0 0.0 416 1.0 3443 9322 3.0 72.63 1 5.0 0.0 417 1.0 3443 6500 2.0 65.84 0 10.0 0.0 [418 rows x 8 columns]
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_normalized)
mse = mean_squared_error(y, y_pred)
print(f"Średni Błąd Kwadratowy (MSE): {mse:.2f}")
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y, y_pred)
print(f"Średni Błąd Bezwzględny (MAE): {mae:.2f}")
from sklearn.metrics import r2_score
r2 = r2_score(y, y_pred)
print(f"R-kwadrat (R2): {r2:.2f}")
plt.scatter(y, y_pred, label="Dane")
plt.xlabel("Rzeczywiste wartości")
plt.ylabel("Przewidywane wartości")
plt.title("Wykres predykcji vs. rzeczywistość")
plt.legend()
plt.show()
Średni Błąd Kwadratowy (MSE): 8598093191.55 Średni Błąd Bezwzględny (MAE): 37536.91 R-kwadrat (R2): 0.88