Metody ewaluacji i reprezentacji danych
This commit is contained in:
parent
63eeb5e7ff
commit
96378653dc
@ -8,9 +8,9 @@
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Uczenie maszynowe 2019/2020 – laboratoria\n",
|
||||
"### 27/28 kwietnia 2020\n",
|
||||
"# 7. Korzystanie z gotowych implementacji algorytmów na przykładzie pakietu *scikit-learn*"
|
||||
"## Uczenie maszynowe – zastosowania\n",
|
||||
"### Laboratoria\n",
|
||||
"# 4. Korzystanie z gotowych implementacji algorytmów na przykładzie pakietu *scikit-learn*"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -31,21 +31,31 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[289411.43360715]\n",
|
||||
" [285930.72623304]\n",
|
||||
" [229893.92602325]\n",
|
||||
" [823267.1750005 ]\n",
|
||||
" [821038.18583152]\n",
|
||||
" [356875.19267371]\n",
|
||||
" [409340.86981766]\n",
|
||||
" [278401.700237 ]\n",
|
||||
" [301680.27997255]\n",
|
||||
" [281051.71865054]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#! /usr/bin/env python3\n",
|
||||
"# -*- coding: utf-8 -*-\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# Regresja liniowa wielu zmiennych\n",
|
||||
"\n",
|
||||
"import csv\n",
|
||||
"import numpy\n",
|
||||
"import pandas\n",
|
||||
"import sys\n",
|
||||
"\n",
|
||||
"from sklearn import linear_model # Model regresji liniowej z biblioteki scikit-learn\n",
|
||||
"from sklearn.linear_model import LinearRegression # Model regresji liniowej z biblioteki scikit-learn\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"FEATURES = [\n",
|
||||
@ -60,32 +70,42 @@
|
||||
"def preprocess(data):\n",
|
||||
" \"\"\"Wstępne przetworzenie danych\"\"\"\n",
|
||||
" data = data.replace({'parter': 0, 'poddasze': 0}, regex=True)\n",
|
||||
" data = data.applymap(numpy.nan_to_num) # Zamienia \"NaN\" na liczby\n",
|
||||
" data = data.applymap(np.nan_to_num) # Zamienia \"NaN\" na liczby\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"# Nazwy plików\n",
|
||||
"input_filename = 'flats-test.tsv'\n",
|
||||
"output_filename = 'flats-predicted.tsv'\n",
|
||||
"trainset_filename = 'flats-train.tsv'\n",
|
||||
"dataset_filename = 'flats.tsv'\n",
|
||||
"\n",
|
||||
"# Wczytanie danych uczących\n",
|
||||
"data = pandas.read_csv(trainset_filename, header=0, sep='\\t')\n",
|
||||
"# Wczytanie danych\n",
|
||||
"data = pd.read_csv(dataset_filename, header=0, sep='\\t')\n",
|
||||
"columns = data.columns[1:] # wszystkie kolumny oprócz pierwszej (\"cena\")\n",
|
||||
"data = data[FEATURES + ['cena']] # wybór cech\n",
|
||||
"data = preprocess(data) # wstępne przetworzenie danych\n",
|
||||
"y = pandas.DataFrame(data['cena'])\n",
|
||||
"x = pandas.DataFrame(data[FEATURES])\n",
|
||||
"model = linear_model.LinearRegression() # definicja modelu\n",
|
||||
"model.fit(x, y) # dopasowanie modelu\n",
|
||||
"\n",
|
||||
"# Wczytanie danych testowych\n",
|
||||
"data = pandas.read_csv(input_filename, header=None, sep='\\t', names=columns)\n",
|
||||
"x = pandas.DataFrame(data[FEATURES]) # wybór cech\n",
|
||||
"x = preprocess(x) # wstępne przetworzenie danych\n",
|
||||
"y = model.predict(x) # przewidywania modelu\n",
|
||||
"# Podział danych na zbiory uczący i testowy\n",
|
||||
"split_point = int(0.8 * len(data))\n",
|
||||
"data_train = data[:split_point]\n",
|
||||
"data_test = data[split_point:]\n",
|
||||
"\n",
|
||||
"# Zapis wyników do pliku\n",
|
||||
"pandas.DataFrame(y).to_csv(output_filename, index=None, header=None, sep='\\t')"
|
||||
"# Uczenie modelu\n",
|
||||
"y_train = pd.DataFrame(data_train['cena'])\n",
|
||||
"x_train = pd.DataFrame(data_train[FEATURES])\n",
|
||||
"model = LinearRegression() # definicja modelu\n",
|
||||
"model.fit(x_train, y_train) # dopasowanie modelu\n",
|
||||
"\n",
|
||||
"# Predykcja wyników dla danych testowych\n",
|
||||
"y_expected = pd.DataFrame(data_test['cena'])\n",
|
||||
"x_test = pd.DataFrame(data_test[FEATURES])\n",
|
||||
"y_predicted = model.predict(x_test) # predykcja wyników na podstawie modelu\n",
|
||||
"\n",
|
||||
"print(y_predicted[:10]) # Pierwsze 10 wyników"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Biblioteka *scikit-learn* dostarcza również narzędzi do wstępnego przetwarzania danych, np. skalowania i normalizacji: https://scikit-learn.org/stable/modules/preprocessing.html"
|
||||
]
|
||||
}
|
||||
],
|
4939
lab/flats.tsv
Normal file
4939
lab/flats.tsv
Normal file
File diff suppressed because one or more lines are too long
@ -293,7 +293,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 24,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
@ -315,7 +315,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 25,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
@ -334,7 +334,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 26,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
@ -406,7 +406,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 27,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
@ -430,7 +430,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 28,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "subslide"
|
||||
@ -456,7 +456,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 29,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
@ -503,7 +503,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 30,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
@ -531,7 +531,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 31,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "subslide"
|
||||
@ -561,7 +561,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 32,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
@ -583,7 +583,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 33,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "subslide"
|
||||
@ -597,7 +597,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 34,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "notes"
|
||||
@ -617,7 +617,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 35,
|
||||
"metadata": {
|
||||
"slideshow": {
|
||||
"slide_type": "subslide"
|
||||
@ -627,7 +627,7 @@
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "d6c3033020f54951a9a8c766ef776d17",
|
||||
"model_id": "32929ab5e3024128bd39a6c165e50196",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
@ -644,7 +644,7 @@
|
||||
"<function __main__.interactive_classification(highlight)>"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -8,9 +8,8 @@
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"## Uczenie maszynowe UMZ 2019/2020\n",
|
||||
"### 28 kwietnia 2020\n",
|
||||
"# 7a. Reprezentacja danych"
|
||||
"## Uczenie maszynowe – zastosowania\n",
|
||||
"# 4a. Reprezentacja danych"
|
||||
]
|
||||
},
|
||||
{
|
Loading…
Reference in New Issue
Block a user