ium_z487175/s487175-create-dataset-script.py


#!/usr/bin/env python

import os
import pandas as pd
diamonds = pd.read_csv('diamonds.csv')

#Wyświetlenie zbioru danych
cutoff = int(os.environ.get('CUTOFF', -1))
if cutoff == -1:
    print("CUTOFF parameter not set.")
    diamonds
else:
    cutoff = int(os.environ['CUTOFF'])
    diamonds = diamonds[:cutoff]

# %%
#przydzielanie nazwy kolumny z id
diamonds = diamonds.rename(columns={diamonds.columns[0]: 'id'})
diamonds

# %%
#Convert to lowerCase

diamonds['cut'] = diamonds['cut'].str.lower()
diamonds

# %%
import sklearn
from sklearn.model_selection import train_test_split

# %%
#podział danych na train/test/dev w proporcji 4:1:1
#losować ustawiona na 10

#1. Dzielimy na zbiór treningowy 80 % i resztę danych
diamonds_train, diamonds_test_dev = sklearn.model_selection.train_test_split(diamonds, test_size=0.2, random_state=10)

#2. Podział reszty danych na zbiór testowy 10% i walidacyjny 10%
diamonds_test, diamonds_dev = train_test_split(diamonds_test_dev, test_size=0.5, random_state=10)


# %%
#Wyświetlenie rozmiarów zbiorów danych train/test/dev
print("Rozmiar diamonds: ", diamonds.shape)
print("Rozmiar diamonds_train: ", diamonds_train.shape)
print("Rozmiar diamonds_test: ", diamonds_test.shape)
print("Rozmiar diamonds_dev: ", diamonds_dev.shape)

# %%
# średnią, minimum, maksimum, odchylenia standardowe, medianę wartości poszczególnych parametrów)
print(diamonds.describe())

# %%
print(diamonds_train.describe())

# %%
print(diamonds_test.describe())

# %%
print(diamonds_dev.describe())

# %%
#Wyświetlenie częstości przykładów dla poszczególnych klas diamentów
diamonds_train["cut"].value_counts()

# %%
diamonds_test["cut"].value_counts()

# %%
diamonds_dev["cut"].value_counts()

# %%
import subprocess
subprocess.check_call(["pip", "install", "matplotlib"])
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
diamonds['cut'].value_counts().plot(kind='bar')
plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds')
plt.xlabel('Szlif')
plt.ylabel('Liczba wystąpień')
plt.show()

# %%

plt.figure(figsize=(8, 6))
diamonds_train['cut'].value_counts().plot(kind='bar')
plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds tranującego')
plt.xlabel('Szlif')
plt.ylabel('Liczba wystąpień')
plt.show()

# %%
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
diamonds_test['cut'].value_counts().plot(kind='bar')
plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds testowego')
plt.xlabel('Szlif')
plt.ylabel('Liczba wystąpień')
plt.show()

# %%
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
diamonds_dev['cut'].value_counts().plot(kind='bar')
plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds walidacyjnego')
plt.xlabel('Szlif')
plt.ylabel('Liczba wystąpień')
plt.show()

# %%
diamonds[["cut","carat"]].groupby("cut").std()

# %%
diamonds[["cut","carat"]].groupby("cut").mean().plot(kind="bar")

# %%
#normalizacja wartości typu float do zakrsu 0.0 - 1.0
#Powyżej wykonano jeszcze konwersję danych typu string na lowerCase

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
diamonds[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']] = scaler.fit_transform(diamonds[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']])

#wyświetlenie zbioru
diamonds

# %%
# Usuwanie artefaktów
diamonds = diamonds.dropna() ## usuwanie pustych wierszy, które posiadają przynajmniej jedno wystąpienie NULL or NaN
diamonds
update task: s487175-create-dataset 2023-04-11 22:01:05 +02:00
update 2023-04-15 11:27:21 +02:00			`#!/usr/bin/env python`
update 2023-04-15 12:30:13 +02:00
add cutoff 2023-04-15 12:22:55 +02:00			`import os`
update task: s487175-create-dataset 2023-04-11 22:01:05 +02:00			`import pandas as pd`
			`diamonds = pd.read_csv('diamonds.csv')`

update 2023-04-15 12:30:13 +02:00			`#Wyświetlenie zbioru danych`
			`cutoff = int(os.environ.get('CUTOFF', -1))`
			`if cutoff == -1:`
			`print("CUTOFF parameter not set.")`
			`diamonds`
			`else:`
			`cutoff = int(os.environ['CUTOFF'])`
			`diamonds = diamonds[:cutoff]`
add cutoff 2023-04-15 12:22:55 +02:00
update task: s487175-create-dataset 2023-04-11 22:01:05 +02:00			`# %%`
			`#przydzielanie nazwy kolumny z id`
			`diamonds = diamonds.rename(columns={diamonds.columns[0]: 'id'})`
			`diamonds`

			`# %%`
			`#Convert to lowerCase`

			`diamonds['cut'] = diamonds['cut'].str.lower()`
			`diamonds`

			`# %%`
			`import sklearn`
			`from sklearn.model_selection import train_test_split`

			`# %%`
			`#podział danych na train/test/dev w proporcji 4:1:1`
			`#losować ustawiona na 10`

			`#1. Dzielimy na zbiór treningowy 80 % i resztę danych`
			`diamonds_train, diamonds_test_dev = sklearn.model_selection.train_test_split(diamonds, test_size=0.2, random_state=10)`

			`#2. Podział reszty danych na zbiór testowy 10% i walidacyjny 10%`
			`diamonds_test, diamonds_dev = train_test_split(diamonds_test_dev, test_size=0.5, random_state=10)`


			`# %%`
			`#Wyświetlenie rozmiarów zbiorów danych train/test/dev`
			`print("Rozmiar diamonds: ", diamonds.shape)`
			`print("Rozmiar diamonds_train: ", diamonds_train.shape)`
			`print("Rozmiar diamonds_test: ", diamonds_test.shape)`
			`print("Rozmiar diamonds_dev: ", diamonds_dev.shape)`

			`# %%`
			`# średnią, minimum, maksimum, odchylenia standardowe, medianę wartości poszczególnych parametrów)`
			`print(diamonds.describe())`

			`# %%`
			`print(diamonds_train.describe())`

			`# %%`
			`print(diamonds_test.describe())`

			`# %%`
			`print(diamonds_dev.describe())`

			`# %%`
			`#Wyświetlenie częstości przykładów dla poszczególnych klas diamentów`
			`diamonds_train["cut"].value_counts()`

			`# %%`
			`diamonds_test["cut"].value_counts()`

			`# %%`
			`diamonds_dev["cut"].value_counts()`

			`# %%`
fix 2023-04-15 11:59:21 +02:00			`import subprocess`
			`subprocess.check_call(["pip", "install", "matplotlib"])`
update task: s487175-create-dataset 2023-04-11 22:01:05 +02:00			`import matplotlib.pyplot as plt`

			`plt.figure(figsize=(8, 6))`
			`diamonds['cut'].value_counts().plot(kind='bar')`
			`plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds')`
			`plt.xlabel('Szlif')`
			`plt.ylabel('Liczba wystąpień')`
			`plt.show()`

			`# %%`

			`plt.figure(figsize=(8, 6))`
			`diamonds_train['cut'].value_counts().plot(kind='bar')`
			`plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds tranującego')`
			`plt.xlabel('Szlif')`
			`plt.ylabel('Liczba wystąpień')`
			`plt.show()`

			`# %%`
			`import matplotlib.pyplot as plt`

			`plt.figure(figsize=(8, 6))`
			`diamonds_test['cut'].value_counts().plot(kind='bar')`
			`plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds testowego')`
			`plt.xlabel('Szlif')`
			`plt.ylabel('Liczba wystąpień')`
			`plt.show()`

			`# %%`
			`import matplotlib.pyplot as plt`

			`plt.figure(figsize=(8, 6))`
			`diamonds_dev['cut'].value_counts().plot(kind='bar')`
			`plt.title('Rozkład częstości dla szlifów diamentów dla zbioru diamonds walidacyjnego')`
			`plt.xlabel('Szlif')`
			`plt.ylabel('Liczba wystąpień')`
			`plt.show()`

			`# %%`
			`diamonds[["cut","carat"]].groupby("cut").std()`

			`# %%`
			`diamonds[["cut","carat"]].groupby("cut").mean().plot(kind="bar")`

			`# %%`
			`#normalizacja wartości typu float do zakrsu 0.0 - 1.0`
			`#Powyżej wykonano jeszcze konwersję danych typu string na lowerCase`

			`from sklearn.preprocessing import MinMaxScaler`
			`scaler = MinMaxScaler()`
			`diamonds[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']] = scaler.fit_transform(diamonds[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']])`

			`#wyświetlenie zbioru`
			`diamonds`

			`# %%`
			`# Usuwanie artefaktów`
			`diamonds = diamonds.dropna() ## usuwanie pustych wierszy, które posiadają przynajmniej jedno wystąpienie NULL or NaN`
			`diamonds`