319 KiB
319 KiB
Analiza Danych w Pythonie: Pandas
Tomasz Dwojak
16 grudnia 2017
Dlaczego Python?
# Render our plots inline
%matplotlib inline
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)
import pandas as pd
losowe = np.random.randint(1, 20, 26)
print(losowe)
[ 7 7 8 2 18 12 17 1 12 4 8 6 18 3 5 10 3 17 2 9 8 8 12 1 9 16]
Series czyli szereg
dane = pd.Series(losowe)
print(dane)
0 7 1 7 2 8 3 2 4 18 5 12 6 17 7 1 8 12 9 4 10 8 11 6 12 18 13 3 14 5 15 10 16 3 17 17 18 2 19 9 20 8 21 8 22 12 23 1 24 9 25 16 dtype: int64
dane2 = pd.Series([1,2,3,4,5], index=['a', 'b', 'c', 'd', 'e'])
print(dane2)
a 1 b 2 c 3 d 4 e 5 dtype: int64
print(len(dane))
print(dane.shape)
26 (26,)
print(dane.head())
print(dane.tail())
0 7 1 7 2 8 3 2 4 18 dtype: int64 21 8 22 12 23 1 24 9 25 16 dtype: int64
print("Średnia:", dane.mean())
print("Mediana:", dane.median())
Średnia: 8.57692307692 Mediana: 8.0
print("Zbiór wartości:", dane.unique())
print("Zliczanie", dane.value_counts())
print(dane.value_counts().head())
Zbiór wartości: [ 7 8 2 18 12 17 1 4 6 3 5 10 9 16] Zliczanie 8 4 12 3 18 2 17 2 9 2 7 2 3 2 2 2 1 2 16 1 10 1 6 1 5 1 4 1 dtype: int64 8 4 12 3 18 2 17 2 9 2 dtype: int64
print(dane.describe())
count 26.000000 mean 8.576923 std 5.375300 min 1.000000 25% 4.250000 50% 8.000000 75% 12.000000 max 18.000000 dtype: float64
dane.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7fb55af72d10>
Indeksowanie
import string
litery = list(string.ascii_uppercase)
dane3 = pd.Series(losowe, index=litery)
print(dane3.head())
A 7 B 7 C 8 D 2 E 18 dtype: int64
print(dane3['E'])
print(dane3[['P', 'Y', 'T']])
print(dane3['B':'E'])
18 P 10 Y 9 T 9 dtype: int64 B 7 C 8 D 2 E 18 dtype: int64
Mapowanie
def cube(x):
return x ** 3
print(dane3.map(cube))
A 343 B 343 C 512 D 8 E 5832 F 1728 G 4913 H 1 I 1728 J 64 K 512 L 216 M 5832 N 27 O 125 P 1000 Q 27 R 4913 S 8 T 729 U 512 V 512 W 1728 X 1 Y 729 Z 4096 dtype: int64
DataFrame (ramka danych)
wielkie = list(string.ascii_uppercase)
male = list(string.ascii_lowercase)
surowe = list(zip(male, wielkie))
print(surowe)
dane = pd.DataFrame(surowe)
print(dane)
[('a', 'A'), ('b', 'B'), ('c', 'C'), ('d', 'D'), ('e', 'E'), ('f', 'F'), ('g', 'G'), ('h', 'H'), ('i', 'I'), ('j', 'J'), ('k', 'K'), ('l', 'L'), ('m', 'M'), ('n', 'N'), ('o', 'O'), ('p', 'P'), ('q', 'Q'), ('r', 'R'), ('s', 'S'), ('t', 'T'), ('u', 'U'), ('v', 'V'), ('w', 'W'), ('x', 'X'), ('y', 'Y'), ('z', 'Z')] 0 1 0 a A 1 b B 2 c C 3 d D 4 e E 5 f F 6 g G 7 h H 8 i I 9 j J 10 k K 11 l L 12 m M 13 n N 14 o O 15 p P 16 q Q 17 r R 18 s S 19 t T 20 u U 21 v V 22 w W 23 x X 24 y Y 25 z Z
dane.columns = ["małe", "wielkie"]
print(dane.head())
małe wielkie 0 a A 1 b B 2 c C 3 d D 4 e E
dane['losowe'] = np.random.randint(1, 20, 26)
Wczytywanie danych
bike_data = pd.read_csv('bikes.csv', # ścieżka do pliku
sep=';', # separator
encoding='latin1', # kodowanie
parse_dates=['Date'], # kolumny, w których występują daty
dayfirst=True, # format dzień - miesiąc - rok
index_col='Date') # ustawienie indeksu na kolumnę Date
print(bike_data.columns)
print(bike_data.head())
Index([u'Berri 1', u'Brébeuf (données non disponibles)', u'Côte-Sainte-Catherine', u'Maisonneuve 1', u'Maisonneuve 2', u'du Parc', u'Pierre-Dupuy', u'Rachel1', u'St-Urbain (données non disponibles)'], dtype='object') Berri 1 Brébeuf (données non disponibles) Côte-Sainte-Catherine \ Date 2012-01-01 35 NaN 0 2012-01-02 83 NaN 1 2012-01-03 135 NaN 2 2012-01-04 144 NaN 1 2012-01-05 197 NaN 2 Maisonneuve 1 Maisonneuve 2 du Parc Pierre-Dupuy Rachel1 \ Date 2012-01-01 38 51 26 10 16 2012-01-02 68 153 53 6 43 2012-01-03 104 248 89 3 58 2012-01-04 116 318 111 8 61 2012-01-05 124 330 97 13 95 St-Urbain (données non disponibles) Date 2012-01-01 NaN 2012-01-02 NaN 2012-01-03 NaN 2012-01-04 NaN 2012-01-05 NaN
Odwoływanie się do kolumn
bike_data['Berri 1']
Date 2012-01-01 35 2012-01-02 83 2012-01-03 135 2012-01-04 144 2012-01-05 197 2012-01-06 146 2012-01-07 98 2012-01-08 95 2012-01-09 244 2012-01-10 397 2012-01-11 273 2012-01-12 157 2012-01-13 75 2012-01-14 32 2012-01-15 54 2012-01-16 168 2012-01-17 155 2012-01-18 139 2012-01-19 191 2012-01-20 161 2012-01-21 53 2012-01-22 71 2012-01-23 210 2012-01-24 299 2012-01-25 334 2012-01-26 306 2012-01-27 91 2012-01-28 80 2012-01-29 87 2012-01-30 219 ... 2012-10-07 1580 2012-10-08 1854 2012-10-09 4787 2012-10-10 3115 2012-10-11 3746 2012-10-12 3169 2012-10-13 1783 2012-10-14 587 2012-10-15 3292 2012-10-16 3739 2012-10-17 4098 2012-10-18 4671 2012-10-19 1313 2012-10-20 2011 2012-10-21 1277 2012-10-22 3650 2012-10-23 4177 2012-10-24 3744 2012-10-25 3735 2012-10-26 4290 2012-10-27 1857 2012-10-28 1310 2012-10-29 2919 2012-10-30 2887 2012-10-31 2634 2012-11-01 2405 2012-11-02 1582 2012-11-03 844 2012-11-04 966 2012-11-05 2247 Name: Berri 1, Length: 310, dtype: int64
bike_data["Berri 1"].plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fb576c24c50>
bike_data.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fb576a9b550>