ium_444380/Dane.ipynb

189 KiB
Raw Blame History

!pip install --user kaggle pandas seaborn
Requirement already satisfied: kaggle in /home/students/s444380/.local/lib/python3.7/site-packages (1.5.12)
Requirement already satisfied: pandas in /usr/lib/python3/dist-packages (0.23.3+dfsg)
Requirement already satisfied: seaborn in /usr/local/lib/python3.7/dist-packages (0.11.2)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.8.1)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.26.3)
Requirement already satisfied: certifi in /usr/local/lib/python3.7/dist-packages (from kaggle) (2020.12.5)
Requirement already satisfied: python-slugify in /home/students/s444380/.local/lib/python3.7/site-packages (from kaggle) (6.1.1)
Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from kaggle) (4.59.0)
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.7/dist-packages (from kaggle) (1.15.0)
Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from kaggle) (2.25.1)
Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.7/dist-packages (from seaborn) (1.7.3)
Requirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.7/dist-packages (from seaborn) (1.21.5)
Requirement already satisfied: matplotlib>=2.2 in /usr/lib/python3/dist-packages (from seaborn) (3.0.2)
Requirement already satisfied: text-unidecode>=1.3 in /home/students/s444380/.local/lib/python3.7/site-packages (from python-slugify->kaggle) (1.3)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (2.10)
Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->kaggle) (4.0.0)
# Wydziałowy jupyter z jakiegoś powodu nie rozpoznaje polecenia kaggle. Dane pobrałem ręcznie
# !kaggle datasets download -d AnalyzeBoston/crimes-in-boston
!unzip -o archive.zip
Archive:  archive.zip
  inflating: crime.csv               
  inflating: offense_codes.csv       
!iconv -f "windows-1252" -t "UTF-8" crime.csv > crime_conv.csv
!head -n 5 crime_conv.csv
INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
I182070945,00619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.35779134,-71.13937053,"(42.35779134, -71.13937053)"
I182070943,01402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.30682138,-71.06030035,"(42.30682138, -71.06030035)"
I182070941,03410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.34658879,-71.07242943,"(42.34658879, -71.07242943)"
I182070940,03114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.33418175,-71.07866441,"(42.33418175, -71.07866441)"
import pandas as pd
crime = pd.read_csv('crime_conv.csv')
crime
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long Location
0 I182070945 619 Larceny LARCENY ALL OTHERS D14 808 NaN 2018-09-02 13:00:00 2018 9 Sunday 13 Part One LINCOLN ST 42.357791 -71.139371 (42.35779134, -71.13937053)
1 I182070943 1402 Vandalism VANDALISM C11 347 NaN 2018-08-21 00:00:00 2018 8 Tuesday 0 Part Two HECLA ST 42.306821 -71.060300 (42.30682138, -71.06030035)
2 I182070941 3410 Towed TOWED MOTOR VEHICLE D4 151 NaN 2018-09-03 19:27:00 2018 9 Monday 19 Part Three CAZENOVE ST 42.346589 -71.072429 (42.34658879, -71.07242943)
3 I182070940 3114 Investigate Property INVESTIGATE PROPERTY D4 272 NaN 2018-09-03 21:16:00 2018 9 Monday 21 Part Three NEWCOMB ST 42.334182 -71.078664 (42.33418175, -71.07866441)
4 I182070938 3114 Investigate Property INVESTIGATE PROPERTY B3 421 NaN 2018-09-03 21:05:00 2018 9 Monday 21 Part Three DELHI ST 42.275365 -71.090361 (42.27536542, -71.09036101)
5 I182070936 3820 Motor Vehicle Accident Response M/V ACCIDENT INVOLVING PEDESTRIAN - INJURY C11 398 NaN 2018-09-03 21:09:00 2018 9 Monday 21 Part Three TALBOT AVE 42.290196 -71.071590 (42.29019621, -71.07159012)
6 I182070933 724 Auto Theft AUTO THEFT B2 330 NaN 2018-09-03 21:25:00 2018 9 Monday 21 Part One NORMANDY ST 42.306072 -71.082733 (42.30607218, -71.08273260)
7 I182070932 3301 Verbal Disputes VERBAL DISPUTE B2 584 NaN 2018-09-03 20:39:37 2018 9 Monday 20 Part Three LAWN ST 42.327016 -71.105551 (42.32701648, -71.10555088)
8 I182070931 301 Robbery ROBBERY - STREET C6 177 NaN 2018-09-03 20:48:00 2018 9 Monday 20 Part One MASSACHUSETTS AVE 42.331521 -71.070853 (42.33152148, -71.07085307)
9 I182070929 3301 Verbal Disputes VERBAL DISPUTE C11 364 NaN 2018-09-03 20:38:00 2018 9 Monday 20 Part Three LESLIE ST 42.295147 -71.058608 (42.29514664, -71.05860832)
10 I182070928 3301 Verbal Disputes VERBAL DISPUTE C6 913 NaN 2018-09-03 19:55:00 2018 9 Monday 19 Part Three OCEAN VIEW DR 42.319579 -71.040328 (42.31957856, -71.04032766)
11 I182070927 3114 Investigate Property INVESTIGATE PROPERTY C6 936 NaN 2018-09-03 20:19:00 2018 9 Monday 20 Part Three DALESSIO CT 42.340115 -71.053390 (42.34011469, -71.05339029)
12 I182070923 3108 Fire Related Reports FIRE REPORT - HOUSE, BUILDING, ETC. D4 139 NaN 2018-09-03 19:58:00 2018 9 Monday 19 Part Three MARLBOROUGH ST 42.350388 -71.087853 (42.35038760, -71.08785290)
13 I182070922 2647 Other THREATS TO DO BODILY HARM B3 429 NaN 2018-09-03 20:39:00 2018 9 Monday 20 Part Two WOODROW AVE 42.286470 -71.087147 (42.28647012, -71.08714661)
14 I182070921 3201 Property Lost PROPERTY - LOST B3 469 NaN 2018-09-02 14:00:00 2018 9 Sunday 14 Part Three MULVEY ST 42.279241 -71.096674 (42.27924052, -71.09667382)
15 I182070920 3006 Medical Assistance SICK/INJURED/MEDICAL - PERSON NaN NaN 2018-09-03 19:43:00 2018 9 Monday 19 Part Three NaN 42.352875 -71.073830 (42.35287456, -71.07382970)
16 I182070919 3301 Verbal Disputes VERBAL DISPUTE C11 341 NaN 2018-09-03 18:52:00 2018 9 Monday 18 Part Three STONEHURST ST 42.305264 -71.066838 (42.30526428, -71.06683755)
17 I182070918 3305 Assembly or Gathering Violations DEMONSTRATIONS/RIOT D4 130 NaN 2018-09-03 17:00:00 2018 9 Monday 17 Part Three HUNTINGTON AVE 42.348577 -71.077720 (42.34857652, -71.07772012)
18 I182070917 2647 Other THREATS TO DO BODILY HARM B2 901 NaN 2018-09-03 19:52:00 2018 9 Monday 19 Part Two HORADAN WAY 42.333717 -71.096658 (42.33371742, -71.09665806)
19 I182070915 614 Larceny From Motor Vehicle LARCENY THEFT FROM MV - NON-ACCESSORY B2 181 NaN 2018-09-02 18:00:00 2018 9 Sunday 18 Part One SHIRLEY ST 42.325695 -71.068168 (42.32569490, -71.06816778)
20 I182070913 3006 Medical Assistance SICK/INJURED/MEDICAL - PERSON NaN NaN 2018-09-03 18:46:00 2018 9 Monday 18 Part Three WOLCOTT -1.000000 -1.000000 (-1.00000000, -1.00000000)
21 I182070911 3801 Motor Vehicle Accident Response M/V ACCIDENT - OTHER A1 69 NaN 2018-09-03 18:30:00 2018 9 Monday 18 Part Three BEACON ST 42.355644 -71.071681 (42.35564426, -71.07168077)
22 I182070910 3006 Medical Assistance SICK/INJURED/MEDICAL - PERSON B3 434 NaN 2018-09-03 18:42:00 2018 9 Monday 18 Part Three CAPEN ST 42.283402 -71.080797 (42.28340243, -71.08079740)
23 I182070909 3803 Motor Vehicle Accident Response M/V ACCIDENT - PERSONAL INJURY E5 550 NaN 2018-09-03 18:33:00 2018 9 Monday 18 Part Three WASHINGTON ST 42.275818 -71.139913 (42.27581799, -71.13991259)
24 I182070908 522 Residential Burglary BURGLARY - RESIDENTIAL - NO FORCE B2 911 NaN 2018-09-03 18:38:00 2018 9 Monday 18 Part One ANNUNCIATION RD 42.335062 -71.093168 (42.33506218, -71.09316781)
25 I182070906 3831 Motor Vehicle Accident Response M/V - LEAVING SCENE - PROPERTY DAMAGE NaN NaN 2018-09-03 18:20:00 2018 9 Monday 18 Part Three NaN 42.283593 -71.055657 (42.28359328, -71.05565683)
26 I182070905 3006 Medical Assistance SICK/INJURED/MEDICAL - PERSON D4 172 NaN 2018-09-03 18:50:00 2018 9 Monday 18 Part Three MASSACHUSETTS AVE 42.333112 -71.072764 (42.33311189, -71.07276370)
27 I182070904 802 Simple Assault ASSAULT SIMPLE - BATTERY C11 242 NaN 2018-09-03 18:34:00 2018 9 Monday 18 Part Two ANNAPOLIS ST 42.317319 -71.061509 (42.31731905, -71.06150882)
28 I182070904 2007 Restraining Order Violations VIOL. OF RESTRAINING ORDER W NO ARREST C11 242 NaN 2018-09-03 18:34:00 2018 9 Monday 18 Part Two ANNAPOLIS ST 42.317319 -71.061509 (42.31731905, -71.06150882)
29 I182070903 2900 Other VAL - VIOLATION OF AUTO LAW - OTHER B3 463 NaN 2018-09-03 18:55:00 2018 9 Monday 18 Part Two BLUE HILL AVE 42.295904 -71.087733 (42.29590385, -71.08773294)
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
319043 I110551302-00 3125 Warrant Arrests WARRANT ARREST D4 171 NaN 2015-07-22 22:00:00 2015 7 Wednesday 22 Part Three HARRISON AVE 42.335560 -71.074364 (42.33555954, -71.07436364)
319044 I110551302-00 623 Larceny LARCENY SHOPLIFTING $50 TO $199 D4 171 NaN 2015-07-22 22:00:00 2015 7 Wednesday 22 Part One HARRISON AVE 42.335560 -71.074364 (42.33555954, -71.07436364)
319045 I110372326-00 403 Aggravated Assault ASSAULT & BATTERY D/W - OTHER A1 97 NaN 2016-06-14 09:40:00 2016 6 Tuesday 9 Part One SCHOOL ST 42.357428 -71.058326 (42.35742837, -71.05832551)
319046 I110372326-00 3125 Warrant Arrests WARRANT ARREST A1 97 NaN 2016-06-14 09:40:00 2016 6 Tuesday 9 Part Three SCHOOL ST 42.357428 -71.058326 (42.35742837, -71.05832551)
319047 I110261417-00 3125 Warrant Arrests WARRANT ARREST B2 324 NaN 2016-07-29 00:00:00 2016 7 Friday 0 Part Three BOWDOIN ST 42.307038 -71.066153 (42.30703835, -71.06615319)
319048 I110261417-00 619 Larceny LARCENY OTHER $200 & OVER B2 324 NaN 2016-07-29 00:00:00 2016 7 Friday 0 Part One BOWDOIN ST 42.307038 -71.066153 (42.30703835, -71.06615319)
319049 I110177502-00 3125 Warrant Arrests WARRANT ARREST B2 318 NaN 2015-10-02 21:00:00 2015 10 Friday 21 Part Three HOMESTEAD ST 42.311277 -71.089093 (42.31127726, -71.08909334)
319050 I110177502-00 802 Simple Assault ASSAULT & BATTERY B2 318 NaN 2015-10-02 21:00:00 2015 10 Friday 21 Part Two HOMESTEAD ST 42.311277 -71.089093 (42.31127726, -71.08909334)
319051 I110177502-00 3125 Warrant Arrests WARRANT ARREST B2 318 NaN 2015-10-02 21:00:00 2015 10 Friday 21 Part Three HOMESTEAD ST 42.311277 -71.089093 (42.31127726, -71.08909334)
319052 I100636670-00 629 Larceny LARCENY OTHER $50 TO $199 D4 285 NaN 2016-06-05 17:23:00 2016 6 Sunday 17 Part One COVENTRY ST 42.336951 -71.085748 (42.33695098, -71.08574813)
319053 I100636670-00 3125 Warrant Arrests WARRANT ARREST D4 285 NaN 2016-06-05 17:23:00 2016 6 Sunday 17 Part Three COVENTRY ST 42.336951 -71.085748 (42.33695098, -71.08574813)
319054 I100340225-00 3125 Warrant Arrests WARRANT ARREST A1 77 NaN 2015-07-27 10:47:00 2015 7 Monday 10 Part Three BOWDOIN SQ 42.361645 -71.062299 (42.36164502, -71.06229949)
319055 I100340225-00 339 Robbery ROBBERY - UNARMED - STREET A1 77 NaN 2015-07-27 10:47:00 2015 7 Monday 10 Part One BOWDOIN SQ 42.361645 -71.062299 (42.36164502, -71.06229949)
319056 I100222105-02 3125 Warrant Arrests WARRANT ARREST E13 572 NaN 2015-08-03 16:22:00 2015 8 Monday 16 Part Three COLUMBUS AVE 42.313628 -71.095603 (42.31362799, -71.09560307)
319057 I100033064-00 2907 Violations VAL - OPERATING AFTER REV/SUSP. B2 304 NaN 2016-07-29 18:20:00 2016 7 Friday 18 Part Two SLAYTON WAY 42.321770 -71.097798 (42.32177032, -71.09779774)
319058 I100033064-00 2910 Violations VAL - OPERATING AFTER REV/SUSP. B2 304 NaN 2016-07-29 18:20:00 2016 7 Friday 18 Part Two SLAYTON WAY 42.321770 -71.097798 (42.32177032, -71.09779774)
319059 I090321958-00 3125 Warrant Arrests WARRANT ARREST C11 355 NaN 2016-02-01 01:43:00 2016 2 Monday 1 Part Three GENEVA AVE NaN NaN (0.00000000, 0.00000000)
319060 I090321958-00 3125 Warrant Arrests WARRANT ARREST C11 355 NaN 2016-02-01 01:43:00 2016 2 Monday 1 Part Three GENEVA AVE NaN NaN (0.00000000, 0.00000000)
319061 I090317057-00 403 Aggravated Assault ASSAULT & BATTERY D/W - OTHER B3 458 NaN 2015-11-20 11:15:00 2015 11 Friday 11 Part One BLUE HILL AVE 42.301897 -71.085549 (42.30189690, -71.08554944)
319062 I090317057-00 3125 Warrant Arrests WARRANT ARREST B3 458 NaN 2015-11-20 11:15:00 2015 11 Friday 11 Part Three BLUE HILL AVE 42.301897 -71.085549 (42.30189690, -71.08554944)
319063 I080542626-00 3125 Warrant Arrests WARRANT ARREST A1 111 NaN 2015-08-12 12:00:00 2015 8 Wednesday 12 Part Three BOYLSTON ST 42.352312 -71.063705 (42.35231190, -71.06370510)
319064 I080542626-00 1848 Drug Violation DRUGS - POSS CLASS B - INTENT TO MFR DIST DISP A1 111 NaN 2015-08-12 12:00:00 2015 8 Wednesday 12 Part Two BOYLSTON ST 42.352312 -71.063705 (42.35231190, -71.06370510)
319065 I080542626-00 1849 Drug Violation DRUGS - POSS CLASS B - COCAINE, ETC. A1 111 NaN 2015-08-12 12:00:00 2015 8 Wednesday 12 Part Two BOYLSTON ST 42.352312 -71.063705 (42.35231190, -71.06370510)
319066 I060168073-00 1864 Drug Violation DRUGS - POSS CLASS D - INTENT MFR DIST DISP E13 912 NaN 2018-01-27 14:01:00 2018 1 Saturday 14 Part Two CENTRE ST 42.322838 -71.100967 (42.32283759, -71.10096723)
319067 I060168073-00 3125 Warrant Arrests WARRANT ARREST E13 912 NaN 2018-01-27 14:01:00 2018 1 Saturday 14 Part Three CENTRE ST 42.322838 -71.100967 (42.32283759, -71.10096723)
319068 I050310906-00 3125 Warrant Arrests WARRANT ARREST D4 285 NaN 2016-06-05 17:25:00 2016 6 Sunday 17 Part Three COVENTRY ST 42.336951 -71.085748 (42.33695098, -71.08574813)
319069 I030217815-08 111 Homicide MURDER, NON-NEGLIGIENT MANSLAUGHTER E18 520 NaN 2015-07-09 13:38:00 2015 7 Thursday 13 Part One RIVER ST 42.255926 -71.123172 (42.25592648, -71.12317207)
319070 I030217815-08 3125 Warrant Arrests WARRANT ARREST E18 520 NaN 2015-07-09 13:38:00 2015 7 Thursday 13 Part Three RIVER ST 42.255926 -71.123172 (42.25592648, -71.12317207)
319071 I010370257-00 3125 Warrant Arrests WARRANT ARREST E13 569 NaN 2016-05-31 19:35:00 2016 5 Tuesday 19 Part Three NEW WASHINGTON ST 42.302333 -71.111565 (42.30233307, -71.11156487)
319072 142052550 3125 Warrant Arrests WARRANT ARREST D4 903 NaN 2015-06-22 00:12:00 2015 6 Monday 0 Part Three WASHINGTON ST 42.333839 -71.080290 (42.33383935, -71.08029038)

319073 rows × 17 columns

crime.describe(include="all")
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long Location
count 319073 319073.000000 319073 319073 317308 319073 1019 319073 319073.000000 319073.000000 319073 319073.000000 318983 308202 299074.000000 299074.000000 319073
unique 282517 NaN 67 244 12 879 1 233229 NaN NaN 7 NaN 4 4657 NaN NaN 18194
top I162030584 NaN Motor Vehicle Accident Response SICK/INJURED/MEDICAL - PERSON B2 Y 2017-06-01 00:00:00 NaN NaN Friday NaN Part Three WASHINGTON ST NaN NaN (0.00000000, 0.00000000)
freq 13 NaN 37132 18783 49945 20250 1019 29 NaN NaN 48495 NaN 158553 14194 NaN NaN 19999
mean NaN 2317.546956 NaN NaN NaN NaN NaN NaN 2016.560586 6.609719 NaN 13.118205 NaN NaN 42.214381 -70.908272 NaN
std NaN 1185.285543 NaN NaN NaN NaN NaN NaN 0.996344 3.273691 NaN 6.294205 NaN NaN 2.159766 3.493618 NaN
min NaN 111.000000 NaN NaN NaN NaN NaN NaN 2015.000000 1.000000 NaN 0.000000 NaN NaN -1.000000 -71.178674 NaN
25% NaN 1001.000000 NaN NaN NaN NaN NaN NaN 2016.000000 4.000000 NaN 9.000000 NaN NaN 42.297442 -71.097135 NaN
50% NaN 2907.000000 NaN NaN NaN NaN NaN NaN 2017.000000 7.000000 NaN 14.000000 NaN NaN 42.325538 -71.077524 NaN
75% NaN 3201.000000 NaN NaN NaN NaN NaN NaN 2017.000000 9.000000 NaN 18.000000 NaN NaN 42.348624 -71.062467 NaN
max NaN 3831.000000 NaN NaN NaN NaN NaN NaN 2018.000000 12.000000 NaN 23.000000 NaN NaN 42.395042 -1.000000 NaN
# Większość danych o strzelaninach jest pusta więc zakładam, że są to inceydenty bez strzelanin
crime["SHOOTING"].fillna("N", inplace=True)

# Kolumna location powtarza wartości z Lat i Long
crime.drop(columns=["Location"], inplace=True)

# Usuwam błędne/brakujące wartości współrzędnych
crime = crime[(crime["Lat"] > 35) & (crime["Long"] < -65)]

# Lowercase na polach tekstowych
crime["OFFENSE_CODE_GROUP"] = crime["OFFENSE_CODE_GROUP"].str.lower()
crime["OFFENSE_DESCRIPTION"] = crime["OFFENSE_DESCRIPTION"].str.lower()
crime["DAY_OF_WEEK"] = crime["DAY_OF_WEEK"].str.lower()
crime["UCR_PART"] = crime["UCR_PART"].str.lower()
crime["STREET"] = crime["STREET"].str.lower()

# Usuwam pozostałe wiersze zawierające nulle
crime.dropna()
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long
0 I182070945 619 larceny larceny all others D14 808 N 2018-09-02 13:00:00 2018 9 sunday 13 part one lincoln st 42.357791 -71.139371
1 I182070943 1402 vandalism vandalism C11 347 N 2018-08-21 00:00:00 2018 8 tuesday 0 part two hecla st 42.306821 -71.060300
2 I182070941 3410 towed towed motor vehicle D4 151 N 2018-09-03 19:27:00 2018 9 monday 19 part three cazenove st 42.346589 -71.072429
3 I182070940 3114 investigate property investigate property D4 272 N 2018-09-03 21:16:00 2018 9 monday 21 part three newcomb st 42.334182 -71.078664
4 I182070938 3114 investigate property investigate property B3 421 N 2018-09-03 21:05:00 2018 9 monday 21 part three delhi st 42.275365 -71.090361
5 I182070936 3820 motor vehicle accident response m/v accident involving pedestrian - injury C11 398 N 2018-09-03 21:09:00 2018 9 monday 21 part three talbot ave 42.290196 -71.071590
6 I182070933 724 auto theft auto theft B2 330 N 2018-09-03 21:25:00 2018 9 monday 21 part one normandy st 42.306072 -71.082733
7 I182070932 3301 verbal disputes verbal dispute B2 584 N 2018-09-03 20:39:37 2018 9 monday 20 part three lawn st 42.327016 -71.105551
8 I182070931 301 robbery robbery - street C6 177 N 2018-09-03 20:48:00 2018 9 monday 20 part one massachusetts ave 42.331521 -71.070853
9 I182070929 3301 verbal disputes verbal dispute C11 364 N 2018-09-03 20:38:00 2018 9 monday 20 part three leslie st 42.295147 -71.058608
10 I182070928 3301 verbal disputes verbal dispute C6 913 N 2018-09-03 19:55:00 2018 9 monday 19 part three ocean view dr 42.319579 -71.040328
11 I182070927 3114 investigate property investigate property C6 936 N 2018-09-03 20:19:00 2018 9 monday 20 part three dalessio ct 42.340115 -71.053390
12 I182070923 3108 fire related reports fire report - house, building, etc. D4 139 N 2018-09-03 19:58:00 2018 9 monday 19 part three marlborough st 42.350388 -71.087853
13 I182070922 2647 other threats to do bodily harm B3 429 N 2018-09-03 20:39:00 2018 9 monday 20 part two woodrow ave 42.286470 -71.087147
14 I182070921 3201 property lost property - lost B3 469 N 2018-09-02 14:00:00 2018 9 sunday 14 part three mulvey st 42.279241 -71.096674
16 I182070919 3301 verbal disputes verbal dispute C11 341 N 2018-09-03 18:52:00 2018 9 monday 18 part three stonehurst st 42.305264 -71.066838
17 I182070918 3305 assembly or gathering violations demonstrations/riot D4 130 N 2018-09-03 17:00:00 2018 9 monday 17 part three huntington ave 42.348577 -71.077720
18 I182070917 2647 other threats to do bodily harm B2 901 N 2018-09-03 19:52:00 2018 9 monday 19 part two horadan way 42.333717 -71.096658
19 I182070915 614 larceny from motor vehicle larceny theft from mv - non-accessory B2 181 N 2018-09-02 18:00:00 2018 9 sunday 18 part one shirley st 42.325695 -71.068168
21 I182070911 3801 motor vehicle accident response m/v accident - other A1 69 N 2018-09-03 18:30:00 2018 9 monday 18 part three beacon st 42.355644 -71.071681
22 I182070910 3006 medical assistance sick/injured/medical - person B3 434 N 2018-09-03 18:42:00 2018 9 monday 18 part three capen st 42.283402 -71.080797
23 I182070909 3803 motor vehicle accident response m/v accident - personal injury E5 550 N 2018-09-03 18:33:00 2018 9 monday 18 part three washington st 42.275818 -71.139913
24 I182070908 522 residential burglary burglary - residential - no force B2 911 N 2018-09-03 18:38:00 2018 9 monday 18 part one annunciation rd 42.335062 -71.093168
26 I182070905 3006 medical assistance sick/injured/medical - person D4 172 N 2018-09-03 18:50:00 2018 9 monday 18 part three massachusetts ave 42.333112 -71.072764
27 I182070904 802 simple assault assault simple - battery C11 242 N 2018-09-03 18:34:00 2018 9 monday 18 part two annapolis st 42.317319 -71.061509
28 I182070904 2007 restraining order violations viol. of restraining order w no arrest C11 242 N 2018-09-03 18:34:00 2018 9 monday 18 part two annapolis st 42.317319 -71.061509
29 I182070903 2900 other val - violation of auto law - other B3 463 N 2018-09-03 18:55:00 2018 9 monday 18 part two blue hill ave 42.295904 -71.087733
30 I182070901 2907 violations val - operating after rev/susp. B3 428 N 2018-09-03 18:41:00 2018 9 monday 18 part two clarkwood st 42.280137 -71.090798
31 I182070900 2629 harassment harassment B3 464 N 2018-09-03 18:17:00 2018 9 monday 18 part two hansborough st 42.288104 -71.091533
32 I182070898 802 simple assault assault simple - battery C11 351 N 2018-09-03 19:11:00 2018 9 monday 19 part two salisbury park 42.299284 -71.059172
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
319040 I110694557-00 3125 warrant arrests warrant arrest B3 436 N 2016-01-22 09:45:00 2016 1 friday 9 part three withington st 42.288767 -71.072897
319041 I110694557-00 3115 investigate person investigate person B3 436 N 2016-01-22 09:45:00 2016 1 friday 9 part three withington st 42.288767 -71.072897
319043 I110551302-00 3125 warrant arrests warrant arrest D4 171 N 2015-07-22 22:00:00 2015 7 wednesday 22 part three harrison ave 42.335560 -71.074364
319044 I110551302-00 623 larceny larceny shoplifting $50 to $199 D4 171 N 2015-07-22 22:00:00 2015 7 wednesday 22 part one harrison ave 42.335560 -71.074364
319045 I110372326-00 403 aggravated assault assault & battery d/w - other A1 97 N 2016-06-14 09:40:00 2016 6 tuesday 9 part one school st 42.357428 -71.058326
319046 I110372326-00 3125 warrant arrests warrant arrest A1 97 N 2016-06-14 09:40:00 2016 6 tuesday 9 part three school st 42.357428 -71.058326
319047 I110261417-00 3125 warrant arrests warrant arrest B2 324 N 2016-07-29 00:00:00 2016 7 friday 0 part three bowdoin st 42.307038 -71.066153
319048 I110261417-00 619 larceny larceny other $200 & over B2 324 N 2016-07-29 00:00:00 2016 7 friday 0 part one bowdoin st 42.307038 -71.066153
319049 I110177502-00 3125 warrant arrests warrant arrest B2 318 N 2015-10-02 21:00:00 2015 10 friday 21 part three homestead st 42.311277 -71.089093
319050 I110177502-00 802 simple assault assault & battery B2 318 N 2015-10-02 21:00:00 2015 10 friday 21 part two homestead st 42.311277 -71.089093
319051 I110177502-00 3125 warrant arrests warrant arrest B2 318 N 2015-10-02 21:00:00 2015 10 friday 21 part three homestead st 42.311277 -71.089093
319052 I100636670-00 629 larceny larceny other $50 to $199 D4 285 N 2016-06-05 17:23:00 2016 6 sunday 17 part one coventry st 42.336951 -71.085748
319053 I100636670-00 3125 warrant arrests warrant arrest D4 285 N 2016-06-05 17:23:00 2016 6 sunday 17 part three coventry st 42.336951 -71.085748
319054 I100340225-00 3125 warrant arrests warrant arrest A1 77 N 2015-07-27 10:47:00 2015 7 monday 10 part three bowdoin sq 42.361645 -71.062299
319055 I100340225-00 339 robbery robbery - unarmed - street A1 77 N 2015-07-27 10:47:00 2015 7 monday 10 part one bowdoin sq 42.361645 -71.062299
319056 I100222105-02 3125 warrant arrests warrant arrest E13 572 N 2015-08-03 16:22:00 2015 8 monday 16 part three columbus ave 42.313628 -71.095603
319057 I100033064-00 2907 violations val - operating after rev/susp. B2 304 N 2016-07-29 18:20:00 2016 7 friday 18 part two slayton way 42.321770 -71.097798
319058 I100033064-00 2910 violations val - operating after rev/susp. B2 304 N 2016-07-29 18:20:00 2016 7 friday 18 part two slayton way 42.321770 -71.097798
319061 I090317057-00 403 aggravated assault assault & battery d/w - other B3 458 N 2015-11-20 11:15:00 2015 11 friday 11 part one blue hill ave 42.301897 -71.085549
319062 I090317057-00 3125 warrant arrests warrant arrest B3 458 N 2015-11-20 11:15:00 2015 11 friday 11 part three blue hill ave 42.301897 -71.085549
319063 I080542626-00 3125 warrant arrests warrant arrest A1 111 N 2015-08-12 12:00:00 2015 8 wednesday 12 part three boylston st 42.352312 -71.063705
319064 I080542626-00 1848 drug violation drugs - poss class b - intent to mfr dist disp A1 111 N 2015-08-12 12:00:00 2015 8 wednesday 12 part two boylston st 42.352312 -71.063705
319065 I080542626-00 1849 drug violation drugs - poss class b - cocaine, etc. A1 111 N 2015-08-12 12:00:00 2015 8 wednesday 12 part two boylston st 42.352312 -71.063705
319066 I060168073-00 1864 drug violation drugs - poss class d - intent mfr dist disp E13 912 N 2018-01-27 14:01:00 2018 1 saturday 14 part two centre st 42.322838 -71.100967
319067 I060168073-00 3125 warrant arrests warrant arrest E13 912 N 2018-01-27 14:01:00 2018 1 saturday 14 part three centre st 42.322838 -71.100967
319068 I050310906-00 3125 warrant arrests warrant arrest D4 285 N 2016-06-05 17:25:00 2016 6 sunday 17 part three coventry st 42.336951 -71.085748
319069 I030217815-08 111 homicide murder, non-negligient manslaughter E18 520 N 2015-07-09 13:38:00 2015 7 thursday 13 part one river st 42.255926 -71.123172
319070 I030217815-08 3125 warrant arrests warrant arrest E18 520 N 2015-07-09 13:38:00 2015 7 thursday 13 part three river st 42.255926 -71.123172
319071 I010370257-00 3125 warrant arrests warrant arrest E13 569 N 2016-05-31 19:35:00 2016 5 tuesday 19 part three new washington st 42.302333 -71.111565
319072 142052550 3125 warrant arrests warrant arrest D4 903 N 2015-06-22 00:12:00 2015 6 monday 0 part three washington st 42.333839 -71.080290

296421 rows × 16 columns

pip install --user scikit-learn
Requirement already satisfied: scikit-learn in /usr/lib/python3/dist-packages (0.20.2)
Note: you may need to restart the kernel to use updated packages.
from sklearn.model_selection import train_test_split

# Zbiór jest całkiem duży - 300k wierszy po oczyszczeniu, więc wybieram podział 8:1:1, czyli w przybliżeniu 30k na dev i test
crime_train, crime_test = train_test_split(crime, test_size=60000, random_state=1)
crime_test, crime_dev = train_test_split(crime_test, test_size=30000, random_state=1)
print(crime_train.shape)
print(crime_test.shape)
print(crime_dev.shape)
(238329, 16)
(30000, 16)
(30000, 16)
crime_train.describe(include="all")
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long
count 238329 238329.000000 238329 238329 237585 238329 238329 238329 238329.000000 238329.000000 238329 238329.000000 238253 236876 238329.000000 238329.000000
unique 215646 NaN 66 233 12 878 2 183782 NaN NaN 7 NaN 4 3760 NaN NaN
top I162030584 NaN motor vehicle accident response sick/injured/medical - person B2 111 N 2016-08-01 00:00:00 NaN NaN friday NaN part three washington st NaN NaN
freq 11 NaN 24783 14380 36780 1851 237575 22 NaN NaN 36273 NaN 117492 11305 NaN NaN
mean NaN 2296.198717 NaN NaN NaN NaN NaN NaN 2016.551754 6.615175 NaN 13.129107 NaN NaN 42.322310 -71.082836
std NaN 1182.831284 NaN NaN NaN NaN NaN NaN 1.001031 3.277604 NaN 6.278188 NaN NaN 0.031891 0.029754
min NaN 111.000000 NaN NaN NaN NaN NaN NaN 2015.000000 1.000000 NaN 0.000000 NaN NaN 42.232413 -71.178674
25% NaN 802.000000 NaN NaN NaN NaN NaN NaN 2016.000000 4.000000 NaN 9.000000 NaN NaN 42.297555 -71.097193
50% NaN 2907.000000 NaN NaN NaN NaN NaN NaN 2017.000000 7.000000 NaN 14.000000 NaN NaN 42.325629 -71.077551
75% NaN 3201.000000 NaN NaN NaN NaN NaN NaN 2017.000000 9.000000 NaN 18.000000 NaN NaN 42.348624 -71.062563
max NaN 3831.000000 NaN NaN NaN NaN NaN NaN 2018.000000 12.000000 NaN 23.000000 NaN NaN 42.395042 -70.963676
crime_test.describe(include="all")
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long
count 30000 30000.000000 30000 30000 29925 30000 30000 30000 30000.000000 30000.000000 30000 30000.000000 29995 29836 30000.000000 30000.000000
unique 29560 NaN 62 183 12 869 2 28717 NaN NaN 7 NaN 4 2646 NaN NaN
top I162050329 NaN motor vehicle accident response sick/injured/medical - person B2 111 N 2015-12-16 20:00:00 NaN NaN friday NaN part three washington st NaN NaN
freq 4 NaN 3087 1800 4659 229 29889 6 NaN NaN 4667 NaN 14698 1374 NaN NaN
mean NaN 2289.794500 NaN NaN NaN NaN NaN NaN 2016.560633 6.584300 NaN 13.063833 NaN NaN 42.322205 -71.082903
std NaN 1182.877414 NaN NaN NaN NaN NaN NaN 0.996171 3.295642 NaN 6.281078 NaN NaN 0.031885 0.029818
min NaN 111.000000 NaN NaN NaN NaN NaN NaN 2015.000000 1.000000 NaN 0.000000 NaN NaN 42.233157 -71.176805
25% NaN 802.000000 NaN NaN NaN NaN NaN NaN 2016.000000 4.000000 NaN 9.000000 NaN NaN 42.297344 -71.097374
50% NaN 2906.000000 NaN NaN NaN NaN NaN NaN 2017.000000 7.000000 NaN 14.000000 NaN NaN 42.325474 -71.077720
75% NaN 3201.000000 NaN NaN NaN NaN NaN NaN 2017.000000 9.000000 NaN 18.000000 NaN NaN 42.348610 -71.062570
max NaN 3831.000000 NaN NaN NaN NaN NaN NaN 2018.000000 12.000000 NaN 23.000000 NaN NaN 42.395042 -70.996769
crime_dev.describe(include="all")
INCIDENT_NUMBER OFFENSE_CODE OFFENSE_CODE_GROUP OFFENSE_DESCRIPTION DISTRICT REPORTING_AREA SHOOTING OCCURRED_ON_DATE YEAR MONTH DAY_OF_WEEK HOUR UCR_PART STREET Lat Long
count 30000 30000.000000 30000 30000 29922 30000 30000 30000 30000.000000 30000.000000 30000 30000.000000 29991 29836 30000.000000 30000.000000
unique 29561 NaN 61 189 12 868 2 28778 NaN NaN 7 NaN 4 2689 NaN NaN
top I162010747 NaN motor vehicle accident response investigate person B2 111 N 2017-06-01 00:00:00 NaN NaN thursday NaN part three washington st NaN NaN
freq 4 NaN 3145 1908 4762 258 29882 6 NaN NaN 4425 NaN 14910 1434 NaN NaN
mean NaN 2303.541933 NaN NaN NaN NaN NaN NaN 2016.560700 6.626700 NaN 13.157033 NaN NaN 42.322292 -71.082911
std NaN 1185.561127 NaN NaN NaN NaN NaN NaN 0.998874 3.264563 NaN 6.282363 NaN NaN 0.031804 0.029857
min NaN 111.000000 NaN NaN NaN NaN NaN NaN 2015.000000 1.000000 NaN 0.000000 NaN NaN 42.232656 -71.178674
25% NaN 802.000000 NaN NaN NaN NaN NaN NaN 2016.000000 4.000000 NaN 9.000000 NaN NaN 42.297555 -71.097193
50% NaN 2907.000000 NaN NaN NaN NaN NaN NaN 2017.000000 7.000000 NaN 14.000000 NaN NaN 42.325834 -71.077564
75% NaN 3201.000000 NaN NaN NaN NaN NaN NaN 2017.000000 9.000000 NaN 18.000000 NaN NaN 42.348610 -71.062607
max NaN 3831.000000 NaN NaN NaN NaN NaN NaN 2018.000000 12.000000 NaN 23.000000 NaN NaN 42.395042 -70.996769
crime_train["OFFENSE_CODE"].value_counts()
3006    14380
3115    14336
1402    11677
3831    11635
802     11229
3301    10373
3410     8577
3114     8462
617      7114
2647     7010
3201     6734
614      6614
613      6250
3125     6060
619      4490
3802     4308
413      3625
3502     3470
1102     3461
2629     3159
3803     3123
3501     2975
3207     2763
724      2676
1106     2466
2610     2461
301      2182
423      2162
520      2039
2900     1973
        ...  
1302        2
2910        2
1002        2
803         2
2672        2
629         2
1866        2
633         2
123         2
770         2
1807        2
627         1
349         1
624         1
112         1
402         1
527         1
637         1
530         1
1620        1
2609        1
404         1
1105        1
547         1
335         1
315         1
714         1
1864        1
1863        1
639         1
Name: OFFENSE_CODE, Length: 215, dtype: int64
crime_train["DISTRICT"].value_counts()
B2     36780
C11    32723
D4     30659
B3     26864
A1     25879
C6     17285
D14    15259
E18    13366
E13    13297
A7     10412
E5     10149
A15     4912
Name: DISTRICT, dtype: int64
crime_train["YEAR"].value_counts()
2017    74671
2016    73720
2018    48922
2015    41016
Name: YEAR, dtype: int64
crime_test.to_csv("crime_test.csv", encoding="utf-8", index=False)
crime_dev.to_csv("crime_dev.csv", encoding="utf-8", index=False)
crime_train.to_csv("crime_train.csv", encoding="utf-8", index=False)