Docker update (almost working)
This commit is contained in:
parent
0ab7172af9
commit
a3c433f999
@ -0,0 +1,202 @@
|
||||
from typing import List, Tuple
|
||||
import pandas as pd
|
||||
from functools import lru_cache
|
||||
import os.path
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
_XLSX_FILENAME = "PL/wyniki_gl_na_kand_po_obwodach.xlsx"
|
||||
_AREA_CODE_COLNAME = "Kod terytorialny gminy"
|
||||
_VALID_VOTES_COLNAME = "Liczba kart ważnych"
|
||||
_VOTERS_ELIGIBLE_TO_VOTE_COLNAME = "Liczba wyborców uprawnionych do głosowania"
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_xlsx(filename):
|
||||
return pd.ExcelFile(filename)
|
||||
|
||||
|
||||
def get_data_sheets() -> List[pd.DataFrame]:
|
||||
"""
|
||||
Obtains the data on the constituent sheets that the Excel workbook. Sheet
|
||||
data are slow to extract (in this way at least), so these are saved into a
|
||||
multitude of csv files first. When already available, these are simply
|
||||
loaded.
|
||||
|
||||
:return: List of data frames matching the sheets of the Excel file.
|
||||
"""
|
||||
sheet_list_csv_filename = "PL/wyniki_gl_na_kand_po_obwodach_sheet_list.csv"
|
||||
sheet_csv_filename = "PL/wyniki_gl_na_kand_po_obwodach_sheet_%s.csv"
|
||||
|
||||
if os.path.exists(sheet_list_csv_filename):
|
||||
try:
|
||||
df_sheets = pd.read_csv(sheet_list_csv_filename, encoding="utf-8")
|
||||
dfs = []
|
||||
for sheet_id in df_sheets.id:
|
||||
dfs.append(pd.read_csv(sheet_csv_filename % sheet_id))
|
||||
return dfs
|
||||
except Exception as e:
|
||||
print("Error reading saved cleaned data, attempting to "
|
||||
"recreate ...")
|
||||
print(" details:", str(e))
|
||||
|
||||
print("Cleaning data...")
|
||||
|
||||
xlsx_filename = "PL/wyniki_gl_na_kand_po_obwodach.xlsx"
|
||||
xls = get_xlsx(xlsx_filename)
|
||||
|
||||
""" I can just save it as a bunch of CSVs and worry later rapidly
|
||||
about the elegant format depending on the scenario """
|
||||
dfs = []
|
||||
sheet_ids = []
|
||||
sheet_names = []
|
||||
id = 0
|
||||
for sheet_name in xls.sheet_names:
|
||||
print("Reading sheet", sheet_name)
|
||||
df = pd.read_excel(xlsx_filename, sheet_name=sheet_name)
|
||||
id += 1
|
||||
df.to_csv(sheet_csv_filename % id, index=False, encoding="utf-8")
|
||||
sheet_ids.append(id)
|
||||
sheet_names.append(sheet_name)
|
||||
dfs.append(df)
|
||||
|
||||
df_sheets = pd.DataFrame(dict(id=sheet_ids, sheet_names=sheet_names))
|
||||
df_sheets.to_csv(sheet_list_csv_filename, index=False, encoding="utf-8")
|
||||
|
||||
return dfs
|
||||
|
||||
|
||||
def get_lista_col_idx(col_name):
|
||||
return int(col_name.split(" ")[2])
|
||||
|
||||
|
||||
class MergedDataInfo():
|
||||
|
||||
def __init__(self, area_columm, valid_votes_column,
|
||||
nr_of_registered_voters_column, lista_columns):
|
||||
self._area_column = area_columm
|
||||
self._valid_votes_column = valid_votes_column
|
||||
self._nr_of_registered_voters_column = nr_of_registered_voters_column
|
||||
self._lista_columns = lista_columns
|
||||
|
||||
def __repr__(self):
|
||||
return ("MergedDataInfo(\n"
|
||||
" valid_votes_column: %s,\n"
|
||||
" nr_of_registered_voters_column: %s,\n"
|
||||
" lista_columns: %s\n"
|
||||
")"
|
||||
% (self.valid_votes_column, self.nr_of_registered_voters_column,
|
||||
", ".join(self.lista_columns))
|
||||
)
|
||||
|
||||
area_column = \
|
||||
property(lambda self: self._area_column) # type: str
|
||||
|
||||
valid_votes_column = \
|
||||
property(lambda self: self._valid_votes_column) # type: str
|
||||
|
||||
nr_of_registered_voters_column = \
|
||||
property(lambda self: self._nr_of_registered_voters_column) # type: str
|
||||
|
||||
""" Totals per lista. """
|
||||
lista_columns = \
|
||||
property(lambda self: self._lista_columns) # type: List[str]
|
||||
|
||||
@lru_cache()
|
||||
def get_lista_column(self, index: int) -> str:
|
||||
for lista_column in self.lista_columns:
|
||||
if get_lista_col_idx(lista_column) == index:
|
||||
return lista_column
|
||||
raise KeyError("Lista column for index %d is not contained." % index)
|
||||
|
||||
|
||||
"""
|
||||
TODO: add another, per csv auto-selected column: that of the most popular candidate per lista
|
||||
"""
|
||||
def merge_lista_results(dfs,
|
||||
# don't ask :) 7, 8, 9 looked too sparse I think anyway
|
||||
# TODO: but #8 just fails with some error ;)
|
||||
lista_idxs_to_exclude=[8],
|
||||
return_overview_cols=False):
|
||||
"""
|
||||
Merges lista results splattered over the sheets into a unified data frame.
|
||||
|
||||
:param dfs: The data frames to merge, consisting of standard and "lista"
|
||||
(candidate preference) columns.
|
||||
:param lista_idxs_to_exclude: (Ballot) number (1...) of lists to exclude.
|
||||
:param return_overview_cols: Instructs to return the overview columns
|
||||
additionally. These will be described in the second member of the return
|
||||
tuple.
|
||||
|
||||
:return: A single data frame if return_overview_cols is False, with the
|
||||
basic contents: area code, then lista total columns. Otherwise a tuple
|
||||
of a more verbose data frame and a MergedListaInfo to help with
|
||||
navigating around among its columns.
|
||||
"""
|
||||
print("Merging \"lista\" columns...")
|
||||
lista_col_names = set()
|
||||
|
||||
for df in dfs:
|
||||
for col in df.columns:
|
||||
if col.startswith("Lista nr"):
|
||||
lista_col_names.add(col)
|
||||
|
||||
lista_col_names = sorted(lista_col_names, key=get_lista_col_idx)
|
||||
lista_col_names = [col for col in lista_col_names
|
||||
if get_lista_col_idx(col)
|
||||
not in lista_idxs_to_exclude]
|
||||
cols_to_keep = [_AREA_CODE_COLNAME] + lista_col_names
|
||||
|
||||
if return_overview_cols:
|
||||
cols_to_keep += [
|
||||
_VALID_VOTES_COLNAME,
|
||||
_VOTERS_ELIGIBLE_TO_VOTE_COLNAME,
|
||||
]
|
||||
|
||||
dfs_to_merge = []
|
||||
for df in dfs:
|
||||
cols_dict = OrderedDict([
|
||||
(col_name,
|
||||
df[col_name]
|
||||
if col_name in df.columns else None)
|
||||
for col_name in cols_to_keep
|
||||
])
|
||||
dfs_to_merge.append(pd.DataFrame(cols_dict))
|
||||
merged = pd.concat(dfs_to_merge)
|
||||
|
||||
for col in merged.columns:
|
||||
# "last digit" columns
|
||||
if col.startswith("Lista"):
|
||||
merged["ld_" + col] = merged[col] % 10
|
||||
|
||||
if not return_overview_cols:
|
||||
return merged
|
||||
else:
|
||||
info = MergedDataInfo(
|
||||
_AREA_CODE_COLNAME,
|
||||
_VALID_VOTES_COLNAME,
|
||||
# fingers crossed it's the right one :)
|
||||
_VOTERS_ELIGIBLE_TO_VOTE_COLNAME,
|
||||
lista_columns=lista_col_names,
|
||||
)
|
||||
return merged, info
|
||||
|
||||
|
||||
def get_preprocessed_data() -> Tuple[pd.DataFrame, MergedDataInfo]:
|
||||
""" Returns a data frame with the 2019 Polish EP election data, and some
|
||||
metadata about it as the columns are left at their original names.
|
||||
|
||||
Remark: Lista 8 is currently excluded due to a likely pre-processing bug
|
||||
causing an exception.
|
||||
|
||||
Expect the following columns:
|
||||
- area code
|
||||
- total votes per lista (except for Lista 8)
|
||||
- total valid votes
|
||||
- voters eligible to vote (registered)
|
||||
- ld_...: one "last digit" column per each lista column for convenience
|
||||
|
||||
Currently likely the most convenient point to start analyzing the data.
|
||||
"""
|
||||
dfs = get_data_sheets()
|
||||
return merge_lista_results(dfs, return_overview_cols=True)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,14 @@
|
||||
id,sheet_names
|
||||
1,Okręg Wyborczy Nr 1
|
||||
2,Okręg Wyborczy Nr 2
|
||||
3,Okręg Wyborczy Nr 3
|
||||
4,Okręg Wyborczy Nr 4
|
||||
5,Okręg Wyborczy Nr 5
|
||||
6,Okręg Wyborczy Nr 6
|
||||
7,Okręg Wyborczy Nr 7
|
||||
8,Okręg Wyborczy Nr 8
|
||||
9,Okręg Wyborczy Nr 9
|
||||
10,Okręg Wyborczy Nr 10
|
||||
11,Okręg Wyborczy Nr 11
|
||||
12,Okręg Wyborczy Nr 12
|
||||
13,Okręg Wyborczy Nr 13
|
|
Binary file not shown.
24
Dockerfile
24
Dockerfile
@ -1,6 +1,22 @@
|
||||
# Nasz obraz będzie dzidziczył z obrazu Ubuntu w wersji latest
|
||||
FROM ubuntu:latest
|
||||
|
||||
# Instalujemy niezbędne zależności. Zwróć uwagę na flagę "-y" (assume yes)
|
||||
RUN apt update && apt install -y pandas
|
||||
RUN apt update && apt install -y kaggle
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y python3
|
||||
RUN apt-get install -y python3-pip
|
||||
RUN apt-get install -y unzip
|
||||
|
||||
RUN pip3 install pandas
|
||||
RUN pip3 install kaggle
|
||||
RUN pip3 install scikit-learn
|
||||
|
||||
RUN ln -s ~/.local/bin/kaggle /usr/bin/kaggle
|
||||
ENV KAGGLE_USERNAME="grzegorzgapiski"
|
||||
ENV KAGGLE_KEY=""
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY ./data_preparation.py ./
|
||||
|
||||
COPY ./2019-european-parliament-election-in-poland-data ./
|
||||
|
||||
CMD ./data_preparation.py
|
66
create_dataset_s407409.sh
Normal file
66
create_dataset_s407409.sh
Normal file
@ -0,0 +1,66 @@
|
||||
!pip install --user kaggle
|
||||
!pip install --user pandas
|
||||
!pip install --user sklearn
|
||||
|
||||
!kaggle datasets download -d brezniczky/2019-european-parliament-election-in-poland-data
|
||||
|
||||
#Rozpakowanie danych
|
||||
!unzip -o 2019-european-parliament-election-in-poland-data.zip
|
||||
|
||||
#Testowe wczytanie pojedynczego arkusza z danymi (1 z 13)
|
||||
import pandas as pd
|
||||
data_district_1=pd.read_csv('2019-european-parliament-election-in-poland-data\wyniki_gl_na_kand_po_obwodach_sheet_1.csv')
|
||||
|
||||
#Wczytanie i obróbka całości danych
|
||||
|
||||
big_dataset = pd.DataFrame()
|
||||
|
||||
#Trzeba wczytać każdy z 13 arkuszy z danymi dla poszczególnych okręgów wyborczych
|
||||
for i in range(1,14):
|
||||
filename = '2019-european-parliament-election-in-poland-data\wyniki_gl_na_kand_po_obwodach_sheet_'+str(i)+'.csv'
|
||||
data_district = pd.read_csv(filename)
|
||||
data_district['obwod'] = i
|
||||
df = data_district.copy()
|
||||
cols_to_remove = []
|
||||
for c in range(35,len(df.columns)):
|
||||
cols_to_remove.append(c)
|
||||
#Wycięcie nadmiarowych danych - szczegółowych informacji o głosach na poszczególnych kandydatów w poszczególnych komisjach
|
||||
df.drop(df.columns[cols_to_remove],axis=1,inplace=True)
|
||||
#Uwzględnienie w danych zbiorczych wyników komitetów wyborczych w poszczególnych komisjach
|
||||
df['Konfederacja'] = data_district['Lista nr 1 - KWW KONFEDERACJA KORWIN BRAUN LIROY NARODOWCY']
|
||||
df['Wiosna'] = data_district['Lista nr 2 - KW WIOSNA ROBERTA BIEDRONIA']
|
||||
df['Koalicja Europejska'] = data_district['Lista nr 3 - KKW KOALICJA EUROPEJSKA PO PSL SLD .N ZIELONI']
|
||||
df['Prawo i Sprawiedliwość'] = data_district['Lista nr 4 - KW PRAWO I SPRAWIEDLIWOŚĆ']
|
||||
df['Lewica Razem'] = data_district['Lista nr 5 - KKW LEWICA RAZEM - RAZEM, UNIA PRACY, RSS']
|
||||
df['Kukiz15'] = data_district['Lista nr 6 - KWW KUKIZ\'15']
|
||||
#Poniżej trzy komitety, które były zarejestrowane tylko w niektórych okręgach wyborczych
|
||||
if 'Lista nr 7 - KWW POLSKA FAIR PLAY BEZPARTYJNI GWIAZDOWSKI' in data_district.columns:
|
||||
df['Polska Fair Play'] = data_district['Lista nr 7 - KWW POLSKA FAIR PLAY BEZPARTYJNI GWIAZDOWSKI']
|
||||
else:
|
||||
df['Polska Fair Play'] = 0
|
||||
if 'Lista nr 9 - KKW POLEXIT - KOALICJA' in data_district.columns:
|
||||
df['POLEXIT'] = data_district['Lista nr 9 - KKW POLEXIT - KOALICJA']
|
||||
else:
|
||||
df['POLEXIT'] = 0
|
||||
if 'Lista nr 10 - KW JEDNOŚĆ NARODU' in data_district.columns:
|
||||
df['Jedność Narodu'] = data_district['Lista nr 10 - KW JEDNOŚĆ NARODU']
|
||||
else:
|
||||
df['Jedność Narodu'] = 0
|
||||
#Dodanie kolumny przechowującej informację o tym, w którym okręgu znajduje się dana komisja
|
||||
df['obwod'] = i
|
||||
|
||||
#Doklejenie wczytanych dla okręgu i obrobionych danych do głównego zbioru
|
||||
big_dataset = pd.concat([big_dataset,df])
|
||||
|
||||
big_dataset.describe(include='all')
|
||||
|
||||
import sklearn
|
||||
|
||||
#Wyodrębnienie z danych zbioru testowego
|
||||
from sklearn.model_selection import train_test_split
|
||||
data_train, data_test = sklearn.model_selection.train_test_split(big_dataset, test_size=2700, random_state=1)
|
||||
data_train['obwod'].value_counts()
|
||||
|
||||
#Wyodrębnienie zbioru walidacyjnego z pozostałego zbioru treningowego
|
||||
data_train, data_val = sklearn.model_selection.train_test_split(data_train, test_size=2700, random_state=1)
|
||||
data_val['obwod'].value_counts()
|
67
data_preparation.py
Normal file
67
data_preparation.py
Normal file
@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import pandas as pd
|
||||
import kaggle
|
||||
|
||||
#kaggle datasets download -d brezniczky/2019-european-parliament-election-in-poland-data
|
||||
|
||||
#Rozpakowanie danych
|
||||
#unzip -o 2019-european-parliament-election-in-poland-data.zip
|
||||
|
||||
#Testowe wczytanie pojedynczego arkusza z danymi (1 z 13)
|
||||
import pandas as pd
|
||||
data_district_1=pd.read_csv('2019-european-parliament-election-in-poland-data\wyniki_gl_na_kand_po_obwodach_sheet_1.csv')
|
||||
|
||||
#Wczytanie i obróbka całości danych
|
||||
|
||||
big_dataset = pd.DataFrame()
|
||||
|
||||
#Trzeba wczytać każdy z 13 arkuszy z danymi dla poszczególnych okręgów wyborczych
|
||||
for i in range(1,14):
|
||||
filename = '2019-european-parliament-election-in-poland-data\wyniki_gl_na_kand_po_obwodach_sheet_'+str(i)+'.csv'
|
||||
data_district = pd.read_csv(filename)
|
||||
data_district['obwod'] = i
|
||||
df = data_district.copy()
|
||||
cols_to_remove = []
|
||||
for c in range(35,len(df.columns)):
|
||||
cols_to_remove.append(c)
|
||||
#Wycięcie nadmiarowych danych - szczegółowych informacji o głosach na poszczególnych kandydatów w poszczególnych komisjach
|
||||
df.drop(df.columns[cols_to_remove],axis=1,inplace=True)
|
||||
#Uwzględnienie w danych zbiorczych wyników komitetów wyborczych w poszczególnych komisjach
|
||||
df['Konfederacja'] = data_district['Lista nr 1 - KWW KONFEDERACJA KORWIN BRAUN LIROY NARODOWCY']
|
||||
df['Wiosna'] = data_district['Lista nr 2 - KW WIOSNA ROBERTA BIEDRONIA']
|
||||
df['Koalicja Europejska'] = data_district['Lista nr 3 - KKW KOALICJA EUROPEJSKA PO PSL SLD .N ZIELONI']
|
||||
df['Prawo i Sprawiedliwość'] = data_district['Lista nr 4 - KW PRAWO I SPRAWIEDLIWOŚĆ']
|
||||
df['Lewica Razem'] = data_district['Lista nr 5 - KKW LEWICA RAZEM - RAZEM, UNIA PRACY, RSS']
|
||||
df['Kukiz15'] = data_district['Lista nr 6 - KWW KUKIZ\'15']
|
||||
#Poniżej trzy komitety, które były zarejestrowane tylko w niektórych okręgach wyborczych
|
||||
if 'Lista nr 7 - KWW POLSKA FAIR PLAY BEZPARTYJNI GWIAZDOWSKI' in data_district.columns:
|
||||
df['Polska Fair Play'] = data_district['Lista nr 7 - KWW POLSKA FAIR PLAY BEZPARTYJNI GWIAZDOWSKI']
|
||||
else:
|
||||
df['Polska Fair Play'] = 0
|
||||
if 'Lista nr 9 - KKW POLEXIT - KOALICJA' in data_district.columns:
|
||||
df['POLEXIT'] = data_district['Lista nr 9 - KKW POLEXIT - KOALICJA']
|
||||
else:
|
||||
df['POLEXIT'] = 0
|
||||
if 'Lista nr 10 - KW JEDNOŚĆ NARODU' in data_district.columns:
|
||||
df['Jedność Narodu'] = data_district['Lista nr 10 - KW JEDNOŚĆ NARODU']
|
||||
else:
|
||||
df['Jedność Narodu'] = 0
|
||||
#Dodanie kolumny przechowującej informację o tym, w którym okręgu znajduje się dana komisja
|
||||
df['obwod'] = i
|
||||
|
||||
#Doklejenie wczytanych dla okręgu i obrobionych danych do głównego zbioru
|
||||
big_dataset = pd.concat([big_dataset,df])
|
||||
|
||||
big_dataset.describe(include='all')
|
||||
|
||||
import sklearn
|
||||
|
||||
#Wyodrębnienie z danych zbioru testowego
|
||||
from sklearn.model_selection import train_test_split
|
||||
data_train, data_test = sklearn.model_selection.train_test_split(big_dataset, test_size=2700, random_state=1)
|
||||
data_train['obwod'].value_counts()
|
||||
|
||||
#Wyodrębnienie zbioru walidacyjnego z pozostałego zbioru treningowego
|
||||
data_train, data_val = sklearn.model_selection.train_test_split(data_train, test_size=2700, random_state=1)
|
||||
data_val['obwod'].value_counts()
|
5
formating.py
Normal file
5
formating.py
Normal file
@ -0,0 +1,5 @@
|
||||
with open('data_preparation.py', 'rb+') as f:
|
||||
content = f.read()
|
||||
f.seek(0)
|
||||
f.write(content.replace(b'\r', b''))
|
||||
f.truncate()
|
Loading…
Reference in New Issue
Block a user