Add script for lab2

This commit is contained in:
Wojciech Jarmosz 2021-03-20 20:55:55 +01:00
commit d8b9b6f6db
4 changed files with 230 additions and 0 deletions

152
.gitignore vendored Normal file
View File

@ -0,0 +1,152 @@
# Created by https://www.toptal.com/developers/gitignore/api/python
# Edit at https://www.toptal.com/developers/gitignore?templates=python
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
pytestdebug.log
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
doc/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
#poetry.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
# .env
.env/
.venv/
env/
venv/
ENV/
env.bak/
venv.bak/
pythonenv*
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# operating system-related files
*.DS_Store #file properties cache/storage on macOS
Thumbs.db #thumbnail cache on Windows
# profiling data
.prof
env
# End of https://www.toptal.com/developers/gitignore/api/python

7
README Normal file
View File

@ -0,0 +1,7 @@
# Instalacja skryptu:
```
python3 -m venv env
source ./env/bin/activate
pip install -r requirements.txt
python3 script.py
```

19
requirements.txt Normal file
View File

@ -0,0 +1,19 @@
certifi==2020.12.5
chardet==4.0.0
idna==2.10
joblib==1.0.1
kaggle==1.5.12
numpy==1.20.1
pandas==1.2.3
python-dateutil==2.8.1
python-slugify==4.0.1
pytz==2021.1
requests==2.25.1
scikit-learn==0.24.1
scipy==1.6.1
six==1.15.0
sklearn==0.0
text-unidecode==1.3
threadpoolctl==2.1.0
tqdm==4.59.0
urllib3==1.26.4

52
script.py Normal file
View File

@ -0,0 +1,52 @@
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import kaggle
kaggle.api.authenticate()
kaggle.api.dataset_download_files('ruchi798/movies-on-netflix-prime-video-hulu-and-disney', path='.', unzip=True)
# odczyt danych
film_data = pd.read_csv('MoviesOnStreamingPlatforms_updated.csv')
# Czyszczenie wierszy z pustymi warościami.
film_data.dropna(inplace=True)
# Usunięcie zbędnych kolumn
film_data.drop(film_data.columns[[0, 1]], axis = 1)
# Normalizacja: Lowercase dla danych tekstowych, standaryzacja (0..1) dla wartości float, sortowanie danych w komórce.
for col_name in ['Title', 'Directors', 'Genres', 'Country', 'Language']:
film_data[col_name] = film_data[col_name].str.lower()
for col_name in ['Directors', 'Genres', 'Country', 'Language']:
film_data[col_name] = film_data[col_name].str.split(',').map(lambda x: ','.join(sorted(x)))
scaler = preprocessing.MinMaxScaler()
film_data[['IMDb', 'Runtime']] = scaler.fit_transform(film_data[['IMDb', 'Runtime']])
# Podział zbioru na train, dev, test w proporcji 8:1:1
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1
film_train, film_test = train_test_split(film_data, test_size=1 - train_ratio)
film_valid, film_test = train_test_split(film_test, test_size=test_ratio/(test_ratio + validation_ratio))
# Statystki głównego zbioru i podzbiorów
for i, data_set in enumerate([film_data, film_train, film_valid, film_test]):
if i == 0:
print("Główny zbiór danych")
elif i == 1:
print("Zbiór trenujący")
elif i == 2:
print("Zbiór walidujący")
if i == 3:
print("Zbiór testowy")
print(len(data_set))
print(data_set.describe().loc[['count','mean', 'max', 'min', 'std', '50%']])
[print(data_set[name].value_counts()) for idx, name in enumerate(data_set)]