commit d8b9b6f6db84566fe958c1c6f4ee3a700438346c Author: Wojciech Jarmosz Date: Sat Mar 20 20:55:55 2021 +0100 Add script for lab2 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fc64530 --- /dev/null +++ b/.gitignore @@ -0,0 +1,152 @@ + +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +pytestdebug.log + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +#poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +# .env +.env/ +.venv/ +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pythonenv* + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# operating system-related files +*.DS_Store #file properties cache/storage on macOS +Thumbs.db #thumbnail cache on Windows + +# profiling data +.prof + +env +# End of https://www.toptal.com/developers/gitignore/api/python \ No newline at end of file diff --git a/README b/README new file mode 100644 index 0000000..db2dcf9 --- /dev/null +++ b/README @@ -0,0 +1,7 @@ +# Instalacja skryptu: +``` +python3 -m venv env +source ./env/bin/activate +pip install -r requirements.txt +python3 script.py +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..00585a5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +certifi==2020.12.5 +chardet==4.0.0 +idna==2.10 +joblib==1.0.1 +kaggle==1.5.12 +numpy==1.20.1 +pandas==1.2.3 +python-dateutil==2.8.1 +python-slugify==4.0.1 +pytz==2021.1 +requests==2.25.1 +scikit-learn==0.24.1 +scipy==1.6.1 +six==1.15.0 +sklearn==0.0 +text-unidecode==1.3 +threadpoolctl==2.1.0 +tqdm==4.59.0 +urllib3==1.26.4 diff --git a/script.py b/script.py new file mode 100644 index 0000000..f03ec67 --- /dev/null +++ b/script.py @@ -0,0 +1,52 @@ +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +import kaggle + +kaggle.api.authenticate() + +kaggle.api.dataset_download_files('ruchi798/movies-on-netflix-prime-video-hulu-and-disney', path='.', unzip=True) + +# odczyt danych +film_data = pd.read_csv('MoviesOnStreamingPlatforms_updated.csv') + +# Czyszczenie wierszy z pustymi warościami. +film_data.dropna(inplace=True) + +# Usunięcie zbędnych kolumn +film_data.drop(film_data.columns[[0, 1]], axis = 1) + +# Normalizacja: Lowercase dla danych tekstowych, standaryzacja (0..1) dla wartości float, sortowanie danych w komórce. + +for col_name in ['Title', 'Directors', 'Genres', 'Country', 'Language']: + film_data[col_name] = film_data[col_name].str.lower() + +for col_name in ['Directors', 'Genres', 'Country', 'Language']: + film_data[col_name] = film_data[col_name].str.split(',').map(lambda x: ','.join(sorted(x))) + +scaler = preprocessing.MinMaxScaler() +film_data[['IMDb', 'Runtime']] = scaler.fit_transform(film_data[['IMDb', 'Runtime']]) + +# Podział zbioru na train, dev, test w proporcji 8:1:1 +train_ratio = 0.8 +validation_ratio = 0.1 +test_ratio = 0.1 + +film_train, film_test = train_test_split(film_data, test_size=1 - train_ratio) + +film_valid, film_test = train_test_split(film_test, test_size=test_ratio/(test_ratio + validation_ratio)) + +# Statystki głównego zbioru i podzbiorów +for i, data_set in enumerate([film_data, film_train, film_valid, film_test]): + if i == 0: + print("Główny zbiór danych") + elif i == 1: + print("Zbiór trenujący") + elif i == 2: + print("Zbiór walidujący") + if i == 3: + print("Zbiór testowy") + print(len(data_set)) + print(data_set.describe().loc[['count','mean', 'max', 'min', 'std', '50%']]) + [print(data_set[name].value_counts()) for idx, name in enumerate(data_set)] +