Add script for lab2

2021-03-20 20:55:55 +01:00 · 2021-03-20 20:55:55 +01:00 · d8b9b6f6db
commit d8b9b6f6db
4 changed files with 230 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,152 @@
+
+# Created by https://www.toptal.com/developers/gitignore/api/python
+# Edit at https://www.toptal.com/developers/gitignore?templates=python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+doc/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#poetry.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+# .env
+.env/
+.venv/
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pythonenv*
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# operating system-related files
+*.DS_Store #file properties cache/storage on macOS
+Thumbs.db #thumbnail cache on Windows
+
+# profiling data
+.prof
+
+env
+# End of https://www.toptal.com/developers/gitignore/api/python
--- a/7
+++ b/7
@ -0,0 +1,7 @@
+# Instalacja skryptu:
+```
+python3 -m venv env
+source ./env/bin/activate
+pip install -r requirements.txt
+python3 script.py
+```
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,19 @@
+certifi==2020.12.5
+chardet==4.0.0
+idna==2.10
+joblib==1.0.1
+kaggle==1.5.12
+numpy==1.20.1
+pandas==1.2.3
+python-dateutil==2.8.1
+python-slugify==4.0.1
+pytz==2021.1
+requests==2.25.1
+scikit-learn==0.24.1
+scipy==1.6.1
+six==1.15.0
+sklearn==0.0
+text-unidecode==1.3
+threadpoolctl==2.1.0
+tqdm==4.59.0
+urllib3==1.26.4
--- a/script.py
+++ b/script.py
@ -0,0 +1,52 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+import kaggle
+
+kaggle.api.authenticate()
+
+kaggle.api.dataset_download_files('ruchi798/movies-on-netflix-prime-video-hulu-and-disney', path='.', unzip=True)
+
+# odczyt danych
+film_data = pd.read_csv('MoviesOnStreamingPlatforms_updated.csv')
+
+# Czyszczenie wierszy z pustymi warościami.
+film_data.dropna(inplace=True)
+
+# Usunięcie zbędnych kolumn
+film_data.drop(film_data.columns[[0, 1]], axis = 1)
+
+# Normalizacja: Lowercase dla danych tekstowych, standaryzacja (0..1) dla wartości float, sortowanie danych w komórce. 
+
+for col_name in ['Title', 'Directors', 'Genres', 'Country', 'Language']:
+    film_data[col_name] = film_data[col_name].str.lower()
+
+for col_name in ['Directors', 'Genres', 'Country', 'Language']:
+    film_data[col_name] = film_data[col_name].str.split(',').map(lambda x: ','.join(sorted(x)))
+        
+scaler = preprocessing.MinMaxScaler()
+film_data[['IMDb', 'Runtime']] = scaler.fit_transform(film_data[['IMDb', 'Runtime']])
+
+# Podział zbioru na train, dev, test w proporcji 8:1:1
+train_ratio = 0.8
+validation_ratio = 0.1
+test_ratio = 0.1
+
+film_train, film_test = train_test_split(film_data, test_size=1 - train_ratio)
+
+film_valid, film_test = train_test_split(film_test, test_size=test_ratio/(test_ratio + validation_ratio)) 
+
+# Statystki głównego zbioru i podzbiorów
+for i, data_set in enumerate([film_data, film_train, film_valid, film_test]):
+    if i == 0:
+        print("Główny zbiór danych")
+    elif i == 1:
+        print("Zbiór trenujący")
+    elif i == 2:
+        print("Zbiór walidujący")
+    if i == 3:
+        print("Zbiór testowy")
+    print(len(data_set))
+    print(data_set.describe().loc[['count','mean', 'max', 'min', 'std', '50%']])
+    [print(data_set[name].value_counts()) for idx, name in enumerate(data_set)]
+