Add script for lab2

2021-03-20 20:55:55 +01:00 · 2021-03-20 20:55:55 +01:00 · d8b9b6f6db
commit d8b9b6f6db
4 changed files with 230 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,152 @@
 # Created by https://www.toptal.com/developers/gitignore/api/python
 # Edit at https://www.toptal.com/developers/gitignore?templates=python
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 pytestdebug.log
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 doc/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #poetry.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 # .env
 .env/
 .venv/
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 pythonenv*
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # operating system-related files
 *.DS_Store #file properties cache/storage on macOS
 Thumbs.db #thumbnail cache on Windows
 # profiling data
 .prof
 env
 # End of https://www.toptal.com/developers/gitignore/api/python
--- a/7
+++ b/7
@ -0,0 +1,7 @@
 # Instalacja skryptu:
 ```
 python3 -m venv env
 source ./env/bin/activate
 pip install -r requirements.txt
 python3 script.py
 ```
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,19 @@
 certifi==2020.12.5
 chardet==4.0.0
 idna==2.10
 joblib==1.0.1
 kaggle==1.5.12
 numpy==1.20.1
 pandas==1.2.3
 python-dateutil==2.8.1
 python-slugify==4.0.1
 pytz==2021.1
 requests==2.25.1
 scikit-learn==0.24.1
 scipy==1.6.1
 six==1.15.0
 sklearn==0.0
 text-unidecode==1.3
 threadpoolctl==2.1.0
 tqdm==4.59.0
 urllib3==1.26.4
--- a/script.py
+++ b/script.py
@ -0,0 +1,52 @@
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn import preprocessing
 import kaggle
 kaggle.api.authenticate()
 kaggle.api.dataset_download_files('ruchi798/movies-on-netflix-prime-video-hulu-and-disney', path='.', unzip=True)
 # odczyt danych
 film_data = pd.read_csv('MoviesOnStreamingPlatforms_updated.csv')
 # Czyszczenie wierszy z pustymi warościami.
 film_data.dropna(inplace=True)
 # Usunięcie zbędnych kolumn
 film_data.drop(film_data.columns[[0, 1]], axis = 1)
 # Normalizacja: Lowercase dla danych tekstowych, standaryzacja (0..1) dla wartości float, sortowanie danych w komórce. 
 for col_name in ['Title', 'Directors', 'Genres', 'Country', 'Language']:
    film_data[col_name] = film_data[col_name].str.lower()
 for col_name in ['Directors', 'Genres', 'Country', 'Language']:
    film_data[col_name] = film_data[col_name].str.split(',').map(lambda x: ','.join(sorted(x)))
 scaler = preprocessing.MinMaxScaler()
 film_data[['IMDb', 'Runtime']] = scaler.fit_transform(film_data[['IMDb', 'Runtime']])
 # Podział zbioru na train, dev, test w proporcji 8:1:1
 train_ratio = 0.8
 validation_ratio = 0.1
 test_ratio = 0.1
 film_train, film_test = train_test_split(film_data, test_size=1 - train_ratio)
 film_valid, film_test = train_test_split(film_test, test_size=test_ratio/(test_ratio + validation_ratio)) 
 # Statystki głównego zbioru i podzbiorów
 for i, data_set in enumerate([film_data, film_train, film_valid, film_test]):
    if i == 0:
        print("Główny zbiór danych")
    elif i == 1:
        print("Zbiór trenujący")
    elif i == 2:
        print("Zbiór walidujący")
    if i == 3:
        print("Zbiór testowy")
    print(len(data_set))
    print(data_set.describe().loc[['count','mean', 'max', 'min', 'std', '50%']])
    [print(data_set[name].value_counts()) for idx, name in enumerate(data_set)]