final
This commit is contained in:
commit
17e9c634d6
23007
imdb_movies.csv
Normal file
23007
imdb_movies.csv
Normal file
File diff suppressed because one or more lines are too long
27
requirements.txt
Normal file
27
requirements.txt
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
appdirs==1.4.4
|
||||||
|
black==20.8b1
|
||||||
|
certifi==2020.12.5
|
||||||
|
chardet==4.0.0
|
||||||
|
click==7.1.2
|
||||||
|
idna==2.10
|
||||||
|
joblib==1.0.1
|
||||||
|
kaggle==1.5.12
|
||||||
|
mypy-extensions==0.4.3
|
||||||
|
numpy==1.20.1
|
||||||
|
pandas==1.2.3
|
||||||
|
pathspec==0.8.1
|
||||||
|
python-dateutil==2.8.1
|
||||||
|
python-slugify==4.0.1
|
||||||
|
pytz==2021.1
|
||||||
|
regex==2021.3.17
|
||||||
|
requests==2.25.1
|
||||||
|
scikit-learn==0.24.1
|
||||||
|
scipy==1.6.1
|
||||||
|
six==1.15.0
|
||||||
|
text-unidecode==1.3
|
||||||
|
threadpoolctl==2.1.0
|
||||||
|
toml==0.10.2
|
||||||
|
tqdm==4.59.0
|
||||||
|
typed-ast==1.4.2
|
||||||
|
typing-extensions==3.7.4.3
|
||||||
|
urllib3==1.26.4
|
61
script.py
Normal file
61
script.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import string
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn import preprocessing
|
||||||
|
import kaggle
|
||||||
|
|
||||||
|
kaggle.api.authenticate()
|
||||||
|
|
||||||
|
kaggle.api.dataset_download_files("pustola/9900-imdb-movies", path=".", unzip=True)
|
||||||
|
|
||||||
|
movies_data = pd.read_csv("imdb_movies.csv")
|
||||||
|
|
||||||
|
# Drop rows with missing values
|
||||||
|
movies_data.dropna(inplace=True)
|
||||||
|
|
||||||
|
# Remove not interesting columns
|
||||||
|
drop_columns = ["title_id", "certificate", "title", "plot"]
|
||||||
|
movies_data.drop(labels=drop_columns, axis=1, inplace=True)
|
||||||
|
|
||||||
|
# Normalize data, lowercase str
|
||||||
|
for column_name in ["original_title", "countries", "genres", "director", "cast"]:
|
||||||
|
movies_data[column_name] = (
|
||||||
|
movies_data[column_name]
|
||||||
|
.str.translate(str.maketrans("", "", string.punctuation))
|
||||||
|
.str.lower()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove ',' from votes number and change type to int
|
||||||
|
movies_data["votes_number"] = (movies_data["votes_number"].str.replace(",", "")).astype(
|
||||||
|
int
|
||||||
|
)
|
||||||
|
|
||||||
|
# Normalize number values
|
||||||
|
scaler = preprocessing.MinMaxScaler()
|
||||||
|
movies_data[["rating", "votes_number", "year"]] = scaler.fit_transform(
|
||||||
|
movies_data[["rating", "votes_number", "year"]]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split set to train/dev/test 6:2:2 ratio and save to .csv file
|
||||||
|
train, dev = train_test_split(movies_data, train_size=0.6, test_size=0.4, shuffle=True)
|
||||||
|
dev, test = train_test_split(dev, train_size=0.5, test_size=0.5, shuffle=True)
|
||||||
|
|
||||||
|
train.to_csv("train.csv")
|
||||||
|
dev.to_csv("dev.csv")
|
||||||
|
test.to_csv("test.csv")
|
||||||
|
|
||||||
|
# Get length of given sets
|
||||||
|
print(f"Test dataset length: {len(test)}")
|
||||||
|
print(f"Dev dataset length: {len(dev)}")
|
||||||
|
print(f"Train dataset length: {len(train)}")
|
||||||
|
print(f"Whole dataset length: {len(movies_data)}, \n")
|
||||||
|
|
||||||
|
# Print information of given columns
|
||||||
|
for column in ["year", "rating", "runtime", "votes_number"]:
|
||||||
|
column_data = movies_data[column]
|
||||||
|
print(f"Information on {column}")
|
||||||
|
print(f"Min: {column_data.min()}")
|
||||||
|
print(f"Mak: {column_data.max()}")
|
||||||
|
print(f"Mean: {column_data.mean()}")
|
||||||
|
print(f"Median: {column_data.median()}")
|
||||||
|
print(f"Standard deviation: {column_data.std()}, \n")
|
Loading…
Reference in New Issue
Block a user