Add dvc yaml solution lab10
This commit is contained in:
parent
0094ad0815
commit
e34a25e476
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,4 @@
|
|||||||
/imdb_movies.csv
|
/imdb_movies.csv
|
||||||
|
/train.csv
|
||||||
|
/test.csv
|
||||||
|
/results.csv
|
||||||
|
34
dvc.lock
Normal file
34
dvc.lock
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
schema: '2.0'
|
||||||
|
stages:
|
||||||
|
split:
|
||||||
|
cmd: python lab_10_prepare.py
|
||||||
|
deps:
|
||||||
|
- path: imdb_movies.csv
|
||||||
|
md5: cf6471460161d4e0a85271c467845d7c
|
||||||
|
size: 50492646
|
||||||
|
- path: lab_10_prepare.py
|
||||||
|
md5: e0f6e525730ab3d991b5e5777ffa2ae0
|
||||||
|
size: 1324
|
||||||
|
outs:
|
||||||
|
- path: test.csv
|
||||||
|
md5: 9bd42fac150dd8a33d32b6326921d984
|
||||||
|
size: 68005
|
||||||
|
- path: train.csv
|
||||||
|
md5: 7ba1b2b4673781406812f35569cb1ed0
|
||||||
|
size: 204232
|
||||||
|
train:
|
||||||
|
cmd: python3 lab_10_train.py
|
||||||
|
deps:
|
||||||
|
- path: lab_10_train.py
|
||||||
|
md5: 7717f393a6f1c6aea2b145ea1f2f6dd3
|
||||||
|
size: 1285
|
||||||
|
- path: test.csv
|
||||||
|
md5: 9bd42fac150dd8a33d32b6326921d984
|
||||||
|
size: 68005
|
||||||
|
- path: train.csv
|
||||||
|
md5: 7ba1b2b4673781406812f35569cb1ed0
|
||||||
|
size: 204232
|
||||||
|
outs:
|
||||||
|
- path: results.csv
|
||||||
|
md5: a52750b686aaeadd7cf4436cbe6904b5
|
||||||
|
size: 16046
|
17
dvc.yaml
Normal file
17
dvc.yaml
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
stages:
|
||||||
|
split:
|
||||||
|
cmd: python lab_10_prepare.py
|
||||||
|
deps:
|
||||||
|
- imdb_movies.csv
|
||||||
|
- lab_10_prepare.py
|
||||||
|
outs:
|
||||||
|
- test.csv
|
||||||
|
- train.csv
|
||||||
|
train:
|
||||||
|
cmd: python3 lab_10_train.py
|
||||||
|
deps:
|
||||||
|
- lab_10_train.py
|
||||||
|
- test.csv
|
||||||
|
- train.csv
|
||||||
|
outs:
|
||||||
|
- results.csv
|
@ -0,0 +1,50 @@
|
|||||||
|
"""
|
||||||
|
Download dataset between 10-20 mb,
|
||||||
|
Split it into train/dev/test
|
||||||
|
Return dataset info (length, max, min etc.)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import string
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
movies_data = pd.read_csv("imdb_movies.csv")
|
||||||
|
|
||||||
|
# Drop rows with missing values
|
||||||
|
movies_data.dropna(inplace=True)
|
||||||
|
|
||||||
|
# Remove not interesting columns
|
||||||
|
drop_columns = ["title_id", "certificate", "title", "plot"]
|
||||||
|
drop_columns2 = [
|
||||||
|
"original_title",
|
||||||
|
"countries",
|
||||||
|
"genres",
|
||||||
|
"director",
|
||||||
|
"cast",
|
||||||
|
"release_date",
|
||||||
|
]
|
||||||
|
drop_columns = drop_columns + drop_columns2
|
||||||
|
|
||||||
|
movies_data.drop(labels=drop_columns, axis=1, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Remove ',' from votes number and change type to int
|
||||||
|
movies_data["votes_number"] = (movies_data["votes_number"].str.replace(",", "")).astype(
|
||||||
|
int
|
||||||
|
)
|
||||||
|
|
||||||
|
# Normalize number values
|
||||||
|
scaler = preprocessing.MinMaxScaler()
|
||||||
|
movies_data[["votes_number", "year", "runtime"]] = scaler.fit_transform(
|
||||||
|
movies_data[["votes_number", "year", "runtime"]]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Split set to train/dev/test 6:2:2 ratio and save to .csv file
|
||||||
|
train, dev = train_test_split(movies_data, train_size=0.6, test_size=0.4, shuffle=True)
|
||||||
|
dev, test = train_test_split(dev, train_size=0.5, test_size=0.5, shuffle=True)
|
||||||
|
|
||||||
|
train.to_csv("train.csv")
|
||||||
|
dev.to_csv("dev.csv")
|
||||||
|
test.to_csv("test.csv")
|
@ -0,0 +1,48 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sklearn.metrics import mean_absolute_error
|
||||||
|
from tensorflow.keras.callbacks import EarlyStopping
|
||||||
|
from tensorflow.keras.layers import Dense, Dropout
|
||||||
|
from tensorflow.keras.models import Sequential
|
||||||
|
|
||||||
|
movies_train = pd.read_csv("train.csv")
|
||||||
|
X_train = movies_train.drop("rating", axis=1)
|
||||||
|
Y_train = movies_train["rating"]
|
||||||
|
|
||||||
|
movies_test = pd.read_csv("test.csv")
|
||||||
|
X_test = movies_test.drop("rating", axis=1)
|
||||||
|
Y_test = movies_test["rating"]
|
||||||
|
|
||||||
|
# Set up model
|
||||||
|
model = Sequential()
|
||||||
|
model.add(Dense(8, activation="relu"))
|
||||||
|
model.add(Dropout(0.5))
|
||||||
|
model.add(Dense(3, activation="relu"))
|
||||||
|
model.add(Dropout(0.5))
|
||||||
|
model.add(Dense(1))
|
||||||
|
model.compile(optimizer="adam", loss="mse")
|
||||||
|
|
||||||
|
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=10)
|
||||||
|
|
||||||
|
model.fit(
|
||||||
|
x=X_train,
|
||||||
|
y=Y_train.values,
|
||||||
|
validation_data=(X_test, Y_test.values),
|
||||||
|
batch_size=128,
|
||||||
|
epochs=400,
|
||||||
|
callbacks=[early_stop],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Predict movie ratings
|
||||||
|
predictions = model.predict(X_test)
|
||||||
|
|
||||||
|
pd.DataFrame(predictions).to_csv("results.csv")
|
||||||
|
|
||||||
|
|
||||||
|
# Compare outputs
|
||||||
|
for i, score in enumerate(predictions):
|
||||||
|
print(f"Original score: {Y_test.iloc[i]} Predicted score: {score} \n")
|
||||||
|
print(f"Difference is : {Y_test.iloc[i] - score}")
|
||||||
|
|
||||||
|
|
||||||
|
# Evaluate
|
||||||
|
print(mean_absolute_error(Y_test, predictions))
|
@ -11,4 +11,5 @@ gast==0.3.3
|
|||||||
sacred==0.8.2
|
sacred==0.8.2
|
||||||
GitPython==3.1.14
|
GitPython==3.1.14
|
||||||
matplotlib==3.3.4
|
matplotlib==3.3.4
|
||||||
mlflow==1.17.0
|
mlflow==1.17.0
|
||||||
|
dvc==2.3.0
|
||||||
|
Loading…
Reference in New Issue
Block a user