From ed0399be0cc7db7f797d2e5d8a746780032c8e30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szymon=20Parafin=CC=81ski?= Date: Mon, 25 Apr 2022 05:02:20 +0200 Subject: [PATCH] add model training and dockerfile for model running --- Dockerfile | 7 ++- biblioteka_DL/dllib.py | 130 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 biblioteka_DL/dllib.py diff --git a/Dockerfile b/Dockerfile index dcc74a3..721e074 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,9 @@ RUN pip3 install pandas RUN pip3 install sklearn RUN pip3 install numpy +RUN pip3 install matplotlib +RUN pip3 install torch + ARG CUTOFF ARG KAGGLE_USERNAME ARG KAGGLE_KEY @@ -19,8 +22,8 @@ ENV KAGGLE_KEY=${KAGGLE_KEY} WORKDIR /app COPY lab2/download.sh . -COPY lab2/main.py . +COPY biblioteka_DL/dllib.py . RUN chmod +x ./download.sh RUN ./download.sh -RUN python3 ./main.py \ No newline at end of file +#CMD python3 ./dllib.py diff --git a/biblioteka_DL/dllib.py b/biblioteka_DL/dllib.py new file mode 100644 index 0000000..fa5ab62 --- /dev/null +++ b/biblioteka_DL/dllib.py @@ -0,0 +1,130 @@ +import torch +import torch.nn as nn +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score + + +def drop_relevant_columns(imbd_data): + imbd_data.drop(columns=["Poster_Link"], inplace=True) + imbd_data.drop(columns=["Overview"], inplace=True) + imbd_data.drop(columns=["Certificate"], inplace=True) + return imbd_data + + +def lowercase_columns_names(imbd_data): + imbd_data["Series_Title"] = imbd_data["Series_Title"].str.lower() + imbd_data["Genre"] = imbd_data["Genre"].str.lower() + imbd_data["Director"] = imbd_data["Director"].str.lower() + imbd_data["Star1"] = imbd_data["Star1"].str.lower() + imbd_data["Star2"] = imbd_data["Star2"].str.lower() + imbd_data["Star3"] = imbd_data["Star3"].str.lower() + imbd_data["Star4"] = imbd_data["Star4"].str.lower() + return imbd_data + + +def data_to_numeric(imbd_data): + imbd_data = imbd_data.replace(np.nan, '', regex=True) + imbd_data["Gross"] = imbd_data["Gross"].str.replace(',', '') + imbd_data["Gross"] = pd.to_numeric(imbd_data["Gross"], errors='coerce') + imbd_data["Runtime"] = imbd_data["Runtime"].str.replace(' min', '') + imbd_data["Runtime"] = pd.to_numeric(imbd_data["Runtime"], errors='coerce') + imbd_data["IMDB_Rating"] = pd.to_numeric(imbd_data["IMDB_Rating"], errors='coerce') + imbd_data["Meta_score"] = pd.to_numeric(imbd_data["Meta_score"], errors='coerce') + imbd_data["Released_Year"] = pd.to_numeric(imbd_data["Released_Year"], errors='coerce') + imbd_data = imbd_data.dropna() + imbd_data = imbd_data.reset_index() + imbd_data.drop(columns=["index"], inplace=True) + return imbd_data + + +def create_train_dev_test(imbd_data): + data_train, data_test = train_test_split(imbd_data, test_size=230, random_state=1, shuffle=True) + data_test, data_dev = train_test_split(data_test, test_size=115, random_state=1, shuffle=True) + data_test.to_csv("data_test.csv", encoding="utf-8", index=False) + data_dev.to_csv("data_dev.csv", encoding="utf-8", index=False) + data_train.to_csv("data_train.csv", encoding="utf-8", index=False) + + +def normalize_gross(imbd_data): + imbd_data[["Gross"]] = imbd_data[["Gross"]] / 10000000 + return imbd_data + + +def prepare_dataset(): + df = pd.read_csv('../imdb_top_1000.csv') + df = drop_relevant_columns(df) + df_lowercase = lowercase_columns_names(df) + df = data_to_numeric(df_lowercase) + df = normalize_gross(df) + return df + + +class LinearRegressionModel(torch.nn.Module): + + def __init__(self): + super(LinearRegressionModel, self).__init__() + self.linear = torch.nn.Linear(1, 1) # One in and one out + + def forward(self, x): + y_pred = self.linear(x) + return y_pred + + +df = prepare_dataset() +data_train, data_test = train_test_split(df, random_state=1, shuffle=True) + +X_train = pd.DataFrame(data_train["Meta_score"], dtype=np.float64) +X_train = X_train.to_numpy() + +y_train = pd.DataFrame(data_train["Gross"], dtype=np.float64) +y_train = y_train.to_numpy() + +X_train = X_train.reshape(-1, 1) +y_train = y_train.reshape(-1, 1) + +X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1, 1) +y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1) + +input_size = 1 +output_size = 1 + +model = nn.Linear(input_size, output_size) + +learning_rate = 0.0001 +l = nn.MSELoss() +optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + + +num_epochs = 1000 +for epoch in range(num_epochs): + # forward feed + y_pred = model(X_train.requires_grad_()) + + # calculate the loss + loss = l(y_pred, y_train) + + # backward propagation: calculate gradients + loss.backward() + + # update the weights + optimizer.step() + + # clear out the gradients from the last step loss.backward() + optimizer.zero_grad() + + if epoch % 100 == 0: + print('epoch {}, loss {}'.format(epoch, loss.item())) + +predicted = model(X_train).detach().numpy() + +pred = pd.DataFrame(predicted) +pred.to_csv('result.csv') + +# plt.scatter(X_train.detach().numpy() , y_train.detach().numpy()) +# plt.plot(X_train.detach().numpy() , predicted , "red") +# plt.xlabel("Meta_score") +# plt.ylabel("Gross") +# plt.show()