diff --git a/Dockerfile b/Dockerfile index 976cf4c..42993ca 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,11 +8,13 @@ RUN pip3 install pandas RUN pip3 install matplotlib RUN pip3 install sklearn RUN pip3 install kaggle +RUN pip3 install torch WORKDIR /ium COPY ./ium-data.py ./ COPY ./download.sh ./ +COPY ./biblioteki_ml.py ./ ARG KAGGLE_KEY ARG KAGGLE_USERNAME diff --git a/biblioteki_ml.py b/biblioteki_ml.py new file mode 100644 index 0000000..2c5fbfb --- /dev/null +++ b/biblioteki_ml.py @@ -0,0 +1,93 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from sklearn.preprocessing import LabelEncoder +import pandas as pd + + +# Model +class Model(nn.Module): + def __init__(self, input_features=2, hidden_layer1=60, hidden_layer2=90, output_features=3): + super().__init__() + self.fc1 = nn.Linear(input_features, hidden_layer1) + self.fc2 = nn.Linear(hidden_layer1, hidden_layer2) + self.out = nn.Linear(hidden_layer2, output_features) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.out(x) + return x + + +# Ładowanie danych +train_set = pd.read_csv('d_train.csv', encoding='latin-1') +train_set = train_set[['Rating', 'Branch', 'Reviewer_Location']] + +test_set = pd.read_csv('d_test.csv', encoding='latin-1') +test_set = test_set[['Rating', 'Branch', 'Reviewer_Location']] + + +# Mapowanie kolumny 'Reviewer_Location' na cyfry +le = LabelEncoder() +le.fit(pd.concat([train_set['Reviewer_Location'], test_set['Reviewer_Location']])) +train_set['Reviewer_Location'] = le.transform(train_set['Reviewer_Location']) +test_set['Reviewer_Location'] = le.transform(test_set['Reviewer_Location']) + + +# Mapowanie kolumny 'Branch' na inny sposób +mappings = { + 'Disneyland_California': 0, + 'Disneyland_Paris': 1, + 'Disneyland_HongKong': 2 +} +train_set['Branch'] = train_set['Branch'].apply(lambda x: mappings[x]) +test_set['Branch'] = test_set['Branch'].apply(lambda x: mappings[x]) + + +# Zamiana danych na tensory +X_train = train_set[['Rating', 'Reviewer_Location']].to_numpy() +X_test = test_set[['Rating', 'Reviewer_Location']].to_numpy() +y_train = train_set['Branch'].to_numpy() +y_test = test_set['Branch'].to_numpy() + +X_train = torch.FloatTensor(X_train) +X_test = torch.FloatTensor(X_test) +y_train = torch.LongTensor(y_train) +y_test = torch.LongTensor(y_test) + + +# Hiperparametry +model = Model() +criterion = nn.CrossEntropyLoss() +optimizer = torch.optim.Adam(model.parameters(), lr=0.01) + + +# Trening +epochs = 100 +losses = [] +for i in range(epochs): + y_pred = model.forward(X_train) + loss = criterion(y_pred, y_train) + losses.append(loss) + print(f'epoch: {i:2} loss: {loss.item():10.8f}') + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + +# Testy +preds = [] +with torch.no_grad(): + for val in X_test: + y_hat = model.forward(val) + preds.append(y_hat.argmax().item()) + +df = pd.DataFrame({'Testing Y': y_test, 'Predicted Y': preds}) +df['Correct'] = [1 if corr == pred else 0 for corr, pred in zip(df['Testing Y'], df['Predicted Y'])] +print(f"{df['Correct'].sum() / len(df)} percent of predictions correct") + + +# Zapis do pliku +df.to_csv('neural_network_prediction_results.csv', index=False)