diff --git a/MLflow/full.py b/MLflow/full.py new file mode 100644 index 0000000..f79b841 --- /dev/null +++ b/MLflow/full.py @@ -0,0 +1,141 @@ +import subprocess +import zipfile +import os +import pandas as pd +import re +from sklearn.model_selection import train_test_split +import torch +import torch.nn as nn +import torch.optim as optim +import mlflow + +def download_kaggle_dataset(dataset_id, destination_folder): + try: + result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True) + zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1) + print(f"Dataset {dataset_id} successfully downloaded.") + return os.path.join(destination_folder, zip_filename) + except subprocess.CalledProcessError as e: + print(f"Error downloading dataset {dataset_id}: {e}") + return None + +def unzip_file(zip_filepath, destination_folder): + try: + with zipfile.ZipFile(zip_filepath, 'r') as zip_ref: + zip_ref.extractall(destination_folder) + print(f"Files extracted to {destination_folder}.") + except Exception as e: + print(f"Error unzipping file {zip_filepath}: {e}") + +def combine_csv_files(train_file, test_file, output_file): + try: + train_df = pd.read_csv(train_file) + test_df = pd.read_csv(test_file) + combined_df = pd.concat([train_df, test_df], ignore_index=True) + combined_df.to_csv(output_file, index=False) + print(f"Combined CSV files saved to {output_file}.") + except Exception as e: + print(f"Error combining CSV files: {e}") + +def split_data(data, train_ratio, dev_ratio, random_seed=42): + train_data, temp_data = train_test_split(data, train_size=train_ratio, random_state=random_seed) + dev_data, test_data = train_test_split(temp_data, train_size=dev_ratio / (1 - train_ratio), random_state=random_seed) + return train_data, dev_data, test_data + +class SimpleNN(nn.Module): + def __init__(self, input_size, hidden_size, output_size): + super(SimpleNN, self).__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, output_size) + self.softmax = nn.Softmax(dim=1) + + def forward(self, x): + x = torch.relu(self.fc1(x)) + x = self.fc2(x) + return self.softmax(x) + +def main(): + with mlflow.start_run(): + dataset_id = "iabhishekofficial/mobile-price-classification" + destination_folder = "/app/data" + zip_filepath = download_kaggle_dataset(dataset_id, destination_folder) + + if zip_filepath is not None: + unzip_file(zip_filepath, destination_folder) + train_file = os.path.join(destination_folder, "train.csv") + test_file = os.path.join(destination_folder, "test.csv") + output_file = os.path.join(destination_folder, "combined.csv") + combine_csv_files(train_file, test_file, output_file) + + data = pd.read_csv(output_file) + train_data, dev_data, test_data = split_data(data, train_ratio=0.6, dev_ratio=0.2) + + output_dir = "/app/output" + os.makedirs(output_dir, exist_ok=True) + train_data.to_csv(os.path.join(output_dir, 'Train1.csv'), index=False) + dev_data.to_csv(os.path.join(output_dir, 'Dev1.csv'), index=False) + test_data.to_csv(os.path.join(output_dir, 'Test1.csv'), index=False) + + print(f"Liczba wierszy w pliku Train1.csv: {len(train_data)}") + print(f"Liczba wierszy w pliku Dev1.csv: {len(dev_data)}") + print(f"Liczba wierszy w pliku Test1.csv: {len(test_data)}") + + train_file_path = os.path.join(output_dir, 'Train1.csv') + train_data = pd.read_csv(train_file_path) + train_data = train_data.dropna(subset=['price_range']) + + valid_values = {0.0, 1.0, 2.0, 3.0} + assert set(train_data['price_range'].unique()) <= valid_values, "Unexpected values in price_range" + + input_size = len(train_data.columns) - 2 + hidden_size = 50 + output_size = len(valid_values) + + # Logowanie parametrów + mlflow.log_param("input_size", input_size) + mlflow.log_param("hidden_size", hidden_size) + mlflow.log_param("output_size", output_size) + + model = SimpleNN(input_size, hidden_size, output_size) + + criterion = nn.CrossEntropyLoss() + optimizer = optim.Adam(model.parameters(), lr=0.001) + + epochs = 15 + for epoch in range(epochs): + inputs = torch.tensor(train_data.drop(['price_range', 'id'], axis=1).values, dtype=torch.float32) + labels = torch.tensor(train_data['price_range'].values, dtype=torch.long) + + optimizer.zero_grad() + outputs = model(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # Logowanie metryk + mlflow.log_metric("loss", loss.item(), step=epoch) + + print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}") + + save_path = "model.pth" + torch.save(model.state_dict(), save_path) + + model.load_state_dict(torch.load("model.pth")) + model.eval() + + test_file_path = os.path.join(output_dir, 'Test1.csv') + test_data = pd.read_csv(test_file_path) + + inputs = torch.tensor(test_data.drop(['price_range', 'id'], axis=1).values, dtype=torch.float32) + with torch.no_grad(): + predictions = model(inputs) + predicted_classes = torch.argmax(predictions, dim=1) + + predicted_classes_df = pd.DataFrame(predicted_classes.numpy(), columns=['Predicted_Price_Range']) + predicted_classes_df['Actual_Price_Range'] = test_data['price_range'].values + + output_path = 'predictions.csv' + predicted_classes_df.to_csv(output_path, index=False) + +if __name__ == "__main__": + main()