import subprocess import zipfile import os import pandas as pd import re from sklearn.model_selection import train_test_split import torch import torch.nn as nn import torch.optim as optim import mlflow def download_kaggle_dataset(dataset_id, destination_folder): try: result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True) zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1) print(f"Dataset {dataset_id} successfully downloaded.") return os.path.join(destination_folder, zip_filename) except subprocess.CalledProcessError as e: print(f"Error downloading dataset {dataset_id}: {e}") return None def unzip_file(zip_filepath, destination_folder): try: with zipfile.ZipFile(zip_filepath, 'r') as zip_ref: zip_ref.extractall(destination_folder) print(f"Files extracted to {destination_folder}.") except Exception as e: print(f"Error unzipping file {zip_filepath}: {e}") def combine_csv_files(train_file, test_file, output_file): try: train_df = pd.read_csv(train_file) test_df = pd.read_csv(test_file) combined_df = pd.concat([train_df, test_df], ignore_index=True) combined_df.to_csv(output_file, index=False) print(f"Combined CSV files saved to {output_file}.") except Exception as e: print(f"Error combining CSV files: {e}") def split_data(data, train_ratio, dev_ratio, random_seed=42): train_data, temp_data = train_test_split(data, train_size=train_ratio, random_state=random_seed) dev_data, test_data = train_test_split(temp_data, train_size=dev_ratio / (1 - train_ratio), random_state=random_seed) return train_data, dev_data, test_data class SimpleNN(nn.Module): def __init__(self, input_size, hidden_size, output_size): super(SimpleNN, self).__init__() self.fc1 = nn.Linear(input_size, hidden_size) self.fc2 = nn.Linear(hidden_size, output_size) self.softmax = nn.Softmax(dim=1) def forward(self, x): x = torch.relu(self.fc1(x)) x = self.fc2(x) return self.softmax(x) def main(): with mlflow.start_run(): dataset_id = "iabhishekofficial/mobile-price-classification" destination_folder = "/app/data" zip_filepath = download_kaggle_dataset(dataset_id, destination_folder) if zip_filepath is not None: unzip_file(zip_filepath, destination_folder) train_file = os.path.join(destination_folder, "train.csv") test_file = os.path.join(destination_folder, "test.csv") output_file = os.path.join(destination_folder, "combined.csv") combine_csv_files(train_file, test_file, output_file) data = pd.read_csv(output_file) train_data, dev_data, test_data = split_data(data, train_ratio=0.6, dev_ratio=0.2) output_dir = "/app/output" os.makedirs(output_dir, exist_ok=True) train_data.to_csv(os.path.join(output_dir, 'Train1.csv'), index=False) dev_data.to_csv(os.path.join(output_dir, 'Dev1.csv'), index=False) test_data.to_csv(os.path.join(output_dir, 'Test1.csv'), index=False) print(f"Liczba wierszy w pliku Train1.csv: {len(train_data)}") print(f"Liczba wierszy w pliku Dev1.csv: {len(dev_data)}") print(f"Liczba wierszy w pliku Test1.csv: {len(test_data)}") train_file_path = os.path.join(output_dir, 'Train1.csv') train_data = pd.read_csv(train_file_path) train_data = train_data.dropna(subset=['price_range']) valid_values = {0.0, 1.0, 2.0, 3.0} assert set(train_data['price_range'].unique()) <= valid_values, "Unexpected values in price_range" input_size = len(train_data.columns) - 2 hidden_size = 50 output_size = len(valid_values) # Logowanie parametrów mlflow.log_param("input_size", input_size) mlflow.log_param("hidden_size", hidden_size) mlflow.log_param("output_size", output_size) model = SimpleNN(input_size, hidden_size, output_size) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) epochs = 15 for epoch in range(epochs): inputs = torch.tensor(train_data.drop(['price_range', 'id'], axis=1).values, dtype=torch.float32) labels = torch.tensor(train_data['price_range'].values, dtype=torch.long) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # Logowanie metryk mlflow.log_metric("loss", loss.item(), step=epoch) print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}") save_path = "model.pth" torch.save(model.state_dict(), save_path) model.load_state_dict(torch.load("model.pth")) model.eval() test_file_path = os.path.join(output_dir, 'Test1.csv') test_data = pd.read_csv(test_file_path) inputs = torch.tensor(test_data.drop(['price_range', 'id'], axis=1).values, dtype=torch.float32) with torch.no_grad(): predictions = model(inputs) predicted_classes = torch.argmax(predictions, dim=1) predicted_classes_df = pd.DataFrame(predicted_classes.numpy(), columns=['Predicted_Price_Range']) predicted_classes_df['Actual_Price_Range'] = test_data['price_range'].values output_path = 'predictions.csv' predicted_classes_df.to_csv(output_path, index=False) if __name__ == "__main__": main()