Upload files to 'MLflow'
This commit is contained in:
parent
638b62a4e1
commit
a1a266d5cd
141
MLflow/full.py
Normal file
141
MLflow/full.py
Normal file
@ -0,0 +1,141 @@
|
||||
import subprocess
|
||||
import zipfile
|
||||
import os
|
||||
import pandas as pd
|
||||
import re
|
||||
from sklearn.model_selection import train_test_split
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import mlflow
|
||||
|
||||
def download_kaggle_dataset(dataset_id, destination_folder):
|
||||
try:
|
||||
result = subprocess.run(["kaggle", "datasets", "download", "-d", dataset_id, "-p", destination_folder], check=True, capture_output=True, text=True)
|
||||
zip_filename = re.search(r"(\S+\.zip)", result.stdout).group(1)
|
||||
print(f"Dataset {dataset_id} successfully downloaded.")
|
||||
return os.path.join(destination_folder, zip_filename)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error downloading dataset {dataset_id}: {e}")
|
||||
return None
|
||||
|
||||
def unzip_file(zip_filepath, destination_folder):
|
||||
try:
|
||||
with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
|
||||
zip_ref.extractall(destination_folder)
|
||||
print(f"Files extracted to {destination_folder}.")
|
||||
except Exception as e:
|
||||
print(f"Error unzipping file {zip_filepath}: {e}")
|
||||
|
||||
def combine_csv_files(train_file, test_file, output_file):
|
||||
try:
|
||||
train_df = pd.read_csv(train_file)
|
||||
test_df = pd.read_csv(test_file)
|
||||
combined_df = pd.concat([train_df, test_df], ignore_index=True)
|
||||
combined_df.to_csv(output_file, index=False)
|
||||
print(f"Combined CSV files saved to {output_file}.")
|
||||
except Exception as e:
|
||||
print(f"Error combining CSV files: {e}")
|
||||
|
||||
def split_data(data, train_ratio, dev_ratio, random_seed=42):
|
||||
train_data, temp_data = train_test_split(data, train_size=train_ratio, random_state=random_seed)
|
||||
dev_data, test_data = train_test_split(temp_data, train_size=dev_ratio / (1 - train_ratio), random_state=random_seed)
|
||||
return train_data, dev_data, test_data
|
||||
|
||||
class SimpleNN(nn.Module):
|
||||
def __init__(self, input_size, hidden_size, output_size):
|
||||
super(SimpleNN, self).__init__()
|
||||
self.fc1 = nn.Linear(input_size, hidden_size)
|
||||
self.fc2 = nn.Linear(hidden_size, output_size)
|
||||
self.softmax = nn.Softmax(dim=1)
|
||||
|
||||
def forward(self, x):
|
||||
x = torch.relu(self.fc1(x))
|
||||
x = self.fc2(x)
|
||||
return self.softmax(x)
|
||||
|
||||
def main():
|
||||
with mlflow.start_run():
|
||||
dataset_id = "iabhishekofficial/mobile-price-classification"
|
||||
destination_folder = "/app/data"
|
||||
zip_filepath = download_kaggle_dataset(dataset_id, destination_folder)
|
||||
|
||||
if zip_filepath is not None:
|
||||
unzip_file(zip_filepath, destination_folder)
|
||||
train_file = os.path.join(destination_folder, "train.csv")
|
||||
test_file = os.path.join(destination_folder, "test.csv")
|
||||
output_file = os.path.join(destination_folder, "combined.csv")
|
||||
combine_csv_files(train_file, test_file, output_file)
|
||||
|
||||
data = pd.read_csv(output_file)
|
||||
train_data, dev_data, test_data = split_data(data, train_ratio=0.6, dev_ratio=0.2)
|
||||
|
||||
output_dir = "/app/output"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
train_data.to_csv(os.path.join(output_dir, 'Train1.csv'), index=False)
|
||||
dev_data.to_csv(os.path.join(output_dir, 'Dev1.csv'), index=False)
|
||||
test_data.to_csv(os.path.join(output_dir, 'Test1.csv'), index=False)
|
||||
|
||||
print(f"Liczba wierszy w pliku Train1.csv: {len(train_data)}")
|
||||
print(f"Liczba wierszy w pliku Dev1.csv: {len(dev_data)}")
|
||||
print(f"Liczba wierszy w pliku Test1.csv: {len(test_data)}")
|
||||
|
||||
train_file_path = os.path.join(output_dir, 'Train1.csv')
|
||||
train_data = pd.read_csv(train_file_path)
|
||||
train_data = train_data.dropna(subset=['price_range'])
|
||||
|
||||
valid_values = {0.0, 1.0, 2.0, 3.0}
|
||||
assert set(train_data['price_range'].unique()) <= valid_values, "Unexpected values in price_range"
|
||||
|
||||
input_size = len(train_data.columns) - 2
|
||||
hidden_size = 50
|
||||
output_size = len(valid_values)
|
||||
|
||||
# Logowanie parametrów
|
||||
mlflow.log_param("input_size", input_size)
|
||||
mlflow.log_param("hidden_size", hidden_size)
|
||||
mlflow.log_param("output_size", output_size)
|
||||
|
||||
model = SimpleNN(input_size, hidden_size, output_size)
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = optim.Adam(model.parameters(), lr=0.001)
|
||||
|
||||
epochs = 15
|
||||
for epoch in range(epochs):
|
||||
inputs = torch.tensor(train_data.drop(['price_range', 'id'], axis=1).values, dtype=torch.float32)
|
||||
labels = torch.tensor(train_data['price_range'].values, dtype=torch.long)
|
||||
|
||||
optimizer.zero_grad()
|
||||
outputs = model(inputs)
|
||||
loss = criterion(outputs, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# Logowanie metryk
|
||||
mlflow.log_metric("loss", loss.item(), step=epoch)
|
||||
|
||||
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")
|
||||
|
||||
save_path = "model.pth"
|
||||
torch.save(model.state_dict(), save_path)
|
||||
|
||||
model.load_state_dict(torch.load("model.pth"))
|
||||
model.eval()
|
||||
|
||||
test_file_path = os.path.join(output_dir, 'Test1.csv')
|
||||
test_data = pd.read_csv(test_file_path)
|
||||
|
||||
inputs = torch.tensor(test_data.drop(['price_range', 'id'], axis=1).values, dtype=torch.float32)
|
||||
with torch.no_grad():
|
||||
predictions = model(inputs)
|
||||
predicted_classes = torch.argmax(predictions, dim=1)
|
||||
|
||||
predicted_classes_df = pd.DataFrame(predicted_classes.numpy(), columns=['Predicted_Price_Range'])
|
||||
predicted_classes_df['Actual_Price_Range'] = test_data['price_range'].values
|
||||
|
||||
output_path = 'predictions.csv'
|
||||
predicted_classes_df.to_csv(output_path, index=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user