32 lines
1.5 KiB
Plaintext
32 lines
1.5 KiB
Plaintext
import pandas as pd
|
|
import numpy as np
|
|
import os
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
def split_data(data, train_ratio, dev_ratio, random_seed=42):
|
|
train_data, temp_data = train_test_split(data, train_size=train_ratio, random_state=random_seed)
|
|
dev_data, test_data = train_test_split(temp_data, train_size=dev_ratio / (1 - train_ratio), random_state=random_seed)
|
|
return train_data, dev_data, test_data
|
|
|
|
def main():
|
|
# Wczytanie danych z pliku CSV
|
|
file_path = os.path.join("C:", os.sep, "Users", "reyva", "OneDrive", "Pulpit", "studia", "InżynieriaUczeniaMaszynowego", "combined.csv")
|
|
data = pd.read_csv(file_path)
|
|
|
|
# Podział danych na zbiory train, dev, test z proporcjami 6:2:2
|
|
train_data, dev_data, test_data = split_data(data, train_ratio=0.6, dev_ratio=0.2)
|
|
|
|
# Zapisanie podzielonych danych do plików CSV
|
|
output_dir = os.path.join("C:", os.sep, "Users", "reyva", "OneDrive", "Pulpit", "studia", "InżynieriaUczeniaMaszynowego")
|
|
train_data.to_csv(os.path.join(output_dir, 'Train1.csv'), index=False)
|
|
dev_data.to_csv(os.path.join(output_dir, 'Dev1.csv'), index=False)
|
|
test_data.to_csv(os.path.join(output_dir, 'Test1.csv'), index=False)
|
|
|
|
# Wypisanie liczby wierszy w każdym pliku
|
|
print(f"Liczba wierszy w pliku Train1.csv: {len(train_data)}")
|
|
print(f"Liczba wierszy w pliku Dev1.csv: {len(dev_data)}")
|
|
print(f"Liczba wierszy w pliku Test1.csv: {len(test_data)}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|