import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVR from sklearn import preprocessing from sklearn import utils data = pd.read_csv('starclusters-global-parameters2.dat',skiprows=1 ,delim_whitespace=True, header=None) shuffled_data = data.sample(frac=1, random_state=42) n = int(0.8 * len(shuffled_data)) data_train = shuffled_data[:n] data_test = shuffled_data[n:] X_train = data_train.iloc[:, 1:-1].values y_train = data_train.iloc[:, -1].values lab = preprocessing.LabelEncoder() y_transformed = lab.fit_transform(y_train) X_test = data_test.iloc[:, 1:-1].values classifier = RandomForestClassifier() classifier.fit(X_train, y_transformed) y_pred = classifier.predict(X_test) data_test['8'] = list(y_pred) data_test.to_csv('prediction.dat', sep=' ', index=False)