from joblib import dump, load
from sklearn import tree
from sklearn.feature_extraction import DictVectorizer

# X is a list of dictionaries with samples, Y is a list of samples' results
X = list()
Y = list()

# TODO: load training data

# vec transforms X (a list of dictionaries of string-string pairs) to binary arrays for tree to work on
vec = DictVectorizer()

# create and run Tree Clasifier upon provided data
clf = tree.DecisionTreeClassifier(max_depth=3)
clf = clf.fit(vec.fit_transform(X).toarray(), Y)

# save decision tree to file
dump(clf, 'decision_tree.joblib')

# print a tree (not necessary)
print(tree.export_text(clf, feature_names=vec.get_feature_names()))