diff --git a/naive_bayes.ipynb b/naive_bayes.ipynb index c7ff2f0..d4e2472 100644 --- a/naive_bayes.ipynb +++ b/naive_bayes.ipynb @@ -113,8 +113,7 @@ "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n", "data = NormalizeData(data)\n", "\n", - "\n", - "# Rozdzielenie etykiet i cech\n", + "# Wybranie etykiet\n", "data = data[['gender', 'age', 'bmi','smoking_status','hypertension','heart_disease','avg_glucose_level','stroke']]\n", "data = data[data.gender != 'other']\n", "\n", @@ -122,65 +121,74 @@ "# Dane wejściowe - zbiór danych, wektor etykiet, wektor prawdopodobieństw a priori dla klas.\n", "# Wygenerowanie wektora prawdopodobieństw a priori dla klas.\n", "a_priori_prob = count_a_priori_prob(data)\n", - "labels = separate_labels_from_properties(data.iloc[:,:-1])\n", - "\n", + "features = separate_labels_from_properties(data.iloc[:,:-1])\n", "class NaiveBayes():\n", - " def __init__(self, dataset, a_priori_prob):\n", - " self.dataset = dataset\n", - " self.a_priori_prob = a_priori_prob\n", - " self.a_priori_features = {}\n", " \n", + " def __init__(self, dataset, features, a_priori_prob):\n", + " self.dataset = dataset\n", + " self.features = features\n", + " self.a_priori_class = a_priori_prob\n", + " self.a_priori_features = {}\n", + "\n", " def fit(self):\n", - " # init dict\n", - " for feature in list(set(data.iloc[:,:-1])):\n", - " self.a_priori_features[feature] = {}\n", - " \n", + " # Inicjalizacja pustego słownika dla każdej cechy: age, heart_disease, hypertension itd.\n", + " for subdict in self.features:\n", + " for feature_key in subdict.keys():\n", + " self.a_priori_features[feature_key] = {}\n", " \n", + " # Wyliczenie prawdopodobieństw\n", " for feature in list(set(data.iloc[:,:-1])):\n", " for feature_value in np.unique(self.dataset[feature]):\n", - " # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes\n", - "\n", - " amount_label_value_yes_class = len(self.dataset.loc[(self.dataset['stroke'] == 'yes') & (self.dataset[feature] == feature_value)])\n", - " amount_label_value_no_class = len(self.dataset.loc[(self.dataset['stroke'] == 'no') & (self.dataset[feature] == feature_value)])\n", - " amount_yes_class = len(self.dataset.loc[(self.dataset['stroke'] == 'yes')])\n", - " amount_no_class = len(self.dataset.loc[(self.dataset['stroke'] == 'no')]) \n", + " \n", + " # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes i heart_disease.no itd.\n", + " amount_of_feature_value_for_class_yes = len(self.dataset.loc[(self.dataset['stroke'] == 'yes') & (self.dataset[feature] == feature_value)])\n", + " amount_of_feature_value_for_class_no = len(self.dataset.loc[(self.dataset['stroke'] == 'no') & (self.dataset[feature] == feature_value)])\n", + " amount_class_yes = len(self.dataset.loc[(self.dataset['stroke'] == 'yes')])\n", + " amount_class_no = len(self.dataset.loc[(self.dataset['stroke'] == 'no')]) \n", + " \n", " # Obliczenie P(heart_disease.yes|'stroke'|), P(heart_disease.yes|'no stroke') itd. dla kazdej cechy.\n", - " # Zapisujemy do listy w formacie (cecha.wartość: prob stroke, cecha.wartość: prob no stroke)\n", - " self.a_priori_features[feature][feature_value + '.' + 'yes'] = amount_label_value_yes_class/amount_yes_class\n", - " self.a_priori_features[feature][feature_value + '.' + 'no'] = amount_label_value_no_class/amount_no_class\n", + " # Zapisujemy do słownika w formacie {cecha.wartość: prawdopodobieństwo}\n", + " self.a_priori_features[feature][feature_value + '.' + 'yes'] = amount_of_feature_value_for_class_yes/amount_class_yes \n", + " self.a_priori_features[feature][feature_value + '.' + 'no'] = amount_of_feature_value_for_class_no/amount_class_no \n", " \n", " def count_bayes(self,labels):\n", - " label_probs_return = []\n", - " posteriori_return = []\n", + " input_features_probs = []\n", + " posteriori_class_probs = []\n", " final_probs = {'top_yes': 0.0, 'top_no': 0.0, 'total': 0.0}\n", " \n", - " # self.labels - Wartości etykiet które nas interesują, opcjonalnie podane sa wszystkie.\n", - " # [{'gender': {'female', 'male', 'other'}}, {'age': {'50-59', '40-49', '60-69', '70+', '18-29', '30-39'}}, {'ever_married': {'no', 'yes'}}, {'Residence_type': {'rural', 'urban'}}, {'bmi': {'high', 'mid', 'low'}}, {'smoking_status': {'unknown', 'smokes', 'never_smoked', 'formerly_smoked'}}, {'work_type': {'self_employed', 'private', 'never_worked', 'govt_job'}}, {'hypertension': {'no', 'yes'}}, {'heart_disease': {'no', 'yes'}}]\n", - " # Dla kazdej z klas - 'yes', 'no'\n", + " # labels - Lista słowników z wartościami etykiet na wejściu\n", + " # [ {'gender': {'female', 'male'}},\n", + " # {'age': '{0-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89'}}, \n", + " # {'bmi': {'correct','overweight','obesity_1','obesity_2','extreme'}},\n", + " # {'smoking_status': {'unknown', 'smokes', 'never_smoked', 'formerly_smoked'}},\n", + " # {'avg_glucose_level': {''50-90', '90-130','130-170','170-210','210-250','250-290'}}, \n", + " # {'hypertension': {'no', 'yes'}}, \n", + " # {'heart_disease': {'no', 'yes'}}]\n", + " # Dla kazdej z klas - 'yes', 'no'\n", " for idx, cls in enumerate(list(set(self.dataset['stroke']))):\n", " label_probs = []\n", " for label in labels:\n", " label_name = list(label.keys())[0]\n", " for label_value in label[label_name]:\n", - " # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes\n", + " # Pobranie z self.a_priori_features prawdopodbieństwa danej cechy w zbiorze danych np. heart_disease.yes\n", " label_probs.append({str(label_name + \".\" + label_value):(self.a_priori_features[label_name][label_value + '.' + 'yes'], self.a_priori_features[label_name][label_value + '.' + 'no'])})\n", "\n", - " label_probs_return.append(label_probs)\n", + " input_features_probs.append(label_probs)\n", " # Obliczanie licznika wzoru Bayesa (mnozymy wartosci prob cech z prawdop apriori danej klasy):\n", - " top = 1\n", + " numerator = 1\n", " for label_prob in label_probs:\n", - " top *= list(label_prob.values())[0][idx]\n", - " top *= self.a_priori_prob[cls]\n", + " numerator *= list(label_prob.values())[0][idx]\n", + " numerator *= self.a_priori_class[cls]\n", "\n", - " final_probs[cls] = top\n", - " final_probs['total'] += top\n", + " final_probs[cls] = numerator\n", + " final_probs['total'] += numerator\n", " \n", - " posteriori_return.append(final_probs['yes']/final_probs['total'])\n", - " posteriori_return.append(final_probs['no']/final_probs['total'])\n", - " return posteriori_return, label_probs_return\n", + " posteriori_class_probs.append(final_probs['yes']/final_probs['total'])\n", + " posteriori_class_probs.append(final_probs['no']/final_probs['total'])\n", + " return posteriori_class_probs, input_features_probs\n", "\n", - "labels = [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'correct'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}]\n", - "naive_bayes = NaiveBayes(data, a_priori_prob)\n", + "\n", + "naive_bayes = NaiveBayes(data,features, a_priori_prob)\n", "naive_bayes.fit()" ] }, @@ -264,30 +272,20 @@ } ], "source": [ - "labels = [[{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_1'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n", + "queries = [[{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_1'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n", " [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_2'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}],\n", " [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'correct'}},{'gender': {'female'}},{'smoking_status': {'never_smoked'}}],\n", " [{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_2'}},{'avg_glucose_level': {'210-250'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n", " [{'age': {'0-29'}},{'hypertension': {'no'}},{'heart_disease': {'no'}},{'bmi': {'correct'}},{'avg_glucose_level': {'130-170'}},{'gender': {'male'}},{'smoking_status': {'never_smoked'}}],\n", " [{'age': {'80-89'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'extreme'}},{'avg_glucose_level': {'210-250'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}]\n", - " \n", - " \n", " ]\n", "\n", - "name = 1\n", - "for i in labels:\n", - " posteriori, labels = naive_bayes.count_bayes(i)\n", - " plot_priori(labels,posteriori, str(name))\n", - " name = name + 1" + "png_name = 1\n", + "for i in queries:\n", + " posteriori_class, features_probs = naive_bayes.count_bayes(i)\n", + " plot_priori(features_probs, posteriori_class, str(png_name))\n", + " png_name += 1" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aware-kuwait", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {