This commit is contained in:
s434766 2021-05-31 21:48:50 +02:00
parent 0c39f9e721
commit db32c1a27b

View File

@ -113,8 +113,7 @@
"data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n", "data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n",
"data = NormalizeData(data)\n", "data = NormalizeData(data)\n",
"\n", "\n",
"\n", "# Wybranie etykiet\n",
"# Rozdzielenie etykiet i cech\n",
"data = data[['gender', 'age', 'bmi','smoking_status','hypertension','heart_disease','avg_glucose_level','stroke']]\n", "data = data[['gender', 'age', 'bmi','smoking_status','hypertension','heart_disease','avg_glucose_level','stroke']]\n",
"data = data[data.gender != 'other']\n", "data = data[data.gender != 'other']\n",
"\n", "\n",
@ -122,65 +121,74 @@
"# Dane wejściowe - zbiór danych, wektor etykiet, wektor prawdopodobieństw a priori dla klas.\n", "# Dane wejściowe - zbiór danych, wektor etykiet, wektor prawdopodobieństw a priori dla klas.\n",
"# Wygenerowanie wektora prawdopodobieństw a priori dla klas.\n", "# Wygenerowanie wektora prawdopodobieństw a priori dla klas.\n",
"a_priori_prob = count_a_priori_prob(data)\n", "a_priori_prob = count_a_priori_prob(data)\n",
"labels = separate_labels_from_properties(data.iloc[:,:-1])\n", "features = separate_labels_from_properties(data.iloc[:,:-1])\n",
"\n",
"class NaiveBayes():\n", "class NaiveBayes():\n",
" def __init__(self, dataset, a_priori_prob):\n", " \n",
" def __init__(self, dataset, features, a_priori_prob):\n",
" self.dataset = dataset\n", " self.dataset = dataset\n",
" self.a_priori_prob = a_priori_prob\n", " self.features = features\n",
" self.a_priori_class = a_priori_prob\n",
" self.a_priori_features = {}\n", " self.a_priori_features = {}\n",
"\n", "\n",
" def fit(self):\n", " def fit(self):\n",
" # init dict\n", " # Inicjalizacja pustego słownika dla każdej cechy: age, heart_disease, hypertension itd.\n",
" for feature in list(set(data.iloc[:,:-1])):\n", " for subdict in self.features:\n",
" self.a_priori_features[feature] = {}\n", " for feature_key in subdict.keys():\n",
" \n", " self.a_priori_features[feature_key] = {}\n",
" \n", " \n",
" # Wyliczenie prawdopodobieństw\n",
" for feature in list(set(data.iloc[:,:-1])):\n", " for feature in list(set(data.iloc[:,:-1])):\n",
" for feature_value in np.unique(self.dataset[feature]):\n", " for feature_value in np.unique(self.dataset[feature]):\n",
" # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes\n",
" \n", " \n",
" amount_label_value_yes_class = len(self.dataset.loc[(self.dataset['stroke'] == 'yes') & (self.dataset[feature] == feature_value)])\n", " # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes i heart_disease.no itd.\n",
" amount_label_value_no_class = len(self.dataset.loc[(self.dataset['stroke'] == 'no') & (self.dataset[feature] == feature_value)])\n", " amount_of_feature_value_for_class_yes = len(self.dataset.loc[(self.dataset['stroke'] == 'yes') & (self.dataset[feature] == feature_value)])\n",
" amount_yes_class = len(self.dataset.loc[(self.dataset['stroke'] == 'yes')])\n", " amount_of_feature_value_for_class_no = len(self.dataset.loc[(self.dataset['stroke'] == 'no') & (self.dataset[feature] == feature_value)])\n",
" amount_no_class = len(self.dataset.loc[(self.dataset['stroke'] == 'no')]) \n", " amount_class_yes = len(self.dataset.loc[(self.dataset['stroke'] == 'yes')])\n",
" amount_class_no = len(self.dataset.loc[(self.dataset['stroke'] == 'no')]) \n",
" \n",
" # Obliczenie P(heart_disease.yes|'stroke'|), P(heart_disease.yes|'no stroke') itd. dla kazdej cechy.\n", " # Obliczenie P(heart_disease.yes|'stroke'|), P(heart_disease.yes|'no stroke') itd. dla kazdej cechy.\n",
" # Zapisujemy do listy w formacie (cecha.wartość: prob stroke, cecha.wartość: prob no stroke)\n", " # Zapisujemy do słownika w formacie {cecha.wartość: prawdopodobieństwo}\n",
" self.a_priori_features[feature][feature_value + '.' + 'yes'] = amount_label_value_yes_class/amount_yes_class\n", " self.a_priori_features[feature][feature_value + '.' + 'yes'] = amount_of_feature_value_for_class_yes/amount_class_yes \n",
" self.a_priori_features[feature][feature_value + '.' + 'no'] = amount_label_value_no_class/amount_no_class\n", " self.a_priori_features[feature][feature_value + '.' + 'no'] = amount_of_feature_value_for_class_no/amount_class_no \n",
" \n", " \n",
" def count_bayes(self,labels):\n", " def count_bayes(self,labels):\n",
" label_probs_return = []\n", " input_features_probs = []\n",
" posteriori_return = []\n", " posteriori_class_probs = []\n",
" final_probs = {'top_yes': 0.0, 'top_no': 0.0, 'total': 0.0}\n", " final_probs = {'top_yes': 0.0, 'top_no': 0.0, 'total': 0.0}\n",
" \n", " \n",
" # self.labels - Wartości etykiet które nas interesują, opcjonalnie podane sa wszystkie.\n", " # labels - Lista słowników z wartościami etykiet na wejściu\n",
" # [{'gender': {'female', 'male', 'other'}}, {'age': {'50-59', '40-49', '60-69', '70+', '18-29', '30-39'}}, {'ever_married': {'no', 'yes'}}, {'Residence_type': {'rural', 'urban'}}, {'bmi': {'high', 'mid', 'low'}}, {'smoking_status': {'unknown', 'smokes', 'never_smoked', 'formerly_smoked'}}, {'work_type': {'self_employed', 'private', 'never_worked', 'govt_job'}}, {'hypertension': {'no', 'yes'}}, {'heart_disease': {'no', 'yes'}}]\n", " # [ {'gender': {'female', 'male'}},\n",
" # {'age': '{0-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89'}}, \n",
" # {'bmi': {'correct','overweight','obesity_1','obesity_2','extreme'}},\n",
" # {'smoking_status': {'unknown', 'smokes', 'never_smoked', 'formerly_smoked'}},\n",
" # {'avg_glucose_level': {''50-90', '90-130','130-170','170-210','210-250','250-290'}}, \n",
" # {'hypertension': {'no', 'yes'}}, \n",
" # {'heart_disease': {'no', 'yes'}}]\n",
" # Dla kazdej z klas - 'yes', 'no'\n", " # Dla kazdej z klas - 'yes', 'no'\n",
" for idx, cls in enumerate(list(set(self.dataset['stroke']))):\n", " for idx, cls in enumerate(list(set(self.dataset['stroke']))):\n",
" label_probs = []\n", " label_probs = []\n",
" for label in labels:\n", " for label in labels:\n",
" label_name = list(label.keys())[0]\n", " label_name = list(label.keys())[0]\n",
" for label_value in label[label_name]:\n", " for label_value in label[label_name]:\n",
" # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes\n", " # Pobranie z self.a_priori_features prawdopodbieństwa danej cechy w zbiorze danych np. heart_disease.yes\n",
" label_probs.append({str(label_name + \".\" + label_value):(self.a_priori_features[label_name][label_value + '.' + 'yes'], self.a_priori_features[label_name][label_value + '.' + 'no'])})\n", " label_probs.append({str(label_name + \".\" + label_value):(self.a_priori_features[label_name][label_value + '.' + 'yes'], self.a_priori_features[label_name][label_value + '.' + 'no'])})\n",
"\n", "\n",
" label_probs_return.append(label_probs)\n", " input_features_probs.append(label_probs)\n",
" # Obliczanie licznika wzoru Bayesa (mnozymy wartosci prob cech z prawdop apriori danej klasy):\n", " # Obliczanie licznika wzoru Bayesa (mnozymy wartosci prob cech z prawdop apriori danej klasy):\n",
" top = 1\n", " numerator = 1\n",
" for label_prob in label_probs:\n", " for label_prob in label_probs:\n",
" top *= list(label_prob.values())[0][idx]\n", " numerator *= list(label_prob.values())[0][idx]\n",
" top *= self.a_priori_prob[cls]\n", " numerator *= self.a_priori_class[cls]\n",
"\n", "\n",
" final_probs[cls] = top\n", " final_probs[cls] = numerator\n",
" final_probs['total'] += top\n", " final_probs['total'] += numerator\n",
" \n", " \n",
" posteriori_return.append(final_probs['yes']/final_probs['total'])\n", " posteriori_class_probs.append(final_probs['yes']/final_probs['total'])\n",
" posteriori_return.append(final_probs['no']/final_probs['total'])\n", " posteriori_class_probs.append(final_probs['no']/final_probs['total'])\n",
" return posteriori_return, label_probs_return\n", " return posteriori_class_probs, input_features_probs\n",
"\n", "\n",
"labels = [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'correct'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}]\n", "\n",
"naive_bayes = NaiveBayes(data, a_priori_prob)\n", "naive_bayes = NaiveBayes(data,features, a_priori_prob)\n",
"naive_bayes.fit()" "naive_bayes.fit()"
] ]
}, },
@ -264,30 +272,20 @@
} }
], ],
"source": [ "source": [
"labels = [[{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_1'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n", "queries = [[{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_1'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n",
" [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_2'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}],\n", " [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_2'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}],\n",
" [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'correct'}},{'gender': {'female'}},{'smoking_status': {'never_smoked'}}],\n", " [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'correct'}},{'gender': {'female'}},{'smoking_status': {'never_smoked'}}],\n",
" [{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_2'}},{'avg_glucose_level': {'210-250'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n", " [{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_2'}},{'avg_glucose_level': {'210-250'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n",
" [{'age': {'0-29'}},{'hypertension': {'no'}},{'heart_disease': {'no'}},{'bmi': {'correct'}},{'avg_glucose_level': {'130-170'}},{'gender': {'male'}},{'smoking_status': {'never_smoked'}}],\n", " [{'age': {'0-29'}},{'hypertension': {'no'}},{'heart_disease': {'no'}},{'bmi': {'correct'}},{'avg_glucose_level': {'130-170'}},{'gender': {'male'}},{'smoking_status': {'never_smoked'}}],\n",
" [{'age': {'80-89'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'extreme'}},{'avg_glucose_level': {'210-250'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}]\n", " [{'age': {'80-89'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'extreme'}},{'avg_glucose_level': {'210-250'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}]\n",
" \n",
" \n",
" ]\n", " ]\n",
"\n", "\n",
"name = 1\n", "png_name = 1\n",
"for i in labels:\n", "for i in queries:\n",
" posteriori, labels = naive_bayes.count_bayes(i)\n", " posteriori_class, features_probs = naive_bayes.count_bayes(i)\n",
" plot_priori(labels,posteriori, str(name))\n", " plot_priori(features_probs, posteriori_class, str(png_name))\n",
" name = name + 1" " png_name += 1"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aware-kuwait",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {