ns
This commit is contained in:
parent
0c39f9e721
commit
db32c1a27b
@ -113,8 +113,7 @@
|
||||
"data = pd.read_csv(\"healthcare-dataset-stroke-data.csv\")\n",
|
||||
"data = NormalizeData(data)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Rozdzielenie etykiet i cech\n",
|
||||
"# Wybranie etykiet\n",
|
||||
"data = data[['gender', 'age', 'bmi','smoking_status','hypertension','heart_disease','avg_glucose_level','stroke']]\n",
|
||||
"data = data[data.gender != 'other']\n",
|
||||
"\n",
|
||||
@ -122,65 +121,74 @@
|
||||
"# Dane wejściowe - zbiór danych, wektor etykiet, wektor prawdopodobieństw a priori dla klas.\n",
|
||||
"# Wygenerowanie wektora prawdopodobieństw a priori dla klas.\n",
|
||||
"a_priori_prob = count_a_priori_prob(data)\n",
|
||||
"labels = separate_labels_from_properties(data.iloc[:,:-1])\n",
|
||||
"\n",
|
||||
"features = separate_labels_from_properties(data.iloc[:,:-1])\n",
|
||||
"class NaiveBayes():\n",
|
||||
" def __init__(self, dataset, a_priori_prob):\n",
|
||||
" self.dataset = dataset\n",
|
||||
" self.a_priori_prob = a_priori_prob\n",
|
||||
" self.a_priori_features = {}\n",
|
||||
" \n",
|
||||
" def __init__(self, dataset, features, a_priori_prob):\n",
|
||||
" self.dataset = dataset\n",
|
||||
" self.features = features\n",
|
||||
" self.a_priori_class = a_priori_prob\n",
|
||||
" self.a_priori_features = {}\n",
|
||||
"\n",
|
||||
" def fit(self):\n",
|
||||
" # init dict\n",
|
||||
" for feature in list(set(data.iloc[:,:-1])):\n",
|
||||
" self.a_priori_features[feature] = {}\n",
|
||||
" \n",
|
||||
" # Inicjalizacja pustego słownika dla każdej cechy: age, heart_disease, hypertension itd.\n",
|
||||
" for subdict in self.features:\n",
|
||||
" for feature_key in subdict.keys():\n",
|
||||
" self.a_priori_features[feature_key] = {}\n",
|
||||
" \n",
|
||||
" # Wyliczenie prawdopodobieństw\n",
|
||||
" for feature in list(set(data.iloc[:,:-1])):\n",
|
||||
" for feature_value in np.unique(self.dataset[feature]):\n",
|
||||
" # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes\n",
|
||||
"\n",
|
||||
" amount_label_value_yes_class = len(self.dataset.loc[(self.dataset['stroke'] == 'yes') & (self.dataset[feature] == feature_value)])\n",
|
||||
" amount_label_value_no_class = len(self.dataset.loc[(self.dataset['stroke'] == 'no') & (self.dataset[feature] == feature_value)])\n",
|
||||
" amount_yes_class = len(self.dataset.loc[(self.dataset['stroke'] == 'yes')])\n",
|
||||
" amount_no_class = len(self.dataset.loc[(self.dataset['stroke'] == 'no')]) \n",
|
||||
" \n",
|
||||
" # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes i heart_disease.no itd.\n",
|
||||
" amount_of_feature_value_for_class_yes = len(self.dataset.loc[(self.dataset['stroke'] == 'yes') & (self.dataset[feature] == feature_value)])\n",
|
||||
" amount_of_feature_value_for_class_no = len(self.dataset.loc[(self.dataset['stroke'] == 'no') & (self.dataset[feature] == feature_value)])\n",
|
||||
" amount_class_yes = len(self.dataset.loc[(self.dataset['stroke'] == 'yes')])\n",
|
||||
" amount_class_no = len(self.dataset.loc[(self.dataset['stroke'] == 'no')]) \n",
|
||||
" \n",
|
||||
" # Obliczenie P(heart_disease.yes|'stroke'|), P(heart_disease.yes|'no stroke') itd. dla kazdej cechy.\n",
|
||||
" # Zapisujemy do listy w formacie (cecha.wartość: prob stroke, cecha.wartość: prob no stroke)\n",
|
||||
" self.a_priori_features[feature][feature_value + '.' + 'yes'] = amount_label_value_yes_class/amount_yes_class\n",
|
||||
" self.a_priori_features[feature][feature_value + '.' + 'no'] = amount_label_value_no_class/amount_no_class\n",
|
||||
" # Zapisujemy do słownika w formacie {cecha.wartość: prawdopodobieństwo}\n",
|
||||
" self.a_priori_features[feature][feature_value + '.' + 'yes'] = amount_of_feature_value_for_class_yes/amount_class_yes \n",
|
||||
" self.a_priori_features[feature][feature_value + '.' + 'no'] = amount_of_feature_value_for_class_no/amount_class_no \n",
|
||||
" \n",
|
||||
" def count_bayes(self,labels):\n",
|
||||
" label_probs_return = []\n",
|
||||
" posteriori_return = []\n",
|
||||
" input_features_probs = []\n",
|
||||
" posteriori_class_probs = []\n",
|
||||
" final_probs = {'top_yes': 0.0, 'top_no': 0.0, 'total': 0.0}\n",
|
||||
" \n",
|
||||
" # self.labels - Wartości etykiet które nas interesują, opcjonalnie podane sa wszystkie.\n",
|
||||
" # [{'gender': {'female', 'male', 'other'}}, {'age': {'50-59', '40-49', '60-69', '70+', '18-29', '30-39'}}, {'ever_married': {'no', 'yes'}}, {'Residence_type': {'rural', 'urban'}}, {'bmi': {'high', 'mid', 'low'}}, {'smoking_status': {'unknown', 'smokes', 'never_smoked', 'formerly_smoked'}}, {'work_type': {'self_employed', 'private', 'never_worked', 'govt_job'}}, {'hypertension': {'no', 'yes'}}, {'heart_disease': {'no', 'yes'}}]\n",
|
||||
" # Dla kazdej z klas - 'yes', 'no'\n",
|
||||
" # labels - Lista słowników z wartościami etykiet na wejściu\n",
|
||||
" # [ {'gender': {'female', 'male'}},\n",
|
||||
" # {'age': '{0-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89'}}, \n",
|
||||
" # {'bmi': {'correct','overweight','obesity_1','obesity_2','extreme'}},\n",
|
||||
" # {'smoking_status': {'unknown', 'smokes', 'never_smoked', 'formerly_smoked'}},\n",
|
||||
" # {'avg_glucose_level': {''50-90', '90-130','130-170','170-210','210-250','250-290'}}, \n",
|
||||
" # {'hypertension': {'no', 'yes'}}, \n",
|
||||
" # {'heart_disease': {'no', 'yes'}}]\n",
|
||||
" # Dla kazdej z klas - 'yes', 'no'\n",
|
||||
" for idx, cls in enumerate(list(set(self.dataset['stroke']))):\n",
|
||||
" label_probs = []\n",
|
||||
" for label in labels:\n",
|
||||
" label_name = list(label.keys())[0]\n",
|
||||
" for label_value in label[label_name]:\n",
|
||||
" # Oblicz ilość występowania danej cechy w zbiorze danych np. heart_disease.yes\n",
|
||||
" # Pobranie z self.a_priori_features prawdopodbieństwa danej cechy w zbiorze danych np. heart_disease.yes\n",
|
||||
" label_probs.append({str(label_name + \".\" + label_value):(self.a_priori_features[label_name][label_value + '.' + 'yes'], self.a_priori_features[label_name][label_value + '.' + 'no'])})\n",
|
||||
"\n",
|
||||
" label_probs_return.append(label_probs)\n",
|
||||
" input_features_probs.append(label_probs)\n",
|
||||
" # Obliczanie licznika wzoru Bayesa (mnozymy wartosci prob cech z prawdop apriori danej klasy):\n",
|
||||
" top = 1\n",
|
||||
" numerator = 1\n",
|
||||
" for label_prob in label_probs:\n",
|
||||
" top *= list(label_prob.values())[0][idx]\n",
|
||||
" top *= self.a_priori_prob[cls]\n",
|
||||
" numerator *= list(label_prob.values())[0][idx]\n",
|
||||
" numerator *= self.a_priori_class[cls]\n",
|
||||
"\n",
|
||||
" final_probs[cls] = top\n",
|
||||
" final_probs['total'] += top\n",
|
||||
" final_probs[cls] = numerator\n",
|
||||
" final_probs['total'] += numerator\n",
|
||||
" \n",
|
||||
" posteriori_return.append(final_probs['yes']/final_probs['total'])\n",
|
||||
" posteriori_return.append(final_probs['no']/final_probs['total'])\n",
|
||||
" return posteriori_return, label_probs_return\n",
|
||||
" posteriori_class_probs.append(final_probs['yes']/final_probs['total'])\n",
|
||||
" posteriori_class_probs.append(final_probs['no']/final_probs['total'])\n",
|
||||
" return posteriori_class_probs, input_features_probs\n",
|
||||
"\n",
|
||||
"labels = [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'correct'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}]\n",
|
||||
"naive_bayes = NaiveBayes(data, a_priori_prob)\n",
|
||||
"\n",
|
||||
"naive_bayes = NaiveBayes(data,features, a_priori_prob)\n",
|
||||
"naive_bayes.fit()"
|
||||
]
|
||||
},
|
||||
@ -264,30 +272,20 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"labels = [[{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_1'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n",
|
||||
"queries = [[{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_1'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n",
|
||||
" [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_2'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}],\n",
|
||||
" [{'age': {'70-79'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'correct'}},{'gender': {'female'}},{'smoking_status': {'never_smoked'}}],\n",
|
||||
" [{'age': {'60-69'}},{'hypertension': {'no'}},{'heart_disease': {'yes'}},{'bmi': {'obesity_2'}},{'avg_glucose_level': {'210-250'}},{'gender': {'female'}},{'smoking_status': {'smokes'}}],\n",
|
||||
" [{'age': {'0-29'}},{'hypertension': {'no'}},{'heart_disease': {'no'}},{'bmi': {'correct'}},{'avg_glucose_level': {'130-170'}},{'gender': {'male'}},{'smoking_status': {'never_smoked'}}],\n",
|
||||
" [{'age': {'80-89'}},{'hypertension': {'yes'}},{'heart_disease': {'yes'}},{'bmi': {'extreme'}},{'avg_glucose_level': {'210-250'}},{'gender': {'male'}},{'smoking_status': {'smokes'}}]\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
"name = 1\n",
|
||||
"for i in labels:\n",
|
||||
" posteriori, labels = naive_bayes.count_bayes(i)\n",
|
||||
" plot_priori(labels,posteriori, str(name))\n",
|
||||
" name = name + 1"
|
||||
"png_name = 1\n",
|
||||
"for i in queries:\n",
|
||||
" posteriori_class, features_probs = naive_bayes.count_bayes(i)\n",
|
||||
" plot_priori(features_probs, posteriori_class, str(png_name))\n",
|
||||
" png_name += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "aware-kuwait",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Loading…
Reference in New Issue
Block a user