Some not important code changes

2022-05-17 14:52:06 +02:00 · 2022-05-17 14:52:06 +02:00 · f30d8b8712
commit f30d8b8712
parent 06f06c68f2
2 changed files with 42 additions and 35 deletions
--- a/naive_bayes.py
+++ b/naive_bayes.py
@ -20,18 +20,6 @@ nltk.download("stopwords")
 ps = PorterStemmer()  # *To perform stemming


-def to_dictionary(stop_words, category):
-    vocab = set()
-    sentences = category
-    for i in sentences:
-        for word in i:
-            word_lower = word.lower()
-            if word_lower not in stop_words and word_lower.isalpha():
-                vocab.add(ps.stem(word_lower))
-    word_dic = Counter(vocab)
-    return word_dic
-
-
 # *For tokenizing the words and putting it into the word list
 def return_word_list(stop_words, sentence):
    word_list = []
@ -42,8 +30,9 @@ def return_word_list(stop_words, sentence):


 # *For finding the conditional probability
-def return_category_probability_dictionary(dict_category_wise_probability,
-                                           word_list, probab, prob_df, pro):
+def return_category_probability_dictionary(
+        dict_category_wise_probability: dict, word_list, probab: int,
+        prob_df: int, pro: int):
    help_dict = {}
    for i, _ in probab.iterrows():
        for word in word_list:
@ -62,19 +51,20 @@ class NaiveBayes:
        self.features = features

    def fit(self):
-        pass # TODO
+        pass  # TODO

    def transform(self):
-        pass # TODO
+        pass  # TODO

    def predict(self):
-        pass # TODO
+        pass  # TODO

    def evaluate(self, test_data):
-        pass # TODO
+        pass  # TODO


-def read_data(data_path, prepare_data=False):
+def read_data(data_path: str, prepare_data: bool = False) -> pd.DataFrame:
+    """Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
    if prepare_data:
        data = preprocess_dataset(data_path)
    else:
@ -82,7 +72,22 @@ def read_data(data_path, prepare_data=False):
    return data["tokens"], data["fraudulent"]


-def build_master_dict(data, classes, stop_words):
+def to_dictionary(stop_words: set, category: int) -> dict:
+    """Create and return a dictionary containing (word: occurrence_count) pairs for words not being stop words"""
+    vocab = set()
+    sentences = category
+    for i in sentences:
+        for word in i:
+            word_lower = word.lower()
+            if word_lower not in stop_words and word_lower.isalpha():
+                vocab.add(ps.stem(word_lower))
+    word_dic = Counter(vocab)
+    return word_dic
+
+
+def build_master_dict(data: pd.DataFrame, classes: list,
+                      stop_words: set) -> dict:
+    """Create the master dictionary containing each word's frequency"""
    master_dict = {}

    for category in classes:
@ -93,7 +98,10 @@ def build_master_dict(data, classes, stop_words):
    return master_dict


-def build_category_probs_dicts(word_frequency_df, categories_to_iterate):
+def build_category_probs_dicts(
+        word_frequency_df: pd.DataFrame,
+        categories_to_iterate: list) -> tuple(dict, dict):
+    """Create the dictionary holding category-wise sums and word-wise probabilities"""
    category_sum = []
    for category in categories_to_iterate:
        # *Prepared category sum for zip
@ -113,6 +121,7 @@ def build_category_probs_dicts(word_frequency_df, categories_to_iterate):


 def build_word_probs(word_freqs, categories_to_iterate, dict_category_sum):
+    """Calculate word probability with smoothing application"""
    prob_df = word_freqs
    for category in categories_to_iterate:
        for index, row in prob_df.iterrows():
--- a/prepare_data.py
+++ b/prepare_data.py
@ -27,21 +27,19 @@ def download_data(data_path, dataset_name):
 def preprocess_dataset(data_path):
    data = pd.read_csv(data_path).replace(np.nan, "", regex=True)

-    data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
-    data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
-    data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
+    data["description"] = data["description"].str.replace(
+        r"(\W+)|(url_\w+)|(\s+)", " ", regex=True)
+    # data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
+    # data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)

-    data["text"] = data[
-        [
-            "title",
-            "department",
-            "company_profile",
-            "description",
-            "requirements",
-            "benefits",
-        ]
-    ].apply(lambda x: " ".join(x).lower(), axis=1)
-    # data['text'] = data['text'].str.lower()
+    data["text"] = data[[
+        "title",
+        "department",
+        "company_profile",
+        "description",
+        "requirements",
+        "benefits",
+    ]].apply(lambda x: " ".join(x).lower(), axis=1)

    tokenizer = RegexpTokenizer(r"\w+")
    data["tokens"] = data["text"].apply(tokenizer.tokenize)