Some not important code changes

This commit is contained in:
MatOgr 2022-05-17 14:52:06 +02:00
parent 06f06c68f2
commit f30d8b8712
2 changed files with 42 additions and 35 deletions

View File

@ -20,18 +20,6 @@ nltk.download("stopwords")
ps = PorterStemmer() # *To perform stemming
def to_dictionary(stop_words, category):
vocab = set()
sentences = category
for i in sentences:
for word in i:
word_lower = word.lower()
if word_lower not in stop_words and word_lower.isalpha():
vocab.add(ps.stem(word_lower))
word_dic = Counter(vocab)
return word_dic
# *For tokenizing the words and putting it into the word list
def return_word_list(stop_words, sentence):
word_list = []
@ -42,8 +30,9 @@ def return_word_list(stop_words, sentence):
# *For finding the conditional probability
def return_category_probability_dictionary(dict_category_wise_probability,
word_list, probab, prob_df, pro):
def return_category_probability_dictionary(
dict_category_wise_probability: dict, word_list, probab: int,
prob_df: int, pro: int):
help_dict = {}
for i, _ in probab.iterrows():
for word in word_list:
@ -62,19 +51,20 @@ class NaiveBayes:
self.features = features
def fit(self):
pass # TODO
pass # TODO
def transform(self):
pass # TODO
pass # TODO
def predict(self):
pass # TODO
pass # TODO
def evaluate(self, test_data):
pass # TODO
pass # TODO
def read_data(data_path, prepare_data=False):
def read_data(data_path: str, prepare_data: bool = False) -> pd.DataFrame:
"""Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
if prepare_data:
data = preprocess_dataset(data_path)
else:
@ -82,7 +72,22 @@ def read_data(data_path, prepare_data=False):
return data["tokens"], data["fraudulent"]
def build_master_dict(data, classes, stop_words):
def to_dictionary(stop_words: set, category: int) -> dict:
"""Create and return a dictionary containing (word: occurrence_count) pairs for words not being stop words"""
vocab = set()
sentences = category
for i in sentences:
for word in i:
word_lower = word.lower()
if word_lower not in stop_words and word_lower.isalpha():
vocab.add(ps.stem(word_lower))
word_dic = Counter(vocab)
return word_dic
def build_master_dict(data: pd.DataFrame, classes: list,
stop_words: set) -> dict:
"""Create the master dictionary containing each word's frequency"""
master_dict = {}
for category in classes:
@ -93,7 +98,10 @@ def build_master_dict(data, classes, stop_words):
return master_dict
def build_category_probs_dicts(word_frequency_df, categories_to_iterate):
def build_category_probs_dicts(
word_frequency_df: pd.DataFrame,
categories_to_iterate: list) -> tuple(dict, dict):
"""Create the dictionary holding category-wise sums and word-wise probabilities"""
category_sum = []
for category in categories_to_iterate:
# *Prepared category sum for zip
@ -113,6 +121,7 @@ def build_category_probs_dicts(word_frequency_df, categories_to_iterate):
def build_word_probs(word_freqs, categories_to_iterate, dict_category_sum):
"""Calculate word probability with smoothing application"""
prob_df = word_freqs
for category in categories_to_iterate:
for index, row in prob_df.iterrows():

View File

@ -27,21 +27,19 @@ def download_data(data_path, dataset_name):
def preprocess_dataset(data_path):
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
data["description"] = data["description"].str.replace(
r"(\W+)|(url_\w+)|(\s+)", " ", regex=True)
# data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
# data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
data["text"] = data[
[
"title",
"department",
"company_profile",
"description",
"requirements",
"benefits",
]
].apply(lambda x: " ".join(x).lower(), axis=1)
# data['text'] = data['text'].str.lower()
data["text"] = data[[
"title",
"department",
"company_profile",
"description",
"requirements",
"benefits",
]].apply(lambda x: " ".join(x).lower(), axis=1)
tokenizer = RegexpTokenizer(r"\w+")
data["tokens"] = data["text"].apply(tokenizer.tokenize)