Some not important code changes
This commit is contained in:
parent
06f06c68f2
commit
f30d8b8712
@ -20,18 +20,6 @@ nltk.download("stopwords")
|
||||
ps = PorterStemmer() # *To perform stemming
|
||||
|
||||
|
||||
def to_dictionary(stop_words, category):
|
||||
vocab = set()
|
||||
sentences = category
|
||||
for i in sentences:
|
||||
for word in i:
|
||||
word_lower = word.lower()
|
||||
if word_lower not in stop_words and word_lower.isalpha():
|
||||
vocab.add(ps.stem(word_lower))
|
||||
word_dic = Counter(vocab)
|
||||
return word_dic
|
||||
|
||||
|
||||
# *For tokenizing the words and putting it into the word list
|
||||
def return_word_list(stop_words, sentence):
|
||||
word_list = []
|
||||
@ -42,8 +30,9 @@ def return_word_list(stop_words, sentence):
|
||||
|
||||
|
||||
# *For finding the conditional probability
|
||||
def return_category_probability_dictionary(dict_category_wise_probability,
|
||||
word_list, probab, prob_df, pro):
|
||||
def return_category_probability_dictionary(
|
||||
dict_category_wise_probability: dict, word_list, probab: int,
|
||||
prob_df: int, pro: int):
|
||||
help_dict = {}
|
||||
for i, _ in probab.iterrows():
|
||||
for word in word_list:
|
||||
@ -62,19 +51,20 @@ class NaiveBayes:
|
||||
self.features = features
|
||||
|
||||
def fit(self):
|
||||
pass # TODO
|
||||
pass # TODO
|
||||
|
||||
def transform(self):
|
||||
pass # TODO
|
||||
pass # TODO
|
||||
|
||||
def predict(self):
|
||||
pass # TODO
|
||||
pass # TODO
|
||||
|
||||
def evaluate(self, test_data):
|
||||
pass # TODO
|
||||
pass # TODO
|
||||
|
||||
|
||||
def read_data(data_path, prepare_data=False):
|
||||
def read_data(data_path: str, prepare_data: bool = False) -> pd.DataFrame:
|
||||
"""Read data from given path - if @prepared_data is True, data is also preprocessed and cleaned"""
|
||||
if prepare_data:
|
||||
data = preprocess_dataset(data_path)
|
||||
else:
|
||||
@ -82,7 +72,22 @@ def read_data(data_path, prepare_data=False):
|
||||
return data["tokens"], data["fraudulent"]
|
||||
|
||||
|
||||
def build_master_dict(data, classes, stop_words):
|
||||
def to_dictionary(stop_words: set, category: int) -> dict:
|
||||
"""Create and return a dictionary containing (word: occurrence_count) pairs for words not being stop words"""
|
||||
vocab = set()
|
||||
sentences = category
|
||||
for i in sentences:
|
||||
for word in i:
|
||||
word_lower = word.lower()
|
||||
if word_lower not in stop_words and word_lower.isalpha():
|
||||
vocab.add(ps.stem(word_lower))
|
||||
word_dic = Counter(vocab)
|
||||
return word_dic
|
||||
|
||||
|
||||
def build_master_dict(data: pd.DataFrame, classes: list,
|
||||
stop_words: set) -> dict:
|
||||
"""Create the master dictionary containing each word's frequency"""
|
||||
master_dict = {}
|
||||
|
||||
for category in classes:
|
||||
@ -93,7 +98,10 @@ def build_master_dict(data, classes, stop_words):
|
||||
return master_dict
|
||||
|
||||
|
||||
def build_category_probs_dicts(word_frequency_df, categories_to_iterate):
|
||||
def build_category_probs_dicts(
|
||||
word_frequency_df: pd.DataFrame,
|
||||
categories_to_iterate: list) -> tuple(dict, dict):
|
||||
"""Create the dictionary holding category-wise sums and word-wise probabilities"""
|
||||
category_sum = []
|
||||
for category in categories_to_iterate:
|
||||
# *Prepared category sum for zip
|
||||
@ -113,6 +121,7 @@ def build_category_probs_dicts(word_frequency_df, categories_to_iterate):
|
||||
|
||||
|
||||
def build_word_probs(word_freqs, categories_to_iterate, dict_category_sum):
|
||||
"""Calculate word probability with smoothing application"""
|
||||
prob_df = word_freqs
|
||||
for category in categories_to_iterate:
|
||||
for index, row in prob_df.iterrows():
|
||||
|
@ -27,21 +27,19 @@ def download_data(data_path, dataset_name):
|
||||
def preprocess_dataset(data_path):
|
||||
data = pd.read_csv(data_path).replace(np.nan, "", regex=True)
|
||||
|
||||
data["description"] = data["description"].str.replace(r"\W+", " ", regex=True)
|
||||
data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
|
||||
data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
|
||||
data["description"] = data["description"].str.replace(
|
||||
r"(\W+)|(url_\w+)|(\s+)", " ", regex=True)
|
||||
# data["description"] = data["description"].str.replace(r"url_\w+", " ", regex=True)
|
||||
# data["description"] = data["description"].str.replace(r"\s+", " ", regex=True)
|
||||
|
||||
data["text"] = data[
|
||||
[
|
||||
"title",
|
||||
"department",
|
||||
"company_profile",
|
||||
"description",
|
||||
"requirements",
|
||||
"benefits",
|
||||
]
|
||||
].apply(lambda x: " ".join(x).lower(), axis=1)
|
||||
# data['text'] = data['text'].str.lower()
|
||||
data["text"] = data[[
|
||||
"title",
|
||||
"department",
|
||||
"company_profile",
|
||||
"description",
|
||||
"requirements",
|
||||
"benefits",
|
||||
]].apply(lambda x: " ".join(x).lower(), axis=1)
|
||||
|
||||
tokenizer = RegexpTokenizer(r"\w+")
|
||||
data["tokens"] = data["text"].apply(tokenizer.tokenize)
|
||||
|
Loading…
Reference in New Issue
Block a user