Added script for teaching model and prediction script

2020-06-12 21:59:30 +02:00 · 2020-06-12 21:59:30 +02:00 · 3d150a4b03
commit 3d150a4b03
parent 9fd6529e2b
3 changed files with 221 additions and 0 deletions
--- a/backend/webapp/model.py
+++ b/backend/webapp/model.py
@ -0,0 +1,148 @@
+import pandas as pd
+from io import StringIO
+import matplotlib.pyplot as plt
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+from sklearn.svm import LinearSVC
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.metrics import accuracy_score
+import seaborn as sns
+from sklearn.metrics import confusion_matrix
+import string
+import re
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import precision_recall_fscore_support as score
+import featuretools as ft
+import joblib
+
+# Importing data and selecting desired columns
+df = pd.read_csv('corp.tsv', sep='\t', encoding='utf-8')
+print(df['label'].value_counts())
+df['index'] = df.index
+columns_titles = ["index", "id", "body_text", "label"]
+df=df.reindex(columns=columns_titles)
+col = ['index','body_text', 'label']
+df = df[col]
+df = df[pd.notnull(df['body_text'])]
+df.columns = ['index','body_text', 'label']
+duplicateDFRow = df[df.duplicated(['body_text'])]
+
+# Factorizing labels for integer values
+df['label_id'] = df['label'].factorize()[0]
+label_id_df = df[['label', 'label_id']].drop_duplicates().sort_values('label_id')
+label_to_id = dict(label_id_df.values)
+id_to_label = dict(label_id_df[['label_id', 'label']].values)
+
+# Sampling data
+#from imblearn.over_sampling import RandomOverSampler
+#from imblearn.under_sampling import RandomUnderSampler
+
+#def resample(df, method):
+#    """Resamples df using method with .fit_resample()
+#
+#    Args:
+#        df (DataFrame): Fraud data
+#        method (object): Resampler with .fit_resample() method
+#    Retuns:
+#        resampled_df (DataFrame): Resampled DataFrame
+#    """
+#    target = df.pop('label_id')
+#
+#    processed_x, processed_y = method.fit_resample(df, target)
+#
+#    cols = list(df.columns) + ["label_id"]
+#
+#    pdf_x = pd.DataFrame(processed_x, columns=df.columns)
+#    pdf_y = pd.DataFrame(processed_y, columns=['label_id'])
+#    resampled_df = pd.concat([pdf_x, pdf_y], axis=1)
+#
+#    return resampled_df
+#RUS = RandomUnderSampler(sampling_strategy={0: 650}, random_state=42)
+#rus_resampled = resample(df, RUS)
+#df = rus_resampled
+
+# Feature engineering
+def count_punct(text):
+    count = sum([1 for char in text if char in string.punctuation])
+    return round(count/(len(text) - text.count(" ")), 3)*100
+
+df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
+df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
+
+#es = ft.EntitySet(id="text_data")
+#es = es.entity_from_dataframe(entity_id="data",
+#                              index='index',
+#                              dataframe=df)
+
+#from nlp_primitives import (
+#    DiversityScore,
+#    LSA,
+#    MeanCharactersPerWord,
+#    PartOfSpeechCount,
+#    PolarityScore,
+#    PunctuationCount,
+#    StopwordCount,
+#    TitleWordCount,
+#    UniversalSentenceEncoder,
+#    UpperCaseCount)
+
+
+#trans = [DiversityScore, 
+#         MeanCharactersPerWord,
+#         TitleWordCount,
+#         LSA,
+#         PartOfSpeechCount,
+#         UniversalSentenceEncoder,
+#         UpperCaseCount]
+
+#feature_matrix, feature_defs = ft.dfs(entityset=es,
+#                                     target_entity='data',
+#                                     verbose=True,
+#                                     trans_primitives=trans,
+#                                     max_depth=4)
+
+#feature_matrix.drop(["body_len"], axis=1, inplace=True)
+#feature_matrix.drop(["punct%"], axis=1, inplace=True)
+
+# Vectorizing data
+def clean_text(text):
+    text = "".join([word.lower() for word in text if word not in string.punctuation])
+    tokens = re.split('\W+', text)
+    text = [word for word in tokens]
+    return text
+
+#tfidf = TfidfVectorizer(analyzer=clean_text)
+tfidf = TfidfVectorizer(analyzer=clean_text,sublinear_tf=True, min_df=10, max_features=None, norm='l2', encoding='utf-8', ngram_range=(1,2))
+transformed = tfidf.fit_transform(df.body_text)
+joblib.dump(tfidf.vocabulary_, 'vocabulary.pkl')
+#features = tfidf.fit_transform(df.body_text).toarray()
+features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
+           pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
+#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
+labels = df.label_id
+
+# Teaching model
+model = LogisticRegression(solver='lbfgs', max_iter=7000)
+#model = LinearSVC(dual=False)
+#model = joblib.load('model.pkl')
+
+X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.2, random_state=42)
+#model = joblib.load('model.pkl')
+
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+joblib.dump(model, 'model.pkl')
+print("Accuracy of the model: " + str(accuracy_score(y_test, y_pred)*100) + "%")
+
+# Generating confusion matrix
+conf_mat = confusion_matrix(y_test, y_pred)
+fig, ax = plt.subplots(figsize=(10,10))
+sns.heatmap(conf_mat, annot=True, fmt='d', 
+            xticklabels=label_id_df.label.values, 
+            yticklabels=label_id_df.label.values)
+plt.ylabel('Actual')
+plt.xlabel('Predicted')
+plt.show()
--- a/backend/webapp/predict.py
+++ b/backend/webapp/predict.py
@ -0,0 +1,73 @@
+import pandas as pd
+from joblib import load
+import string
+import re
+import featuretools as ft
+from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
+
+id_to_labels = load('labels.pkl')
+data = open('testdata.txt').read().splitlines()
+df = pd.DataFrame(data, columns=["body_text"])
+df['index'] = df.index
+columns_titles = ["index", "body_text"]
+df=df.reindex(columns=columns_titles)
+col = ['index','body_text']
+df = df[col]
+df.columns = ['index','body_text']
+
+model = load('model.pkl')
+
+def count_punct(text):
+    count = sum([1 for char in text if char in string.punctuation])
+    return round(count/(len(text) - text.count(" ")), 3)*100
+
+df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
+df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
+
+#es = ft.EntitySet(id="text_data")
+#es = es.entity_from_dataframe(entity_id="data",
+#                              index='index',
+#                              dataframe=df)
+#from nlp_primitives import (
+#    DiversityScore,
+#    LSA,
+#    MeanCharactersPerWord,
+#    TitleWordCount,
+#    UpperCaseCount)
+
+
+#trans = [DiversityScore, 
+#         MeanCharactersPerWord,
+#         TitleWordCount,
+#         LSA,
+#         UpperCaseCount]
+#feature_matrix, feature_defs = ft.dfs(entityset=es,
+#                                      target_entity='data',
+#                                      verbose=True,
+#                                      trans_primitives=trans,
+#                                      max_depth=4)
+#feature_matrix.drop(["body_len"], axis=1, inplace=True)
+#feature_matrix.drop(["punct%"], axis=1, inplace=True)
+
+
+# Vectorizing data
+#def clean_text(text):
+#    text = "".join([word.lower() for word in text if word not in string.punctuation])
+#    tokens = re.split('\W+', text)
+#    text = [word for word in tokens]
+#    return text
+transformer = TfidfTransformer()
+loaded_vec = CountVectorizer(decode_error="replace",vocabulary=load('vocabulary.pkl'))
+transformed = transformer.fit_transform(loaded_vec.fit_transform(df.body_text).toarray())
+
+features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
+           pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
+#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
+
+pred = model.predict(features)
+labels = list(map(id_to_labels.get, pred))
+df['label'] = labels
+del df['body_len']
+del df['punct%']
+df.to_csv('result.csv', encoding='utf-8')
+
--- a/backend/webapp/vocabulary.pkl
+++ b/backend/webapp/vocabulary.pkl