PRI_2020-FE/backend/webapp/model.py

import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.metrics import confusion_matrix
import string
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as score
import featuretools as ft
import joblib

# Importing data and selecting desired columns
df = pd.read_csv('corp.tsv', sep='\t', encoding='utf-8')
print(df['label'].value_counts())
df['index'] = df.index
columns_titles = ["index", "id", "body_text", "label"]
df=df.reindex(columns=columns_titles)
col = ['index','body_text', 'label']
df = df[col]
df = df[pd.notnull(df['body_text'])]
df.columns = ['index','body_text', 'label']
duplicateDFRow = df[df.duplicated(['body_text'])]

# Factorizing labels for integer values
df['label_id'] = df['label'].factorize()[0]
label_id_df = df[['label', 'label_id']].drop_duplicates().sort_values('label_id')
label_to_id = dict(label_id_df.values)
id_to_label = dict(label_id_df[['label_id', 'label']].values)

# Sampling data
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.under_sampling import RandomUnderSampler

#def resample(df, method):
#    """Resamples df using method with .fit_resample()
#
#    Args:
#        df (DataFrame): Fraud data
#        method (object): Resampler with .fit_resample() method
#    Retuns:
#        resampled_df (DataFrame): Resampled DataFrame
#    """
#    target = df.pop('label_id')
#
#    processed_x, processed_y = method.fit_resample(df, target)
#
#    cols = list(df.columns) + ["label_id"]
#
#    pdf_x = pd.DataFrame(processed_x, columns=df.columns)
#    pdf_y = pd.DataFrame(processed_y, columns=['label_id'])
#    resampled_df = pd.concat([pdf_x, pdf_y], axis=1)
#
#    return resampled_df
#RUS = RandomUnderSampler(sampling_strategy={0: 650}, random_state=42)
#rus_resampled = resample(df, RUS)
#df = rus_resampled

# Feature engineering
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))

#es = ft.EntitySet(id="text_data")
#es = es.entity_from_dataframe(entity_id="data",
#                              index='index',
#                              dataframe=df)

#from nlp_primitives import (
#    DiversityScore,
#    LSA,
#    MeanCharactersPerWord,
#    PartOfSpeechCount,
#    PolarityScore,
#    PunctuationCount,
#    StopwordCount,
#    TitleWordCount,
#    UniversalSentenceEncoder,
#    UpperCaseCount)


#trans = [DiversityScore,
#         MeanCharactersPerWord,
#         TitleWordCount,
#         LSA,
#         PartOfSpeechCount,
#         UniversalSentenceEncoder,
#         UpperCaseCount]

#feature_matrix, feature_defs = ft.dfs(entityset=es,
#                                     target_entity='data',
#                                     verbose=True,
#                                     trans_primitives=trans,
#                                     max_depth=4)

#feature_matrix.drop(["body_len"], axis=1, inplace=True)
#feature_matrix.drop(["punct%"], axis=1, inplace=True)

# Vectorizing data
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens]
    return text

#tfidf = TfidfVectorizer(analyzer=clean_text)
tfidf = TfidfVectorizer(analyzer=clean_text,sublinear_tf=True, min_df=10, max_features=None, norm='l2', encoding='utf-8', ngram_range=(1,2))
transformed = tfidf.fit_transform(df.body_text)
joblib.dump(tfidf.vocabulary_, 'vocabulary.pkl')
#features = tfidf.fit_transform(df.body_text).toarray()
features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
           pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
labels = df.label_id

# Teaching model
model = LogisticRegression(solver='lbfgs', max_iter=7000)
#model = LinearSVC(dual=False)
#model = joblib.load('model.pkl')

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.2, random_state=42)
#model = joblib.load('model.pkl')

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
joblib.dump(model, 'model.pkl')
print("Accuracy of the model: " + str(accuracy_score(y_test, y_pred)*100) + "%")

# Generating confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=label_id_df.label.values,
            yticklabels=label_id_df.label.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()