Merge remote-tracking branch 'origin/master' into graph-styling-and-mapping-data

2020-06-13 14:47:10 +02:00 · 2020-06-13 14:47:10 +02:00 · 192c52f587
commit 192c52f587
parent 21df72b812 f3860b9ac5
11 changed files with 349 additions and 68 deletions
--- a/backend/edumaticParser.py
+++ b/backend/edumaticParser.py
@ -0,0 +1,50 @@
 #!/usr/bin/env python3
 import argparse
 from bs4 import BeautifulSoup
 from postmarkup import render_bbcode
 import html
 import json
 import re
 # arguments
 parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
 parser.add_argument('filename', help='xml forum file')
 args = parser.parse_args()
 # make a soup
 with open(args.filename, 'rb') as forum:
  soup = BeautifulSoup(forum, "xml")
 # put json together
 out = {}
 out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
 out['name'] = soup.group.table.find('string').text
 out['discussions'] = []
 did = 50
 pid = did + 1
 # we ignore first table, and then rules 
 for d in soup.group.find_all('table')[4::2]:
  posts = []
  for p in d.find_all('row'):
    text = html.unescape(p.find_all('string')[1].text)
    paragraphs = [render_bbcode(x) for x in text.splitlines()]
    posts.append({
      'id': pid,
      'parent': pid - 1,
      'author': p.find_all('string')[2].text,
      'message': [x for x in paragraphs if x]
    })
    pid = pid + 1 
  out['discussions'].append({
    'id' : did,
    'title':  d.row.find('string').text,
    'first_post': did + 1,
    'posts': posts
  })
  did = did + 50
  pid = did + 1
 with open('parsed.json', 'w', encoding='utf-8') as outfile:
  json.dump(out, outfile, ensure_ascii=False, indent=2)
--- a/backend/webapp/model.py
+++ b/backend/webapp/model.py
@ -0,0 +1,148 @@
 import pandas as pd
 from io import StringIO
 import matplotlib.pyplot as plt
 from sklearn.feature_extraction.text import TfidfVectorizer
 import numpy as np
 from sklearn.svm import LinearSVC
 from sklearn.model_selection import cross_val_score
 from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.metrics import accuracy_score
 import seaborn as sns
 from sklearn.metrics import confusion_matrix
 import string
 import re
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import precision_recall_fscore_support as score
 import featuretools as ft
 import joblib
 # Importing data and selecting desired columns
 df = pd.read_csv('corp.tsv', sep='\t', encoding='utf-8')
 print(df['label'].value_counts())
 df['index'] = df.index
 columns_titles = ["index", "id", "body_text", "label"]
 df=df.reindex(columns=columns_titles)
 col = ['index','body_text', 'label']
 df = df[col]
 df = df[pd.notnull(df['body_text'])]
 df.columns = ['index','body_text', 'label']
 duplicateDFRow = df[df.duplicated(['body_text'])]
 # Factorizing labels for integer values
 df['label_id'] = df['label'].factorize()[0]
 label_id_df = df[['label', 'label_id']].drop_duplicates().sort_values('label_id')
 label_to_id = dict(label_id_df.values)
 id_to_label = dict(label_id_df[['label_id', 'label']].values)
 # Sampling data
 #from imblearn.over_sampling import RandomOverSampler
 #from imblearn.under_sampling import RandomUnderSampler
 #def resample(df, method):
 #    """Resamples df using method with .fit_resample()
 #
 #    Args:
 #        df (DataFrame): Fraud data
 #        method (object): Resampler with .fit_resample() method
 #    Retuns:
 #        resampled_df (DataFrame): Resampled DataFrame
 #    """
 #    target = df.pop('label_id')
 #
 #    processed_x, processed_y = method.fit_resample(df, target)
 #
 #    cols = list(df.columns) + ["label_id"]
 #
 #    pdf_x = pd.DataFrame(processed_x, columns=df.columns)
 #    pdf_y = pd.DataFrame(processed_y, columns=['label_id'])
 #    resampled_df = pd.concat([pdf_x, pdf_y], axis=1)
 #
 #    return resampled_df
 #RUS = RandomUnderSampler(sampling_strategy={0: 650}, random_state=42)
 #rus_resampled = resample(df, RUS)
 #df = rus_resampled
 # Feature engineering
 def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100
 df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
 df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
 #es = ft.EntitySet(id="text_data")
 #es = es.entity_from_dataframe(entity_id="data",
 #                              index='index',
 #                              dataframe=df)
 #from nlp_primitives import (
 #    DiversityScore,
 #    LSA,
 #    MeanCharactersPerWord,
 #    PartOfSpeechCount,
 #    PolarityScore,
 #    PunctuationCount,
 #    StopwordCount,
 #    TitleWordCount,
 #    UniversalSentenceEncoder,
 #    UpperCaseCount)
 #trans = [DiversityScore, 
 #         MeanCharactersPerWord,
 #         TitleWordCount,
 #         LSA,
 #         PartOfSpeechCount,
 #         UniversalSentenceEncoder,
 #         UpperCaseCount]
 #feature_matrix, feature_defs = ft.dfs(entityset=es,
 #                                     target_entity='data',
 #                                     verbose=True,
 #                                     trans_primitives=trans,
 #                                     max_depth=4)
 #feature_matrix.drop(["body_len"], axis=1, inplace=True)
 #feature_matrix.drop(["punct%"], axis=1, inplace=True)
 # Vectorizing data
 def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens]
    return text
 #tfidf = TfidfVectorizer(analyzer=clean_text)
 tfidf = TfidfVectorizer(analyzer=clean_text,sublinear_tf=True, min_df=10, max_features=None, norm='l2', encoding='utf-8', ngram_range=(1,2))
 transformed = tfidf.fit_transform(df.body_text)
 joblib.dump(tfidf.vocabulary_, 'vocabulary.pkl')
 #features = tfidf.fit_transform(df.body_text).toarray()
 features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
           pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
 #dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
 labels = df.label_id
 # Teaching model
 model = LogisticRegression(solver='lbfgs', max_iter=7000)
 #model = LinearSVC(dual=False)
 #model = joblib.load('model.pkl')
 X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.2, random_state=42)
 #model = joblib.load('model.pkl')
 model.fit(X_train, y_train)
 y_pred = model.predict(X_test)
 joblib.dump(model, 'model.pkl')
 print("Accuracy of the model: " + str(accuracy_score(y_test, y_pred)*100) + "%")
 # Generating confusion matrix
 conf_mat = confusion_matrix(y_test, y_pred)
 fig, ax = plt.subplots(figsize=(10,10))
 sns.heatmap(conf_mat, annot=True, fmt='d', 
            xticklabels=label_id_df.label.values, 
            yticklabels=label_id_df.label.values)
 plt.ylabel('Actual')
 plt.xlabel('Predicted')
 plt.show()
--- a/backend/webapp/predict.py
+++ b/backend/webapp/predict.py
@ -0,0 +1,73 @@
 import pandas as pd
 from joblib import load
 import string
 import re
 import featuretools as ft
 from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
 id_to_labels = load('labels.pkl')
 data = open('testdata.txt').read().splitlines()
 df = pd.DataFrame(data, columns=["body_text"])
 df['index'] = df.index
 columns_titles = ["index", "body_text"]
 df=df.reindex(columns=columns_titles)
 col = ['index','body_text']
 df = df[col]
 df.columns = ['index','body_text']
 model = load('model.pkl')
 def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100
 df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
 df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
 #es = ft.EntitySet(id="text_data")
 #es = es.entity_from_dataframe(entity_id="data",
 #                              index='index',
 #                              dataframe=df)
 #from nlp_primitives import (
 #    DiversityScore,
 #    LSA,
 #    MeanCharactersPerWord,
 #    TitleWordCount,
 #    UpperCaseCount)
 #trans = [DiversityScore, 
 #         MeanCharactersPerWord,
 #         TitleWordCount,
 #         LSA,
 #         UpperCaseCount]
 #feature_matrix, feature_defs = ft.dfs(entityset=es,
 #                                      target_entity='data',
 #                                      verbose=True,
 #                                      trans_primitives=trans,
 #                                      max_depth=4)
 #feature_matrix.drop(["body_len"], axis=1, inplace=True)
 #feature_matrix.drop(["punct%"], axis=1, inplace=True)
 # Vectorizing data
 #def clean_text(text):
 #    text = "".join([word.lower() for word in text if word not in string.punctuation])
 #    tokens = re.split('\W+', text)
 #    text = [word for word in tokens]
 #    return text
 transformer = TfidfTransformer()
 loaded_vec = CountVectorizer(decode_error="replace",vocabulary=load('vocabulary.pkl'))
 transformed = transformer.fit_transform(loaded_vec.fit_transform(df.body_text).toarray())
 features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
           pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
 #dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
 pred = model.predict(features)
 labels = list(map(id_to_labels.get, pred))
 df['label'] = labels
 del df['body_len']
 del df['punct%']
 df.to_csv('result.csv', encoding='utf-8')
--- a/backend/webapp/prototype/filehandler/forms.py
+++ b/backend/webapp/prototype/filehandler/forms.py
@ -4,4 +4,4 @@ from prototype.filehandler.models import Document
 class DocumentForm(forms.ModelForm):
    class Meta:
        model = Document
-        fields = ('description', 'file', )
+        fields = ('file', )
--- a/backend/webapp/prototype/filehandler/model.pkl
+++ b/backend/webapp/prototype/filehandler/model.pkl
--- a/backend/webapp/prototype/filehandler/models.py
+++ b/backend/webapp/prototype/filehandler/models.py
@ -2,6 +2,5 @@ from __future__ import unicode_literals
 from django.db import models
 class Document(models.Model):
    description = models.CharField(max_length=255, blank=True)
    file = models.FileField(upload_to='documents/')
    uploaded_at = models.DateTimeField(auto_now_add=True)
--- a/backend/webapp/prototype/filehandler/views.py
+++ b/backend/webapp/prototype/filehandler/views.py
@ -7,7 +7,6 @@ from django.http import JsonResponse, HttpResponse
 from prototype.filehandler.models import Document
 from prototype.filehandler.forms import DocumentForm
 from prototype.filehandler.functions import isValidXML
 from prototype.filehandler.xmlParser import parseData
 def home(request):
@ -17,16 +16,15 @@ def home(request):
@csrf_exempt
 def model_form_upload(request):
    if request.method == 'POST':
        if not isValidXML(request.FILES['file']):
            return HttpResponse('Niepoprawny format XML', status=406)
        form = DocumentForm(request.POST, request.FILES)
        print("POST: " + str(request.POST))
        print("FILES: " + str(request.FILES))
        if form.is_valid():
-            form.save()
+            try:
-            data = parseData(request.FILES['file'])
+                data = parseData(request.FILES['file'])
-            return JsonResponse(data, safe=False)
+                print(data)
                form.save()
                return JsonResponse(data, safe=False)
            except:
                return HttpResponse('Niepoprawny format XML', status=406)
    else:
        form = DocumentForm()
    return render(request, 'core/model_form_upload.html', {
--- a/backend/webapp/prototype/filehandler/vocabulary.pkl
+++ b/backend/webapp/prototype/filehandler/vocabulary.pkl
--- a/backend/webapp/prototype/filehandler/xmlParser.py
+++ b/backend/webapp/prototype/filehandler/xmlParser.py
@ -2,40 +2,50 @@
 import argparse
 from bs4 import BeautifulSoup
 from postmarkup import render_bbcode
 import json
 import html
 import re
 import tempfile
-def parseData(file):
+def parseData(file):    
    # arguments
    parser = argparse.ArgumentParser(description='Process some xml files.')
    parser.add_argument('filename', help='xml forum file')
    args = parser.parse_args()
-    
+        
    # write file first
-    with open('temp.xml', 'wb+') as destination:
+    fd = tempfile.NamedTemporaryFile()
-        for chunk in file.chunks():
+    f = open(fd.name, "wb+")
-            destination.write(chunk)
+    for chunk in file.chunks():
-
+        f.write(chunk)
-    # make a soup:
+    f.close()
    with open('temp.xml') as forum:
        soup = BeautifulSoup(forum, "xml")
    # make a soup:
    with open(fd.name) as forum:
        soup = BeautifulSoup(forum, "xml")
    # put json together
    out = {}
    out['id'] = soup.forum.get('id')
    out['name'] = soup.forum.find('name').text
    out['discussions'] = []
    for d in soup.forum.find_all('discussion'):
        posts = []
        for p in d.find_all('post'):
            post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
            paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
            posts.append({
                'id': p.get('id'),
                'parent': p.find('parent').text,
                'author': p.userid.text,
                'message': [x for x in paragraphs if x]
            })
        out['discussions'].append({
            'id':         d.get('id'),
            'title':      d.find('name').text,
            'first_post': d.firstpost.text,
-            'posts': [
+            'posts': posts
                {
                    'id': p.get('id'),
                    'parent': p.find('parent').text,
                    'author': p.userid.text,
                    'message': p.message.get_text()
                } for p in d.find_all('post')]
        })
-
+    fd.close()
    return(out)
--- a/backend/webapp/vocabulary.pkl
+++ b/backend/webapp/vocabulary.pkl
--- a/backend/xmlParser.py
+++ b/backend/xmlParser.py
@ -1,40 +1,43 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python3
-
+
-import argparse
+import argparse
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup
-import json
+from postmarkup import render_bbcode
-
+import html
-
+import json
-# arguments
+
-parser = argparse.ArgumentParser(description='Process some xml files.')
+
-parser.add_argument('filename', help='xml forum file')
+# arguments
-args = parser.parse_args()
+parser = argparse.ArgumentParser(description='Process some xml files.')
-
+parser.add_argument('filename', help='xml forum file')
-# make a soup
+args = parser.parse_args()
-with open(args.filename) as forum:
+
-  soup = BeautifulSoup(forum, "xml")
+# make a soup
-
+with open(args.filename) as forum:
-# put json together
+  soup = BeautifulSoup(forum, "xml")
-out = {}
+
-out['id'] = soup.forum.get('id')
+# put json together
-out['name'] = soup.forum.find('name').text
+out = {}
-out['discussions'] = []
+out['id'] = soup.forum.get('id')
-for d in soup.forum.find_all('discussion'):
+out['name'] = soup.forum.find('name').text
-  posts = []
+out['discussions'] = []
-  for p in d.find_all('post'):
+for d in soup.forum.find_all('discussion'):
-    message_soup = BeautifulSoup(p.message.get_text(), "xml")
+  posts = []
-    posts.append({
+  for p in d.find_all('post'):
-      'id': p.get('id'),
+    post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
-      'parent': p.find('parent').text,
+    paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
-      'author': p.userid.text,
+    posts.append({
-      'message': message_soup.get_text()
+      'id': p.get('id'),
-    })
+      'parent': p.find('parent').text,
-  out['discussions'].append({
+      'author': p.userid.text,
-    'id':         d.get('id'),
+      'message': [x for x in paragraphs if x]
-    'title':      d.find('name').text,
+    })
-    'first_post': d.firstpost.text,
+  out['discussions'].append({
-    'posts': posts
+    'id':         d.get('id'),
-  })
+    'title':      d.find('name').text,
-
+    'first_post': d.firstpost.text,
-with open('parsed.json', 'w') as outfile:
+    'posts': posts
-  json.dump(out, outfile, ensure_ascii=False, indent=2)
+  })
 with open('parsed.json', 'w') as outfile:
  json.dump(out, outfile, ensure_ascii=False, indent=2)