Merge remote-tracking branch 'origin/master' into graph-styling-and-mapping-data

This commit is contained in:
Michał Romaszkin 2020-06-13 14:47:10 +02:00
commit 192c52f587
11 changed files with 349 additions and 68 deletions

50
backend/edumaticParser.py Normal file
View File

@ -0,0 +1,50 @@
#!/usr/bin/env python3
import argparse
from bs4 import BeautifulSoup
from postmarkup import render_bbcode
import html
import json
import re
# arguments
parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
parser.add_argument('filename', help='xml forum file')
args = parser.parse_args()
# make a soup
with open(args.filename, 'rb') as forum:
soup = BeautifulSoup(forum, "xml")
# put json together
out = {}
out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
out['name'] = soup.group.table.find('string').text
out['discussions'] = []
did = 50
pid = did + 1
# we ignore first table, and then rules
for d in soup.group.find_all('table')[4::2]:
posts = []
for p in d.find_all('row'):
text = html.unescape(p.find_all('string')[1].text)
paragraphs = [render_bbcode(x) for x in text.splitlines()]
posts.append({
'id': pid,
'parent': pid - 1,
'author': p.find_all('string')[2].text,
'message': [x for x in paragraphs if x]
})
pid = pid + 1
out['discussions'].append({
'id' : did,
'title': d.row.find('string').text,
'first_post': did + 1,
'posts': posts
})
did = did + 50
pid = did + 1
with open('parsed.json', 'w', encoding='utf-8') as outfile:
json.dump(out, outfile, ensure_ascii=False, indent=2)

148
backend/webapp/model.py Normal file
View File

@ -0,0 +1,148 @@
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.metrics import confusion_matrix
import string
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as score
import featuretools as ft
import joblib
# Importing data and selecting desired columns
df = pd.read_csv('corp.tsv', sep='\t', encoding='utf-8')
print(df['label'].value_counts())
df['index'] = df.index
columns_titles = ["index", "id", "body_text", "label"]
df=df.reindex(columns=columns_titles)
col = ['index','body_text', 'label']
df = df[col]
df = df[pd.notnull(df['body_text'])]
df.columns = ['index','body_text', 'label']
duplicateDFRow = df[df.duplicated(['body_text'])]
# Factorizing labels for integer values
df['label_id'] = df['label'].factorize()[0]
label_id_df = df[['label', 'label_id']].drop_duplicates().sort_values('label_id')
label_to_id = dict(label_id_df.values)
id_to_label = dict(label_id_df[['label_id', 'label']].values)
# Sampling data
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.under_sampling import RandomUnderSampler
#def resample(df, method):
# """Resamples df using method with .fit_resample()
#
# Args:
# df (DataFrame): Fraud data
# method (object): Resampler with .fit_resample() method
# Retuns:
# resampled_df (DataFrame): Resampled DataFrame
# """
# target = df.pop('label_id')
#
# processed_x, processed_y = method.fit_resample(df, target)
#
# cols = list(df.columns) + ["label_id"]
#
# pdf_x = pd.DataFrame(processed_x, columns=df.columns)
# pdf_y = pd.DataFrame(processed_y, columns=['label_id'])
# resampled_df = pd.concat([pdf_x, pdf_y], axis=1)
#
# return resampled_df
#RUS = RandomUnderSampler(sampling_strategy={0: 650}, random_state=42)
#rus_resampled = resample(df, RUS)
#df = rus_resampled
# Feature engineering
def count_punct(text):
count = sum([1 for char in text if char in string.punctuation])
return round(count/(len(text) - text.count(" ")), 3)*100
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
#es = ft.EntitySet(id="text_data")
#es = es.entity_from_dataframe(entity_id="data",
# index='index',
# dataframe=df)
#from nlp_primitives import (
# DiversityScore,
# LSA,
# MeanCharactersPerWord,
# PartOfSpeechCount,
# PolarityScore,
# PunctuationCount,
# StopwordCount,
# TitleWordCount,
# UniversalSentenceEncoder,
# UpperCaseCount)
#trans = [DiversityScore,
# MeanCharactersPerWord,
# TitleWordCount,
# LSA,
# PartOfSpeechCount,
# UniversalSentenceEncoder,
# UpperCaseCount]
#feature_matrix, feature_defs = ft.dfs(entityset=es,
# target_entity='data',
# verbose=True,
# trans_primitives=trans,
# max_depth=4)
#feature_matrix.drop(["body_len"], axis=1, inplace=True)
#feature_matrix.drop(["punct%"], axis=1, inplace=True)
# Vectorizing data
def clean_text(text):
text = "".join([word.lower() for word in text if word not in string.punctuation])
tokens = re.split('\W+', text)
text = [word for word in tokens]
return text
#tfidf = TfidfVectorizer(analyzer=clean_text)
tfidf = TfidfVectorizer(analyzer=clean_text,sublinear_tf=True, min_df=10, max_features=None, norm='l2', encoding='utf-8', ngram_range=(1,2))
transformed = tfidf.fit_transform(df.body_text)
joblib.dump(tfidf.vocabulary_, 'vocabulary.pkl')
#features = tfidf.fit_transform(df.body_text).toarray()
features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
labels = df.label_id
# Teaching model
model = LogisticRegression(solver='lbfgs', max_iter=7000)
#model = LinearSVC(dual=False)
#model = joblib.load('model.pkl')
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.2, random_state=42)
#model = joblib.load('model.pkl')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
joblib.dump(model, 'model.pkl')
print("Accuracy of the model: " + str(accuracy_score(y_test, y_pred)*100) + "%")
# Generating confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
xticklabels=label_id_df.label.values,
yticklabels=label_id_df.label.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

73
backend/webapp/predict.py Normal file
View File

@ -0,0 +1,73 @@
import pandas as pd
from joblib import load
import string
import re
import featuretools as ft
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
id_to_labels = load('labels.pkl')
data = open('testdata.txt').read().splitlines()
df = pd.DataFrame(data, columns=["body_text"])
df['index'] = df.index
columns_titles = ["index", "body_text"]
df=df.reindex(columns=columns_titles)
col = ['index','body_text']
df = df[col]
df.columns = ['index','body_text']
model = load('model.pkl')
def count_punct(text):
count = sum([1 for char in text if char in string.punctuation])
return round(count/(len(text) - text.count(" ")), 3)*100
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
#es = ft.EntitySet(id="text_data")
#es = es.entity_from_dataframe(entity_id="data",
# index='index',
# dataframe=df)
#from nlp_primitives import (
# DiversityScore,
# LSA,
# MeanCharactersPerWord,
# TitleWordCount,
# UpperCaseCount)
#trans = [DiversityScore,
# MeanCharactersPerWord,
# TitleWordCount,
# LSA,
# UpperCaseCount]
#feature_matrix, feature_defs = ft.dfs(entityset=es,
# target_entity='data',
# verbose=True,
# trans_primitives=trans,
# max_depth=4)
#feature_matrix.drop(["body_len"], axis=1, inplace=True)
#feature_matrix.drop(["punct%"], axis=1, inplace=True)
# Vectorizing data
#def clean_text(text):
# text = "".join([word.lower() for word in text if word not in string.punctuation])
# tokens = re.split('\W+', text)
# text = [word for word in tokens]
# return text
transformer = TfidfTransformer()
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=load('vocabulary.pkl'))
transformed = transformer.fit_transform(loaded_vec.fit_transform(df.body_text).toarray())
features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
pred = model.predict(features)
labels = list(map(id_to_labels.get, pred))
df['label'] = labels
del df['body_len']
del df['punct%']
df.to_csv('result.csv', encoding='utf-8')

View File

@ -4,4 +4,4 @@ from prototype.filehandler.models import Document
class DocumentForm(forms.ModelForm): class DocumentForm(forms.ModelForm):
class Meta: class Meta:
model = Document model = Document
fields = ('description', 'file', ) fields = ('file', )

Binary file not shown.

View File

@ -2,6 +2,5 @@ from __future__ import unicode_literals
from django.db import models from django.db import models
class Document(models.Model): class Document(models.Model):
description = models.CharField(max_length=255, blank=True)
file = models.FileField(upload_to='documents/') file = models.FileField(upload_to='documents/')
uploaded_at = models.DateTimeField(auto_now_add=True) uploaded_at = models.DateTimeField(auto_now_add=True)

View File

@ -7,7 +7,6 @@ from django.http import JsonResponse, HttpResponse
from prototype.filehandler.models import Document from prototype.filehandler.models import Document
from prototype.filehandler.forms import DocumentForm from prototype.filehandler.forms import DocumentForm
from prototype.filehandler.functions import isValidXML
from prototype.filehandler.xmlParser import parseData from prototype.filehandler.xmlParser import parseData
def home(request): def home(request):
@ -17,16 +16,15 @@ def home(request):
@csrf_exempt @csrf_exempt
def model_form_upload(request): def model_form_upload(request):
if request.method == 'POST': if request.method == 'POST':
if not isValidXML(request.FILES['file']):
return HttpResponse('Niepoprawny format XML', status=406)
form = DocumentForm(request.POST, request.FILES) form = DocumentForm(request.POST, request.FILES)
print("POST: " + str(request.POST))
print("FILES: " + str(request.FILES))
if form.is_valid(): if form.is_valid():
form.save() try:
data = parseData(request.FILES['file']) data = parseData(request.FILES['file'])
return JsonResponse(data, safe=False) print(data)
form.save()
return JsonResponse(data, safe=False)
except:
return HttpResponse('Niepoprawny format XML', status=406)
else: else:
form = DocumentForm() form = DocumentForm()
return render(request, 'core/model_form_upload.html', { return render(request, 'core/model_form_upload.html', {

Binary file not shown.

View File

@ -2,40 +2,50 @@
import argparse import argparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from postmarkup import render_bbcode
import json import json
import html
import re
import tempfile
def parseData(file): def parseData(file):
# arguments # arguments
parser = argparse.ArgumentParser(description='Process some xml files.') parser = argparse.ArgumentParser(description='Process some xml files.')
parser.add_argument('filename', help='xml forum file') parser.add_argument('filename', help='xml forum file')
args = parser.parse_args() args = parser.parse_args()
# write file first # write file first
with open('temp.xml', 'wb+') as destination: fd = tempfile.NamedTemporaryFile()
for chunk in file.chunks(): f = open(fd.name, "wb+")
destination.write(chunk) for chunk in file.chunks():
f.write(chunk)
# make a soup: f.close()
with open('temp.xml') as forum:
soup = BeautifulSoup(forum, "xml")
# make a soup:
with open(fd.name) as forum:
soup = BeautifulSoup(forum, "xml")
# put json together # put json together
out = {} out = {}
out['id'] = soup.forum.get('id') out['id'] = soup.forum.get('id')
out['name'] = soup.forum.find('name').text out['name'] = soup.forum.find('name').text
out['discussions'] = [] out['discussions'] = []
for d in soup.forum.find_all('discussion'): for d in soup.forum.find_all('discussion'):
posts = []
for p in d.find_all('post'):
post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
posts.append({
'id': p.get('id'),
'parent': p.find('parent').text,
'author': p.userid.text,
'message': [x for x in paragraphs if x]
})
out['discussions'].append({ out['discussions'].append({
'id': d.get('id'), 'id': d.get('id'),
'title': d.find('name').text, 'title': d.find('name').text,
'first_post': d.firstpost.text, 'first_post': d.firstpost.text,
'posts': [ 'posts': posts
{
'id': p.get('id'),
'parent': p.find('parent').text,
'author': p.userid.text,
'message': p.message.get_text()
} for p in d.find_all('post')]
}) })
fd.close()
return(out) return(out)

Binary file not shown.

View File

@ -1,40 +1,43 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import json from postmarkup import render_bbcode
import html
import json
# arguments
parser = argparse.ArgumentParser(description='Process some xml files.')
parser.add_argument('filename', help='xml forum file') # arguments
args = parser.parse_args() parser = argparse.ArgumentParser(description='Process some xml files.')
parser.add_argument('filename', help='xml forum file')
# make a soup args = parser.parse_args()
with open(args.filename) as forum:
soup = BeautifulSoup(forum, "xml") # make a soup
with open(args.filename) as forum:
# put json together soup = BeautifulSoup(forum, "xml")
out = {}
out['id'] = soup.forum.get('id') # put json together
out['name'] = soup.forum.find('name').text out = {}
out['discussions'] = [] out['id'] = soup.forum.get('id')
for d in soup.forum.find_all('discussion'): out['name'] = soup.forum.find('name').text
posts = [] out['discussions'] = []
for p in d.find_all('post'): for d in soup.forum.find_all('discussion'):
message_soup = BeautifulSoup(p.message.get_text(), "xml") posts = []
posts.append({ for p in d.find_all('post'):
'id': p.get('id'), post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
'parent': p.find('parent').text, paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
'author': p.userid.text, posts.append({
'message': message_soup.get_text() 'id': p.get('id'),
}) 'parent': p.find('parent').text,
out['discussions'].append({ 'author': p.userid.text,
'id': d.get('id'), 'message': [x for x in paragraphs if x]
'title': d.find('name').text, })
'first_post': d.firstpost.text, out['discussions'].append({
'posts': posts 'id': d.get('id'),
}) 'title': d.find('name').text,
'first_post': d.firstpost.text,
with open('parsed.json', 'w') as outfile: 'posts': posts
json.dump(out, outfile, ensure_ascii=False, indent=2) })
with open('parsed.json', 'w') as outfile:
json.dump(out, outfile, ensure_ascii=False, indent=2)