Implemented model in backend

This commit is contained in:
Marcin Armacki 2020-06-14 15:21:13 +02:00
parent 16e7c414a4
commit 5902c72f2e
3 changed files with 42 additions and 4 deletions

View File

@ -1,7 +1,41 @@
from pandas import DataFrame, concat
from joblib import load
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from prototype.filehandler.models import Forum, Discussion, Post, Paragraph from prototype.filehandler.models import Forum, Discussion, Post, Paragraph
def count_punct(text):
count = sum([1 for char in text if char in string.punctuation])
return round(count/(len(text) - text.count(" ")), 3)*100
def createLabels(data):
id_to_labels = load('prototype/filehandler/labels.pkl')
df = DataFrame(data['messages'], columns = ['body_text'])
model = load('prototype/filehandler/model.pkl')
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
transformer = TfidfTransformer()
loaded_vec = CountVectorizer(decode_error = "replace", vocabulary = load('prototype/filehandler/vocabulary.pkl'))
transformed = transformer.fit_transform(loaded_vec.fit_transform(df.body_text).toarray())
features = concat([df[['body_len', 'punct%']], DataFrame(transformed.toarray())], axis=1)
pred = model.predict(features)
labels = list(map(id_to_labels.get, pred))
for id, label in zip(data['para_id'], labels):
Paragraph.objects.filter(pk = id).update(label = label)
return(True)
def addToDatabase(data, file_id): def addToDatabase(data, file_id):
out = [] out = {}
para_id = []
messages = []
forum = Forum(forum_id = data['id'], name = data['name'], document_id = file_id) forum = Forum(forum_id = data['id'], name = data['name'], document_id = file_id)
forum.save() forum.save()
for discussion_ in data['discussions']: for discussion_ in data['discussions']:
@ -13,7 +47,10 @@ def addToDatabase(data, file_id):
for paragraph_ in post_['message']: for paragraph_ in post_['message']:
paragraph = Paragraph(message = paragraph_, label = '', post_id = post.pk) paragraph = Paragraph(message = paragraph_, label = '', post_id = post.pk)
paragraph.save() paragraph.save()
out.append(paragraph.pk) para_id.append(paragraph.pk)
messages.append(paragraph_)
out['para_id'] = para_id
out['messages'] = messages
return(out) return(out)
def listDiscussionsFromFile(id): def listDiscussionsFromFile(id):

Binary file not shown.

View File

@ -7,7 +7,7 @@ from django.http import JsonResponse, HttpResponse
from prototype.filehandler.models import Document, Forum from prototype.filehandler.models import Document, Forum
from prototype.filehandler.forms import DocumentForm from prototype.filehandler.forms import DocumentForm
from prototype.filehandler.xmlParser import parseData from prototype.filehandler.xmlParser import parseData
from prototype.filehandler.functions import addToDatabase, listDiscussionsFromFile from prototype.filehandler.functions import addToDatabase, listDiscussionsFromFile, createLabels
def home(request): def home(request):
@ -21,7 +21,8 @@ def model_form_upload(request):
if form.is_valid(): if form.is_valid():
data = parseData(request.FILES['file']) data = parseData(request.FILES['file'])
file_id = (form.save()).pk file_id = (form.save()).pk
addToDatabase(data, file_id) if not (createLabels(addToDatabase(data, file_id))):
return HttpResponse('Błąd przy dodawaniu informacji do bazy danych/tworzeniu etykiet', status=406)
output = listDiscussionsFromFile(file_id) output = listDiscussionsFromFile(file_id)
return JsonResponse(output, safe=False) return JsonResponse(output, safe=False)
else: else: