Merge remote-tracking branch 'origin/master' into graph-styling-and-mapping-data
This commit is contained in:
commit
192c52f587
50
backend/edumaticParser.py
Normal file
50
backend/edumaticParser.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from postmarkup import render_bbcode
|
||||||
|
import html
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
# arguments
|
||||||
|
parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
|
||||||
|
parser.add_argument('filename', help='xml forum file')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# make a soup
|
||||||
|
with open(args.filename, 'rb') as forum:
|
||||||
|
soup = BeautifulSoup(forum, "xml")
|
||||||
|
|
||||||
|
# put json together
|
||||||
|
out = {}
|
||||||
|
out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
|
||||||
|
out['name'] = soup.group.table.find('string').text
|
||||||
|
out['discussions'] = []
|
||||||
|
did = 50
|
||||||
|
pid = did + 1
|
||||||
|
# we ignore first table, and then rules
|
||||||
|
for d in soup.group.find_all('table')[4::2]:
|
||||||
|
posts = []
|
||||||
|
for p in d.find_all('row'):
|
||||||
|
text = html.unescape(p.find_all('string')[1].text)
|
||||||
|
paragraphs = [render_bbcode(x) for x in text.splitlines()]
|
||||||
|
posts.append({
|
||||||
|
'id': pid,
|
||||||
|
'parent': pid - 1,
|
||||||
|
'author': p.find_all('string')[2].text,
|
||||||
|
'message': [x for x in paragraphs if x]
|
||||||
|
})
|
||||||
|
pid = pid + 1
|
||||||
|
out['discussions'].append({
|
||||||
|
'id' : did,
|
||||||
|
'title': d.row.find('string').text,
|
||||||
|
'first_post': did + 1,
|
||||||
|
'posts': posts
|
||||||
|
})
|
||||||
|
did = did + 50
|
||||||
|
pid = did + 1
|
||||||
|
|
||||||
|
with open('parsed.json', 'w', encoding='utf-8') as outfile:
|
||||||
|
json.dump(out, outfile, ensure_ascii=False, indent=2)
|
148
backend/webapp/model.py
Normal file
148
backend/webapp/model.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from io import StringIO
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.model_selection import cross_val_score
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
import seaborn as sns
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
import string
|
||||||
|
import re
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.metrics import precision_recall_fscore_support as score
|
||||||
|
import featuretools as ft
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
# Importing data and selecting desired columns
|
||||||
|
df = pd.read_csv('corp.tsv', sep='\t', encoding='utf-8')
|
||||||
|
print(df['label'].value_counts())
|
||||||
|
df['index'] = df.index
|
||||||
|
columns_titles = ["index", "id", "body_text", "label"]
|
||||||
|
df=df.reindex(columns=columns_titles)
|
||||||
|
col = ['index','body_text', 'label']
|
||||||
|
df = df[col]
|
||||||
|
df = df[pd.notnull(df['body_text'])]
|
||||||
|
df.columns = ['index','body_text', 'label']
|
||||||
|
duplicateDFRow = df[df.duplicated(['body_text'])]
|
||||||
|
|
||||||
|
# Factorizing labels for integer values
|
||||||
|
df['label_id'] = df['label'].factorize()[0]
|
||||||
|
label_id_df = df[['label', 'label_id']].drop_duplicates().sort_values('label_id')
|
||||||
|
label_to_id = dict(label_id_df.values)
|
||||||
|
id_to_label = dict(label_id_df[['label_id', 'label']].values)
|
||||||
|
|
||||||
|
# Sampling data
|
||||||
|
#from imblearn.over_sampling import RandomOverSampler
|
||||||
|
#from imblearn.under_sampling import RandomUnderSampler
|
||||||
|
|
||||||
|
#def resample(df, method):
|
||||||
|
# """Resamples df using method with .fit_resample()
|
||||||
|
#
|
||||||
|
# Args:
|
||||||
|
# df (DataFrame): Fraud data
|
||||||
|
# method (object): Resampler with .fit_resample() method
|
||||||
|
# Retuns:
|
||||||
|
# resampled_df (DataFrame): Resampled DataFrame
|
||||||
|
# """
|
||||||
|
# target = df.pop('label_id')
|
||||||
|
#
|
||||||
|
# processed_x, processed_y = method.fit_resample(df, target)
|
||||||
|
#
|
||||||
|
# cols = list(df.columns) + ["label_id"]
|
||||||
|
#
|
||||||
|
# pdf_x = pd.DataFrame(processed_x, columns=df.columns)
|
||||||
|
# pdf_y = pd.DataFrame(processed_y, columns=['label_id'])
|
||||||
|
# resampled_df = pd.concat([pdf_x, pdf_y], axis=1)
|
||||||
|
#
|
||||||
|
# return resampled_df
|
||||||
|
#RUS = RandomUnderSampler(sampling_strategy={0: 650}, random_state=42)
|
||||||
|
#rus_resampled = resample(df, RUS)
|
||||||
|
#df = rus_resampled
|
||||||
|
|
||||||
|
# Feature engineering
|
||||||
|
def count_punct(text):
|
||||||
|
count = sum([1 for char in text if char in string.punctuation])
|
||||||
|
return round(count/(len(text) - text.count(" ")), 3)*100
|
||||||
|
|
||||||
|
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
|
||||||
|
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
|
||||||
|
|
||||||
|
#es = ft.EntitySet(id="text_data")
|
||||||
|
#es = es.entity_from_dataframe(entity_id="data",
|
||||||
|
# index='index',
|
||||||
|
# dataframe=df)
|
||||||
|
|
||||||
|
#from nlp_primitives import (
|
||||||
|
# DiversityScore,
|
||||||
|
# LSA,
|
||||||
|
# MeanCharactersPerWord,
|
||||||
|
# PartOfSpeechCount,
|
||||||
|
# PolarityScore,
|
||||||
|
# PunctuationCount,
|
||||||
|
# StopwordCount,
|
||||||
|
# TitleWordCount,
|
||||||
|
# UniversalSentenceEncoder,
|
||||||
|
# UpperCaseCount)
|
||||||
|
|
||||||
|
|
||||||
|
#trans = [DiversityScore,
|
||||||
|
# MeanCharactersPerWord,
|
||||||
|
# TitleWordCount,
|
||||||
|
# LSA,
|
||||||
|
# PartOfSpeechCount,
|
||||||
|
# UniversalSentenceEncoder,
|
||||||
|
# UpperCaseCount]
|
||||||
|
|
||||||
|
#feature_matrix, feature_defs = ft.dfs(entityset=es,
|
||||||
|
# target_entity='data',
|
||||||
|
# verbose=True,
|
||||||
|
# trans_primitives=trans,
|
||||||
|
# max_depth=4)
|
||||||
|
|
||||||
|
#feature_matrix.drop(["body_len"], axis=1, inplace=True)
|
||||||
|
#feature_matrix.drop(["punct%"], axis=1, inplace=True)
|
||||||
|
|
||||||
|
# Vectorizing data
|
||||||
|
def clean_text(text):
|
||||||
|
text = "".join([word.lower() for word in text if word not in string.punctuation])
|
||||||
|
tokens = re.split('\W+', text)
|
||||||
|
text = [word for word in tokens]
|
||||||
|
return text
|
||||||
|
|
||||||
|
#tfidf = TfidfVectorizer(analyzer=clean_text)
|
||||||
|
tfidf = TfidfVectorizer(analyzer=clean_text,sublinear_tf=True, min_df=10, max_features=None, norm='l2', encoding='utf-8', ngram_range=(1,2))
|
||||||
|
transformed = tfidf.fit_transform(df.body_text)
|
||||||
|
joblib.dump(tfidf.vocabulary_, 'vocabulary.pkl')
|
||||||
|
#features = tfidf.fit_transform(df.body_text).toarray()
|
||||||
|
features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
|
||||||
|
pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
|
||||||
|
#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
|
||||||
|
labels = df.label_id
|
||||||
|
|
||||||
|
# Teaching model
|
||||||
|
model = LogisticRegression(solver='lbfgs', max_iter=7000)
|
||||||
|
#model = LinearSVC(dual=False)
|
||||||
|
#model = joblib.load('model.pkl')
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.2, random_state=42)
|
||||||
|
#model = joblib.load('model.pkl')
|
||||||
|
|
||||||
|
model.fit(X_train, y_train)
|
||||||
|
y_pred = model.predict(X_test)
|
||||||
|
joblib.dump(model, 'model.pkl')
|
||||||
|
print("Accuracy of the model: " + str(accuracy_score(y_test, y_pred)*100) + "%")
|
||||||
|
|
||||||
|
# Generating confusion matrix
|
||||||
|
conf_mat = confusion_matrix(y_test, y_pred)
|
||||||
|
fig, ax = plt.subplots(figsize=(10,10))
|
||||||
|
sns.heatmap(conf_mat, annot=True, fmt='d',
|
||||||
|
xticklabels=label_id_df.label.values,
|
||||||
|
yticklabels=label_id_df.label.values)
|
||||||
|
plt.ylabel('Actual')
|
||||||
|
plt.xlabel('Predicted')
|
||||||
|
plt.show()
|
73
backend/webapp/predict.py
Normal file
73
backend/webapp/predict.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from joblib import load
|
||||||
|
import string
|
||||||
|
import re
|
||||||
|
import featuretools as ft
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
|
||||||
|
|
||||||
|
id_to_labels = load('labels.pkl')
|
||||||
|
data = open('testdata.txt').read().splitlines()
|
||||||
|
df = pd.DataFrame(data, columns=["body_text"])
|
||||||
|
df['index'] = df.index
|
||||||
|
columns_titles = ["index", "body_text"]
|
||||||
|
df=df.reindex(columns=columns_titles)
|
||||||
|
col = ['index','body_text']
|
||||||
|
df = df[col]
|
||||||
|
df.columns = ['index','body_text']
|
||||||
|
|
||||||
|
model = load('model.pkl')
|
||||||
|
|
||||||
|
def count_punct(text):
|
||||||
|
count = sum([1 for char in text if char in string.punctuation])
|
||||||
|
return round(count/(len(text) - text.count(" ")), 3)*100
|
||||||
|
|
||||||
|
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))
|
||||||
|
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))
|
||||||
|
|
||||||
|
#es = ft.EntitySet(id="text_data")
|
||||||
|
#es = es.entity_from_dataframe(entity_id="data",
|
||||||
|
# index='index',
|
||||||
|
# dataframe=df)
|
||||||
|
#from nlp_primitives import (
|
||||||
|
# DiversityScore,
|
||||||
|
# LSA,
|
||||||
|
# MeanCharactersPerWord,
|
||||||
|
# TitleWordCount,
|
||||||
|
# UpperCaseCount)
|
||||||
|
|
||||||
|
|
||||||
|
#trans = [DiversityScore,
|
||||||
|
# MeanCharactersPerWord,
|
||||||
|
# TitleWordCount,
|
||||||
|
# LSA,
|
||||||
|
# UpperCaseCount]
|
||||||
|
#feature_matrix, feature_defs = ft.dfs(entityset=es,
|
||||||
|
# target_entity='data',
|
||||||
|
# verbose=True,
|
||||||
|
# trans_primitives=trans,
|
||||||
|
# max_depth=4)
|
||||||
|
#feature_matrix.drop(["body_len"], axis=1, inplace=True)
|
||||||
|
#feature_matrix.drop(["punct%"], axis=1, inplace=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Vectorizing data
|
||||||
|
#def clean_text(text):
|
||||||
|
# text = "".join([word.lower() for word in text if word not in string.punctuation])
|
||||||
|
# tokens = re.split('\W+', text)
|
||||||
|
# text = [word for word in tokens]
|
||||||
|
# return text
|
||||||
|
transformer = TfidfTransformer()
|
||||||
|
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=load('vocabulary.pkl'))
|
||||||
|
transformed = transformer.fit_transform(loaded_vec.fit_transform(df.body_text).toarray())
|
||||||
|
|
||||||
|
features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True),
|
||||||
|
pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1)
|
||||||
|
#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False)
|
||||||
|
|
||||||
|
pred = model.predict(features)
|
||||||
|
labels = list(map(id_to_labels.get, pred))
|
||||||
|
df['label'] = labels
|
||||||
|
del df['body_len']
|
||||||
|
del df['punct%']
|
||||||
|
df.to_csv('result.csv', encoding='utf-8')
|
||||||
|
|
@ -4,4 +4,4 @@ from prototype.filehandler.models import Document
|
|||||||
class DocumentForm(forms.ModelForm):
|
class DocumentForm(forms.ModelForm):
|
||||||
class Meta:
|
class Meta:
|
||||||
model = Document
|
model = Document
|
||||||
fields = ('description', 'file', )
|
fields = ('file', )
|
||||||
|
BIN
backend/webapp/prototype/filehandler/model.pkl
Normal file
BIN
backend/webapp/prototype/filehandler/model.pkl
Normal file
Binary file not shown.
@ -2,6 +2,5 @@ from __future__ import unicode_literals
|
|||||||
from django.db import models
|
from django.db import models
|
||||||
|
|
||||||
class Document(models.Model):
|
class Document(models.Model):
|
||||||
description = models.CharField(max_length=255, blank=True)
|
|
||||||
file = models.FileField(upload_to='documents/')
|
file = models.FileField(upload_to='documents/')
|
||||||
uploaded_at = models.DateTimeField(auto_now_add=True)
|
uploaded_at = models.DateTimeField(auto_now_add=True)
|
||||||
|
@ -7,7 +7,6 @@ from django.http import JsonResponse, HttpResponse
|
|||||||
from prototype.filehandler.models import Document
|
from prototype.filehandler.models import Document
|
||||||
from prototype.filehandler.forms import DocumentForm
|
from prototype.filehandler.forms import DocumentForm
|
||||||
|
|
||||||
from prototype.filehandler.functions import isValidXML
|
|
||||||
from prototype.filehandler.xmlParser import parseData
|
from prototype.filehandler.xmlParser import parseData
|
||||||
|
|
||||||
def home(request):
|
def home(request):
|
||||||
@ -17,16 +16,15 @@ def home(request):
|
|||||||
@csrf_exempt
|
@csrf_exempt
|
||||||
def model_form_upload(request):
|
def model_form_upload(request):
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
if not isValidXML(request.FILES['file']):
|
|
||||||
return HttpResponse('Niepoprawny format XML', status=406)
|
|
||||||
|
|
||||||
form = DocumentForm(request.POST, request.FILES)
|
form = DocumentForm(request.POST, request.FILES)
|
||||||
print("POST: " + str(request.POST))
|
|
||||||
print("FILES: " + str(request.FILES))
|
|
||||||
if form.is_valid():
|
if form.is_valid():
|
||||||
form.save()
|
try:
|
||||||
data = parseData(request.FILES['file'])
|
data = parseData(request.FILES['file'])
|
||||||
return JsonResponse(data, safe=False)
|
print(data)
|
||||||
|
form.save()
|
||||||
|
return JsonResponse(data, safe=False)
|
||||||
|
except:
|
||||||
|
return HttpResponse('Niepoprawny format XML', status=406)
|
||||||
else:
|
else:
|
||||||
form = DocumentForm()
|
form = DocumentForm()
|
||||||
return render(request, 'core/model_form_upload.html', {
|
return render(request, 'core/model_form_upload.html', {
|
||||||
|
BIN
backend/webapp/prototype/filehandler/vocabulary.pkl
Normal file
BIN
backend/webapp/prototype/filehandler/vocabulary.pkl
Normal file
Binary file not shown.
@ -2,40 +2,50 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from postmarkup import render_bbcode
|
||||||
import json
|
import json
|
||||||
|
import html
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
|
||||||
def parseData(file):
|
def parseData(file):
|
||||||
# arguments
|
# arguments
|
||||||
parser = argparse.ArgumentParser(description='Process some xml files.')
|
parser = argparse.ArgumentParser(description='Process some xml files.')
|
||||||
parser.add_argument('filename', help='xml forum file')
|
parser.add_argument('filename', help='xml forum file')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# write file first
|
# write file first
|
||||||
with open('temp.xml', 'wb+') as destination:
|
fd = tempfile.NamedTemporaryFile()
|
||||||
for chunk in file.chunks():
|
f = open(fd.name, "wb+")
|
||||||
destination.write(chunk)
|
for chunk in file.chunks():
|
||||||
|
f.write(chunk)
|
||||||
# make a soup:
|
f.close()
|
||||||
with open('temp.xml') as forum:
|
|
||||||
soup = BeautifulSoup(forum, "xml")
|
|
||||||
|
|
||||||
|
# make a soup:
|
||||||
|
with open(fd.name) as forum:
|
||||||
|
soup = BeautifulSoup(forum, "xml")
|
||||||
|
|
||||||
# put json together
|
# put json together
|
||||||
out = {}
|
out = {}
|
||||||
out['id'] = soup.forum.get('id')
|
out['id'] = soup.forum.get('id')
|
||||||
out['name'] = soup.forum.find('name').text
|
out['name'] = soup.forum.find('name').text
|
||||||
out['discussions'] = []
|
out['discussions'] = []
|
||||||
for d in soup.forum.find_all('discussion'):
|
for d in soup.forum.find_all('discussion'):
|
||||||
|
posts = []
|
||||||
|
for p in d.find_all('post'):
|
||||||
|
post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
|
||||||
|
paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
|
||||||
|
posts.append({
|
||||||
|
'id': p.get('id'),
|
||||||
|
'parent': p.find('parent').text,
|
||||||
|
'author': p.userid.text,
|
||||||
|
'message': [x for x in paragraphs if x]
|
||||||
|
})
|
||||||
out['discussions'].append({
|
out['discussions'].append({
|
||||||
'id': d.get('id'),
|
'id': d.get('id'),
|
||||||
'title': d.find('name').text,
|
'title': d.find('name').text,
|
||||||
'first_post': d.firstpost.text,
|
'first_post': d.firstpost.text,
|
||||||
'posts': [
|
'posts': posts
|
||||||
{
|
|
||||||
'id': p.get('id'),
|
|
||||||
'parent': p.find('parent').text,
|
|
||||||
'author': p.userid.text,
|
|
||||||
'message': p.message.get_text()
|
|
||||||
} for p in d.find_all('post')]
|
|
||||||
})
|
})
|
||||||
|
fd.close()
|
||||||
return(out)
|
return(out)
|
||||||
|
BIN
backend/webapp/vocabulary.pkl
Normal file
BIN
backend/webapp/vocabulary.pkl
Normal file
Binary file not shown.
@ -1,40 +1,43 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import json
|
from postmarkup import render_bbcode
|
||||||
|
import html
|
||||||
|
import json
|
||||||
# arguments
|
|
||||||
parser = argparse.ArgumentParser(description='Process some xml files.')
|
|
||||||
parser.add_argument('filename', help='xml forum file')
|
# arguments
|
||||||
args = parser.parse_args()
|
parser = argparse.ArgumentParser(description='Process some xml files.')
|
||||||
|
parser.add_argument('filename', help='xml forum file')
|
||||||
# make a soup
|
args = parser.parse_args()
|
||||||
with open(args.filename) as forum:
|
|
||||||
soup = BeautifulSoup(forum, "xml")
|
# make a soup
|
||||||
|
with open(args.filename) as forum:
|
||||||
# put json together
|
soup = BeautifulSoup(forum, "xml")
|
||||||
out = {}
|
|
||||||
out['id'] = soup.forum.get('id')
|
# put json together
|
||||||
out['name'] = soup.forum.find('name').text
|
out = {}
|
||||||
out['discussions'] = []
|
out['id'] = soup.forum.get('id')
|
||||||
for d in soup.forum.find_all('discussion'):
|
out['name'] = soup.forum.find('name').text
|
||||||
posts = []
|
out['discussions'] = []
|
||||||
for p in d.find_all('post'):
|
for d in soup.forum.find_all('discussion'):
|
||||||
message_soup = BeautifulSoup(p.message.get_text(), "xml")
|
posts = []
|
||||||
posts.append({
|
for p in d.find_all('post'):
|
||||||
'id': p.get('id'),
|
post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
|
||||||
'parent': p.find('parent').text,
|
paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
|
||||||
'author': p.userid.text,
|
posts.append({
|
||||||
'message': message_soup.get_text()
|
'id': p.get('id'),
|
||||||
})
|
'parent': p.find('parent').text,
|
||||||
out['discussions'].append({
|
'author': p.userid.text,
|
||||||
'id': d.get('id'),
|
'message': [x for x in paragraphs if x]
|
||||||
'title': d.find('name').text,
|
})
|
||||||
'first_post': d.firstpost.text,
|
out['discussions'].append({
|
||||||
'posts': posts
|
'id': d.get('id'),
|
||||||
})
|
'title': d.find('name').text,
|
||||||
|
'first_post': d.firstpost.text,
|
||||||
with open('parsed.json', 'w') as outfile:
|
'posts': posts
|
||||||
json.dump(out, outfile, ensure_ascii=False, indent=2)
|
})
|
||||||
|
|
||||||
|
with open('parsed.json', 'w') as outfile:
|
||||||
|
json.dump(out, outfile, ensure_ascii=False, indent=2)
|
Loading…
Reference in New Issue
Block a user