diff --git a/backend/edumaticParser.py b/backend/edumaticParser.py new file mode 100644 index 0000000..f2e2775 --- /dev/null +++ b/backend/edumaticParser.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +import argparse +from bs4 import BeautifulSoup +from postmarkup import render_bbcode +import html +import json +import re + + +# arguments +parser = argparse.ArgumentParser(description='Process some edumatic xml files.') +parser.add_argument('filename', help='xml forum file') +args = parser.parse_args() + +# make a soup +with open(args.filename, 'rb') as forum: + soup = BeautifulSoup(forum, "xml") + +# put json together +out = {} +out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1) +out['name'] = soup.group.table.find('string').text +out['discussions'] = [] +did = 50 +pid = did + 1 +# we ignore first table, and then rules +for d in soup.group.find_all('table')[4::2]: + posts = [] + for p in d.find_all('row'): + text = html.unescape(p.find_all('string')[1].text) + paragraphs = [render_bbcode(x) for x in text.splitlines()] + posts.append({ + 'id': pid, + 'parent': pid - 1, + 'author': p.find_all('string')[2].text, + 'message': [x for x in paragraphs if x] + }) + pid = pid + 1 + out['discussions'].append({ + 'id' : did, + 'title': d.row.find('string').text, + 'first_post': did + 1, + 'posts': posts + }) + did = did + 50 + pid = did + 1 + +with open('parsed.json', 'w', encoding='utf-8') as outfile: + json.dump(out, outfile, ensure_ascii=False, indent=2) diff --git a/backend/webapp/model.py b/backend/webapp/model.py new file mode 100644 index 0000000..f157da6 --- /dev/null +++ b/backend/webapp/model.py @@ -0,0 +1,148 @@ +import pandas as pd +from io import StringIO +import matplotlib.pyplot as plt +from sklearn.feature_extraction.text import TfidfVectorizer +import numpy as np +from sklearn.svm import LinearSVC +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.metrics import accuracy_score +import seaborn as sns +from sklearn.metrics import confusion_matrix +import string +import re +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import precision_recall_fscore_support as score +import featuretools as ft +import joblib + +# Importing data and selecting desired columns +df = pd.read_csv('corp.tsv', sep='\t', encoding='utf-8') +print(df['label'].value_counts()) +df['index'] = df.index +columns_titles = ["index", "id", "body_text", "label"] +df=df.reindex(columns=columns_titles) +col = ['index','body_text', 'label'] +df = df[col] +df = df[pd.notnull(df['body_text'])] +df.columns = ['index','body_text', 'label'] +duplicateDFRow = df[df.duplicated(['body_text'])] + +# Factorizing labels for integer values +df['label_id'] = df['label'].factorize()[0] +label_id_df = df[['label', 'label_id']].drop_duplicates().sort_values('label_id') +label_to_id = dict(label_id_df.values) +id_to_label = dict(label_id_df[['label_id', 'label']].values) + +# Sampling data +#from imblearn.over_sampling import RandomOverSampler +#from imblearn.under_sampling import RandomUnderSampler + +#def resample(df, method): +# """Resamples df using method with .fit_resample() +# +# Args: +# df (DataFrame): Fraud data +# method (object): Resampler with .fit_resample() method +# Retuns: +# resampled_df (DataFrame): Resampled DataFrame +# """ +# target = df.pop('label_id') +# +# processed_x, processed_y = method.fit_resample(df, target) +# +# cols = list(df.columns) + ["label_id"] +# +# pdf_x = pd.DataFrame(processed_x, columns=df.columns) +# pdf_y = pd.DataFrame(processed_y, columns=['label_id']) +# resampled_df = pd.concat([pdf_x, pdf_y], axis=1) +# +# return resampled_df +#RUS = RandomUnderSampler(sampling_strategy={0: 650}, random_state=42) +#rus_resampled = resample(df, RUS) +#df = rus_resampled + +# Feature engineering +def count_punct(text): + count = sum([1 for char in text if char in string.punctuation]) + return round(count/(len(text) - text.count(" ")), 3)*100 + +df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" ")) +df['punct%'] = df['body_text'].apply(lambda x: count_punct(x)) + +#es = ft.EntitySet(id="text_data") +#es = es.entity_from_dataframe(entity_id="data", +# index='index', +# dataframe=df) + +#from nlp_primitives import ( +# DiversityScore, +# LSA, +# MeanCharactersPerWord, +# PartOfSpeechCount, +# PolarityScore, +# PunctuationCount, +# StopwordCount, +# TitleWordCount, +# UniversalSentenceEncoder, +# UpperCaseCount) + + +#trans = [DiversityScore, +# MeanCharactersPerWord, +# TitleWordCount, +# LSA, +# PartOfSpeechCount, +# UniversalSentenceEncoder, +# UpperCaseCount] + +#feature_matrix, feature_defs = ft.dfs(entityset=es, +# target_entity='data', +# verbose=True, +# trans_primitives=trans, +# max_depth=4) + +#feature_matrix.drop(["body_len"], axis=1, inplace=True) +#feature_matrix.drop(["punct%"], axis=1, inplace=True) + +# Vectorizing data +def clean_text(text): + text = "".join([word.lower() for word in text if word not in string.punctuation]) + tokens = re.split('\W+', text) + text = [word for word in tokens] + return text + +#tfidf = TfidfVectorizer(analyzer=clean_text) +tfidf = TfidfVectorizer(analyzer=clean_text,sublinear_tf=True, min_df=10, max_features=None, norm='l2', encoding='utf-8', ngram_range=(1,2)) +transformed = tfidf.fit_transform(df.body_text) +joblib.dump(tfidf.vocabulary_, 'vocabulary.pkl') +#features = tfidf.fit_transform(df.body_text).toarray() +features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True), + pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1) +#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False) +labels = df.label_id + +# Teaching model +model = LogisticRegression(solver='lbfgs', max_iter=7000) +#model = LinearSVC(dual=False) +#model = joblib.load('model.pkl') + +X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.2, random_state=42) +#model = joblib.load('model.pkl') + +model.fit(X_train, y_train) +y_pred = model.predict(X_test) +joblib.dump(model, 'model.pkl') +print("Accuracy of the model: " + str(accuracy_score(y_test, y_pred)*100) + "%") + +# Generating confusion matrix +conf_mat = confusion_matrix(y_test, y_pred) +fig, ax = plt.subplots(figsize=(10,10)) +sns.heatmap(conf_mat, annot=True, fmt='d', + xticklabels=label_id_df.label.values, + yticklabels=label_id_df.label.values) +plt.ylabel('Actual') +plt.xlabel('Predicted') +plt.show() diff --git a/backend/webapp/predict.py b/backend/webapp/predict.py new file mode 100644 index 0000000..d5297b8 --- /dev/null +++ b/backend/webapp/predict.py @@ -0,0 +1,73 @@ +import pandas as pd +from joblib import load +import string +import re +import featuretools as ft +from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer + +id_to_labels = load('labels.pkl') +data = open('testdata.txt').read().splitlines() +df = pd.DataFrame(data, columns=["body_text"]) +df['index'] = df.index +columns_titles = ["index", "body_text"] +df=df.reindex(columns=columns_titles) +col = ['index','body_text'] +df = df[col] +df.columns = ['index','body_text'] + +model = load('model.pkl') + +def count_punct(text): + count = sum([1 for char in text if char in string.punctuation]) + return round(count/(len(text) - text.count(" ")), 3)*100 + +df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" ")) +df['punct%'] = df['body_text'].apply(lambda x: count_punct(x)) + +#es = ft.EntitySet(id="text_data") +#es = es.entity_from_dataframe(entity_id="data", +# index='index', +# dataframe=df) +#from nlp_primitives import ( +# DiversityScore, +# LSA, +# MeanCharactersPerWord, +# TitleWordCount, +# UpperCaseCount) + + +#trans = [DiversityScore, +# MeanCharactersPerWord, +# TitleWordCount, +# LSA, +# UpperCaseCount] +#feature_matrix, feature_defs = ft.dfs(entityset=es, +# target_entity='data', +# verbose=True, +# trans_primitives=trans, +# max_depth=4) +#feature_matrix.drop(["body_len"], axis=1, inplace=True) +#feature_matrix.drop(["punct%"], axis=1, inplace=True) + + +# Vectorizing data +#def clean_text(text): +# text = "".join([word.lower() for word in text if word not in string.punctuation]) +# tokens = re.split('\W+', text) +# text = [word for word in tokens] +# return text +transformer = TfidfTransformer() +loaded_vec = CountVectorizer(decode_error="replace",vocabulary=load('vocabulary.pkl')) +transformed = transformer.fit_transform(loaded_vec.fit_transform(df.body_text).toarray()) + +features = pd.concat([df[['body_len', 'punct%']].reset_index(drop=True), + pd.DataFrame(transformed.toarray()).reset_index(drop=True)], axis=1) +#dataset = pd.concat([features,feature_matrix.reset_index(drop=True)], axis=1, sort=False) + +pred = model.predict(features) +labels = list(map(id_to_labels.get, pred)) +df['label'] = labels +del df['body_len'] +del df['punct%'] +df.to_csv('result.csv', encoding='utf-8') + diff --git a/backend/webapp/prototype/filehandler/forms.py b/backend/webapp/prototype/filehandler/forms.py index a8d8888..d433614 100644 --- a/backend/webapp/prototype/filehandler/forms.py +++ b/backend/webapp/prototype/filehandler/forms.py @@ -4,4 +4,4 @@ from prototype.filehandler.models import Document class DocumentForm(forms.ModelForm): class Meta: model = Document - fields = ('description', 'file', ) + fields = ('file', ) diff --git a/backend/webapp/prototype/filehandler/model.pkl b/backend/webapp/prototype/filehandler/model.pkl new file mode 100644 index 0000000..074eb0b Binary files /dev/null and b/backend/webapp/prototype/filehandler/model.pkl differ diff --git a/backend/webapp/prototype/filehandler/models.py b/backend/webapp/prototype/filehandler/models.py index 5ccb64b..84cc772 100644 --- a/backend/webapp/prototype/filehandler/models.py +++ b/backend/webapp/prototype/filehandler/models.py @@ -2,6 +2,5 @@ from __future__ import unicode_literals from django.db import models class Document(models.Model): - description = models.CharField(max_length=255, blank=True) file = models.FileField(upload_to='documents/') uploaded_at = models.DateTimeField(auto_now_add=True) diff --git a/backend/webapp/prototype/filehandler/views.py b/backend/webapp/prototype/filehandler/views.py index 5d9c3b8..42942dc 100644 --- a/backend/webapp/prototype/filehandler/views.py +++ b/backend/webapp/prototype/filehandler/views.py @@ -7,7 +7,6 @@ from django.http import JsonResponse, HttpResponse from prototype.filehandler.models import Document from prototype.filehandler.forms import DocumentForm -from prototype.filehandler.functions import isValidXML from prototype.filehandler.xmlParser import parseData def home(request): @@ -17,16 +16,15 @@ def home(request): @csrf_exempt def model_form_upload(request): if request.method == 'POST': - if not isValidXML(request.FILES['file']): - return HttpResponse('Niepoprawny format XML', status=406) - form = DocumentForm(request.POST, request.FILES) - print("POST: " + str(request.POST)) - print("FILES: " + str(request.FILES)) if form.is_valid(): - form.save() - data = parseData(request.FILES['file']) - return JsonResponse(data, safe=False) + try: + data = parseData(request.FILES['file']) + print(data) + form.save() + return JsonResponse(data, safe=False) + except: + return HttpResponse('Niepoprawny format XML', status=406) else: form = DocumentForm() return render(request, 'core/model_form_upload.html', { diff --git a/backend/webapp/prototype/filehandler/vocabulary.pkl b/backend/webapp/prototype/filehandler/vocabulary.pkl new file mode 100644 index 0000000..5d34426 Binary files /dev/null and b/backend/webapp/prototype/filehandler/vocabulary.pkl differ diff --git a/backend/webapp/prototype/filehandler/xmlParser.py b/backend/webapp/prototype/filehandler/xmlParser.py index e515403..7597140 100644 --- a/backend/webapp/prototype/filehandler/xmlParser.py +++ b/backend/webapp/prototype/filehandler/xmlParser.py @@ -2,40 +2,50 @@ import argparse from bs4 import BeautifulSoup +from postmarkup import render_bbcode import json +import html +import re +import tempfile -def parseData(file): +def parseData(file): # arguments parser = argparse.ArgumentParser(description='Process some xml files.') parser.add_argument('filename', help='xml forum file') args = parser.parse_args() - + # write file first - with open('temp.xml', 'wb+') as destination: - for chunk in file.chunks(): - destination.write(chunk) - - # make a soup: - with open('temp.xml') as forum: - soup = BeautifulSoup(forum, "xml") + fd = tempfile.NamedTemporaryFile() + f = open(fd.name, "wb+") + for chunk in file.chunks(): + f.write(chunk) + f.close() + # make a soup: + with open(fd.name) as forum: + soup = BeautifulSoup(forum, "xml") + # put json together out = {} out['id'] = soup.forum.get('id') out['name'] = soup.forum.find('name').text out['discussions'] = [] for d in soup.forum.find_all('discussion'): + posts = [] + for p in d.find_all('post'): + post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml") + paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')] + posts.append({ + 'id': p.get('id'), + 'parent': p.find('parent').text, + 'author': p.userid.text, + 'message': [x for x in paragraphs if x] + }) out['discussions'].append({ 'id': d.get('id'), 'title': d.find('name').text, 'first_post': d.firstpost.text, - 'posts': [ - { - 'id': p.get('id'), - 'parent': p.find('parent').text, - 'author': p.userid.text, - 'message': p.message.get_text() - } for p in d.find_all('post')] + 'posts': posts }) - + fd.close() return(out) diff --git a/backend/webapp/vocabulary.pkl b/backend/webapp/vocabulary.pkl new file mode 100644 index 0000000..5d34426 Binary files /dev/null and b/backend/webapp/vocabulary.pkl differ diff --git a/backend/xmlParser.py b/backend/xmlParser.py index 1cdde2b..deb404f 100644 --- a/backend/xmlParser.py +++ b/backend/xmlParser.py @@ -1,40 +1,43 @@ -#!/usr/bin/env python3 - -import argparse -from bs4 import BeautifulSoup -import json - - -# arguments -parser = argparse.ArgumentParser(description='Process some xml files.') -parser.add_argument('filename', help='xml forum file') -args = parser.parse_args() - -# make a soup -with open(args.filename) as forum: - soup = BeautifulSoup(forum, "xml") - -# put json together -out = {} -out['id'] = soup.forum.get('id') -out['name'] = soup.forum.find('name').text -out['discussions'] = [] -for d in soup.forum.find_all('discussion'): - posts = [] - for p in d.find_all('post'): - message_soup = BeautifulSoup(p.message.get_text(), "xml") - posts.append({ - 'id': p.get('id'), - 'parent': p.find('parent').text, - 'author': p.userid.text, - 'message': message_soup.get_text() - }) - out['discussions'].append({ - 'id': d.get('id'), - 'title': d.find('name').text, - 'first_post': d.firstpost.text, - 'posts': posts - }) - -with open('parsed.json', 'w') as outfile: - json.dump(out, outfile, ensure_ascii=False, indent=2) +#!/usr/bin/env python3 + +import argparse +from bs4 import BeautifulSoup +from postmarkup import render_bbcode +import html +import json + + +# arguments +parser = argparse.ArgumentParser(description='Process some xml files.') +parser.add_argument('filename', help='xml forum file') +args = parser.parse_args() + +# make a soup +with open(args.filename) as forum: + soup = BeautifulSoup(forum, "xml") + +# put json together +out = {} +out['id'] = soup.forum.get('id') +out['name'] = soup.forum.find('name').text +out['discussions'] = [] +for d in soup.forum.find_all('discussion'): + posts = [] + for p in d.find_all('post'): + post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml") + paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')] + posts.append({ + 'id': p.get('id'), + 'parent': p.find('parent').text, + 'author': p.userid.text, + 'message': [x for x in paragraphs if x] + }) + out['discussions'].append({ + 'id': d.get('id'), + 'title': d.find('name').text, + 'first_post': d.firstpost.text, + 'posts': posts + }) + +with open('parsed.json', 'w') as outfile: + json.dump(out, outfile, ensure_ascii=False, indent=2) \ No newline at end of file