Merge remote-tracking branch 'origin/master' into backend-mysql
This commit is contained in:
commit
f3860b9ac5
50
backend/edumaticParser.py
Normal file
50
backend/edumaticParser.py
Normal file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
from bs4 import BeautifulSoup
|
||||
from postmarkup import render_bbcode
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
# arguments
|
||||
parser = argparse.ArgumentParser(description='Process some edumatic xml files.')
|
||||
parser.add_argument('filename', help='xml forum file')
|
||||
args = parser.parse_args()
|
||||
|
||||
# make a soup
|
||||
with open(args.filename, 'rb') as forum:
|
||||
soup = BeautifulSoup(forum, "xml")
|
||||
|
||||
# put json together
|
||||
out = {}
|
||||
out['id'] = re.search(r'ID: (\d+)', soup.group.title.text).group(1)
|
||||
out['name'] = soup.group.table.find('string').text
|
||||
out['discussions'] = []
|
||||
did = 50
|
||||
pid = did + 1
|
||||
# we ignore first table, and then rules
|
||||
for d in soup.group.find_all('table')[4::2]:
|
||||
posts = []
|
||||
for p in d.find_all('row'):
|
||||
text = html.unescape(p.find_all('string')[1].text)
|
||||
paragraphs = [render_bbcode(x) for x in text.splitlines()]
|
||||
posts.append({
|
||||
'id': pid,
|
||||
'parent': pid - 1,
|
||||
'author': p.find_all('string')[2].text,
|
||||
'message': [x for x in paragraphs if x]
|
||||
})
|
||||
pid = pid + 1
|
||||
out['discussions'].append({
|
||||
'id' : did,
|
||||
'title': d.row.find('string').text,
|
||||
'first_post': did + 1,
|
||||
'posts': posts
|
||||
})
|
||||
did = did + 50
|
||||
pid = did + 1
|
||||
|
||||
with open('parsed.json', 'w', encoding='utf-8') as outfile:
|
||||
json.dump(out, outfile, ensure_ascii=False, indent=2)
|
@ -1,40 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
|
||||
|
||||
# arguments
|
||||
parser = argparse.ArgumentParser(description='Process some xml files.')
|
||||
parser.add_argument('filename', help='xml forum file')
|
||||
args = parser.parse_args()
|
||||
|
||||
# make a soup
|
||||
with open(args.filename) as forum:
|
||||
soup = BeautifulSoup(forum, "xml")
|
||||
|
||||
# put json together
|
||||
out = {}
|
||||
out['id'] = soup.forum.get('id')
|
||||
out['name'] = soup.forum.find('name').text
|
||||
out['discussions'] = []
|
||||
for d in soup.forum.find_all('discussion'):
|
||||
posts = []
|
||||
for p in d.find_all('post'):
|
||||
message_soup = BeautifulSoup(p.message.get_text(), "xml")
|
||||
posts.append({
|
||||
'id': p.get('id'),
|
||||
'parent': p.find('parent').text,
|
||||
'author': p.userid.text,
|
||||
'message': message_soup.get_text()
|
||||
})
|
||||
out['discussions'].append({
|
||||
'id': d.get('id'),
|
||||
'title': d.find('name').text,
|
||||
'first_post': d.firstpost.text,
|
||||
'posts': posts
|
||||
})
|
||||
|
||||
with open('parsed.json', 'w') as outfile:
|
||||
json.dump(out, outfile, ensure_ascii=False, indent=2)
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
from bs4 import BeautifulSoup
|
||||
from postmarkup import render_bbcode
|
||||
import html
|
||||
import json
|
||||
|
||||
|
||||
# arguments
|
||||
parser = argparse.ArgumentParser(description='Process some xml files.')
|
||||
parser.add_argument('filename', help='xml forum file')
|
||||
args = parser.parse_args()
|
||||
|
||||
# make a soup
|
||||
with open(args.filename) as forum:
|
||||
soup = BeautifulSoup(forum, "xml")
|
||||
|
||||
# put json together
|
||||
out = {}
|
||||
out['id'] = soup.forum.get('id')
|
||||
out['name'] = soup.forum.find('name').text
|
||||
out['discussions'] = []
|
||||
for d in soup.forum.find_all('discussion'):
|
||||
posts = []
|
||||
for p in d.find_all('post'):
|
||||
post_soup = BeautifulSoup(html.unescape(str(p.message)), "lxml")
|
||||
paragraphs = [render_bbcode(x.text) for x in post_soup.find_all('p')]
|
||||
posts.append({
|
||||
'id': p.get('id'),
|
||||
'parent': p.find('parent').text,
|
||||
'author': p.userid.text,
|
||||
'message': [x for x in paragraphs if x]
|
||||
})
|
||||
out['discussions'].append({
|
||||
'id': d.get('id'),
|
||||
'title': d.find('name').text,
|
||||
'first_post': d.firstpost.text,
|
||||
'posts': posts
|
||||
})
|
||||
|
||||
with open('parsed.json', 'w') as outfile:
|
||||
json.dump(out, outfile, ensure_ascii=False, indent=2)
|
Loading…
Reference in New Issue
Block a user