Add prototype basic crawl

This commit is contained in:
siulkilulki 2017-11-21 22:51:09 +01:00
parent 9f1423b362
commit 7161193169
4 changed files with 72 additions and 0 deletions

12
plan.org Normal file
View File

@ -0,0 +1,12 @@
* Plan pracy magisterskiej
** TODO Ulepszenie wydobywania poprawnych linków ze stron parafialnych
DEADLINE: <2017-11-19 nie>
** TODO Zebranie danych ze stron parafialnych
DEADLINE: <2017-12-03 nie>
** TODO [0/2] Ekstrakcja godzin mszy świętych
DEADLINE: <2018-01-14 nie>
- [ ] Opracowanie metody
- [ ] Ewaluacja
** TODO Poprawki
DEADLINE: <2018-01-21 nie>
** TODO Interfejs webowy

3
scraper/Makefile Normal file
View File

@ -0,0 +1,3 @@
SHELL := /bin/bash
PREPARE_ENVIRONMENT := $(shell ./prepare-enironment.sh > /tmp/makeenv)
include /tmp/makeenv

47
scraper/crawl_deon.py Executable file
View File

@ -0,0 +1,47 @@
#!/usr/bin/env python3
import requests
from string import Template
import re
from bs4 import BeautifulSoup
import unicodedata
def process_parish(url, parish_name):
page = requests.get(url, timeout=10)
soup = BeautifulSoup(page.text, 'html.parser')
# address = soup.find(class_='adres adres2')
# description = soup.find(class_='tytul5 clear').find_next(class_='row')
match = re.search('<b>www:</b> (.*?)<br', str(soup))
if match:
parish_url = match.group(1)
print('\t'.join([url, parish_name, parish_url]))
else:
if re.search('www:', str(soup)):
print(url)
# TODO: regexy lub soup
def process_page(url):
page = requests.get(url, timeout=10)
soup = BeautifulSoup(page.text, 'html.parser')
for td in soup.find_all('td', class_='temat'):
href = td.a['href']
parish_name = td.a['title']
parish_name = ' '.join(
unicodedata.normalize("NFKD", parish_name).split())
process_parish(href, parish_name)
def main():
base_url = 'https://www.deon.pl/parafie-koscioly/'
suffix = Template('strona,${page}.html')
process_page(base_url)
for i in range(2, 1014): # TODO: add search for last page nr on deon
url = base_url + suffix.substitute(page=str(i))
process_page(url)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
if conda info --envs | grep -q "env-name"; then
(>&2 echo "Environment exist. Ready to process.")
else
(>&2 conda env create -f environment.yml)
fi
source activate env-name
export PYTHONIOENCODING=utf8
env | sed 's/=/:=/' | sed 's/^/export /'