Add prototype basic crawl

2017-11-21 22:51:09 +01:00 · 2017-11-21 22:51:09 +01:00 · 7161193169
commit 7161193169
parent 9f1423b362
4 changed files with 72 additions and 0 deletions
--- a/plan.org
+++ b/plan.org
@ -0,0 +1,12 @@
 * Plan pracy magisterskiej
 ** TODO Ulepszenie wydobywania poprawnych linków ze stron parafialnych
  DEADLINE: <2017-11-19 nie>
 ** TODO Zebranie danych ze stron parafialnych
   DEADLINE: <2017-12-03 nie>
 ** TODO [0/2] Ekstrakcja godzin mszy świętych
   DEADLINE: <2018-01-14 nie>
 - [ ] Opracowanie metody
 - [ ] Ewaluacja
 ** TODO Poprawki
   DEADLINE: <2018-01-21 nie>
 ** TODO Interfejs webowy
--- a/scraper/Makefile
+++ b/scraper/Makefile
@ -0,0 +1,3 @@
 SHELL := /bin/bash
 PREPARE_ENVIRONMENT := $(shell ./prepare-enironment.sh > /tmp/makeenv)
 include /tmp/makeenv
--- a/scraper/crawl_deon.py
+++ b/scraper/crawl_deon.py
@ -0,0 +1,47 @@
 #!/usr/bin/env python3
 import requests
 from string import Template
 import re
 from bs4 import BeautifulSoup
 import unicodedata
 def process_parish(url, parish_name):
    page = requests.get(url, timeout=10)
    soup = BeautifulSoup(page.text, 'html.parser')
    # address = soup.find(class_='adres adres2')
    # description = soup.find(class_='tytul5 clear').find_next(class_='row')
    match = re.search('<b>www:</b> (.*?)<br', str(soup))
    if match:
        parish_url = match.group(1)
        print('\t'.join([url, parish_name, parish_url]))
    else:
        if re.search('www:', str(soup)):
            print(url)
    # TODO: regexy lub soup
 def process_page(url):
    page = requests.get(url, timeout=10)
    soup = BeautifulSoup(page.text, 'html.parser')
    for td in soup.find_all('td', class_='temat'):
        href = td.a['href']
        parish_name = td.a['title']
        parish_name = ' '.join(
            unicodedata.normalize("NFKD", parish_name).split())
        process_parish(href, parish_name)
 def main():
    base_url = 'https://www.deon.pl/parafie-koscioly/'
    suffix = Template('strona,${page}.html')
    process_page(base_url)
    for i in range(2, 1014):  # TODO: add search for last page nr on deon
        url = base_url + suffix.substitute(page=str(i))
        process_page(url)
 if __name__ == '__main__':
    main()
--- a/scraper/prepare-enviroment.sh
+++ b/scraper/prepare-enviroment.sh
@ -0,0 +1,10 @@
 #!/usr/bin/env bash
 if conda info --envs | grep -q "env-name"; then
    (>&2 echo "Environment exist. Ready to process.")
 else
    (>&2 conda env create -f environment.yml)
 fi
 source activate env-name
 export PYTHONIOENCODING=utf8
 env | sed 's/=/:=/' | sed 's/^/export /'