Add prototype basic crawl
This commit is contained in:
parent
9f1423b362
commit
7161193169
12
plan.org
Normal file
12
plan.org
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
* Plan pracy magisterskiej
|
||||||
|
** TODO Ulepszenie wydobywania poprawnych linków ze stron parafialnych
|
||||||
|
DEADLINE: <2017-11-19 nie>
|
||||||
|
** TODO Zebranie danych ze stron parafialnych
|
||||||
|
DEADLINE: <2017-12-03 nie>
|
||||||
|
** TODO [0/2] Ekstrakcja godzin mszy świętych
|
||||||
|
DEADLINE: <2018-01-14 nie>
|
||||||
|
- [ ] Opracowanie metody
|
||||||
|
- [ ] Ewaluacja
|
||||||
|
** TODO Poprawki
|
||||||
|
DEADLINE: <2018-01-21 nie>
|
||||||
|
** TODO Interfejs webowy
|
3
scraper/Makefile
Normal file
3
scraper/Makefile
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
SHELL := /bin/bash
|
||||||
|
PREPARE_ENVIRONMENT := $(shell ./prepare-enironment.sh > /tmp/makeenv)
|
||||||
|
include /tmp/makeenv
|
47
scraper/crawl_deon.py
Executable file
47
scraper/crawl_deon.py
Executable file
@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import requests
|
||||||
|
from string import Template
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
def process_parish(url, parish_name):
|
||||||
|
page = requests.get(url, timeout=10)
|
||||||
|
soup = BeautifulSoup(page.text, 'html.parser')
|
||||||
|
# address = soup.find(class_='adres adres2')
|
||||||
|
# description = soup.find(class_='tytul5 clear').find_next(class_='row')
|
||||||
|
match = re.search('<b>www:</b> (.*?)<br', str(soup))
|
||||||
|
if match:
|
||||||
|
parish_url = match.group(1)
|
||||||
|
print('\t'.join([url, parish_name, parish_url]))
|
||||||
|
else:
|
||||||
|
if re.search('www:', str(soup)):
|
||||||
|
print(url)
|
||||||
|
|
||||||
|
# TODO: regexy lub soup
|
||||||
|
|
||||||
|
|
||||||
|
def process_page(url):
|
||||||
|
page = requests.get(url, timeout=10)
|
||||||
|
soup = BeautifulSoup(page.text, 'html.parser')
|
||||||
|
for td in soup.find_all('td', class_='temat'):
|
||||||
|
href = td.a['href']
|
||||||
|
parish_name = td.a['title']
|
||||||
|
parish_name = ' '.join(
|
||||||
|
unicodedata.normalize("NFKD", parish_name).split())
|
||||||
|
process_parish(href, parish_name)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
base_url = 'https://www.deon.pl/parafie-koscioly/'
|
||||||
|
suffix = Template('strona,${page}.html')
|
||||||
|
|
||||||
|
process_page(base_url)
|
||||||
|
for i in range(2, 1014): # TODO: add search for last page nr on deon
|
||||||
|
url = base_url + suffix.substitute(page=str(i))
|
||||||
|
process_page(url)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
10
scraper/prepare-enviroment.sh
Normal file
10
scraper/prepare-enviroment.sh
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
if conda info --envs | grep -q "env-name"; then
|
||||||
|
(>&2 echo "Environment exist. Ready to process.")
|
||||||
|
else
|
||||||
|
(>&2 conda env create -f environment.yml)
|
||||||
|
fi
|
||||||
|
|
||||||
|
source activate env-name
|
||||||
|
export PYTHONIOENCODING=utf8
|
||||||
|
env | sed 's/=/:=/' | sed 's/^/export /'
|
Loading…
Reference in New Issue
Block a user