Add prototype basic crawl
This commit is contained in:
parent
9f1423b362
commit
7161193169
12
plan.org
Normal file
12
plan.org
Normal file
@ -0,0 +1,12 @@
|
||||
* Plan pracy magisterskiej
|
||||
** TODO Ulepszenie wydobywania poprawnych linków ze stron parafialnych
|
||||
DEADLINE: <2017-11-19 nie>
|
||||
** TODO Zebranie danych ze stron parafialnych
|
||||
DEADLINE: <2017-12-03 nie>
|
||||
** TODO [0/2] Ekstrakcja godzin mszy świętych
|
||||
DEADLINE: <2018-01-14 nie>
|
||||
- [ ] Opracowanie metody
|
||||
- [ ] Ewaluacja
|
||||
** TODO Poprawki
|
||||
DEADLINE: <2018-01-21 nie>
|
||||
** TODO Interfejs webowy
|
3
scraper/Makefile
Normal file
3
scraper/Makefile
Normal file
@ -0,0 +1,3 @@
|
||||
SHELL := /bin/bash
|
||||
PREPARE_ENVIRONMENT := $(shell ./prepare-enironment.sh > /tmp/makeenv)
|
||||
include /tmp/makeenv
|
47
scraper/crawl_deon.py
Executable file
47
scraper/crawl_deon.py
Executable file
@ -0,0 +1,47 @@
|
||||
#!/usr/bin/env python3
|
||||
import requests
|
||||
from string import Template
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
import unicodedata
|
||||
|
||||
|
||||
def process_parish(url, parish_name):
|
||||
page = requests.get(url, timeout=10)
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
# address = soup.find(class_='adres adres2')
|
||||
# description = soup.find(class_='tytul5 clear').find_next(class_='row')
|
||||
match = re.search('<b>www:</b> (.*?)<br', str(soup))
|
||||
if match:
|
||||
parish_url = match.group(1)
|
||||
print('\t'.join([url, parish_name, parish_url]))
|
||||
else:
|
||||
if re.search('www:', str(soup)):
|
||||
print(url)
|
||||
|
||||
# TODO: regexy lub soup
|
||||
|
||||
|
||||
def process_page(url):
|
||||
page = requests.get(url, timeout=10)
|
||||
soup = BeautifulSoup(page.text, 'html.parser')
|
||||
for td in soup.find_all('td', class_='temat'):
|
||||
href = td.a['href']
|
||||
parish_name = td.a['title']
|
||||
parish_name = ' '.join(
|
||||
unicodedata.normalize("NFKD", parish_name).split())
|
||||
process_parish(href, parish_name)
|
||||
|
||||
|
||||
def main():
|
||||
base_url = 'https://www.deon.pl/parafie-koscioly/'
|
||||
suffix = Template('strona,${page}.html')
|
||||
|
||||
process_page(base_url)
|
||||
for i in range(2, 1014): # TODO: add search for last page nr on deon
|
||||
url = base_url + suffix.substitute(page=str(i))
|
||||
process_page(url)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
10
scraper/prepare-enviroment.sh
Normal file
10
scraper/prepare-enviroment.sh
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
if conda info --envs | grep -q "env-name"; then
|
||||
(>&2 echo "Environment exist. Ready to process.")
|
||||
else
|
||||
(>&2 conda env create -f environment.yml)
|
||||
fi
|
||||
|
||||
source activate env-name
|
||||
export PYTHONIOENCODING=utf8
|
||||
env | sed 's/=/:=/' | sed 's/^/export /'
|
Loading…
Reference in New Issue
Block a user