mass-scraper/scraper/crawl_warszawa.py
Dawid Jurkiewicz 8b72d0b351 Prototype rule based masses extractor.
Added spider.
Started working on testsets.
2018-03-01 14:40:13 +01:00

63 lines
1.8 KiB
Python
Executable File

#!/usr/bin/env python3
import re
from bs4 import BeautifulSoup
import requests
import logging
from collections import OrderedDict
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def process_page(url):
logging.info(url)
page = requests.get(url, timeout=30)
soup = BeautifulSoup(page.text, 'html.parser')
parish_url = soup.find('a', class_='website')
parish_url = parish_url['href'] if parish_url else ''
name = soup.find('title').text
masses = soup.find('div', class_='masses')
if not masses:
return
sunday_masses = masses.find_next(
'div', class_='column').find_next(
'div', class_='column')
if 'powszednie' in masses.text:
everyday_masses = sunday_masses.find_next(
'div', class_='column').find_next(
'div', class_='column').text
else:
everyday_masses = ''
sunday_masses = sunday_masses.text
print('\t'.join(
[name, parish_url, sunday_masses, everyday_masses, page.url]))
def get_all_urls():
# urls = []
base_url = 'http://archwwa.pl/parafie/strona/'
for i in range(1, 23):
page = requests.get(base_url + str(i), timeout=30)
soup = BeautifulSoup(page.text, 'html.parser')
for el in soup.find_all('a', class_='more-link'):
# urls.append(el['href'])
yield el['href']
# return urls
def main():
urls = get_all_urls()
print(
'\t'.join(['Parafia', 'url', 'niedzielne', 'powszednie', 'debug_url']))
i = 0
for url in urls:
if url == 'http://archwwa.pl/parafie/strona-parafii-propozycja/':
continue
process_page(url)
i += 1
logging.info(i)
if __name__ == '__main__':
main()