8b72d0b351
Added spider. Started working on testsets.
63 lines
1.8 KiB
Python
Executable File
63 lines
1.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
import logging
|
|
from collections import OrderedDict
|
|
|
|
logging.basicConfig(
|
|
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
|
|
|
|
|
|
def process_page(url):
|
|
logging.info(url)
|
|
page = requests.get(url, timeout=30)
|
|
soup = BeautifulSoup(page.text, 'html.parser')
|
|
parish_url = soup.find('a', class_='website')
|
|
parish_url = parish_url['href'] if parish_url else ''
|
|
name = soup.find('title').text
|
|
masses = soup.find('div', class_='masses')
|
|
if not masses:
|
|
return
|
|
sunday_masses = masses.find_next(
|
|
'div', class_='column').find_next(
|
|
'div', class_='column')
|
|
if 'powszednie' in masses.text:
|
|
everyday_masses = sunday_masses.find_next(
|
|
'div', class_='column').find_next(
|
|
'div', class_='column').text
|
|
else:
|
|
everyday_masses = ''
|
|
sunday_masses = sunday_masses.text
|
|
print('\t'.join(
|
|
[name, parish_url, sunday_masses, everyday_masses, page.url]))
|
|
|
|
|
|
def get_all_urls():
|
|
# urls = []
|
|
base_url = 'http://archwwa.pl/parafie/strona/'
|
|
for i in range(1, 23):
|
|
page = requests.get(base_url + str(i), timeout=30)
|
|
soup = BeautifulSoup(page.text, 'html.parser')
|
|
for el in soup.find_all('a', class_='more-link'):
|
|
# urls.append(el['href'])
|
|
yield el['href']
|
|
# return urls
|
|
|
|
|
|
def main():
|
|
urls = get_all_urls()
|
|
print(
|
|
'\t'.join(['Parafia', 'url', 'niedzielne', 'powszednie', 'debug_url']))
|
|
i = 0
|
|
for url in urls:
|
|
if url == 'http://archwwa.pl/parafie/strona-parafii-propozycja/':
|
|
continue
|
|
process_page(url)
|
|
i += 1
|
|
logging.info(i)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|