From 716119316968e896095db0f53625f875da9b88f5 Mon Sep 17 00:00:00 2001 From: siulkilulki Date: Tue, 21 Nov 2017 22:51:09 +0100 Subject: [PATCH] Add prototype basic crawl --- plan.org | 12 +++++++++ scraper/Makefile | 3 +++ scraper/crawl_deon.py | 47 +++++++++++++++++++++++++++++++++++ scraper/prepare-enviroment.sh | 10 ++++++++ 4 files changed, 72 insertions(+) create mode 100644 plan.org create mode 100644 scraper/Makefile create mode 100755 scraper/crawl_deon.py create mode 100644 scraper/prepare-enviroment.sh diff --git a/plan.org b/plan.org new file mode 100644 index 0000000..c80928c --- /dev/null +++ b/plan.org @@ -0,0 +1,12 @@ +* Plan pracy magisterskiej +** TODO Ulepszenie wydobywania poprawnych linków ze stron parafialnych + DEADLINE: <2017-11-19 nie> +** TODO Zebranie danych ze stron parafialnych + DEADLINE: <2017-12-03 nie> +** TODO [0/2] Ekstrakcja godzin mszy świętych + DEADLINE: <2018-01-14 nie> + - [ ] Opracowanie metody + - [ ] Ewaluacja +** TODO Poprawki + DEADLINE: <2018-01-21 nie> +** TODO Interfejs webowy diff --git a/scraper/Makefile b/scraper/Makefile new file mode 100644 index 0000000..51c7109 --- /dev/null +++ b/scraper/Makefile @@ -0,0 +1,3 @@ +SHELL := /bin/bash +PREPARE_ENVIRONMENT := $(shell ./prepare-enironment.sh > /tmp/makeenv) +include /tmp/makeenv diff --git a/scraper/crawl_deon.py b/scraper/crawl_deon.py new file mode 100755 index 0000000..c32cf83 --- /dev/null +++ b/scraper/crawl_deon.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +import requests +from string import Template +import re +from bs4 import BeautifulSoup +import unicodedata + + +def process_parish(url, parish_name): + page = requests.get(url, timeout=10) + soup = BeautifulSoup(page.text, 'html.parser') + # address = soup.find(class_='adres adres2') + # description = soup.find(class_='tytul5 clear').find_next(class_='row') + match = re.search('www: (.*?)&2 echo "Environment exist. Ready to process.") +else + (>&2 conda env create -f environment.yml) +fi + +source activate env-name +export PYTHONIOENCODING=utf8 +env | sed 's/=/:=/' | sed 's/^/export /'