From c006103bc1c3a44b17ae99cb1d5af0825a31da04 Mon Sep 17 00:00:00 2001 From: Smolak Date: Sat, 16 Nov 2019 21:24:28 +0100 Subject: [PATCH] new files with scrappy spiders --- src/spiders/fina_test1.py | 29 +++++++++++++++++++++++++++++ src/spiders/fina_test2.py | 38 ++++++++++++++++++++++++++++++++++++++ src/spiders/items.py | 6 ++++++ 3 files changed, 73 insertions(+) create mode 100644 src/spiders/fina_test1.py create mode 100644 src/spiders/fina_test2.py create mode 100644 src/spiders/items.py diff --git a/src/spiders/fina_test1.py b/src/spiders/fina_test1.py new file mode 100644 index 00000000..cfa0220d --- /dev/null +++ b/src/spiders/fina_test1.py @@ -0,0 +1,29 @@ +import scrapy +from scrapy.crawler import CrawlerProcess +import re + +class ScraperWithLimit(scrapy.Spider): + name = "ScraperWithLimit" + download_delay = 0.1 + start_urls = [ + 'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a' + ] + + custom_settings = { + 'DEPTH_LIMIT': 1, + 'FEED_EXPORT_ENCODING': 'utf-8' + } + + def parse(self, response): + for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'): + yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse) + + for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'): + yield { + 'title': movie.xpath('div[1]/div/div/span/text()').get(), + 'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get()) + } + +def getMp4(text): + x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text) + return x[0] \ No newline at end of file diff --git a/src/spiders/fina_test2.py b/src/spiders/fina_test2.py new file mode 100644 index 00000000..a0aec9fe --- /dev/null +++ b/src/spiders/fina_test2.py @@ -0,0 +1,38 @@ +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy.item import Item, Field +import re + +from fina_scrap.items import MovieItem + +class MySpider(CrawlSpider): + name = 'repozytorium.fn.org.pl' + start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'] + + rules = ( + # Extract link from index of titles + Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')), + + # Extract links with movie titles + Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'), + + Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')), + + Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item') + ) + custom_settings = { + 'FEED_EXPORT_ENCODING': 'utf-8' + } + + def parse_item(self, response): + + def getMp4(self, text): + x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text) + return x + + item = MovieItem() + item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get() + item['mp4'] = self.getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) + item['url'] = response.url + return item diff --git a/src/spiders/items.py b/src/spiders/items.py new file mode 100644 index 00000000..407b4307 --- /dev/null +++ b/src/spiders/items.py @@ -0,0 +1,6 @@ +from scrapy.item import Item, Field + +class MovieItem(Item): + url = Field() + title = Field() + mp4 = Field() \ No newline at end of file