2019-11-16 21:24:28 +01:00
|
|
|
import scrapy
|
|
|
|
from scrapy.spiders import CrawlSpider, Rule
|
|
|
|
from scrapy.linkextractors import LinkExtractor
|
|
|
|
import re
|
2019-11-16 22:06:05 +01:00
|
|
|
from items import MovieItem
|
2019-11-16 21:24:28 +01:00
|
|
|
|
|
|
|
|
2019-11-16 22:06:05 +01:00
|
|
|
class FinaSpider(CrawlSpider):
|
2019-11-16 21:24:28 +01:00
|
|
|
name = 'repozytorium.fn.org.pl'
|
|
|
|
start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a']
|
|
|
|
|
|
|
|
rules = (
|
|
|
|
# Extract link from index of titles
|
|
|
|
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')),
|
|
|
|
|
|
|
|
# Extract links with movie titles
|
|
|
|
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'),
|
|
|
|
|
|
|
|
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')),
|
|
|
|
|
|
|
|
Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item')
|
|
|
|
)
|
|
|
|
custom_settings = {
|
|
|
|
'FEED_EXPORT_ENCODING': 'utf-8'
|
|
|
|
}
|
|
|
|
|
2019-12-11 20:06:26 +01:00
|
|
|
|
2019-11-16 21:24:28 +01:00
|
|
|
def parse_item(self, response):
|
|
|
|
|
2019-12-11 19:55:46 +01:00
|
|
|
def getMp4(text):
|
2019-11-16 21:24:28 +01:00
|
|
|
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
|
|
|
|
return x
|
|
|
|
|
|
|
|
item = MovieItem()
|
|
|
|
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
|
2019-12-11 19:55:46 +01:00
|
|
|
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
|
2019-11-16 21:24:28 +01:00
|
|
|
item['url'] = response.url
|
|
|
|
return item
|