import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor import re from items import MovieItem, Description class FinaSpider(CrawlSpider): name = 'repozytorium.fn.org.pl' start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'] rules = ( # Extract link from index of titles Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')), # Extract links with movie titles Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'), Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')), Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item') ) custom_settings = { 'FEED_EXPORT_ENCODING': 'utf-8' } def parse_item(self, response): def getMp4(text): x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text) return x cleanHtml = re.compile('<.*?>') cleanPuncSpace = re.compile(': $') item = MovieItem() item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get() item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) item['url'] = response.url desc = Description() desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get() desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get() desc['date'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[3]/span[2]/text()').get() seq = {} for row in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[2]/div/div/table/tr'): seq[ row.xpath('td[1]/span/text()').get() ] = row.xpath('td[2]/span/text()').get() desc['sequence'] = dict(seq) det = {} for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'): key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first()) key = re.sub(cleanPuncSpace, '', key) val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first()) det[ key ] = val desc['details'] = dict(det) item['description'] = dict(desc) return item