import scrapy from scrapy.crawler import CrawlerProcess import re class ScraperWithLimit(scrapy.Spider): name = "ScraperWithLimit" download_delay = 0.1 start_urls = [ 'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a' ] custom_settings = { 'DEPTH_LIMIT': 1, 'FEED_EXPORT_ENCODING': 'utf-8' } def parse(self, response): for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'): yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse) for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'): yield { 'title': movie.xpath('div[1]/div/div/span/text()').get(), 'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get()) } def getMp4(text): x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text) return x[0]