archSpeechReco/src/spiders/fina_test1.py

import scrapy
from scrapy.crawler import CrawlerProcess
import re

class ScraperWithLimit(scrapy.Spider):
    name = "ScraperWithLimit"
    download_delay = 0.1
    start_urls = [
        'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'
    ]

    custom_settings = {
        'DEPTH_LIMIT': 1,
        'FEED_EXPORT_ENCODING': 'utf-8'
    }

    def parse(self, response):
        for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'):
            yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse)

        for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'):
            yield {
                'title': movie.xpath('div[1]/div/div/span/text()').get(),
                'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get())
            }

def getMp4(text):
    x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text)
    return x[0]