29 lines
921 B
Python
29 lines
921 B
Python
import scrapy
|
|
from scrapy.crawler import CrawlerProcess
|
|
import re
|
|
|
|
class ScraperWithLimit(scrapy.Spider):
|
|
name = "ScraperWithLimit"
|
|
download_delay = 0.1
|
|
start_urls = [
|
|
'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'
|
|
]
|
|
|
|
custom_settings = {
|
|
'DEPTH_LIMIT': 1,
|
|
'FEED_EXPORT_ENCODING': 'utf-8'
|
|
}
|
|
|
|
def parse(self, response):
|
|
for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'):
|
|
yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse)
|
|
|
|
for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'):
|
|
yield {
|
|
'title': movie.xpath('div[1]/div/div/span/text()').get(),
|
|
'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get())
|
|
}
|
|
|
|
def getMp4(text):
|
|
x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text)
|
|
return x[0] |