archSpeechReco/src/spiders/finaSpider.py

65 lines
2.6 KiB
Python
Raw Normal View History

2019-11-16 21:24:28 +01:00
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import re
2019-12-13 22:32:09 +01:00
from items import MovieItem, Description
2019-11-16 21:24:28 +01:00
2019-11-16 22:06:05 +01:00
class FinaSpider(CrawlSpider):
2019-11-16 21:24:28 +01:00
name = 'repozytorium.fn.org.pl'
start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a']
rules = (
# Extract link from index of titles
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')),
# Extract links with movie titles
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')),
Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item')
)
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8'
}
2019-12-11 20:06:26 +01:00
2019-11-16 21:24:28 +01:00
def parse_item(self, response):
2019-12-11 19:55:46 +01:00
def getMp4(text):
2019-11-16 21:24:28 +01:00
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
return x
2019-12-13 22:32:09 +01:00
cleanHtml = re.compile('<.*?>')
cleanPuncSpace = re.compile(': $')
2019-11-16 21:24:28 +01:00
item = MovieItem()
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
2019-12-11 19:55:46 +01:00
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
2019-11-16 21:24:28 +01:00
item['url'] = response.url
2019-12-13 22:32:09 +01:00
2019-12-11 23:49:43 +01:00
desc = Description()
desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
desc['date'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[3]/span[2]/text()').get()
seq = {}
for row in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[2]/div/div/table/tr'):
seq[ row.xpath('td[1]/span/text()').get() ] = row.xpath('td[2]/span/text()').get()
desc['sequence'] = dict(seq)
2019-12-13 22:32:09 +01:00
det = {}
for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'):
key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first())
key = re.sub(cleanPuncSpace, '', key)
val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first())
det[ key ] = val
desc['details'] = dict(det)
2019-12-11 23:49:43 +01:00
item['description'] = dict(desc)
2019-12-11 20:13:33 +01:00
2019-11-16 21:24:28 +01:00
return item