spider on steroids

This commit is contained in:
Wojtek 2019-12-13 21:32:09 +00:00
parent 52241b5d66
commit 35ab9b8013
2 changed files with 15 additions and 6 deletions

View File

@ -2,7 +2,7 @@ import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import re
from items import MovieItem, Description, Sequence
from items import MovieItem, Description
class FinaSpider(CrawlSpider):
@ -31,12 +31,14 @@ class FinaSpider(CrawlSpider):
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
return x
cleanHtml = re.compile('<.*?>')
cleanPuncSpace = re.compile(': $')
item = MovieItem()
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
item['url'] = response.url
#TODO
#description:
desc = Description()
desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
@ -48,6 +50,15 @@ class FinaSpider(CrawlSpider):
desc['sequence'] = dict(seq)
det = {}
for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'):
key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first())
key = re.sub(cleanPuncSpace, '', key)
val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first())
det[ key ] = val
desc['details'] = dict(det)
item['description'] = dict(desc)
return item

View File

@ -1,14 +1,12 @@
from scrapy.item import Item, Field
class Sequence(Item):
seqTime = Field()
seqVal = Field()
class Description(Item):
fullTitle = Field()
sequence = Field()
date = Field()
desc = Field()
details = Field()
class MovieItem(Item):