spider on steroids
This commit is contained in:
parent
52241b5d66
commit
35ab9b8013
@ -2,7 +2,7 @@ import scrapy
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
import re
|
||||
from items import MovieItem, Description, Sequence
|
||||
from items import MovieItem, Description
|
||||
|
||||
|
||||
class FinaSpider(CrawlSpider):
|
||||
@ -31,12 +31,14 @@ class FinaSpider(CrawlSpider):
|
||||
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
|
||||
return x
|
||||
|
||||
cleanHtml = re.compile('<.*?>')
|
||||
cleanPuncSpace = re.compile(': $')
|
||||
|
||||
item = MovieItem()
|
||||
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
|
||||
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
|
||||
item['url'] = response.url
|
||||
#TODO
|
||||
#description:
|
||||
|
||||
desc = Description()
|
||||
desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
|
||||
desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
|
||||
@ -48,6 +50,15 @@ class FinaSpider(CrawlSpider):
|
||||
|
||||
desc['sequence'] = dict(seq)
|
||||
|
||||
det = {}
|
||||
for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'):
|
||||
key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first())
|
||||
key = re.sub(cleanPuncSpace, '', key)
|
||||
val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first())
|
||||
det[ key ] = val
|
||||
|
||||
desc['details'] = dict(det)
|
||||
|
||||
item['description'] = dict(desc)
|
||||
|
||||
return item
|
||||
|
@ -1,14 +1,12 @@
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
class Sequence(Item):
|
||||
seqTime = Field()
|
||||
seqVal = Field()
|
||||
|
||||
class Description(Item):
|
||||
fullTitle = Field()
|
||||
sequence = Field()
|
||||
date = Field()
|
||||
desc = Field()
|
||||
details = Field()
|
||||
|
||||
|
||||
class MovieItem(Item):
|
||||
|
Loading…
Reference in New Issue
Block a user