spider on steroids
This commit is contained in:
parent
52241b5d66
commit
35ab9b8013
@ -2,7 +2,7 @@ import scrapy
|
|||||||
from scrapy.spiders import CrawlSpider, Rule
|
from scrapy.spiders import CrawlSpider, Rule
|
||||||
from scrapy.linkextractors import LinkExtractor
|
from scrapy.linkextractors import LinkExtractor
|
||||||
import re
|
import re
|
||||||
from items import MovieItem, Description, Sequence
|
from items import MovieItem, Description
|
||||||
|
|
||||||
|
|
||||||
class FinaSpider(CrawlSpider):
|
class FinaSpider(CrawlSpider):
|
||||||
@ -31,12 +31,14 @@ class FinaSpider(CrawlSpider):
|
|||||||
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
|
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
cleanHtml = re.compile('<.*?>')
|
||||||
|
cleanPuncSpace = re.compile(': $')
|
||||||
|
|
||||||
item = MovieItem()
|
item = MovieItem()
|
||||||
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
|
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
|
||||||
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
|
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
|
||||||
item['url'] = response.url
|
item['url'] = response.url
|
||||||
#TODO
|
|
||||||
#description:
|
|
||||||
desc = Description()
|
desc = Description()
|
||||||
desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
|
desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
|
||||||
desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
|
desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
|
||||||
@ -48,6 +50,15 @@ class FinaSpider(CrawlSpider):
|
|||||||
|
|
||||||
desc['sequence'] = dict(seq)
|
desc['sequence'] = dict(seq)
|
||||||
|
|
||||||
|
det = {}
|
||||||
|
for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'):
|
||||||
|
key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first())
|
||||||
|
key = re.sub(cleanPuncSpace, '', key)
|
||||||
|
val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first())
|
||||||
|
det[ key ] = val
|
||||||
|
|
||||||
|
desc['details'] = dict(det)
|
||||||
|
|
||||||
item['description'] = dict(desc)
|
item['description'] = dict(desc)
|
||||||
|
|
||||||
return item
|
return item
|
||||||
|
@ -1,14 +1,12 @@
|
|||||||
from scrapy.item import Item, Field
|
from scrapy.item import Item, Field
|
||||||
|
|
||||||
class Sequence(Item):
|
|
||||||
seqTime = Field()
|
|
||||||
seqVal = Field()
|
|
||||||
|
|
||||||
class Description(Item):
|
class Description(Item):
|
||||||
fullTitle = Field()
|
fullTitle = Field()
|
||||||
sequence = Field()
|
sequence = Field()
|
||||||
date = Field()
|
date = Field()
|
||||||
desc = Field()
|
desc = Field()
|
||||||
|
details = Field()
|
||||||
|
|
||||||
|
|
||||||
class MovieItem(Item):
|
class MovieItem(Item):
|
||||||
|
Loading…
Reference in New Issue
Block a user