spider on steroids

This commit is contained in:
Wojtek 2019-12-13 21:32:09 +00:00
parent 52241b5d66
commit 35ab9b8013
2 changed files with 15 additions and 6 deletions

View File

@ -2,7 +2,7 @@ import scrapy
from scrapy.spiders import CrawlSpider, Rule from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor from scrapy.linkextractors import LinkExtractor
import re import re
from items import MovieItem, Description, Sequence from items import MovieItem, Description
class FinaSpider(CrawlSpider): class FinaSpider(CrawlSpider):
@ -31,12 +31,14 @@ class FinaSpider(CrawlSpider):
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text) x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
return x return x
cleanHtml = re.compile('<.*?>')
cleanPuncSpace = re.compile(': $')
item = MovieItem() item = MovieItem()
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get() item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
item['url'] = response.url item['url'] = response.url
#TODO
#description:
desc = Description() desc = Description()
desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get() desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get() desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
@ -48,6 +50,15 @@ class FinaSpider(CrawlSpider):
desc['sequence'] = dict(seq) desc['sequence'] = dict(seq)
det = {}
for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'):
key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first())
key = re.sub(cleanPuncSpace, '', key)
val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first())
det[ key ] = val
desc['details'] = dict(det)
item['description'] = dict(desc) item['description'] = dict(desc)
return item return item

View File

@ -1,14 +1,12 @@
from scrapy.item import Item, Field from scrapy.item import Item, Field
class Sequence(Item):
seqTime = Field()
seqVal = Field()
class Description(Item): class Description(Item):
fullTitle = Field() fullTitle = Field()
sequence = Field() sequence = Field()
date = Field() date = Field()
desc = Field() desc = Field()
details = Field()
class MovieItem(Item): class MovieItem(Item):