From 35ab9b801312e3361b807c98d0b7286f99f40347 Mon Sep 17 00:00:00 2001 From: Wojtek Date: Fri, 13 Dec 2019 21:32:09 +0000 Subject: [PATCH] spider on steroids --- src/spiders/finaSpider.py | 17 ++++++++++++++--- src/spiders/items.py | 4 +--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/spiders/finaSpider.py b/src/spiders/finaSpider.py index 0955da7a..c52ca415 100644 --- a/src/spiders/finaSpider.py +++ b/src/spiders/finaSpider.py @@ -2,7 +2,7 @@ import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor import re -from items import MovieItem, Description, Sequence +from items import MovieItem, Description class FinaSpider(CrawlSpider): @@ -31,12 +31,14 @@ class FinaSpider(CrawlSpider): x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text) return x + cleanHtml = re.compile('<.*?>') + cleanPuncSpace = re.compile(': $') + item = MovieItem() item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get() item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) item['url'] = response.url - #TODO - #description: + desc = Description() desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get() desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get() @@ -48,6 +50,15 @@ class FinaSpider(CrawlSpider): desc['sequence'] = dict(seq) + det = {} + for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'): + key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first()) + key = re.sub(cleanPuncSpace, '', key) + val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first()) + det[ key ] = val + + desc['details'] = dict(det) + item['description'] = dict(desc) return item diff --git a/src/spiders/items.py b/src/spiders/items.py index f1fb9c0a..b99b0557 100644 --- a/src/spiders/items.py +++ b/src/spiders/items.py @@ -1,14 +1,12 @@ from scrapy.item import Item, Field -class Sequence(Item): - seqTime = Field() - seqVal = Field() class Description(Item): fullTitle = Field() sequence = Field() date = Field() desc = Field() + details = Field() class MovieItem(Item):