new files with scrappy spiders
This commit is contained in:
parent
5861f57d7f
commit
c006103bc1
29
src/spiders/fina_test1.py
Normal file
29
src/spiders/fina_test1.py
Normal file
@ -0,0 +1,29 @@
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
import re
|
||||
|
||||
class ScraperWithLimit(scrapy.Spider):
|
||||
name = "ScraperWithLimit"
|
||||
download_delay = 0.1
|
||||
start_urls = [
|
||||
'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'
|
||||
]
|
||||
|
||||
custom_settings = {
|
||||
'DEPTH_LIMIT': 1,
|
||||
'FEED_EXPORT_ENCODING': 'utf-8'
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'):
|
||||
yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse)
|
||||
|
||||
for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'):
|
||||
yield {
|
||||
'title': movie.xpath('div[1]/div/div/span/text()').get(),
|
||||
'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get())
|
||||
}
|
||||
|
||||
def getMp4(text):
|
||||
x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text)
|
||||
return x[0]
|
38
src/spiders/fina_test2.py
Normal file
38
src/spiders/fina_test2.py
Normal file
@ -0,0 +1,38 @@
|
||||
import scrapy
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
from scrapy.item import Item, Field
|
||||
import re
|
||||
|
||||
from fina_scrap.items import MovieItem
|
||||
|
||||
class MySpider(CrawlSpider):
|
||||
name = 'repozytorium.fn.org.pl'
|
||||
start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a']
|
||||
|
||||
rules = (
|
||||
# Extract link from index of titles
|
||||
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')),
|
||||
|
||||
# Extract links with movie titles
|
||||
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'),
|
||||
|
||||
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')),
|
||||
|
||||
Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item')
|
||||
)
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_ENCODING': 'utf-8'
|
||||
}
|
||||
|
||||
def parse_item(self, response):
|
||||
|
||||
def getMp4(self, text):
|
||||
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
|
||||
return x
|
||||
|
||||
item = MovieItem()
|
||||
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
|
||||
item['mp4'] = self.getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
|
||||
item['url'] = response.url
|
||||
return item
|
6
src/spiders/items.py
Normal file
6
src/spiders/items.py
Normal file
@ -0,0 +1,6 @@
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
class MovieItem(Item):
|
||||
url = Field()
|
||||
title = Field()
|
||||
mp4 = Field()
|
Loading…
Reference in New Issue
Block a user