Merge branch 'include_spiders' of s333949/archSpeechReco into master
This commit is contained in:
commit
82632e17e0
64
src/spiders/finaSpider.py
Normal file
64
src/spiders/finaSpider.py
Normal file
@ -0,0 +1,64 @@
|
||||
import scrapy
|
||||
from scrapy.spiders import CrawlSpider, Rule
|
||||
from scrapy.linkextractors import LinkExtractor
|
||||
import re
|
||||
from items import MovieItem, Description
|
||||
|
||||
|
||||
class FinaSpider(CrawlSpider):
|
||||
name = 'repozytorium.fn.org.pl'
|
||||
start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a']
|
||||
|
||||
rules = (
|
||||
# Extract link from index of titles
|
||||
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')),
|
||||
|
||||
# Extract links with movie titles
|
||||
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'),
|
||||
|
||||
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')),
|
||||
|
||||
Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item')
|
||||
)
|
||||
custom_settings = {
|
||||
'FEED_EXPORT_ENCODING': 'utf-8'
|
||||
}
|
||||
|
||||
|
||||
def parse_item(self, response):
|
||||
|
||||
def getMp4(text):
|
||||
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
|
||||
return x
|
||||
|
||||
cleanHtml = re.compile('<.*?>')
|
||||
cleanPuncSpace = re.compile(': $')
|
||||
|
||||
item = MovieItem()
|
||||
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
|
||||
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
|
||||
item['url'] = response.url
|
||||
|
||||
desc = Description()
|
||||
desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
|
||||
desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
|
||||
desc['date'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[3]/span[2]/text()').get()
|
||||
|
||||
seq = {}
|
||||
for row in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[2]/div/div/table/tr'):
|
||||
seq[ row.xpath('td[1]/span/text()').get() ] = row.xpath('td[2]/span/text()').get()
|
||||
|
||||
desc['sequence'] = dict(seq)
|
||||
|
||||
det = {}
|
||||
for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'):
|
||||
key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first())
|
||||
key = re.sub(cleanPuncSpace, '', key)
|
||||
val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first())
|
||||
det[ key ] = val
|
||||
|
||||
desc['details'] = dict(det)
|
||||
|
||||
item['description'] = dict(desc)
|
||||
|
||||
return item
|
29
src/spiders/fina_test1.py
Normal file
29
src/spiders/fina_test1.py
Normal file
@ -0,0 +1,29 @@
|
||||
import scrapy
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
import re
|
||||
|
||||
class ScraperWithLimit(scrapy.Spider):
|
||||
name = "ScraperWithLimit"
|
||||
download_delay = 0.1
|
||||
start_urls = [
|
||||
'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'
|
||||
]
|
||||
|
||||
custom_settings = {
|
||||
'DEPTH_LIMIT': 1,
|
||||
'FEED_EXPORT_ENCODING': 'utf-8'
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'):
|
||||
yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse)
|
||||
|
||||
for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'):
|
||||
yield {
|
||||
'title': movie.xpath('div[1]/div/div/span/text()').get(),
|
||||
'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get())
|
||||
}
|
||||
|
||||
def getMp4(text):
|
||||
x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text)
|
||||
return x[0]
|
16
src/spiders/items.py
Normal file
16
src/spiders/items.py
Normal file
@ -0,0 +1,16 @@
|
||||
from scrapy.item import Item, Field
|
||||
|
||||
|
||||
class Description(Item):
|
||||
fullTitle = Field()
|
||||
sequence = Field()
|
||||
date = Field()
|
||||
desc = Field()
|
||||
details = Field()
|
||||
|
||||
|
||||
class MovieItem(Item):
|
||||
url = Field()
|
||||
title = Field()
|
||||
mp4 = Field()
|
||||
description = Field()
|
162
src/storageUpload.py
Normal file
162
src/storageUpload.py
Normal file
@ -0,0 +1,162 @@
|
||||
from google.cloud import storage
|
||||
import sys
|
||||
import urllib
|
||||
from pymongo import MongoClient
|
||||
from bson.objectid import ObjectId
|
||||
import os
|
||||
import datetime
|
||||
from subprocess import run,DEVNULL
|
||||
import argparse
|
||||
|
||||
def main(args):
|
||||
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
||||
dbName = "archSpeechReco"
|
||||
colName = "moviesMeta"
|
||||
bucket = 'archspeechreco'
|
||||
|
||||
col = getMongoCollection(colName,dbName,uri)
|
||||
fileFormat = args.format
|
||||
|
||||
if (fileFormat == 'mp4'):
|
||||
uploadMp4(col,bucket)
|
||||
elif (fileFormat == 'wav'):
|
||||
uploadWave(col,bucket)
|
||||
|
||||
def uploadMp4(col,bucket):
|
||||
toUpload = getUploadList(col)
|
||||
|
||||
for i in toUpload:
|
||||
fileName = ObjectId(i['_id'])
|
||||
getVid( i['url'], ObjectId( i['_id'] ) )
|
||||
upload_blob(bucket, "{}.mp4".format(fileName), "mp4/{}.mp4".format(fileName),col,"Mp4")
|
||||
try:
|
||||
os.remove("{}.mp4".format(fileName))
|
||||
except:
|
||||
print("{}.mp4 has NOT been removed".format(fileName))
|
||||
else:
|
||||
print("{}.mp4 has been removed".format(fileName))
|
||||
|
||||
|
||||
def uploadWave(col,bucket):
|
||||
toUpload = getWavUploadList(col)
|
||||
|
||||
for i in toUpload:
|
||||
fileName = ObjectId(i['_id'])
|
||||
getVid( i['url'], ObjectId( i['_id'] ) )
|
||||
getWave("{}.mp4".format(fileName))
|
||||
upload_blob(bucket, "{}.wav".format(fileName), "wave/{}.wav".format(fileName),col,"Wav")
|
||||
try:
|
||||
os.remove("{}.wav".format(fileName))
|
||||
except:
|
||||
print("{}.wav has NOT been removed".format(fileName))
|
||||
else:
|
||||
print("{}.wav has been removed".format(fileName))
|
||||
|
||||
|
||||
def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFormat):
|
||||
"""Uploads a file to the bucket."""
|
||||
storage_client = storage.Client()
|
||||
bucket = storage_client.get_bucket(bucket_name)
|
||||
blob = bucket.blob(destination_blob_name)
|
||||
|
||||
try:
|
||||
blob.upload_from_filename(source_file_name)
|
||||
except:
|
||||
print("gcs upload failed")
|
||||
else:
|
||||
print('File {}.{} uploaded to {}.'.format(
|
||||
source_file_name,
|
||||
fileFormat,
|
||||
destination_blob_name))
|
||||
now = datetime.datetime.now()
|
||||
try:
|
||||
col.update_one(
|
||||
{"_id": ObjectId(source_file_name.split('.')[0])},
|
||||
{"$set":{
|
||||
"gcs{}".format(fileFormat):{
|
||||
"location":destination_blob_name,
|
||||
"uploadDate":now.strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
except:
|
||||
print("mongo update failed")
|
||||
else:
|
||||
print("mongo update OK")
|
||||
|
||||
|
||||
def getMongoCollection(colName,dbName,uri):
|
||||
client = MongoClient(uri)
|
||||
db = client[dbName]
|
||||
col = db[colName]
|
||||
|
||||
return col
|
||||
|
||||
|
||||
def getUploadList(col):
|
||||
pipeline = []
|
||||
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
|
||||
pipeline.append({"$match": {
|
||||
"gcsMp4": {"$exists": False}
|
||||
}
|
||||
})
|
||||
#project phase, show only url and _id keys
|
||||
pipeline.append({"$project": {
|
||||
"url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] }
|
||||
}
|
||||
})
|
||||
#skip first N documents
|
||||
#pipeline.append({"$skip":362})
|
||||
#fetch only N documents
|
||||
#pipeline.append({"$limit":20})
|
||||
|
||||
return col.aggregate(pipeline)
|
||||
|
||||
def getWavUploadList(col):
|
||||
pipeline = []
|
||||
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
|
||||
pipeline.append({"$match": {
|
||||
"gcsWav": {"$exists": False}
|
||||
}
|
||||
})
|
||||
#project phase, show only url and _id keys
|
||||
pipeline.append({"$project": {
|
||||
"url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] }
|
||||
}
|
||||
})
|
||||
#skip first N documents
|
||||
#pipeline.append({"$skip":362})
|
||||
#fetch only N documents
|
||||
#pipeline.append({"$limit":500})
|
||||
|
||||
return col.aggregate(pipeline)
|
||||
|
||||
|
||||
def getVid(url,out):
|
||||
try:
|
||||
urllib.request.urlretrieve(url, "{}.mp4".format(out))
|
||||
except:
|
||||
print("wrong URL, can't download")
|
||||
|
||||
|
||||
def getWave(filename):
|
||||
try:
|
||||
run(['ffmpeg','-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1', filename.replace("mp4","wav")],stdout=DEVNULL)
|
||||
except:
|
||||
print("problem with ffmpeg")
|
||||
else:
|
||||
try:
|
||||
os.remove(filename)
|
||||
except:
|
||||
print("{} has NOT been removed".format(filename))
|
||||
else:
|
||||
print("{} has been removed".format(filename))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='GCS uploader')
|
||||
parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
Loading…
Reference in New Issue
Block a user