Merge branch 'include_spiders' of s333949/archSpeechReco into master

This commit is contained in:
Wojciech Smolak 2020-01-06 11:57:11 +00:00 committed by Gogs
commit 82632e17e0
4 changed files with 271 additions and 0 deletions

64
src/spiders/finaSpider.py Normal file
View File

@ -0,0 +1,64 @@
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import re
from items import MovieItem, Description
class FinaSpider(CrawlSpider):
name = 'repozytorium.fn.org.pl'
start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a']
rules = (
# Extract link from index of titles
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')),
# Extract links with movie titles
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'),
Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')),
Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item')
)
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8'
}
def parse_item(self, response):
def getMp4(text):
x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
return x
cleanHtml = re.compile('<.*?>')
cleanPuncSpace = re.compile(': $')
item = MovieItem()
item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
item['url'] = response.url
desc = Description()
desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
desc['date'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[3]/span[2]/text()').get()
seq = {}
for row in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[2]/div/div/table/tr'):
seq[ row.xpath('td[1]/span/text()').get() ] = row.xpath('td[2]/span/text()').get()
desc['sequence'] = dict(seq)
det = {}
for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'):
key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first())
key = re.sub(cleanPuncSpace, '', key)
val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first())
det[ key ] = val
desc['details'] = dict(det)
item['description'] = dict(desc)
return item

29
src/spiders/fina_test1.py Normal file
View File

@ -0,0 +1,29 @@
import scrapy
from scrapy.crawler import CrawlerProcess
import re
class ScraperWithLimit(scrapy.Spider):
name = "ScraperWithLimit"
download_delay = 0.1
start_urls = [
'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'
]
custom_settings = {
'DEPTH_LIMIT': 1,
'FEED_EXPORT_ENCODING': 'utf-8'
}
def parse(self, response):
for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'):
yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse)
for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'):
yield {
'title': movie.xpath('div[1]/div/div/span/text()').get(),
'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get())
}
def getMp4(text):
x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text)
return x[0]

16
src/spiders/items.py Normal file
View File

@ -0,0 +1,16 @@
from scrapy.item import Item, Field
class Description(Item):
fullTitle = Field()
sequence = Field()
date = Field()
desc = Field()
details = Field()
class MovieItem(Item):
url = Field()
title = Field()
mp4 = Field()
description = Field()

162
src/storageUpload.py Normal file
View File

@ -0,0 +1,162 @@
from google.cloud import storage
import sys
import urllib
from pymongo import MongoClient
from bson.objectid import ObjectId
import os
import datetime
from subprocess import run,DEVNULL
import argparse
def main(args):
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco"
colName = "moviesMeta"
bucket = 'archspeechreco'
col = getMongoCollection(colName,dbName,uri)
fileFormat = args.format
if (fileFormat == 'mp4'):
uploadMp4(col,bucket)
elif (fileFormat == 'wav'):
uploadWave(col,bucket)
def uploadMp4(col,bucket):
toUpload = getUploadList(col)
for i in toUpload:
fileName = ObjectId(i['_id'])
getVid( i['url'], ObjectId( i['_id'] ) )
upload_blob(bucket, "{}.mp4".format(fileName), "mp4/{}.mp4".format(fileName),col,"Mp4")
try:
os.remove("{}.mp4".format(fileName))
except:
print("{}.mp4 has NOT been removed".format(fileName))
else:
print("{}.mp4 has been removed".format(fileName))
def uploadWave(col,bucket):
toUpload = getWavUploadList(col)
for i in toUpload:
fileName = ObjectId(i['_id'])
getVid( i['url'], ObjectId( i['_id'] ) )
getWave("{}.mp4".format(fileName))
upload_blob(bucket, "{}.wav".format(fileName), "wave/{}.wav".format(fileName),col,"Wav")
try:
os.remove("{}.wav".format(fileName))
except:
print("{}.wav has NOT been removed".format(fileName))
else:
print("{}.wav has been removed".format(fileName))
def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFormat):
"""Uploads a file to the bucket."""
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
try:
blob.upload_from_filename(source_file_name)
except:
print("gcs upload failed")
else:
print('File {}.{} uploaded to {}.'.format(
source_file_name,
fileFormat,
destination_blob_name))
now = datetime.datetime.now()
try:
col.update_one(
{"_id": ObjectId(source_file_name.split('.')[0])},
{"$set":{
"gcs{}".format(fileFormat):{
"location":destination_blob_name,
"uploadDate":now.strftime("%Y-%m-%d %H:%M:%S")
}
}
}
)
except:
print("mongo update failed")
else:
print("mongo update OK")
def getMongoCollection(colName,dbName,uri):
client = MongoClient(uri)
db = client[dbName]
col = db[colName]
return col
def getUploadList(col):
pipeline = []
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
pipeline.append({"$match": {
"gcsMp4": {"$exists": False}
}
})
#project phase, show only url and _id keys
pipeline.append({"$project": {
"url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] }
}
})
#skip first N documents
#pipeline.append({"$skip":362})
#fetch only N documents
#pipeline.append({"$limit":20})
return col.aggregate(pipeline)
def getWavUploadList(col):
pipeline = []
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
pipeline.append({"$match": {
"gcsWav": {"$exists": False}
}
})
#project phase, show only url and _id keys
pipeline.append({"$project": {
"url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] }
}
})
#skip first N documents
#pipeline.append({"$skip":362})
#fetch only N documents
#pipeline.append({"$limit":500})
return col.aggregate(pipeline)
def getVid(url,out):
try:
urllib.request.urlretrieve(url, "{}.mp4".format(out))
except:
print("wrong URL, can't download")
def getWave(filename):
try:
run(['ffmpeg','-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1', filename.replace("mp4","wav")],stdout=DEVNULL)
except:
print("problem with ffmpeg")
else:
try:
os.remove(filename)
except:
print("{} has NOT been removed".format(filename))
else:
print("{} has been removed".format(filename))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='GCS uploader')
parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]")
args = parser.parse_args()
main(args)