From c006103bc1c3a44b17ae99cb1d5af0825a31da04 Mon Sep 17 00:00:00 2001 From: Smolak Date: Sat, 16 Nov 2019 21:24:28 +0100 Subject: [PATCH 1/9] new files with scrappy spiders --- src/spiders/fina_test1.py | 29 +++++++++++++++++++++++++++++ src/spiders/fina_test2.py | 38 ++++++++++++++++++++++++++++++++++++++ src/spiders/items.py | 6 ++++++ 3 files changed, 73 insertions(+) create mode 100644 src/spiders/fina_test1.py create mode 100644 src/spiders/fina_test2.py create mode 100644 src/spiders/items.py diff --git a/src/spiders/fina_test1.py b/src/spiders/fina_test1.py new file mode 100644 index 00000000..cfa0220d --- /dev/null +++ b/src/spiders/fina_test1.py @@ -0,0 +1,29 @@ +import scrapy +from scrapy.crawler import CrawlerProcess +import re + +class ScraperWithLimit(scrapy.Spider): + name = "ScraperWithLimit" + download_delay = 0.1 + start_urls = [ + 'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a' + ] + + custom_settings = { + 'DEPTH_LIMIT': 1, + 'FEED_EXPORT_ENCODING': 'utf-8' + } + + def parse(self, response): + for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'): + yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse) + + for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'): + yield { + 'title': movie.xpath('div[1]/div/div/span/text()').get(), + 'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get()) + } + +def getMp4(text): + x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text) + return x[0] \ No newline at end of file diff --git a/src/spiders/fina_test2.py b/src/spiders/fina_test2.py new file mode 100644 index 00000000..a0aec9fe --- /dev/null +++ b/src/spiders/fina_test2.py @@ -0,0 +1,38 @@ +import scrapy +from scrapy.spiders import CrawlSpider, Rule +from scrapy.linkextractors import LinkExtractor +from scrapy.item import Item, Field +import re + +from fina_scrap.items import MovieItem + +class MySpider(CrawlSpider): + name = 'repozytorium.fn.org.pl' + start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'] + + rules = ( + # Extract link from index of titles + Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')), + + # Extract links with movie titles + Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'), + + Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')), + + Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item') + ) + custom_settings = { + 'FEED_EXPORT_ENCODING': 'utf-8' + } + + def parse_item(self, response): + + def getMp4(self, text): + x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text) + return x + + item = MovieItem() + item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get() + item['mp4'] = self.getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) + item['url'] = response.url + return item diff --git a/src/spiders/items.py b/src/spiders/items.py new file mode 100644 index 00000000..407b4307 --- /dev/null +++ b/src/spiders/items.py @@ -0,0 +1,6 @@ +from scrapy.item import Item, Field + +class MovieItem(Item): + url = Field() + title = Field() + mp4 = Field() \ No newline at end of file From d20baf5bd8e95c0ea12927352fba111cea882e96 Mon Sep 17 00:00:00 2001 From: Smolak Date: Sat, 16 Nov 2019 22:06:05 +0100 Subject: [PATCH 2/9] small refactoring and cleanup --- src/spiders/{fina_test2.py => finaSpider.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename src/spiders/{fina_test2.py => finaSpider.py} (95%) diff --git a/src/spiders/fina_test2.py b/src/spiders/finaSpider.py similarity index 95% rename from src/spiders/fina_test2.py rename to src/spiders/finaSpider.py index a0aec9fe..4151cfbe 100644 --- a/src/spiders/fina_test2.py +++ b/src/spiders/finaSpider.py @@ -1,12 +1,12 @@ import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor -from scrapy.item import Item, Field import re +from items import MovieItem from fina_scrap.items import MovieItem -class MySpider(CrawlSpider): +class FinaSpider(CrawlSpider): name = 'repozytorium.fn.org.pl' start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'] From 4f05c132fd9ba2af44d9b74c7d908918d5ccd8f2 Mon Sep 17 00:00:00 2001 From: Wojtek Date: Wed, 11 Dec 2019 18:55:46 +0000 Subject: [PATCH 3/9] spiders --- src/spiders/finaSpider.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/spiders/finaSpider.py b/src/spiders/finaSpider.py index 4151cfbe..a64be0aa 100644 --- a/src/spiders/finaSpider.py +++ b/src/spiders/finaSpider.py @@ -4,7 +4,6 @@ from scrapy.linkextractors import LinkExtractor import re from items import MovieItem -from fina_scrap.items import MovieItem class FinaSpider(CrawlSpider): name = 'repozytorium.fn.org.pl' @@ -27,12 +26,12 @@ class FinaSpider(CrawlSpider): def parse_item(self, response): - def getMp4(self, text): + def getMp4(text): x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text) return x item = MovieItem() item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get() - item['mp4'] = self.getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) + item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) item['url'] = response.url return item From 993ec57cf1920a3ee4f31aa77cce474d5390d3df Mon Sep 17 00:00:00 2001 From: Wojtek Date: Wed, 11 Dec 2019 19:06:26 +0000 Subject: [PATCH 4/9] spiders --- src/spiders/finaSpider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/spiders/finaSpider.py b/src/spiders/finaSpider.py index a64be0aa..8d903041 100644 --- a/src/spiders/finaSpider.py +++ b/src/spiders/finaSpider.py @@ -24,6 +24,7 @@ class FinaSpider(CrawlSpider): 'FEED_EXPORT_ENCODING': 'utf-8' } + def parse_item(self, response): def getMp4(text): From f6a9702b3a60554c0a175778c26b37a73a0e5554 Mon Sep 17 00:00:00 2001 From: Wojtek Date: Wed, 11 Dec 2019 19:13:33 +0000 Subject: [PATCH 5/9] TODOs --- src/spiders/finaSpider.py | 3 +++ src/spiders/items.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/spiders/finaSpider.py b/src/spiders/finaSpider.py index 8d903041..72acd44e 100644 --- a/src/spiders/finaSpider.py +++ b/src/spiders/finaSpider.py @@ -35,4 +35,7 @@ class FinaSpider(CrawlSpider): item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get() item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) item['url'] = response.url + #TODO + #description: + return item diff --git a/src/spiders/items.py b/src/spiders/items.py index 407b4307..88464203 100644 --- a/src/spiders/items.py +++ b/src/spiders/items.py @@ -3,4 +3,5 @@ from scrapy.item import Item, Field class MovieItem(Item): url = Field() title = Field() - mp4 = Field() \ No newline at end of file + mp4 = Field() + #description = Field() From 52241b5d667ceee9c452613bfd7720b88794e90a Mon Sep 17 00:00:00 2001 From: Wojtek Date: Wed, 11 Dec 2019 22:49:43 +0000 Subject: [PATCH 6/9] sequences added --- src/spiders/finaSpider.py | 14 +++++++++++++- src/spiders/items.py | 13 ++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/spiders/finaSpider.py b/src/spiders/finaSpider.py index 72acd44e..0955da7a 100644 --- a/src/spiders/finaSpider.py +++ b/src/spiders/finaSpider.py @@ -2,7 +2,7 @@ import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor import re -from items import MovieItem +from items import MovieItem, Description, Sequence class FinaSpider(CrawlSpider): @@ -37,5 +37,17 @@ class FinaSpider(CrawlSpider): item['url'] = response.url #TODO #description: + desc = Description() + desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get() + desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get() + desc['date'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[3]/span[2]/text()').get() + + seq = {} + for row in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[2]/div/div/table/tr'): + seq[ row.xpath('td[1]/span/text()').get() ] = row.xpath('td[2]/span/text()').get() + + desc['sequence'] = dict(seq) + + item['description'] = dict(desc) return item diff --git a/src/spiders/items.py b/src/spiders/items.py index 88464203..f1fb9c0a 100644 --- a/src/spiders/items.py +++ b/src/spiders/items.py @@ -1,7 +1,18 @@ from scrapy.item import Item, Field +class Sequence(Item): + seqTime = Field() + seqVal = Field() + +class Description(Item): + fullTitle = Field() + sequence = Field() + date = Field() + desc = Field() + + class MovieItem(Item): url = Field() title = Field() mp4 = Field() - #description = Field() + description = Field() From 35ab9b801312e3361b807c98d0b7286f99f40347 Mon Sep 17 00:00:00 2001 From: Wojtek Date: Fri, 13 Dec 2019 21:32:09 +0000 Subject: [PATCH 7/9] spider on steroids --- src/spiders/finaSpider.py | 17 ++++++++++++++--- src/spiders/items.py | 4 +--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/spiders/finaSpider.py b/src/spiders/finaSpider.py index 0955da7a..c52ca415 100644 --- a/src/spiders/finaSpider.py +++ b/src/spiders/finaSpider.py @@ -2,7 +2,7 @@ import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor import re -from items import MovieItem, Description, Sequence +from items import MovieItem, Description class FinaSpider(CrawlSpider): @@ -31,12 +31,14 @@ class FinaSpider(CrawlSpider): x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text) return x + cleanHtml = re.compile('<.*?>') + cleanPuncSpace = re.compile(': $') + item = MovieItem() item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get() item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get()) item['url'] = response.url - #TODO - #description: + desc = Description() desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get() desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get() @@ -48,6 +50,15 @@ class FinaSpider(CrawlSpider): desc['sequence'] = dict(seq) + det = {} + for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'): + key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first()) + key = re.sub(cleanPuncSpace, '', key) + val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first()) + det[ key ] = val + + desc['details'] = dict(det) + item['description'] = dict(desc) return item diff --git a/src/spiders/items.py b/src/spiders/items.py index f1fb9c0a..b99b0557 100644 --- a/src/spiders/items.py +++ b/src/spiders/items.py @@ -1,14 +1,12 @@ from scrapy.item import Item, Field -class Sequence(Item): - seqTime = Field() - seqVal = Field() class Description(Item): fullTitle = Field() sequence = Field() date = Field() desc = Field() + details = Field() class MovieItem(Item): From 40b9efeaab3817870a7b5f0d949c07ed0d2e045b Mon Sep 17 00:00:00 2001 From: Wojtek Date: Sun, 15 Dec 2019 19:56:51 +0000 Subject: [PATCH 8/9] gcs file uploader, mp4 only --- src/storageUpload.py | 101 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 src/storageUpload.py diff --git a/src/storageUpload.py b/src/storageUpload.py new file mode 100644 index 00000000..ccad59f2 --- /dev/null +++ b/src/storageUpload.py @@ -0,0 +1,101 @@ +from google.cloud import storage +import sys +import urllib +from pymongo import MongoClient +from bson.objectid import ObjectId +import os +import datetime + + +def main(): + uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" + dbName = "archSpeechReco" + colName = "moviesMeta" + bucket = 'archspeechreco' + + col = getMongoCollection(colName,dbName,uri) + + toUpload = getUploadList(col) + + for i in toUpload: + fileName = ObjectId(i['_id']) + getVid( i['url'], ObjectId( i['_id'] ) ) + upload_blob(bucket, fileName, "mp4/{}.mp4".format(fileName),col) + try: + os.remove("{}.mp4".format(fileName)) + except: + print("{}.mp4 has NOT been removed".format(fileName)) + else: + print("{}.mp4 has been removed".format(fileName)) + + +def upload_blob(bucket_name, source_file_name, destination_blob_name,col): + """Uploads a file to the bucket.""" + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(destination_blob_name) + + try: + blob.upload_from_filename("{}.mp4".format(source_file_name)) + except: + print("gcs upload failed") + else: + print('File {}.mp4 uploaded to {}.'.format( + source_file_name, + destination_blob_name)) + now = datetime.datetime.now() + try: + col.update_one( + {"_id": ObjectId(source_file_name)}, + {"$set":{ + "gcs":{ + "location":destination_blob_name, + "uploadDate":now.strftime("%Y-%m-%d %H:%M:%S") + } + } + } + ) + except: + print("mongo update failed") + else: + print("mongo update OK") + + +def getMongoCollection(colName,dbName,uri): + client = MongoClient(uri) + db = client[dbName] + col = db[colName] + + return col + + +def getUploadList(col): + pipeline = [] + #$match phase, filetr documents withour gcs field - movies not uploaded to gcs + pipeline.append({"$match": { + "gcs": {"$exists": False} + } + }) + #project phase, show only url and _id keys + pipeline.append({"$project": { + "url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] } + } + }) + #skip first N documents + #pipeline.append({"$skip":362}) + #fetch only N documents + #pipeline.append({"$limit":20}) + + return col.aggregate(pipeline) + + +def getVid(url,out): + try: + urllib.request.urlretrieve(url, "{}.mp4".format(out)) + except: + print("wrong URL, can't download") + + +if __name__ == '__main__': + main() + From f793e2e82ab043cfe7e8868d4f94441d060dcec1 Mon Sep 17 00:00:00 2001 From: Wojtek Date: Mon, 6 Jan 2020 11:50:46 +0000 Subject: [PATCH 9/9] wav upload --- src/storageUpload.py | 83 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 72 insertions(+), 11 deletions(-) diff --git a/src/storageUpload.py b/src/storageUpload.py index ccad59f2..6f7048d7 100644 --- a/src/storageUpload.py +++ b/src/storageUpload.py @@ -5,22 +5,30 @@ from pymongo import MongoClient from bson.objectid import ObjectId import os import datetime +from subprocess import run,DEVNULL +import argparse - -def main(): +def main(args): uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" dbName = "archSpeechReco" colName = "moviesMeta" bucket = 'archspeechreco' col = getMongoCollection(colName,dbName,uri) + fileFormat = args.format + if (fileFormat == 'mp4'): + uploadMp4(col,bucket) + elif (fileFormat == 'wav'): + uploadWave(col,bucket) + +def uploadMp4(col,bucket): toUpload = getUploadList(col) for i in toUpload: fileName = ObjectId(i['_id']) getVid( i['url'], ObjectId( i['_id'] ) ) - upload_blob(bucket, fileName, "mp4/{}.mp4".format(fileName),col) + upload_blob(bucket, "{}.mp4".format(fileName), "mp4/{}.mp4".format(fileName),col,"Mp4") try: os.remove("{}.mp4".format(fileName)) except: @@ -29,26 +37,43 @@ def main(): print("{}.mp4 has been removed".format(fileName)) -def upload_blob(bucket_name, source_file_name, destination_blob_name,col): +def uploadWave(col,bucket): + toUpload = getWavUploadList(col) + + for i in toUpload: + fileName = ObjectId(i['_id']) + getVid( i['url'], ObjectId( i['_id'] ) ) + getWave("{}.mp4".format(fileName)) + upload_blob(bucket, "{}.wav".format(fileName), "wave/{}.wav".format(fileName),col,"Wav") + try: + os.remove("{}.wav".format(fileName)) + except: + print("{}.wav has NOT been removed".format(fileName)) + else: + print("{}.wav has been removed".format(fileName)) + + +def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFormat): """Uploads a file to the bucket.""" storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) blob = bucket.blob(destination_blob_name) try: - blob.upload_from_filename("{}.mp4".format(source_file_name)) + blob.upload_from_filename(source_file_name) except: print("gcs upload failed") else: - print('File {}.mp4 uploaded to {}.'.format( + print('File {}.{} uploaded to {}.'.format( source_file_name, + fileFormat, destination_blob_name)) now = datetime.datetime.now() try: col.update_one( - {"_id": ObjectId(source_file_name)}, + {"_id": ObjectId(source_file_name.split('.')[0])}, {"$set":{ - "gcs":{ + "gcs{}".format(fileFormat):{ "location":destination_blob_name, "uploadDate":now.strftime("%Y-%m-%d %H:%M:%S") } @@ -73,7 +98,7 @@ def getUploadList(col): pipeline = [] #$match phase, filetr documents withour gcs field - movies not uploaded to gcs pipeline.append({"$match": { - "gcs": {"$exists": False} + "gcsMp4": {"$exists": False} } }) #project phase, show only url and _id keys @@ -88,6 +113,25 @@ def getUploadList(col): return col.aggregate(pipeline) +def getWavUploadList(col): + pipeline = [] + #$match phase, filetr documents withour gcs field - movies not uploaded to gcs + pipeline.append({"$match": { + "gcsWav": {"$exists": False} + } + }) + #project phase, show only url and _id keys + pipeline.append({"$project": { + "url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] } + } + }) + #skip first N documents + #pipeline.append({"$skip":362}) + #fetch only N documents + #pipeline.append({"$limit":500}) + + return col.aggregate(pipeline) + def getVid(url,out): try: @@ -96,6 +140,23 @@ def getVid(url,out): print("wrong URL, can't download") -if __name__ == '__main__': - main() +def getWave(filename): + try: + run(['ffmpeg','-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1', filename.replace("mp4","wav")],stdout=DEVNULL) + except: + print("problem with ffmpeg") + else: + try: + os.remove(filename) + except: + print("{} has NOT been removed".format(filename)) + else: + print("{} has been removed".format(filename)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GCS uploader') + parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]") + args = parser.parse_args() + main(args)