Merge branch 'include_spiders' of s333949/archSpeechReco into master

2020-01-06 11:57:11 +00:00 · 2020-01-06 11:57:11 +00:00 · 82632e17e0
commit 82632e17e0
parent 5861f57d7f f793e2e82a
4 changed files with 271 additions and 0 deletions
--- a/src/spiders/finaSpider.py
+++ b/src/spiders/finaSpider.py
@ -0,0 +1,64 @@
+import scrapy
+from scrapy.spiders import CrawlSpider, Rule
+from scrapy.linkextractors import LinkExtractor
+import re
+from items import MovieItem, Description
+
+
+class FinaSpider(CrawlSpider):
+    name = 'repozytorium.fn.org.pl'
+    start_urls = ['http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a']
+
+    rules = (
+        # Extract link from index of titles
+        Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[2]')),
+
+        # Extract links with movie titles
+        Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', deny='\?q=pl\/search\/site'), callback='parse_item'),
+
+        Rule(LinkExtractor(restrict_xpaths='//*[@id="content"]/div/div[4]/div', allow='\?q=pl\/search\/site')),
+
+        Rule(LinkExtractor(restrict_xpaths='//*[@id="fntiles_page_search_0"]'), callback='parse_item')
+    )
+    custom_settings = {
+        'FEED_EXPORT_ENCODING': 'utf-8'
+    }
+
+
+    def parse_item(self, response):
+
+        def getMp4(text):
+            x = re.findall('file: encodeURI\("(.+?\.mp4)"\)',text)
+            return x
+        
+        cleanHtml = re.compile('<.*?>')
+        cleanPuncSpace = re.compile(': $')
+
+        item = MovieItem()
+        item['title'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[1]/div/div/span/text()').get()
+        item['mp4'] = getMp4(response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[2]/div/script/text()').get())
+        item['url'] = response.url
+
+        desc = Description()
+        desc['fullTitle'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[2]/span[2]/text()').get()
+        desc['desc'] = response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[2]/span/p/text()').get()
+        desc['date'] = response.xpath('//*[@id="block-fnfilm-fnfilm"]/div/div[3]/div/div[3]/span[2]/text()').get()
+
+        seq = {}
+        for row in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[2]/div/div/table/tr'):
+            seq[ row.xpath('td[1]/span/text()').get() ] = row.xpath('td[2]/span/text()').get()
+
+        desc['sequence'] = dict(seq)
+
+        det = {}
+        for div in response.xpath('//*[@id="content"]/div[3]/article/div[1]/div/div[2]/div[1]/div/div[4]/div[contains(@class,"fncustom_field")]'):
+            key = re.sub(cleanHtml, '', div.xpath('span[1]').extract_first())
+            key = re.sub(cleanPuncSpace, '', key)
+            val = re.sub(cleanHtml, '', div.xpath('span[2]').extract_first())
+            det[ key ] = val
+        
+        desc['details'] = dict(det)
+        
+        item['description'] = dict(desc)
+        
+        return item
--- a/src/spiders/fina_test1.py
+++ b/src/spiders/fina_test1.py
@ -0,0 +1,29 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+import re
+
+class ScraperWithLimit(scrapy.Spider):
+    name = "ScraperWithLimit"
+    download_delay = 0.1
+    start_urls = [
+        'http://repozytorium.fn.org.pl/?q=pl/fnsearch/film_index/a'
+    ]
+    
+    custom_settings = {
+        'DEPTH_LIMIT': 1,
+        'FEED_EXPORT_ENCODING': 'utf-8'
+    }
+    
+    def parse(self, response):
+        for next_page in response.xpath('//*[@id="content"]/div/div[4]/div'):
+            yield response.follow(next_page.xpath('div/div/a/@href').get(), self.parse)
+        
+        for movie in response.xpath('//*[@id="block-fnfilm-fnfilm"]/div'):
+            yield {
+                'title': movie.xpath('div[1]/div/div/span/text()').get(),
+                'mp4': getMp4(movie.xpath('div[2]/div/script/text()').get())
+            }
+			
+def getMp4(text):
+    x = re.findall('file: encodeURI\("(.+\.mp4)"\)',text)
+    return x[0]
--- a/src/spiders/items.py
+++ b/src/spiders/items.py
@ -0,0 +1,16 @@
+from scrapy.item import Item, Field
+
+
+class Description(Item):
+    fullTitle = Field()
+    sequence = Field()
+    date = Field()
+    desc = Field()
+    details = Field()
+
+
+class MovieItem(Item):
+    url = Field()
+    title = Field()
+    mp4 = Field()
+    description = Field()
--- a/src/storageUpload.py
+++ b/src/storageUpload.py
@ -0,0 +1,162 @@
+from google.cloud import storage
+import sys
+import urllib
+from pymongo import MongoClient
+from bson.objectid import ObjectId
+import os
+import datetime
+from subprocess import run,DEVNULL
+import argparse
+
+def main(args):
+    uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
+    dbName = "archSpeechReco"
+    colName = "moviesMeta"
+    bucket = 'archspeechreco'
+
+    col = getMongoCollection(colName,dbName,uri)
+    fileFormat = args.format
+
+    if (fileFormat == 'mp4'):
+        uploadMp4(col,bucket)   
+    elif (fileFormat == 'wav'): 
+        uploadWave(col,bucket)
+
+def uploadMp4(col,bucket):
+    toUpload = getUploadList(col)
+
+    for i in toUpload:
+        fileName = ObjectId(i['_id'])
+        getVid( i['url'], ObjectId( i['_id'] ) )
+        upload_blob(bucket, "{}.mp4".format(fileName), "mp4/{}.mp4".format(fileName),col,"Mp4")
+        try:
+            os.remove("{}.mp4".format(fileName))
+        except:
+            print("{}.mp4 has NOT been removed".format(fileName))
+        else:
+            print("{}.mp4 has been removed".format(fileName))
+
+
+def uploadWave(col,bucket):
+    toUpload = getWavUploadList(col)
+
+    for i in toUpload:
+        fileName = ObjectId(i['_id'])
+        getVid( i['url'], ObjectId( i['_id'] ) )
+        getWave("{}.mp4".format(fileName))
+        upload_blob(bucket, "{}.wav".format(fileName), "wave/{}.wav".format(fileName),col,"Wav")
+        try:
+            os.remove("{}.wav".format(fileName))
+        except:
+            print("{}.wav has NOT been removed".format(fileName))
+        else:
+            print("{}.wav has been removed".format(fileName))
+
+
+def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFormat):
+    """Uploads a file to the bucket."""
+    storage_client = storage.Client()
+    bucket = storage_client.get_bucket(bucket_name)
+    blob = bucket.blob(destination_blob_name)
+    
+    try:
+        blob.upload_from_filename(source_file_name)
+    except:
+        print("gcs upload failed")
+    else:
+        print('File {}.{} uploaded to {}.'.format(
+            source_file_name,
+            fileFormat,
+            destination_blob_name))
+        now = datetime.datetime.now()
+        try:
+            col.update_one(
+                    {"_id": ObjectId(source_file_name.split('.')[0])},
+                    {"$set":{
+                        "gcs{}".format(fileFormat):{
+                            "location":destination_blob_name,
+                            "uploadDate":now.strftime("%Y-%m-%d %H:%M:%S")
+                            }
+                        }
+                        }
+                    )
+        except:
+            print("mongo update failed")
+        else:
+            print("mongo update OK")
+
+
+def getMongoCollection(colName,dbName,uri):
+    client = MongoClient(uri)
+    db = client[dbName]
+    col = db[colName]
+    
+    return col
+
+
+def getUploadList(col):
+    pipeline = []
+    #$match phase, filetr documents withour gcs field - movies not uploaded to gcs
+    pipeline.append({"$match": {
+                            "gcsMp4": {"$exists": False}
+                                }
+                    })
+    #project phase, show only url and _id keys
+    pipeline.append({"$project": {
+                            "url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}]  }
+                                }
+                    })
+    #skip first N documents
+    #pipeline.append({"$skip":362})
+    #fetch only N documents
+    #pipeline.append({"$limit":20})
+    
+    return col.aggregate(pipeline)
+
+def getWavUploadList(col):
+        pipeline = []
+        #$match phase, filetr documents withour gcs field - movies not uploaded to gcs
+        pipeline.append({"$match": {
+                                "gcsWav": {"$exists": False}
+                                    }
+                       })
+        #project phase, show only url and _id keys
+        pipeline.append({"$project": {
+                                "url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}]  }
+                                    }
+                       })
+        #skip first N documents
+        #pipeline.append({"$skip":362})
+        #fetch only N documents
+        #pipeline.append({"$limit":500})
+
+        return col.aggregate(pipeline)
+
+
+def getVid(url,out):
+    try:
+        urllib.request.urlretrieve(url, "{}.mp4".format(out))
+    except:
+        print("wrong URL, can't download")
+
+
+def getWave(filename):
+    try:
+        run(['ffmpeg','-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1', filename.replace("mp4","wav")],stdout=DEVNULL)
+    except:
+        print("problem with ffmpeg")
+    else:
+        try:
+            os.remove(filename)
+        except:
+            print("{} has NOT been removed".format(filename))
+        else:
+            print("{} has been removed".format(filename))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='GCS uploader')
+    parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]")
+    args = parser.parse_args()
+    main(args)
+