2020-06-05 20:10:50 +02:00
7 changed files with 6028 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 __pycache__
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,43 @@
 attrs==19.3.0
 Automat==0.8.0
 cachetools==3.1.1
 certifi==2019.11.28
 cffi==1.13.2
 chardet==3.0.4
 constantly==15.1.0
 cryptography==2.8
 cssselect==1.1.0
 google-api-core==1.14.3
 google-auth==1.8.2
 google-cloud==0.34.0
 google-cloud-core==1.1.0
 google-cloud-speech==1.3.1
 google-cloud-storage==1.23.0
 google-resumable-media==0.5.0
 googleapis-common-protos==1.6.0
 grpcio==1.26.0
 hyperlink==19.0.0
 idna==2.8
 incremental==17.5.0
 lxml==4.4.1
 parsel==1.5.2
 Protego==0.1.15
 protobuf==3.11.1
 pyasn1==0.4.8
 pyasn1-modules==0.2.7
 pycparser==2.19
 PyDispatcher==2.0.5
 PyHamcrest==1.9.0
 pymongo==3.10.0
 pyOpenSSL==19.0.0
 pytz==2019.3
 queuelib==1.5.0
 requests==2.22.0
 rsa==4.0
 Scrapy==1.8.0
 service-identity==18.1.0
 six==1.13.0
 Twisted==19.10.0
 urllib3==1.25.7
 w3lib==1.21.0
 zope.interface==4.7.1
--- a/resources/dbSamples/archSpeechReco.json
+++ b/resources/dbSamples/archSpeechReco.json
--- a/resources/dbSamples/archSpeechRecoArray.json
+++ b/resources/dbSamples/archSpeechRecoArray.json
--- a/src/fix_transcript.py
+++ b/src/fix_transcript.py
@ -0,0 +1,28 @@
 from storageUpload import getMongoCollection
 mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
 dbName = "archSpeechReco"
 colName = "moviesMeta"
 col = getMongoCollection(colName,dbName,mongoUri)
 pipeline = [{'$match': {'gcTextReco.words': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}]
 col.aggregate(pipeline)
 words = col.aggregate(pipeline)
 words_dict = dict()
 for w in words:
    ww = w['gcTextReco']['words']
    words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
 for key, value in words_dict.items():
    try:
        col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
    except Exception as e:
        print(e)
    else:
        print(f"mongo update OK {key}")
--- a/src/gcs2scp.sh
+++ b/src/gcs2scp.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 for i in `gsutil ls gs://archspeechreco/wave/5df3e63c4c0402698d782e*`
 do 
 	gsutil -cp $i ./temp/
 	scp -P 2222 -i .ssh/id_rsa_mongo ./temp/* wojciechs@gonito.net:~/
 	rm -f ./temp/*
 done
--- a/src/temp
+++ b/src/temp
@ -0,0 +1,44 @@
 mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
 dbName = "archSpeechReco"
 colName = "moviesMeta"
 def getMongoCollection(colName,dbName,uri):
    client = MongoClient(uri,maxPoolSize=512)
    db = client[dbName]
    col = db[colName]
    return col
 col = getMongoCollection(colName,dbName,mongoUri)
 col.aggregate(pipeline)
 pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}]
 words = col.aggregate(pipeline)
 for w in words:
 	ww = w['gcTextReco']['words']
 	words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
 for key, value in words_dict.items():
 	try:
        	col.update_one(
 			col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
 except Exception as e: print(e)
 else:
        print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")
 var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid": {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript_fix"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$out: "export"}                  ]
 var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid":{ "$concat": ["Filmoteka_", {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) } ] },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$limit: 2}                  ]
 var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript_fix"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$out: "export3"}                  ]
 db.moviesMeta.aggregate(pipeline)