some cleanups

2020-06-05 18:08:03 +00:00 · 2020-06-05 18:08:03 +00:00 · d7434248d2
commit d7434248d2
parent a9f6b1f406
7 changed files with 6028 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+__pycache__
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,43 @@
+attrs==19.3.0
+Automat==0.8.0
+cachetools==3.1.1
+certifi==2019.11.28
+cffi==1.13.2
+chardet==3.0.4
+constantly==15.1.0
+cryptography==2.8
+cssselect==1.1.0
+google-api-core==1.14.3
+google-auth==1.8.2
+google-cloud==0.34.0
+google-cloud-core==1.1.0
+google-cloud-speech==1.3.1
+google-cloud-storage==1.23.0
+google-resumable-media==0.5.0
+googleapis-common-protos==1.6.0
+grpcio==1.26.0
+hyperlink==19.0.0
+idna==2.8
+incremental==17.5.0
+lxml==4.4.1
+parsel==1.5.2
+Protego==0.1.15
+protobuf==3.11.1
+pyasn1==0.4.8
+pyasn1-modules==0.2.7
+pycparser==2.19
+PyDispatcher==2.0.5
+PyHamcrest==1.9.0
+pymongo==3.10.0
+pyOpenSSL==19.0.0
+pytz==2019.3
+queuelib==1.5.0
+requests==2.22.0
+rsa==4.0
+Scrapy==1.8.0
+service-identity==18.1.0
+six==1.13.0
+Twisted==19.10.0
+urllib3==1.25.7
+w3lib==1.21.0
+zope.interface==4.7.1
--- a/resources/dbSamples/archSpeechReco.json
+++ b/resources/dbSamples/archSpeechReco.json
--- a/resources/dbSamples/archSpeechRecoArray.json
+++ b/resources/dbSamples/archSpeechRecoArray.json
--- a/src/fix_transcript.py
+++ b/src/fix_transcript.py
@ -0,0 +1,28 @@
+from storageUpload import getMongoCollection
+
+
+mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
+dbName = "archSpeechReco"
+colName = "moviesMeta"
+
+
+col = getMongoCollection(colName,dbName,mongoUri)
+
+pipeline = [{'$match': {'gcTextReco.words': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}]
+col.aggregate(pipeline)
+
+
+words = col.aggregate(pipeline)
+words_dict = dict()
+for w in words:
+    ww = w['gcTextReco']['words']
+    words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
+
+
+for key, value in words_dict.items():
+    try:
+        col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
+    except Exception as e:
+        print(e)
+    else:
+        print(f"mongo update OK {key}")
--- a/src/gcs2scp.sh
+++ b/src/gcs2scp.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+for i in `gsutil ls gs://archspeechreco/wave/5df3e63c4c0402698d782e*`
+do 
+	gsutil -cp $i ./temp/
+	scp -P 2222 -i .ssh/id_rsa_mongo ./temp/* wojciechs@gonito.net:~/
+	rm -f ./temp/*
+done
--- a/src/temp
+++ b/src/temp
@ -0,0 +1,44 @@
+mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
+dbName = "archSpeechReco"
+colName = "moviesMeta"
+
+def getMongoCollection(colName,dbName,uri):
+    client = MongoClient(uri,maxPoolSize=512)
+    db = client[dbName]
+    col = db[colName]
+
+    return col
+
+col = getMongoCollection(colName,dbName,mongoUri)
+
+col.aggregate(pipeline)
+
+
+
+pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}]
+
+words = col.aggregate(pipeline)
+for w in words:
+	ww = w['gcTextReco']['words']
+	words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
+
+
+for key, value in words_dict.items():
+	try:
+        	col.update_one(
+			col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
+
+except Exception as e: print(e)
+else:
+        print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")
+
+
+
+var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid": {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript_fix"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$out: "export"}                  ]
+
+
+var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid":{ "$concat": ["Filmoteka_", {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) } ] },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$limit: 2}                  ]
+
+
+var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript_fix"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$out: "export3"}                  ]
+db.moviesMeta.aggregate(pipeline)