some cleanups

This commit is contained in:
Wojtek 2020-06-05 18:08:03 +00:00
parent a9f6b1f406
commit d7434248d2
7 changed files with 6028 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__

43
requirements.txt Normal file
View File

@ -0,0 +1,43 @@
attrs==19.3.0
Automat==0.8.0
cachetools==3.1.1
certifi==2019.11.28
cffi==1.13.2
chardet==3.0.4
constantly==15.1.0
cryptography==2.8
cssselect==1.1.0
google-api-core==1.14.3
google-auth==1.8.2
google-cloud==0.34.0
google-cloud-core==1.1.0
google-cloud-speech==1.3.1
google-cloud-storage==1.23.0
google-resumable-media==0.5.0
googleapis-common-protos==1.6.0
grpcio==1.26.0
hyperlink==19.0.0
idna==2.8
incremental==17.5.0
lxml==4.4.1
parsel==1.5.2
Protego==0.1.15
protobuf==3.11.1
pyasn1==0.4.8
pyasn1-modules==0.2.7
pycparser==2.19
PyDispatcher==2.0.5
PyHamcrest==1.9.0
pymongo==3.10.0
pyOpenSSL==19.0.0
pytz==2019.3
queuelib==1.5.0
requests==2.22.0
rsa==4.0
Scrapy==1.8.0
service-identity==18.1.0
six==1.13.0
Twisted==19.10.0
urllib3==1.25.7
w3lib==1.21.0
zope.interface==4.7.1

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

28
src/fix_transcript.py Normal file
View File

@ -0,0 +1,28 @@
from storageUpload import getMongoCollection
mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco"
colName = "moviesMeta"
col = getMongoCollection(colName,dbName,mongoUri)
pipeline = [{'$match': {'gcTextReco.words': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}]
col.aggregate(pipeline)
words = col.aggregate(pipeline)
words_dict = dict()
for w in words:
ww = w['gcTextReco']['words']
words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
for key, value in words_dict.items():
try:
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
except Exception as e:
print(e)
else:
print(f"mongo update OK {key}")

8
src/gcs2scp.sh Normal file
View File

@ -0,0 +1,8 @@
#!/bin/bash
for i in `gsutil ls gs://archspeechreco/wave/5df3e63c4c0402698d782e*`
do
gsutil -cp $i ./temp/
scp -P 2222 -i .ssh/id_rsa_mongo ./temp/* wojciechs@gonito.net:~/
rm -f ./temp/*
done

44
src/temp Normal file
View File

@ -0,0 +1,44 @@
mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco"
colName = "moviesMeta"
def getMongoCollection(colName,dbName,uri):
client = MongoClient(uri,maxPoolSize=512)
db = client[dbName]
col = db[colName]
return col
col = getMongoCollection(colName,dbName,mongoUri)
col.aggregate(pipeline)
pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}]
words = col.aggregate(pipeline)
for w in words:
ww = w['gcTextReco']['words']
words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
for key, value in words_dict.items():
try:
col.update_one(
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
except Exception as e: print(e)
else:
print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid": {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export"} ]
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) } ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$limit: 2} ]
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ]
db.moviesMeta.aggregate(pipeline)