some cleanups
This commit is contained in:
parent
a9f6b1f406
commit
d7434248d2
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
__pycache__
|
43
requirements.txt
Normal file
43
requirements.txt
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
attrs==19.3.0
|
||||||
|
Automat==0.8.0
|
||||||
|
cachetools==3.1.1
|
||||||
|
certifi==2019.11.28
|
||||||
|
cffi==1.13.2
|
||||||
|
chardet==3.0.4
|
||||||
|
constantly==15.1.0
|
||||||
|
cryptography==2.8
|
||||||
|
cssselect==1.1.0
|
||||||
|
google-api-core==1.14.3
|
||||||
|
google-auth==1.8.2
|
||||||
|
google-cloud==0.34.0
|
||||||
|
google-cloud-core==1.1.0
|
||||||
|
google-cloud-speech==1.3.1
|
||||||
|
google-cloud-storage==1.23.0
|
||||||
|
google-resumable-media==0.5.0
|
||||||
|
googleapis-common-protos==1.6.0
|
||||||
|
grpcio==1.26.0
|
||||||
|
hyperlink==19.0.0
|
||||||
|
idna==2.8
|
||||||
|
incremental==17.5.0
|
||||||
|
lxml==4.4.1
|
||||||
|
parsel==1.5.2
|
||||||
|
Protego==0.1.15
|
||||||
|
protobuf==3.11.1
|
||||||
|
pyasn1==0.4.8
|
||||||
|
pyasn1-modules==0.2.7
|
||||||
|
pycparser==2.19
|
||||||
|
PyDispatcher==2.0.5
|
||||||
|
PyHamcrest==1.9.0
|
||||||
|
pymongo==3.10.0
|
||||||
|
pyOpenSSL==19.0.0
|
||||||
|
pytz==2019.3
|
||||||
|
queuelib==1.5.0
|
||||||
|
requests==2.22.0
|
||||||
|
rsa==4.0
|
||||||
|
Scrapy==1.8.0
|
||||||
|
service-identity==18.1.0
|
||||||
|
six==1.13.0
|
||||||
|
Twisted==19.10.0
|
||||||
|
urllib3==1.25.7
|
||||||
|
w3lib==1.21.0
|
||||||
|
zope.interface==4.7.1
|
5903
resources/dbSamples/archSpeechReco.json
Normal file
5903
resources/dbSamples/archSpeechReco.json
Normal file
File diff suppressed because one or more lines are too long
1
resources/dbSamples/archSpeechRecoArray.json
Normal file
1
resources/dbSamples/archSpeechRecoArray.json
Normal file
File diff suppressed because one or more lines are too long
28
src/fix_transcript.py
Normal file
28
src/fix_transcript.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
from storageUpload import getMongoCollection
|
||||||
|
|
||||||
|
|
||||||
|
mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
||||||
|
dbName = "archSpeechReco"
|
||||||
|
colName = "moviesMeta"
|
||||||
|
|
||||||
|
|
||||||
|
col = getMongoCollection(colName,dbName,mongoUri)
|
||||||
|
|
||||||
|
pipeline = [{'$match': {'gcTextReco.words': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}]
|
||||||
|
col.aggregate(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
words = col.aggregate(pipeline)
|
||||||
|
words_dict = dict()
|
||||||
|
for w in words:
|
||||||
|
ww = w['gcTextReco']['words']
|
||||||
|
words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
|
||||||
|
|
||||||
|
|
||||||
|
for key, value in words_dict.items():
|
||||||
|
try:
|
||||||
|
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
else:
|
||||||
|
print(f"mongo update OK {key}")
|
8
src/gcs2scp.sh
Normal file
8
src/gcs2scp.sh
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
for i in `gsutil ls gs://archspeechreco/wave/5df3e63c4c0402698d782e*`
|
||||||
|
do
|
||||||
|
gsutil -cp $i ./temp/
|
||||||
|
scp -P 2222 -i .ssh/id_rsa_mongo ./temp/* wojciechs@gonito.net:~/
|
||||||
|
rm -f ./temp/*
|
||||||
|
done
|
44
src/temp
Normal file
44
src/temp
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
||||||
|
dbName = "archSpeechReco"
|
||||||
|
colName = "moviesMeta"
|
||||||
|
|
||||||
|
def getMongoCollection(colName,dbName,uri):
|
||||||
|
client = MongoClient(uri,maxPoolSize=512)
|
||||||
|
db = client[dbName]
|
||||||
|
col = db[colName]
|
||||||
|
|
||||||
|
return col
|
||||||
|
|
||||||
|
col = getMongoCollection(colName,dbName,mongoUri)
|
||||||
|
|
||||||
|
col.aggregate(pipeline)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}]
|
||||||
|
|
||||||
|
words = col.aggregate(pipeline)
|
||||||
|
for w in words:
|
||||||
|
ww = w['gcTextReco']['words']
|
||||||
|
words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
|
||||||
|
|
||||||
|
|
||||||
|
for key, value in words_dict.items():
|
||||||
|
try:
|
||||||
|
col.update_one(
|
||||||
|
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
|
||||||
|
|
||||||
|
except Exception as e: print(e)
|
||||||
|
else:
|
||||||
|
print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid": {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export"} ]
|
||||||
|
|
||||||
|
|
||||||
|
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) } ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$limit: 2} ]
|
||||||
|
|
||||||
|
|
||||||
|
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ]
|
||||||
|
db.moviesMeta.aggregate(pipeline)
|
Loading…
Reference in New Issue
Block a user