cleanups #3
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
__pycache__
|
43
requirements.txt
Normal file
43
requirements.txt
Normal file
@ -0,0 +1,43 @@
|
||||
attrs==19.3.0
|
||||
Automat==0.8.0
|
||||
cachetools==3.1.1
|
||||
certifi==2019.11.28
|
||||
cffi==1.13.2
|
||||
chardet==3.0.4
|
||||
constantly==15.1.0
|
||||
cryptography==2.8
|
||||
cssselect==1.1.0
|
||||
google-api-core==1.14.3
|
||||
google-auth==1.8.2
|
||||
google-cloud==0.34.0
|
||||
google-cloud-core==1.1.0
|
||||
google-cloud-speech==1.3.1
|
||||
google-cloud-storage==1.23.0
|
||||
google-resumable-media==0.5.0
|
||||
googleapis-common-protos==1.6.0
|
||||
grpcio==1.26.0
|
||||
hyperlink==19.0.0
|
||||
idna==2.8
|
||||
incremental==17.5.0
|
||||
lxml==4.4.1
|
||||
parsel==1.5.2
|
||||
Protego==0.1.15
|
||||
protobuf==3.11.1
|
||||
pyasn1==0.4.8
|
||||
pyasn1-modules==0.2.7
|
||||
pycparser==2.19
|
||||
PyDispatcher==2.0.5
|
||||
PyHamcrest==1.9.0
|
||||
pymongo==3.10.0
|
||||
pyOpenSSL==19.0.0
|
||||
pytz==2019.3
|
||||
queuelib==1.5.0
|
||||
requests==2.22.0
|
||||
rsa==4.0
|
||||
Scrapy==1.8.0
|
||||
service-identity==18.1.0
|
||||
six==1.13.0
|
||||
Twisted==19.10.0
|
||||
urllib3==1.25.7
|
||||
w3lib==1.21.0
|
||||
zope.interface==4.7.1
|
5903
resources/dbSamples/archSpeechReco.json
Normal file
5903
resources/dbSamples/archSpeechReco.json
Normal file
File diff suppressed because one or more lines are too long
1
resources/dbSamples/archSpeechRecoArray.json
Normal file
1
resources/dbSamples/archSpeechRecoArray.json
Normal file
File diff suppressed because one or more lines are too long
28
src/fix_transcript.py
Normal file
28
src/fix_transcript.py
Normal file
@ -0,0 +1,28 @@
|
||||
from storageUpload import getMongoCollection
|
||||
|
||||
|
||||
mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
||||
dbName = "archSpeechReco"
|
||||
colName = "moviesMeta"
|
||||
|
||||
|
||||
col = getMongoCollection(colName,dbName,mongoUri)
|
||||
|
||||
pipeline = [{'$match': {'gcTextReco.words': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}]
|
||||
col.aggregate(pipeline)
|
||||
|
||||
|
||||
words = col.aggregate(pipeline)
|
||||
words_dict = dict()
|
||||
for w in words:
|
||||
ww = w['gcTextReco']['words']
|
||||
words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
|
||||
|
||||
|
||||
for key, value in words_dict.items():
|
||||
try:
|
||||
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
|
||||
except Exception as e:
|
||||
print(e)
|
||||
else:
|
||||
print(f"mongo update OK {key}")
|
8
src/gcs2scp.sh
Normal file
8
src/gcs2scp.sh
Normal file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
for i in `gsutil ls gs://archspeechreco/wave/5df3e63c4c0402698d782e*`
|
||||
do
|
||||
gsutil -cp $i ./temp/
|
||||
scp -P 2222 -i .ssh/id_rsa_mongo ./temp/* wojciechs@gonito.net:~/
|
||||
rm -f ./temp/*
|
||||
done
|
44
src/temp
Normal file
44
src/temp
Normal file
@ -0,0 +1,44 @@
|
||||
mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
||||
dbName = "archSpeechReco"
|
||||
colName = "moviesMeta"
|
||||
|
||||
def getMongoCollection(colName,dbName,uri):
|
||||
client = MongoClient(uri,maxPoolSize=512)
|
||||
db = client[dbName]
|
||||
col = db[colName]
|
||||
|
||||
return col
|
||||
|
||||
col = getMongoCollection(colName,dbName,mongoUri)
|
||||
|
||||
col.aggregate(pipeline)
|
||||
|
||||
|
||||
|
||||
pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}]
|
||||
|
||||
words = col.aggregate(pipeline)
|
||||
for w in words:
|
||||
ww = w['gcTextReco']['words']
|
||||
words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
|
||||
|
||||
|
||||
for key, value in words_dict.items():
|
||||
try:
|
||||
col.update_one(
|
||||
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
|
||||
|
||||
except Exception as e: print(e)
|
||||
else:
|
||||
print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")
|
||||
|
||||
|
||||
|
||||
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid": {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export"} ]
|
||||
|
||||
|
||||
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) } ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$limit: 2} ]
|
||||
|
||||
|
||||
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ]
|
||||
db.moviesMeta.aggregate(pipeline)
|
Loading…
Reference in New Issue
Block a user