45 lines
3.0 KiB
Plaintext
45 lines
3.0 KiB
Plaintext
mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
|
dbName = "archSpeechReco"
|
|
colName = "moviesMeta"
|
|
|
|
def getMongoCollection(colName,dbName,uri):
|
|
client = MongoClient(uri,maxPoolSize=512)
|
|
db = client[dbName]
|
|
col = db[colName]
|
|
|
|
return col
|
|
|
|
col = getMongoCollection(colName,dbName,mongoUri)
|
|
|
|
col.aggregate(pipeline)
|
|
|
|
|
|
|
|
pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}]
|
|
|
|
words = col.aggregate(pipeline)
|
|
for w in words:
|
|
ww = w['gcTextReco']['words']
|
|
words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
|
|
|
|
|
|
for key, value in words_dict.items():
|
|
try:
|
|
col.update_one(
|
|
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
|
|
|
|
except Exception as e: print(e)
|
|
else:
|
|
print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")
|
|
|
|
|
|
|
|
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid": {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export"} ]
|
|
|
|
|
|
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) } ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$limit: 2} ]
|
|
|
|
|
|
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ]
|
|
db.moviesMeta.aggregate(pipeline)
|