archSpeechReco/src/yt_toMongo.py

72 lines
2.6 KiB
Python
Raw Normal View History

2020-08-19 20:10:27 +02:00
from mongo.helpers import get_mongo_collection
import argparse
import logging
from glob import glob
from os import path
2020-08-20 20:03:54 +02:00
import re
from datetime import datetime
2020-08-19 20:10:27 +02:00
def main(args):
loglevel = args.loglevel
input_path = args.input
2020-08-20 20:03:54 +02:00
input_source = args.source
2020-08-19 20:10:27 +02:00
numeric_level = getattr(logging, loglevel.upper(), 10)
logging.basicConfig(format='%(asctime)s [%(levelname)s] - %(message)s', level=numeric_level)
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
db_name = "archSpeechReco"
col_name = "moviesMeta"
col = get_mongo_collection(col_name, db_name, uri)
logging.info("let's start")
logging.info(input_path)
for file in glob(f'{input_path}/*.description'):
2020-08-20 20:03:54 +02:00
file_title = file.replace('description', 'title')
2020-08-19 20:10:27 +02:00
video_id = path.basename(file).split('.')[0]
video_url = f'https://www.youtube.com/watch?v={video_id}'
logging.debug(f'YT URL: {video_url}')
with open(file) as f:
video_descr = f.read()
2020-08-20 20:03:54 +02:00
logging.debug(f'Desc: {video_descr}')
with open(file_title) as f:
video_title = f.read()
logging.debug(f'Title: {video_title}')
m = re.match(r".*(\d\d\.\d\d\.19\d\d).*", video_title)
if m is not None:
video_date = datetime.strptime(m[1], '%d.%m.%Y').strftime('%Y-%m-%d')
else:
m = re.match(r".*(19\d\d).*", video_title)
video_date = m[1] if m is not None else "brak daty"
logging.debug(f'Video Date: {video_date}')
to_mongo = {
'title': video_title,
'url': video_url,
'source': input_source,
'gcsMp4': {
'location': f'mp4/{video_id}.mp4',
'uploadDate': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
},
'gcsWav': {
'location': f'wave/{video_id}.wav',
'uploadDate': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
}
try:
col.insert_one(to_mongo)
except:
logging.error('mongo update failed')
else:
logging.info('mongo insert OK')
logging.debug(f'inserted: {to_mongo}')
2020-08-19 20:10:27 +02:00
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='YouTube description to mongo')
parser.add_argument("--input", help="input path for .description files")
2020-08-20 20:03:54 +02:00
parser.add_argument("--source", help="source of media files [dtv, sonda, kronikiprl]")
2020-08-19 20:10:27 +02:00
parser.add_argument("--loglevel", help="log level: DEBUG INFO WARNING ERROR")
args = parser.parse_args()
main(args)