youtube to mongo
This commit is contained in:
parent
d8040377b5
commit
07c10c4bf9
48
src/temp
48
src/temp
@ -64,4 +64,50 @@ https://www.youtube.com/playlist?list=PLE6CBDC963E1806AD
|
|||||||
youtube-dl -f mp4 -i --id -x --audio-format wav --add-metadata --write-description \
|
youtube-dl -f mp4 -i --id -x --audio-format wav --add-metadata --write-description \
|
||||||
--proxy socks5://localhost:9999/ -k \
|
--proxy socks5://localhost:9999/ -k \
|
||||||
--exec 'gsutil cp {} gs://archspeechreco/wave && rm {} && gsutil cp `echo {} | cut -d "." -f1`.mp4 gs://archspeechreco/mp4 && rm `echo {} | cut -d "." -f1`.mp4' \
|
--exec 'gsutil cp {} gs://archspeechreco/wave && rm {} && gsutil cp `echo {} | cut -d "." -f1`.mp4 gs://archspeechreco/mp4 && rm `echo {} | cut -d "." -f1`.mp4' \
|
||||||
https://www.youtube.com/channel/UCy91ke1yYCZiFdnZ3vTdY_Q/videos
|
https://www.youtube.com/channel/UCy91ke1yYCZiFdnZ3vTdY_Q/videos
|
||||||
|
|
||||||
|
for f in `ls sonda/ | cut -d "." -f1`; do echo gs://archspeechreco/wave/$f.wav; done | gsutil -m cp -I ./sonda_wave/
|
||||||
|
|
||||||
|
for f in `ls | cut -d "." -f1`; do youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=$f > $f.title; done
|
||||||
|
|
||||||
|
|
||||||
|
for f in `ls | cut -d "." -f1`; do youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=$f > $f.title; done
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=-7tezSQBZhg > -7tezSQBZhg.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=0fr0vQfZeKE > 0fr0vQfZeKE.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=1FhejGVNFuI > 1FhejGVNFuI.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=3aWb4Te6F84 > 3aWb4Te6F84.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=3xj4AjWgZr0 > 3xj4AjWgZr0.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=4kKl_iiMjj4 > 4kKl_iiMjj4.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=56I3zYf316s > 56I3zYf316s.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=6oUId6Jx1OM > 6oUId6Jx1OM.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=75yM0jdkBrs > 75yM0jdkBrs.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=8wg0XEIwYV4 > 8wg0XEIwYV4.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=91WKUB2BXBU > 91WKUB2BXBU.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=EBhJzIF1t3M > EBhJzIF1t3M.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=GFwW49KqoW4 > GFwW49KqoW4.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=QV0BLvTjAYg > QV0BLvTjAYg.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=S3lk1ZcrsH0 > S3lk1ZcrsH0.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=U6yoI0yBLQk > U6yoI0yBLQk.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=UPTtFoqySeY > UPTtFoqySeY.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=U_t7y_ktmLE > U_t7y_ktmLE.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=WZclHUbylVs > WZclHUbylVs.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=Wl-n6VHXAJ4 > Wl-n6VHXAJ4.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=YLr7pwIMW8g > YLr7pwIMW8g.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=Ymo0WUJc7T0 > Ymo0WUJc7T0.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=aPHXeR8VdHM > aPHXeR8VdHM.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=cNzas0WtnrU > cNzas0WtnrU.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=eCWPEy3sriM > eCWPEy3sriM.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=e_EoPQObDvY > e_EoPQObDvY.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=fOHdDZg1jQ0 > fOHdDZg1jQ0.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=gMRdK0rt8yg > gMRdK0rt8yg.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=nU9y_E3zysc > nU9y_E3zysc.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=pHoFRQViBg4 > pHoFRQViBg4.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=tFB6tcVsLQY > tFB6tcVsLQY.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=tWzIfplDy0s > tWzIfplDy0s.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=uosd3_3KwnY > uosd3_3KwnY.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=uuFmYozhoNM > uuFmYozhoNM.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=vFQflTQV-f0 > vFQflTQV-f0.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=weGOS1cw2BM > weGOS1cw2BM.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=xzBw5MCjf2U > xzBw5MCjf2U.title
|
||||||
|
youtube-dl -q -e --proxy socks5://localhost:9999/ https://www.youtube.com/watch?v=zJeTPTjkcOc > zJeTPTjkcOc.title
|
||||||
|
|
||||||
|
@ -3,11 +3,14 @@ import argparse
|
|||||||
import logging
|
import logging
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from os import path
|
from os import path
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
loglevel = args.loglevel
|
loglevel = args.loglevel
|
||||||
input_path = args.input
|
input_path = args.input
|
||||||
|
input_source = args.source
|
||||||
numeric_level = getattr(logging, loglevel.upper(), 10)
|
numeric_level = getattr(logging, loglevel.upper(), 10)
|
||||||
logging.basicConfig(format='%(asctime)s [%(levelname)s] - %(message)s', level=numeric_level)
|
logging.basicConfig(format='%(asctime)s [%(levelname)s] - %(message)s', level=numeric_level)
|
||||||
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
||||||
@ -20,17 +23,49 @@ def main(args):
|
|||||||
logging.info(input_path)
|
logging.info(input_path)
|
||||||
|
|
||||||
for file in glob(f'{input_path}/*.description'):
|
for file in glob(f'{input_path}/*.description'):
|
||||||
|
file_title = file.replace('description', 'title')
|
||||||
video_id = path.basename(file).split('.')[0]
|
video_id = path.basename(file).split('.')[0]
|
||||||
video_url = f'https://www.youtube.com/watch?v={video_id}'
|
video_url = f'https://www.youtube.com/watch?v={video_id}'
|
||||||
logging.debug(f'YT URL: {video_url}')
|
logging.debug(f'YT URL: {video_url}')
|
||||||
with open(file) as f:
|
with open(file) as f:
|
||||||
video_descr = f.read()
|
video_descr = f.read()
|
||||||
logging.debug(f'Desc: {video_descr}')
|
logging.debug(f'Desc: {video_descr}')
|
||||||
|
with open(file_title) as f:
|
||||||
|
video_title = f.read()
|
||||||
|
logging.debug(f'Title: {video_title}')
|
||||||
|
m = re.match(r".*(\d\d\.\d\d\.19\d\d).*", video_title)
|
||||||
|
if m is not None:
|
||||||
|
video_date = datetime.strptime(m[1], '%d.%m.%Y').strftime('%Y-%m-%d')
|
||||||
|
else:
|
||||||
|
m = re.match(r".*(19\d\d).*", video_title)
|
||||||
|
video_date = m[1] if m is not None else "brak daty"
|
||||||
|
logging.debug(f'Video Date: {video_date}')
|
||||||
|
to_mongo = {
|
||||||
|
'title': video_title,
|
||||||
|
'url': video_url,
|
||||||
|
'source': input_source,
|
||||||
|
'gcsMp4': {
|
||||||
|
'location': f'mp4/{video_id}.mp4',
|
||||||
|
'uploadDate': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
},
|
||||||
|
'gcsWav': {
|
||||||
|
'location': f'wave/{video_id}.wav',
|
||||||
|
'uploadDate': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
col.insert_one(to_mongo)
|
||||||
|
except:
|
||||||
|
logging.error('mongo update failed')
|
||||||
|
else:
|
||||||
|
logging.info('mongo insert OK')
|
||||||
|
logging.debug(f'inserted: {to_mongo}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='YouTube description to mongo')
|
parser = argparse.ArgumentParser(description='YouTube description to mongo')
|
||||||
parser.add_argument("--input", help="input path for .description files")
|
parser.add_argument("--input", help="input path for .description files")
|
||||||
|
parser.add_argument("--source", help="source of media files [dtv, sonda, kronikiprl]")
|
||||||
parser.add_argument("--loglevel", help="log level: DEBUG INFO WARNING ERROR")
|
parser.add_argument("--loglevel", help="log level: DEBUG INFO WARNING ERROR")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
Loading…
Reference in New Issue
Block a user