diff --git a/resources/dbSamples/doc1.json b/resources/dbSamples/doc1.json new file mode 100644 index 00000000..3f5d1f93 --- /dev/null +++ b/resources/dbSamples/doc1.json @@ -0,0 +1,710 @@ +{ + "_id" : ObjectId("5df3e63d4c0402698d7837f3"), + "title" : "Sportowa niedziela", + "mp4" : [ + "sites/default/files/video/PL06736e39-a6bb-4357-bf77-4e423fe60756/Kronika_PKF_53-42-5.mp4" + ], + "url" : "http://repozytorium.fn.org.pl/?q=pl/node/7754", + "description" : { + "fullTitle" : "Sportowa niedziela. Zawody lekkoatletyczne Finlandia - Polska. Mecz piłkarski Tirana - Warszawa", + "desc" : "Prezentacje polskich i fińskich lekkoatletów. Sportowcy podczas biegu. Skok o tyczce. Piłkarze przygotowujący się do meczu. Fragmenty meczu. Wiśniewski zdobywa bramkę dla Polski.", + "date" : "1953-09-30", + "sequence" : { + "00:00:01:15" : "Napis: \"Sportowa niedziela\". W tle kibice zgromadzeni na stadionie.", + "00:00:06:02" : "Prezentacja drużyn lekkoatletycznych.", + "00:00:19:06" : "Bieg na 1500 metrów.", + "00:00:35:02" : "Żbikowski dobiega do mety.", + "00:00:44:01" : "Adamczyk skacze o tyczce.", + "00:01:04:06" : "Piłkarze przygotowują się do meczu.", + "00:01:19:23" : "Fragmenty meczu Tirana - Warszawa.", + "00:02:09:20" : "Wiśniewski zdobywa bramkę dla Warszawy.", + "00:02:18:02" : "Plansza końcowa: \"Produkcja: WYTWÓRNIA FILMÓW DOKUMENTALNYCH\"." + }, + "details" : { + "Redakcja" : "Helena Lemańska", + "Komentarz" : "Jerzy Szelubski", + "Lektor" : "Andrzej Łapicki", + "Zdjęcia" : "Karol Szczeciński", + "Muzyka" : "Stefan Zawarski", + "Montaż" : "Wacław Kaźmierczak", + "Numer tematu" : "8-9", + "Osoba" : "Stefan Lewandowski (polski lekkoatleta, średniodystansowiec), Jan Wiktor Wiśniewski (polski piłkarz), Hogli (albański bramkarz), Edward Adamczyk (polski lekkoatleta, wieloboista), Kazimierz Żbikowski (polski lekkoatleta, średnio- i długodystansowiec), Valto Olenius (fiński lekkoatleta, tyczkarz), Edmund Potrzebowski (polski lekkoatleta, średniodystansowiec), Haikola (fiński biegacz)", + "Obiekt" : "Stadion Wojska Polskiego w Warszawie", + "Zdarzenie" : "zawody sportowe na Stadionie Wojska Polskiego w Warszawie", + "Czas akcji" : "1953", + "Miejsce akcji" : "Warszawa", + "Produkcja" : "Wytwórnia Filmów Dokumentalnych (Warszawa)", + "Prawa" : "WFDiF", + "Format dźwięku" : "mono", + "Format klatki" : "4:3" + } + }, + "gcsMp4" : { + "location" : "mp4/5df3e63d4c0402698d7837f3.mp4", + "uploadDate" : "2019-12-15 18:41:54" + }, + "gcsWav" : { + "location" : "wave/5df3e63d4c0402698d7837f3.wav", + "uploadDate" : "2020-01-05 23:10:27" + }, + "gcTextReco" : { + "transcript" : "35000 widzów zgromadzonych na stadionie Wojska Polskiego gotowało serdeczne przyjęcie sportowcom którzy przybyli do Warszawy by zmierzyć się z czołówką naszej lekkoatletyka do biegu na 1500m Lewandowski potrzebowski tłusta niemieckim chaikola piąty jest Żbikowski szpikowski W dobrym czasie 3 minuty 50 i 6 sekund skoku o tyczce taboretem publiczności czekolada liczy 4 m film o leniu z ma lepszy rozbieg i góruje nad Polakiem pod względem technicznym 4 mzasięg kulminacyjny punkt sportowy niedzieli zawody piłkarskie ci rana Warszawa Oto fragmenty spotkania goście przewyższają zespół polski techniczny i faktycznie ale nie umieją wykorzystać swojej przewagi Oblicz wpisuje się doskonale waga Wiśniewski zdobywa bramkę ostateczny rezultat na 1 dla Warszawy", + "transcripted" : "2020-01-07 21:44:18", + "words" : [ + { + "startTime" : "0.500s", + "endTime" : "7.600s", + "word" : "35000", + "speakerTag" : 1 + }, + { + "startTime" : "7.600s", + "endTime" : "7.900s", + "word" : "widzów", + "speakerTag" : 1 + }, + { + "startTime" : "7.900s", + "endTime" : "8.200s", + "word" : "zgromadzonych", + "speakerTag" : 1 + }, + { + "startTime" : "8.200s", + "endTime" : "8.600s", + "word" : "na", + "speakerTag" : 1 + }, + { + "startTime" : "8.600s", + "endTime" : "9.100s", + "word" : "stadionie", + "speakerTag" : 1 + }, + { + "startTime" : "9.100s", + "endTime" : "9.400s", + "word" : "Wojska", + "speakerTag" : 1 + }, + { + "startTime" : "9.400s", + "endTime" : "9.900s", + "word" : "Polskiego", + "speakerTag" : 1 + }, + { + "startTime" : "9.900s", + "endTime" : "10.700s", + "word" : "gotowało", + "speakerTag" : 1 + }, + { + "startTime" : "10.700s", + "endTime" : "11.300s", + "word" : "serdeczne", + "speakerTag" : 1 + }, + { + "startTime" : "11.300s", + "endTime" : "11.400s", + "word" : "przyjęcie", + "speakerTag" : 1 + }, + { + "startTime" : "11.400s", + "endTime" : "12.300s", + "word" : "sportowcom", + "speakerTag" : 1 + }, + { + "startTime" : "12.300s", + "endTime" : "13.200s", + "word" : "którzy", + "speakerTag" : 1 + }, + { + "startTime" : "13.200s", + "endTime" : "13.700s", + "word" : "przybyli", + "speakerTag" : 1 + }, + { + "startTime" : "13.700s", + "endTime" : "13.800s", + "word" : "do", + "speakerTag" : 1 + }, + { + "startTime" : "13.800s", + "endTime" : "14.300s", + "word" : "Warszawy", + "speakerTag" : 1 + }, + { + "startTime" : "14.300s", + "endTime" : "14.500s", + "word" : "by", + "speakerTag" : 1 + }, + { + "startTime" : "14.500s", + "endTime" : "14.800s", + "word" : "zmierzyć", + "speakerTag" : 1 + }, + { + "startTime" : "14.800s", + "endTime" : "15s", + "word" : "się", + "speakerTag" : 1 + }, + { + "startTime" : "15s", + "endTime" : "15.200s", + "word" : "z", + "speakerTag" : 1 + }, + { + "startTime" : "15.200s", + "endTime" : "15.700s", + "word" : "czołówką", + "speakerTag" : 1 + }, + { + "startTime" : "15.700s", + "endTime" : "16.100s", + "word" : "naszej", + "speakerTag" : 1 + }, + { + "startTime" : "16.100s", + "endTime" : "16.800s", + "word" : "lekkoatletyka", + "speakerTag" : 1 + }, + { + "startTime" : "16.800s", + "endTime" : "21s", + "word" : "do", + "speakerTag" : 1 + }, + { + "startTime" : "21s", + "endTime" : "21.300s", + "word" : "biegu", + "speakerTag" : 1 + }, + { + "startTime" : "21.300s", + "endTime" : "21.400s", + "word" : "na", + "speakerTag" : 1 + }, + { + "startTime" : "21.400s", + "endTime" : "22.100s", + "word" : "1500m", + "speakerTag" : 1 + }, + { + "startTime" : "22.100s", + "endTime" : "25.800s", + "word" : "Lewandowski", + "speakerTag" : 1 + }, + { + "startTime" : "25.800s", + "endTime" : "26.400s", + "word" : "potrzebowski", + "speakerTag" : 1 + }, + { + "startTime" : "26.400s", + "endTime" : "26.700s", + "word" : "tłusta", + "speakerTag" : 1 + }, + { + "startTime" : "26.700s", + "endTime" : "27.100s", + "word" : "niemieckim", + "speakerTag" : 1 + }, + { + "startTime" : "27.100s", + "endTime" : "27.500s", + "word" : "chaikola", + "speakerTag" : 1 + }, + { + "startTime" : "27.500s", + "endTime" : "28s", + "word" : "piąty", + "speakerTag" : 1 + }, + { + "startTime" : "28s", + "endTime" : "28.200s", + "word" : "jest", + "speakerTag" : 1 + }, + { + "startTime" : "28.200s", + "endTime" : "28.900s", + "word" : "Żbikowski", + "speakerTag" : 1 + }, + { + "startTime" : "28.900s", + "endTime" : "37.900s", + "word" : "szpikowski", + "speakerTag" : 1 + }, + { + "startTime" : "37.900s", + "endTime" : "38s", + "word" : "W", + "speakerTag" : 1 + }, + { + "startTime" : "38s", + "endTime" : "38.300s", + "word" : "dobrym", + "speakerTag" : 1 + }, + { + "startTime" : "38.300s", + "endTime" : "38.600s", + "word" : "czasie", + "speakerTag" : 1 + }, + { + "startTime" : "38.600s", + "endTime" : "38.900s", + "word" : "3", + "speakerTag" : 1 + }, + { + "startTime" : "38.900s", + "endTime" : "39.300s", + "word" : "minuty", + "speakerTag" : 1 + }, + { + "startTime" : "39.300s", + "endTime" : "40.100s", + "word" : "50", + "speakerTag" : 1 + }, + { + "startTime" : "40.100s", + "endTime" : "40.200s", + "word" : "i", + "speakerTag" : 1 + }, + { + "startTime" : "40.200s", + "endTime" : "40.600s", + "word" : "6", + "speakerTag" : 1 + }, + { + "startTime" : "40.600s", + "endTime" : "41.200s", + "word" : "sekund", + "speakerTag" : 1 + }, + { + "startTime" : "41.200s", + "endTime" : "43.100s", + "word" : "skoku", + "speakerTag" : 1 + }, + { + "startTime" : "43.100s", + "endTime" : "43.200s", + "word" : "o", + "speakerTag" : 1 + }, + { + "startTime" : "43.200s", + "endTime" : "43.300s", + "word" : "tyczce", + "speakerTag" : 1 + }, + { + "startTime" : "43.300s", + "endTime" : "44s", + "word" : "taboretem", + "speakerTag" : 1 + }, + { + "startTime" : "44s", + "endTime" : "44.600s", + "word" : "publiczności", + "speakerTag" : 1 + }, + { + "startTime" : "44.600s", + "endTime" : "44.900s", + "word" : "czekolada", + "speakerTag" : 1 + }, + { + "startTime" : "44.900s", + "endTime" : "45.200s", + "word" : "liczy", + "speakerTag" : 1 + }, + { + "startTime" : "45.200s", + "endTime" : "47.900s", + "word" : "4", + "speakerTag" : 1 + }, + { + "startTime" : "47.900s", + "endTime" : "48s", + "word" : "m", + "speakerTag" : 1 + }, + { + "startTime" : "48s", + "endTime" : "52.500s", + "word" : "film", + "speakerTag" : 1 + }, + { + "startTime" : "52.500s", + "endTime" : "52.600s", + "word" : "o", + "speakerTag" : 1 + }, + { + "startTime" : "52.600s", + "endTime" : "53s", + "word" : "leniu", + "speakerTag" : 1 + }, + { + "startTime" : "53s", + "endTime" : "53.100s", + "word" : "z", + "speakerTag" : 1 + }, + { + "startTime" : "53.100s", + "endTime" : "53.300s", + "word" : "ma", + "speakerTag" : 1 + }, + { + "startTime" : "53.300s", + "endTime" : "53.800s", + "word" : "lepszy", + "speakerTag" : 1 + }, + { + "startTime" : "53.800s", + "endTime" : "54.200s", + "word" : "rozbieg", + "speakerTag" : 1 + }, + { + "startTime" : "54.200s", + "endTime" : "54.300s", + "word" : "i", + "speakerTag" : 1 + }, + { + "startTime" : "54.300s", + "endTime" : "54.700s", + "word" : "góruje", + "speakerTag" : 1 + }, + { + "startTime" : "54.700s", + "endTime" : "54.900s", + "word" : "nad", + "speakerTag" : 1 + }, + { + "startTime" : "54.900s", + "endTime" : "55.300s", + "word" : "Polakiem", + "speakerTag" : 1 + }, + { + "startTime" : "55.300s", + "endTime" : "55.500s", + "word" : "pod", + "speakerTag" : 1 + }, + { + "startTime" : "55.500s", + "endTime" : "55.800s", + "word" : "względem", + "speakerTag" : 1 + }, + { + "startTime" : "55.800s", + "endTime" : "56s", + "word" : "technicznym", + "speakerTag" : 1 + }, + { + "startTime" : "56s", + "endTime" : "59.900s", + "word" : "4", + "speakerTag" : 1 + }, + { + "startTime" : "59.900s", + "endTime" : "60.100s", + "word" : "m", + "speakerTag" : 1 + }, + { + "startTime" : "60.500s", + "endTime" : "64.300s", + "word" : "zasięg", + "speakerTag" : 1 + }, + { + "startTime" : "64.300s", + "endTime" : "65s", + "word" : "kulminacyjny", + "speakerTag" : 1 + }, + { + "startTime" : "65s", + "endTime" : "65.100s", + "word" : "punkt", + "speakerTag" : 1 + }, + { + "startTime" : "65.100s", + "endTime" : "65.600s", + "word" : "sportowy", + "speakerTag" : 1 + }, + { + "startTime" : "65.600s", + "endTime" : "66s", + "word" : "niedzieli", + "speakerTag" : 1 + }, + { + "startTime" : "66s", + "endTime" : "66.300s", + "word" : "zawody", + "speakerTag" : 1 + }, + { + "startTime" : "66.300s", + "endTime" : "66.700s", + "word" : "piłkarskie", + "speakerTag" : 1 + }, + { + "startTime" : "66.700s", + "endTime" : "67.200s", + "word" : "ci", + "speakerTag" : 1 + }, + { + "startTime" : "67.200s", + "endTime" : "67.600s", + "word" : "rana", + "speakerTag" : 1 + }, + { + "startTime" : "67.600s", + "endTime" : "68.200s", + "word" : "Warszawa", + "speakerTag" : 1 + }, + { + "startTime" : "68.200s", + "endTime" : "81s", + "word" : "Oto", + "speakerTag" : 1 + }, + { + "startTime" : "81s", + "endTime" : "81.600s", + "word" : "fragmenty", + "speakerTag" : 1 + }, + { + "startTime" : "81.600s", + "endTime" : "82s", + "word" : "spotkania", + "speakerTag" : 1 + }, + { + "startTime" : "85.800s", + "endTime" : "86.900s", + "word" : "goście", + "speakerTag" : 1 + }, + { + "startTime" : "86.900s", + "endTime" : "87.400s", + "word" : "przewyższają", + "speakerTag" : 1 + }, + { + "startTime" : "87.400s", + "endTime" : "87.700s", + "word" : "zespół", + "speakerTag" : 1 + }, + { + "startTime" : "87.700s", + "endTime" : "87.900s", + "word" : "polski", + "speakerTag" : 1 + }, + { + "startTime" : "87.900s", + "endTime" : "88.600s", + "word" : "techniczny", + "speakerTag" : 1 + }, + { + "startTime" : "88.600s", + "endTime" : "88.700s", + "word" : "i", + "speakerTag" : 1 + }, + { + "startTime" : "88.700s", + "endTime" : "89.200s", + "word" : "faktycznie", + "speakerTag" : 1 + }, + { + "startTime" : "89.200s", + "endTime" : "89.600s", + "word" : "ale", + "speakerTag" : 1 + }, + { + "startTime" : "89.600s", + "endTime" : "89.900s", + "word" : "nie", + "speakerTag" : 1 + }, + { + "startTime" : "89.900s", + "endTime" : "90.200s", + "word" : "umieją", + "speakerTag" : 1 + }, + { + "startTime" : "90.200s", + "endTime" : "90.800s", + "word" : "wykorzystać", + "speakerTag" : 1 + }, + { + "startTime" : "90.800s", + "endTime" : "91s", + "word" : "swojej", + "speakerTag" : 1 + }, + { + "startTime" : "91s", + "endTime" : "91.100s", + "word" : "przewagi", + "speakerTag" : 1 + }, + { + "startTime" : "91.900s", + "endTime" : "117.300s", + "word" : "Oblicz", + "speakerTag" : 1 + }, + { + "startTime" : "117.300s", + "endTime" : "117.700s", + "word" : "wpisuje", + "speakerTag" : 1 + }, + { + "startTime" : "117.700s", + "endTime" : "117.900s", + "word" : "się", + "speakerTag" : 1 + }, + { + "startTime" : "117.900s", + "endTime" : "118.400s", + "word" : "doskonale", + "speakerTag" : 1 + }, + { + "startTime" : "118.400s", + "endTime" : "129.600s", + "word" : "waga", + "speakerTag" : 1 + }, + { + "startTime" : "129.600s", + "endTime" : "130.500s", + "word" : "Wiśniewski", + "speakerTag" : 1 + }, + { + "startTime" : "130.500s", + "endTime" : "130.900s", + "word" : "zdobywa", + "speakerTag" : 1 + }, + { + "startTime" : "130.900s", + "endTime" : "131s", + "word" : "bramkę", + "speakerTag" : 1 + }, + { + "startTime" : "131s", + "endTime" : "132.500s", + "word" : "ostateczny", + "speakerTag" : 1 + }, + { + "startTime" : "132.500s", + "endTime" : "132.700s", + "word" : "rezultat", + "speakerTag" : 1 + }, + { + "startTime" : "132.700s", + "endTime" : "133.600s", + "word" : "na", + "speakerTag" : 1 + }, + { + "startTime" : "133.600s", + "endTime" : "134s", + "word" : "1", + "speakerTag" : 1 + }, + { + "startTime" : "134s", + "endTime" : "134.100s", + "word" : "dla", + "speakerTag" : 1 + }, + { + "startTime" : "134.100s", + "endTime" : "134.600s", + "word" : "Warszawy", + "speakerTag" : 1 + } + ] + } +} diff --git a/resources/dbSamples/doc2.json b/resources/dbSamples/doc2.json new file mode 100644 index 00000000..c6932b42 --- /dev/null +++ b/resources/dbSamples/doc2.json @@ -0,0 +1,340 @@ +{ + "_id" : ObjectId("5df3e63c4c0402698d782e78"), + "title" : "Azory. Nowa wyspa wulkaniczna", + "mp4" : [ + "sites/default/files/video/PLab8cc349-67c6-42a7-b31e-6bdc4d6c5695/6.mp4" + ], + "url" : "http://repozytorium.fn.org.pl/?q=pl/node/10329", + "description" : { + "fullTitle" : "Azory. Nowa wyspa wulkaniczna", + "desc" : "Wybuch wulkanu.", + "date" : "1957-10-18", + "sequence" : { + "00:00:00:16" : "Napis: „Azory”. W tle dymy unoszące się nad wodą.", + "00:00:03:04" : "Ujęcie z samolotu na morze. Unosi się dym wulkaniczny i tworzy się nowa wyspa wulkaniczna." + }, + "details" : { + "Komentarz" : "Jerzy Kasprzycki", + "Lektor" : "Barbara Matkowska, Włodzimierz Kmicik", + "Opracowanie dźwiękowe" : "Stefan Zawarski", + "Montaż" : "Krystyna Rutkowska, Marian Duszyński", + "Numer tematu" : "6", + "Czas akcji" : "1957", + "Miejsce akcji" : "Azory", + "Produkcja" : "Wytwórnia Filmów Dokumentalnych (Warszawa)", + "Wymiana zagraniczna" : "Wytwórnia Filmowa, Portugalia", + "Prawa" : "WFDiF", + "Format dźwięku" : "mono", + "Format klatki" : "4:3", + "System koloru" : "czarno-biały" + } + }, + "gcsMp4" : { + "location" : "mp4/5df3e63c4c0402698d782e78.mp4", + "uploadDate" : "2019-12-15 17:38:04" + }, + "gcsWav" : { + "location" : "wave/5df3e63c4c0402698d782e78.wav", + "uploadDate" : "2020-01-05 21:14:59" + }, + "gcTextReco" : { + "transcript" : "w pobliżu Azorów może gotuje się jak zupa w garnku na niebie żółto-czerwony dym pod wodą płynna lawa grzyb wybuchu jak po bombie atomowej Jesteśmy Świadkami narodzin nowej wyspy wulkanicznej władze portugalskie czekają i szampanem ale co do wulkanów to nigdy nic nie wiadomo na mapie radzimy trochę poczekać", + "words" : [ + { + "startTime" : "0s", + "endTime" : "4.500s", + "word" : "w", + "speakerTag" : 1 + }, + { + "startTime" : "4.500s", + "endTime" : "5s", + "word" : "pobliżu", + "speakerTag" : 1 + }, + { + "startTime" : "5s", + "endTime" : "5.500s", + "word" : "Azorów", + "speakerTag" : 1 + }, + { + "startTime" : "5.500s", + "endTime" : "5.900s", + "word" : "może", + "speakerTag" : 1 + }, + { + "startTime" : "5.900s", + "endTime" : "6.200s", + "word" : "gotuje", + "speakerTag" : 1 + }, + { + "startTime" : "6.200s", + "endTime" : "6.400s", + "word" : "się", + "speakerTag" : 1 + }, + { + "startTime" : "6.400s", + "endTime" : "6.600s", + "word" : "jak", + "speakerTag" : 1 + }, + { + "startTime" : "6.600s", + "endTime" : "6.900s", + "word" : "zupa", + "speakerTag" : 1 + }, + { + "startTime" : "6.900s", + "endTime" : "7s", + "word" : "w", + "speakerTag" : 1 + }, + { + "startTime" : "7s", + "endTime" : "7.400s", + "word" : "garnku", + "speakerTag" : 1 + }, + { + "startTime" : "7.400s", + "endTime" : "8.300s", + "word" : "na", + "speakerTag" : 1 + }, + { + "startTime" : "8.300s", + "endTime" : "8.700s", + "word" : "niebie", + "speakerTag" : 1 + }, + { + "startTime" : "8.700s", + "endTime" : "9.400s", + "word" : "żółto-czerwony", + "speakerTag" : 1 + }, + { + "startTime" : "9.400s", + "endTime" : "9.600s", + "word" : "dym", + "speakerTag" : 1 + }, + { + "startTime" : "9.600s", + "endTime" : "10.400s", + "word" : "pod", + "speakerTag" : 1 + }, + { + "startTime" : "10.400s", + "endTime" : "10.800s", + "word" : "wodą", + "speakerTag" : 1 + }, + { + "startTime" : "10.800s", + "endTime" : "11.300s", + "word" : "płynna", + "speakerTag" : 1 + }, + { + "startTime" : "11.300s", + "endTime" : "11.600s", + "word" : "lawa", + "speakerTag" : 1 + }, + { + "startTime" : "11.600s", + "endTime" : "12.900s", + "word" : "grzyb", + "speakerTag" : 1 + }, + { + "startTime" : "12.900s", + "endTime" : "13.400s", + "word" : "wybuchu", + "speakerTag" : 1 + }, + { + "startTime" : "13.400s", + "endTime" : "13.500s", + "word" : "jak", + "speakerTag" : 1 + }, + { + "startTime" : "13.500s", + "endTime" : "13.700s", + "word" : "po", + "speakerTag" : 1 + }, + { + "startTime" : "13.700s", + "endTime" : "14.100s", + "word" : "bombie", + "speakerTag" : 1 + }, + { + "startTime" : "14.100s", + "endTime" : "14.300s", + "word" : "atomowej", + "speakerTag" : 1 + }, + { + "startTime" : "15.300s", + "endTime" : "16.500s", + "word" : "Jesteśmy", + "speakerTag" : 1 + }, + { + "startTime" : "16.500s", + "endTime" : "16.800s", + "word" : "Świadkami", + "speakerTag" : 1 + }, + { + "startTime" : "16.800s", + "endTime" : "17.100s", + "word" : "narodzin", + "speakerTag" : 1 + }, + { + "startTime" : "17.100s", + "endTime" : "17.700s", + "word" : "nowej", + "speakerTag" : 1 + }, + { + "startTime" : "17.700s", + "endTime" : "18s", + "word" : "wyspy", + "speakerTag" : 1 + }, + { + "startTime" : "18s", + "endTime" : "18.700s", + "word" : "wulkanicznej", + "speakerTag" : 1 + }, + { + "startTime" : "18.700s", + "endTime" : "19.400s", + "word" : "władze", + "speakerTag" : 1 + }, + { + "startTime" : "19.400s", + "endTime" : "20.100s", + "word" : "portugalskie", + "speakerTag" : 1 + }, + { + "startTime" : "20.100s", + "endTime" : "20.500s", + "word" : "czekają", + "speakerTag" : 1 + }, + { + "startTime" : "20.500s", + "endTime" : "20.600s", + "word" : "i", + "speakerTag" : 1 + }, + { + "startTime" : "20.600s", + "endTime" : "21.600s", + "word" : "szampanem", + "speakerTag" : 1 + }, + { + "startTime" : "21.600s", + "endTime" : "25s", + "word" : "ale", + "speakerTag" : 1 + }, + { + "startTime" : "25s", + "endTime" : "25.100s", + "word" : "co", + "speakerTag" : 1 + }, + { + "startTime" : "25.100s", + "endTime" : "25.300s", + "word" : "do", + "speakerTag" : 1 + }, + { + "startTime" : "25.300s", + "endTime" : "25.800s", + "word" : "wulkanów", + "speakerTag" : 1 + }, + { + "startTime" : "25.800s", + "endTime" : "26.100s", + "word" : "to", + "speakerTag" : 1 + }, + { + "startTime" : "26.100s", + "endTime" : "26.300s", + "word" : "nigdy", + "speakerTag" : 1 + }, + { + "startTime" : "26.300s", + "endTime" : "26.500s", + "word" : "nic", + "speakerTag" : 1 + }, + { + "startTime" : "26.500s", + "endTime" : "26.600s", + "word" : "nie", + "speakerTag" : 1 + }, + { + "startTime" : "26.600s", + "endTime" : "26.700s", + "word" : "wiadomo", + "speakerTag" : 1 + }, + { + "startTime" : "26.700s", + "endTime" : "28.200s", + "word" : "na", + "speakerTag" : 1 + }, + { + "startTime" : "28.200s", + "endTime" : "28.400s", + "word" : "mapie", + "speakerTag" : 1 + }, + { + "startTime" : "28.400s", + "endTime" : "28.800s", + "word" : "radzimy", + "speakerTag" : 1 + }, + { + "startTime" : "28.800s", + "endTime" : "29.200s", + "word" : "trochę", + "speakerTag" : 1 + }, + { + "startTime" : "29.200s", + "endTime" : "29.800s", + "word" : "poczekać", + "speakerTag" : 1 + } + ], + "transcripted" : "2020-01-07 21:40:32" + } +} diff --git a/src/reco.py b/src/reco.py index 2050ea8a..6b6d6734 100644 --- a/src/reco.py +++ b/src/reco.py @@ -1,39 +1,140 @@ -#!/usr/bin/python +#from google.cloud import speech_v1 +from google.cloud import speech_v1p1beta1 +from google.cloud.speech_v1p1beta1 import enums +from google.cloud.speech_v1p1beta1 import types +from pymongo import MongoClient +import json +import argparse +from google.protobuf.json_format import MessageToJson,MessageToDict +from storageUpload import getMongoCollection +from bson.objectid import ObjectId +import datetime +import time +import concurrent.futures +import re -import sys -import base64 -import googleapiclient.discovery -import os -from natsort import natsorted -for dirname, dirnames, filenames in os.walk(sys.argv[1]): - # print path to all filenames. - for filename in natsorted(filenames): - speech_file = os.path.join(dirname, filename) - with open(speech_file, 'rb') as speech: - # Base64 encode the binary audio file for inclusion in the JSON - # request. - temp = base64.b64encode(speech.read()) - speech_content = temp.decode() +def main(args): + mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" + dbName = "archSpeechReco" + colName = "moviesMeta" + global col + col = getMongoCollection(colName,dbName,mongoUri) + batch_size = int(args.batch_size) + waves = getWavList(col,batch_size) + uris = [ w['gcsWawLocation'] for w in waves ] - # Construct the request - service = googleapiclient.discovery.build('speech', 'v1') - service_request = service.speech().recognize( - body={ - "config": { - "encoding": "LINEAR16", # raw 16-bit signed LE samples - "sampleRateHertz": 44100, # 16 khz - "languageCode": "pl-PL", # a BCP-47 language tag - }, - "audio": { - "content": speech_content - } - }) + start = time.perf_counter() + with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor: + executor.map(run_reco, uris) + stop = time.perf_counter() - response = service_request.execute() - #recognized_text = 'Transcribed Text: \n' - recognized_text = '' - if len(response) > 0: - for i in range(len(response['results'])): - recognized_text += response['results'][i]['alternatives'][0]['transcript'] - print(recognized_text) + print(f'Finished in {round(stop-start, 2)} seconds') + + +def run_reco(uri): + reco = recognize(uri) + recoDict = MessageToDict(reco) + + if (len(recoDict) != 0): + words = recoDict["results"][-1]["alternatives"][0]["words"] + transcript = "".join( [ trans["alternatives"][0]["transcript"] for trans in recoDict["results"][:-1] ] ) + elif (len(recoDict) == 0): + words = {} + transcript = "film niemy" + + now = datetime.datetime.now() + try: + col.update_one( + {"_id": ObjectId(uri.split('/')[4].split('.')[0])}, + {"$set":{"gcTextReco.transcript":transcript, + "gcTextReco.words":words, + "gcTextReco.transcripted":now.strftime("%Y-%m-%d %H:%M:%S")}} + ) + except Exception as e: print(e) + else: + print(f"mongo update OK {uri.split('/')[4].split('.')[0]}") + + +def recognize(storage_uri): + """ + Transcribe long audio file from Cloud Storage using asynchronous speech + recognition + + Args: + storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] + """ + + #client = speech_v1.SpeechClient() + client = speech_v1p1beta1.SpeechClient() + # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' + + # Sample rate in Hertz of the audio data sent + sample_rate_hertz = 44100 + + # The language of the supplied audio + language_code = "pl-PL" + + # Encoding of audio data sent. This sample sets this explicitly. + # This field is optional for FLAC and WAV audio formats. + encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 + enable_speaker_diarization = True + #config = { + #"sample_rate_hertz": sample_rate_hertz, + # "language_code": language_code, + # "encoding": encoding, + # "enableSpeakerDiarization": enable_speaker_diarization + # + d_config = types.SpeakerDiarizationConfig( + enable_speaker_diarization=True + ) + config = types.RecognitionConfig( + encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz = 44100, + language_code = "pl-PL", + diarization_config=d_config + ) + + audio = {"uri": storage_uri} + + operation = client.long_running_recognize(config, audio) + print(f'{storage_uri} has been sent to reco') + print(u"Waiting for operation to complete...") + response = operation.result() + + return response + + +def getMongoCollection(colName,dbName,uri): + client = MongoClient(uri,maxPoolSize=512) + db = client[dbName] + col = db[colName] + + return col + + +def getWavList(col,limit=32): + pipeline = [] + #match phase, filetr documents withour gcTextReco field - voice not recognized + pipeline.append({"$match": {"$and":[ + {"gcTextReco": {"$exists": False}}, + {"gcsWav": {"$exists": True}}, + {"description.details.Format dźwięku": {"$ne": "brak"}} + ]} + } + ) + #project phase, show only bucket name: gcsWav.location + pipeline.append({"$project": { + "gcsWawLocation": { "$concat": [ "gs://archspeechreco/","$gcsWav.location" ] } + } + }) + #fetch only N documents + pipeline.append({"$limit":limit}) + return col.aggregate(pipeline) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Google Cloud speech2text API client') + parser.add_argument("--batch_size", default=512, help="how many waves in the batch") + args = parser.parse_args() + main(args)