From 2eb383e7ba33d170765aac356fabe83197c730f2 Mon Sep 17 00:00:00 2001 From: ssut Date: Sat, 6 Jun 2015 14:44:15 +0900 Subject: [PATCH] add language detection, and use JSON to clean up the code for easier comprehension --- googletrans/response.py | 42 +++++++++++++++ googletrans/translator.py | 111 ++++++++++++++++++++++++++++---------- googletrans/urls.py | 1 + tests.py | 14 ++++- translate | 11 ++++ 5 files changed, 150 insertions(+), 29 deletions(-) create mode 100644 googletrans/response.py diff --git a/googletrans/response.py b/googletrans/response.py new file mode 100644 index 0000000..69e5f50 --- /dev/null +++ b/googletrans/response.py @@ -0,0 +1,42 @@ +class Translated: + """ + The Translated object, which contains Google Translator's result. + + :param src: source langauge (default: auto) + :param dest: destination language (default: en) + :param origin: original text + :param text: translated text + :param pronunciation: the pronunciation provided by Google Translator + """ + def __init__(self, src, dest, origin, text, pronunciation): + self.src = src + self.dest = dest + self.origin = origin + self.text = text + self.pronunciation = pronunciation + + def __str__(self): + return self.__unicode__() + + def __unicode__(self): + return u''.format( + src=self.src, dest=self.dest, text=self.text, pronunciation=self.pronunciation) + +class Detected: + """ + The detected object, which contains Google Translator's langauge detection result. + + :param lang: detected language + :param confidence: the confidence of detection (0.00 to 1.00) + """ + def __init__(self, lang, confidence): + self.lang = lang + self.confidence = confidence + + def __str__(self): + return self.__unicode__() + + def __unicode__(self): + return u''.format( + lang=self.lang, confidence=self.confidence) + diff --git a/googletrans/translator.py b/googletrans/translator.py index 503e792..e5e9b21 100644 --- a/googletrans/translator.py +++ b/googletrans/translator.py @@ -3,23 +3,6 @@ A Translation module. You can translate text using this module. - -Basic usage: - >>> from googletrans import translator - >>> translator.translate('안녕하세요.') - - >>> translator.translate('안녕하세요.', dest='ja') - - >>> translator.translate('veritas lux mea', src='la') - - -Advanced usage: - >>> translations = translator.translate(['The quick brown fox', 'jumps over', 'the lazy dog'], dest='ko') - >>> for translation in translations: - ... print(translation.origin, ' -> ', translation.text) - The quick brown fox -> 빠른 갈색 여우 - jumps over -> 이상 점프 - the lazy dog -> 게으른 개 """ import re import requests @@ -28,7 +11,8 @@ from future.moves.urllib.parse import quote from . import __version__ from googletrans import urls -from googletrans.translated import Translated +from googletrans.conversion import format_json +from googletrans.response import Translated, Detected user_agent = 'PyGt/{0}'.format(__version__) @@ -53,11 +37,27 @@ def agent(): return __agent -# translator.translate(text, to='') def translate(text, dest='en', src='auto'): """ Translate the passed text into destination language. +Basic usage: + >>> from googletrans import translator + >>> translator.translate('안녕하세요.') + + >>> translator.translate('안녕하세요.', dest='ja') + + >>> translator.translate('veritas lux mea', src='la') + + +Advanced usage: + >>> translations = translator.translate(['The quick brown fox', 'jumps over', 'the lazy dog'], dest='ko') + >>> for translation in translations: + ... print(translation.origin, ' -> ', translation.text) + The quick brown fox -> 빠른 갈색 여우 + jumps over -> 이상 점프 + the lazy dog -> 게으른 개 + :param text: the text you want to translate. you can pass this parameter as a list object, as shown in the advanced usage above. :param dest: the destination language you want to translate. (default: en) @@ -86,23 +86,22 @@ def translate(text, dest='en', src='auto'): [[["공화국","republique"],[,,"gonghwagug"]],,"fr",,,[["republique",1,[["공화국",1000,true,false],["공화국의",0,true,false],["공화국에",0,true,false],["공화국에서",0,true,false]],[[0,10]],"republique",0,1]],0.94949496,,[["fr"],,[0.94949496]],,,[["명사",[[["communauté","démocratie"],""]],"république"]]] """ + data = format_json(r.text) # this code will be updated when the format is changed. - # (I know this code is not really efficient and so sketchy.) - translated = r.text.split('[[[')[1][1:].split('"')[0] + translated = data[0][0][0] + # actual source language that will be recognized by Google Translator when the # src passed is equal to auto. try: - src = RE_SRC.findall(r.text)[0] + src = data[-1][0][0] except: pass pron = origin - if src not in EXCLUDES: - try: - pron_table = r.text.split('[[[')[1].split('[')[1] - pron = pron_table.split('"')[1].split('"')[0] - except: pass - if dest in EXCLUDES: + try: + pron = data[0][1][-1] + except: pass + if dest in EXCLUDES and pron == origin: pron = translated # put final values into new Translated object @@ -110,3 +109,59 @@ def translate(text, dest='en', src='auto'): text=translated, pronunciation=pron) return result + +def detect(text): + """ + Detect the language of a text. + +Basic usage: + >>> from googletrans import translator + >>> translator.detect('이 문장은 한글로 쓰여졌습니다.') + + >>> translator.detect('この文章は日本語で書かれました。') + + >>> translator.detect('This sentence is written in English.') + + >>> translator.detect('Tiu frazo estas skribita en Esperanto.') + + +Advanced usage: + >>> langs = translator.detect(['한국어', '日本語', 'English', 'le français']) + >>> for lang in langs: + ... print(lang.lang, lang.confidence) + ko 1 + ja 0.92929292 + en 0.96954316 + fr 0.043500196 + + :param text: the text you want to detect. + + :rtype: Detected + :rtype: list (when list is passed) + """ + if isinstance(text, list): + result = [] + for item in text: + lang = detect(item) + result.append(lang) + return result + + result = '' + sess = agent() # acquire requests session + origin = text + text = quote(text) + url = urls.DETECT.format(query=text) + r = sess.get(url, headers=__headers) + data = format_json(r.text) + + # actual source language that will be recognized by Google Translator when the + # src passed is equal to auto. + src = '' + confidence = 0.0 + try: + src = ''.join(data[-1][0]) + confidence = data[-1][-1][0] + except: pass + result = Detected(lang=src, confidence=confidence) + + return result diff --git a/googletrans/urls.py b/googletrans/urls.py index 3ac1106..7a340f0 100644 --- a/googletrans/urls.py +++ b/googletrans/urls.py @@ -4,3 +4,4 @@ Predefined URLs used to make google translate requests. """ TRANSLATOR = 'https://translate.google.com/' TRANSLATE = 'https://translate.google.com/translate_a/single?client=t&sl={src}&tl={dest}&hl={dest}&dt=bd&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&q={query}' +DETECT = 'https://translate.google.com/translate_a/single?client=t&sl=auto&tl=en&hl=en&dt=bd&ie=UTF-8&oe=UTF-8&q={query}' \ No newline at end of file diff --git a/tests.py b/tests.py index 7886d2b..0db481b 100644 --- a/tests.py +++ b/tests.py @@ -1,11 +1,16 @@ # -*- coding: utf-8 -*- import unittest -from googletrans import translator +from googletrans import translator, conversion class TranslateTests(unittest.TestCase): def setUp(self): pass + def test_to_json(self): + text = '[,,"en",,,,0.96954316,,[["en"],,[0.96954316]]]' + approx = [None, None, 'en', None, None, None, 0.96954316, None, [['en'], None, [0.96954316]]] + assert conversion.format_json(text) == approx + def test_latin_to_english(self): result = translator.translate('veritas lux mea', src='la', dest='en') assert result.text == 'The truth is my light' @@ -22,5 +27,12 @@ class TranslateTests(unittest.TestCase): assert translations[1].text == u'이상 점프' assert translations[2].text == u'게으른 개' + def test_language_detection(self): + ko = translator.detect('한국어') + assert ko.lang == 'ko' + + en = translator.detect('English') + assert en.lang == 'en' + if __name__ == '__main__': unittest.main() diff --git a/translate b/translate index 0c1efcf..60844d4 100755 --- a/translate +++ b/translate @@ -11,8 +11,19 @@ def main(): help='The destination language you want to translate. (Default: en)') parser.add_argument('-s', '--src', default='auto', help='The source language you want to translate. (Default: auto)') + parser.add_argument('-c', '--detect', action='store_true', default=False, + help='') args = parser.parse_args() + if args.detect: + result = translator.detect(args.text) + result = """ +[{lang}, {confidence}] {text} + """.strip().format(text=args.text, + lang=result.lang, confidence=result.confidence) + print(result) + return + result = translator.translate(args.text, dest=args.dest, src=args.src) result = """ [{src}] {original}