diff --git a/tools/corpus_analyzer.pl b/tools/corpus_analyzer.pl new file mode 100755 index 0000000..27453c1 --- /dev/null +++ b/tools/corpus_analyzer.pl @@ -0,0 +1,77 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import sys, re, codecs, heapq +import numpy as np + +n_largest_param = 10 + +html_pattern = r'<\/?(a|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|blockquote|body|br|button|caption|center|cite|code|col|colgroup|dd|del|dir|div|dfn|dl|dt|em|fieldset|font|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|hr|html|i|iframe|img|input|ins|isindex|kbd|label|legend|li|link|map|menu|meta|noframes|noscript|object|ol|optgroup|option|p|param|pre|q|s|samp|script|select|small|span|strike|strong|style|sub|sup|table|tbody|td|textarea|tfoot|th|thead|title|tr|tt|u|ul|var|xmp).*?>' +date_pattern = r'[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4}' +email_pattern = r'[\w\._\d]+@\w+(\.\w+)*' +number_pattern = r'[0-9]+([\.\,][0-9]+)?' + + +corpus_file = codecs.open(sys.argv[1], encoding='utf-8') + +sentences = [] +total_sentences = 0 +empty_sentences = 0 + +print "Reading and hashing corpus file..." +for line in corpus_file: + total_sentences += 1 + line = re.sub(html_pattern, '', line, re.UNICODE) + + line = re.sub(date_pattern, '[DATE]', line, re.UNICODE) + line = re.sub(email_pattern, '[EMAIL]', line, re.UNICODE) + line = re.sub(number_pattern, '[NUMBER]', line, re.UNICODE) + + words = re.findall(r'\[[A-Z]+\]|\w+', line, re.UNICODE) + if any(re.match(r'\w+', word, re.UNICODE) for word in words): + sentences.append(" ".join([word.lower() for word in words])) + else: + empty_sentences += 1 + + if total_sentences % 10000 == 0: + print " processed %d sentences" % total_sentences + + +corpus_file.close() +print "Corpus file read." + + +print "Sorting the corpus" +sentences.sort() + +print "Computing unique values..." + +prev_sentence = None +curr_count = 1 + +ul = [] + +for i,sentence in enumerate(sentences): + if sentence == prev_sentence: + curr_count += 1 + else: + if prev_sentence is not None: + ul.append((prev_sentence, curr_count)) + curr_count = 1 + prev_sentence = sentence + +#append the last sentence +ul.append((prev_sentence, curr_count)) + +#print ul +print "Unique values computed." + +print "Total sentences %d, empty: %d" % (total_sentences, empty_sentences) +print "Among %d non-empty sentences there are %d unique sentences" % (total_sentences - empty_sentences, len(ul)) +print "%d most common sentences:" % n_largest_param + +for sentence, count in heapq.nlargest(n_largest_param, ul, key = lambda x:x[1]): + print "%d occurences of sentence: %s" % (count, sentence) + + +