working corpus analyzer

2015-10-03 16:18:49 +02:00 · 2015-10-03 16:18:49 +02:00 · 4e17e28f7f
commit 4e17e28f7f
parent fa3138df29
1 changed files with 77 additions and 0 deletions
--- a/tools/corpus_analyzer.pl
+++ b/tools/corpus_analyzer.pl
@ -0,0 +1,77 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import sys, re, codecs, heapq
+import numpy as np
+
+n_largest_param = 10
+
+html_pattern = r'<\/?(a|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|blockquote|body|br|button|caption|center|cite|code|col|colgroup|dd|del|dir|div|dfn|dl|dt|em|fieldset|font|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|hr|html|i|iframe|img|input|ins|isindex|kbd|label|legend|li|link|map|menu|meta|noframes|noscript|object|ol|optgroup|option|p|param|pre|q|s|samp|script|select|small|span|strike|strong|style|sub|sup|table|tbody|td|textarea|tfoot|th|thead|title|tr|tt|u|ul|var|xmp).*?>'
+date_pattern = r'[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4}'
+email_pattern = r'[\w\._\d]+@\w+(\.\w+)*'
+number_pattern = r'[0-9]+([\.\,][0-9]+)?'
+
+
+corpus_file = codecs.open(sys.argv[1], encoding='utf-8')
+
+sentences = []
+total_sentences = 0
+empty_sentences = 0
+
+print "Reading and hashing corpus file..."
+for line in corpus_file:
+    total_sentences += 1
+    line = re.sub(html_pattern, '', line, re.UNICODE)
+
+    line = re.sub(date_pattern, '[DATE]', line, re.UNICODE)
+    line = re.sub(email_pattern, '[EMAIL]', line, re.UNICODE)
+    line = re.sub(number_pattern, '[NUMBER]', line, re.UNICODE)
+
+    words = re.findall(r'\[[A-Z]+\]|\w+', line, re.UNICODE)
+    if any(re.match(r'\w+', word, re.UNICODE) for word in words):
+        sentences.append(" ".join([word.lower() for word in words]))
+    else:
+        empty_sentences += 1
+        
+    if total_sentences % 10000 == 0:
+        print "    processed %d sentences" % total_sentences
+        
+
+corpus_file.close()
+print "Corpus file read."
+
+
+print "Sorting the corpus"
+sentences.sort()
+
+print "Computing unique values..."
+
+prev_sentence = None
+curr_count = 1
+
+ul = []
+
+for i,sentence in enumerate(sentences):
+    if sentence == prev_sentence:
+        curr_count += 1
+    else:
+        if prev_sentence is not None:
+            ul.append((prev_sentence, curr_count))
+            curr_count = 1
+    prev_sentence = sentence            
+
+#append the last sentence
+ul.append((prev_sentence, curr_count))
+
+#print ul
+print "Unique values computed."
+
+print "Total sentences %d, empty: %d" % (total_sentences, empty_sentences)
+print "Among %d non-empty sentences there are %d unique sentences" % (total_sentences - empty_sentences, len(ul))
+print "%d most common sentences:" % n_largest_param
+
+for sentence, count in heapq.nlargest(n_largest_param, ul, key = lambda x:x[1]):
+    print "%d occurences of sentence: %s" % (count, sentence)
+
+
+