concordia-library/tools/corpus_analyzer.pl

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys, re, codecs, heapq, os
import numpy as np

from optparse import OptionParser

parser = OptionParser()
parser.add_option("-f", "--file", dest="corpus_file_path",
                  help="corpus file, one sentence per line", metavar="FILE")
parser.add_option("-n", "--name", dest="corpus_name",
                  help="name of the corpus")                  
parser.add_option("-l", "--n-most-frequent", type="int", dest="n_most_frequent", default=10, metavar="N",
                  help="output N most frequent sentences")                  
parser.add_option("-o", "--output-folder", dest="output_folder", default=".", metavar="PATH",
                  help="folder for storing _stats.txt file")                  

(options, args) = parser.parse_args()

html_pattern = r'<\/?(a|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|blockquote|body|br|button|caption|center|cite|code|col|colgroup|dd|del|dir|div|dfn|dl|dt|em|fieldset|font|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|hr|html|i|iframe|img|input|ins|isindex|kbd|label|legend|li|link|map|menu|meta|noframes|noscript|object|ol|optgroup|option|p|param|pre|q|s|samp|script|select|small|span|strike|strong|style|sub|sup|table|tbody|td|textarea|tfoot|th|thead|title|tr|tt|u|ul|var|xmp).*?>'
date_pattern = r'[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4}'
email_pattern = r'[\w\._\d]+@\w+(\.\w+)*'
number_pattern = r'[0-9]+([\.\,][0-9]+)?'


corpus_file = codecs.open(options.corpus_file_path, encoding='utf-8')

sentences = []
total_sentences = 0
empty_sentences = 0

print "Reading and hashing corpus file..."
for line in corpus_file:
    total_sentences += 1
    line = re.sub(html_pattern, '', line, re.UNICODE)

    line = re.sub(date_pattern, '[DATE]', line, re.UNICODE)
    line = re.sub(email_pattern, '[EMAIL]', line, re.UNICODE)
    line = re.sub(number_pattern, '[NUMBER]', line, re.UNICODE)

    words = re.findall(r'\[[A-Z]+\]|\w+', line, re.UNICODE)
    if any(re.match(r'\w+', word, re.UNICODE) for word in words):
        sentences.append(" ".join([word.lower() for word in words]))
    else:
        empty_sentences += 1
        
    if total_sentences % 10000 == 0:
        print "    processed %d sentences" % total_sentences
        

corpus_file.close()
print "Corpus file read."


print "Sorting the corpus"
sentences.sort()

print "Computing unique values..."

prev_sentence = None
curr_count = 1

ul = []

for i,sentence in enumerate(sentences):
    if sentence == prev_sentence:
        curr_count += 1
    else:
        if prev_sentence is not None:
            ul.append((prev_sentence, curr_count))
            curr_count = 1
    prev_sentence = sentence            

#append the last sentence
ul.append((prev_sentence, curr_count))

#print ul
print "Unique values computed."


print "Total sentences %d, empty: %d" % (total_sentences, empty_sentences)
print "Among %d non-empty sentences there are %d unique sentences" % (total_sentences - empty_sentences, len(ul))
print "%d most common sentences:" % options.n_most_frequent


sep = '@#@'

with open(options.output_folder+'/'+os.path.basename(options.corpus_file_path)[:-4]+'_stats.txt', 'wb') as output_file:
    
    
    corpus_name = options.corpus_name if options.corpus_name is not None else options.corpus_file_path[:-4]
    output_file.write(corpus_name+sep+str(total_sentences)+sep+str(empty_sentences)+sep+str(len(ul))+'\n')

    for sentence, count in heapq.nlargest(options.n_most_frequent, ul, key = lambda x:x[1]):
        print "%d occurences of sentence: %s" % (count, sentence)
        output_file.write(str(count)+sep+sentence.encode('UTF-8')+'\n')
working corpus analyzer 2015-10-03 16:18:49 +02:00			`#!/usr/bin/python`
			`# -- coding: utf-8 --`

corpus analyzer 2015-10-04 16:24:58 +02:00			`import sys, re, codecs, heapq, os`
working corpus analyzer 2015-10-03 16:18:49 +02:00			`import numpy as np`

corpus analyzer 2015-10-04 16:24:58 +02:00			`from optparse import OptionParser`

			`parser = OptionParser()`
			`parser.add_option("-f", "--file", dest="corpus_file_path",`
			`help="corpus file, one sentence per line", metavar="FILE")`
			`parser.add_option("-n", "--name", dest="corpus_name",`
			`help="name of the corpus")`
			`parser.add_option("-l", "--n-most-frequent", type="int", dest="n_most_frequent", default=10, metavar="N",`
			`help="output N most frequent sentences")`
			`parser.add_option("-o", "--output-folder", dest="output_folder", default=".", metavar="PATH",`
			`help="folder for storing _stats.txt file")`

			`(options, args) = parser.parse_args()`
working corpus analyzer 2015-10-03 16:18:49 +02:00
			`html_pattern = r'<\/?(a\|abbr\|acronym\|address\|applet\|area\|b\|base\|basefont\|bdo\|big\|blockquote\|body\|br\|button\|caption\|center\|cite\|code\|col\|colgroup\|dd\|del\|dir\|div\|dfn\|dl\|dt\|em\|fieldset\|font\|form\|frame\|frameset\|h1\|h2\|h3\|h4\|h5\|h6\|head\|hr\|html\|i\|iframe\|img\|input\|ins\|isindex\|kbd\|label\|legend\|li\|link\|map\|menu\|meta\|noframes\|noscript\|object\|ol\|optgroup\|option\|p\|param\|pre\|q\|s\|samp\|script\|select\|small\|span\|strike\|strong\|style\|sub\|sup\|table\|tbody\|td\|textarea\|tfoot\|th\|thead\|title\|tr\|tt\|u\|ul\|var\|xmp).*?>'`
			`date_pattern = r'[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4}'`
			`email_pattern = r'[\w\._\d]+@\w+(\.\w+)*'`
			`number_pattern = r'[0-9]+([\.\,][0-9]+)?'`


corpus analyzer 2015-10-04 16:24:58 +02:00			`corpus_file = codecs.open(options.corpus_file_path, encoding='utf-8')`
working corpus analyzer 2015-10-03 16:18:49 +02:00
			`sentences = []`
			`total_sentences = 0`
			`empty_sentences = 0`

			`print "Reading and hashing corpus file..."`
			`for line in corpus_file:`
			`total_sentences += 1`
			`line = re.sub(html_pattern, '', line, re.UNICODE)`

			`line = re.sub(date_pattern, '[DATE]', line, re.UNICODE)`
			`line = re.sub(email_pattern, '[EMAIL]', line, re.UNICODE)`
			`line = re.sub(number_pattern, '[NUMBER]', line, re.UNICODE)`

			`words = re.findall(r'\[[A-Z]+\]\|\w+', line, re.UNICODE)`
			`if any(re.match(r'\w+', word, re.UNICODE) for word in words):`
			`sentences.append(" ".join([word.lower() for word in words]))`
			`else:`
			`empty_sentences += 1`

			`if total_sentences % 10000 == 0:`
			`print " processed %d sentences" % total_sentences`


			`corpus_file.close()`
			`print "Corpus file read."`


			`print "Sorting the corpus"`
			`sentences.sort()`

			`print "Computing unique values..."`

			`prev_sentence = None`
			`curr_count = 1`

			`ul = []`

			`for i,sentence in enumerate(sentences):`
			`if sentence == prev_sentence:`
			`curr_count += 1`
			`else:`
			`if prev_sentence is not None:`
			`ul.append((prev_sentence, curr_count))`
			`curr_count = 1`
			`prev_sentence = sentence`

			`#append the last sentence`
			`ul.append((prev_sentence, curr_count))`

			`#print ul`
			`print "Unique values computed."`

corpus analyzer 2015-10-04 16:24:58 +02:00
working corpus analyzer 2015-10-03 16:18:49 +02:00			`print "Total sentences %d, empty: %d" % (total_sentences, empty_sentences)`
			`print "Among %d non-empty sentences there are %d unique sentences" % (total_sentences - empty_sentences, len(ul))`
corpus analyzer 2015-10-04 16:24:58 +02:00			`print "%d most common sentences:" % options.n_most_frequent`

working corpus analyzer 2015-10-03 16:18:49 +02:00
corpus analyzer 2015-10-04 16:24:58 +02:00			`sep = '@#@'`
working corpus analyzer 2015-10-03 16:18:49 +02:00
corpus analyzer 2015-10-04 16:24:58 +02:00			`with open(options.output_folder+'/'+os.path.basename(options.corpus_file_path)[:-4]+'_stats.txt', 'wb') as output_file:`


			`corpus_name = options.corpus_name if options.corpus_name is not None else options.corpus_file_path[:-4]`
			`output_file.write(corpus_name+sep+str(total_sentences)+sep+str(empty_sentences)+sep+str(len(ul))+'\n')`
working corpus analyzer 2015-10-03 16:18:49 +02:00
corpus analyzer 2015-10-04 16:24:58 +02:00			`for sentence, count in heapq.nlargest(options.n_most_frequent, ul, key = lambda x:x[1]):`
			`print "%d occurences of sentence: %s" % (count, sentence)`
			`output_file.write(str(count)+sep+sentence.encode('UTF-8')+'\n')`
working corpus analyzer 2015-10-03 16:18:49 +02:00