99 lines
3.6 KiB
Python
Executable File
99 lines
3.6 KiB
Python
Executable File
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import sys, re, codecs, heapq, os
|
|
import numpy as np
|
|
|
|
from optparse import OptionParser
|
|
|
|
parser = OptionParser()
|
|
parser.add_option("-f", "--file", dest="corpus_file_path",
|
|
help="corpus file, one sentence per line", metavar="FILE")
|
|
parser.add_option("-n", "--name", dest="corpus_name",
|
|
help="name of the corpus")
|
|
parser.add_option("-l", "--n-most-frequent", type="int", dest="n_most_frequent", default=10, metavar="N",
|
|
help="output N most frequent sentences")
|
|
parser.add_option("-o", "--output-folder", dest="output_folder", default=".", metavar="PATH",
|
|
help="folder for storing _stats.txt file")
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
html_pattern = r'<\/?(a|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|blockquote|body|br|button|caption|center|cite|code|col|colgroup|dd|del|dir|div|dfn|dl|dt|em|fieldset|font|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|hr|html|i|iframe|img|input|ins|isindex|kbd|label|legend|li|link|map|menu|meta|noframes|noscript|object|ol|optgroup|option|p|param|pre|q|s|samp|script|select|small|span|strike|strong|style|sub|sup|table|tbody|td|textarea|tfoot|th|thead|title|tr|tt|u|ul|var|xmp).*?>'
|
|
date_pattern = r'[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4}'
|
|
email_pattern = r'[\w\._\d]+@\w+(\.\w+)*'
|
|
number_pattern = r'[0-9]+([\.\,][0-9]+)?'
|
|
|
|
|
|
corpus_file = codecs.open(options.corpus_file_path, encoding='utf-8')
|
|
|
|
sentences = []
|
|
total_sentences = 0
|
|
empty_sentences = 0
|
|
|
|
print "Reading and hashing corpus file..."
|
|
for line in corpus_file:
|
|
total_sentences += 1
|
|
line = re.sub(html_pattern, '', line, re.UNICODE)
|
|
|
|
line = re.sub(date_pattern, '[DATE]', line, re.UNICODE)
|
|
line = re.sub(email_pattern, '[EMAIL]', line, re.UNICODE)
|
|
line = re.sub(number_pattern, '[NUMBER]', line, re.UNICODE)
|
|
|
|
words = re.findall(r'\[[A-Z]+\]|\w+', line, re.UNICODE)
|
|
if any(re.match(r'\w+', word, re.UNICODE) for word in words):
|
|
sentences.append(" ".join([word.lower() for word in words]))
|
|
else:
|
|
empty_sentences += 1
|
|
|
|
if total_sentences % 10000 == 0:
|
|
print " processed %d sentences" % total_sentences
|
|
|
|
|
|
corpus_file.close()
|
|
print "Corpus file read."
|
|
|
|
|
|
print "Sorting the corpus"
|
|
sentences.sort()
|
|
|
|
print "Computing unique values..."
|
|
|
|
prev_sentence = None
|
|
curr_count = 1
|
|
|
|
ul = []
|
|
|
|
for i,sentence in enumerate(sentences):
|
|
if sentence == prev_sentence:
|
|
curr_count += 1
|
|
else:
|
|
if prev_sentence is not None:
|
|
ul.append((prev_sentence, curr_count))
|
|
curr_count = 1
|
|
prev_sentence = sentence
|
|
|
|
#append the last sentence
|
|
ul.append((prev_sentence, curr_count))
|
|
|
|
#print ul
|
|
print "Unique values computed."
|
|
|
|
|
|
print "Total sentences %d, empty: %d" % (total_sentences, empty_sentences)
|
|
print "Among %d non-empty sentences there are %d unique sentences" % (total_sentences - empty_sentences, len(ul))
|
|
print "%d most common sentences:" % options.n_most_frequent
|
|
|
|
|
|
sep = '@#@'
|
|
|
|
with open(options.output_folder+'/'+os.path.basename(options.corpus_file_path)[:-4]+'_stats.txt', 'wb') as output_file:
|
|
|
|
|
|
corpus_name = options.corpus_name if options.corpus_name is not None else options.corpus_file_path[:-4]
|
|
output_file.write(corpus_name+sep+str(total_sentences)+sep+str(empty_sentences)+sep+str(len(ul))+'\n')
|
|
|
|
for sentence, count in heapq.nlargest(options.n_most_frequent, ul, key = lambda x:x[1]):
|
|
print "%d occurences of sentence: %s" % (count, sentence)
|
|
output_file.write(str(count)+sep+sentence.encode('UTF-8')+'\n')
|
|
|