corpus analyzer
This commit is contained in:
parent
2601dc83bf
commit
96c74c47ac
@ -1,10 +1,22 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys, re, codecs, heapq
|
||||
import sys, re, codecs, heapq, os
|
||||
import numpy as np
|
||||
|
||||
n_largest_param = 10
|
||||
from optparse import OptionParser
|
||||
|
||||
parser = OptionParser()
|
||||
parser.add_option("-f", "--file", dest="corpus_file_path",
|
||||
help="corpus file, one sentence per line", metavar="FILE")
|
||||
parser.add_option("-n", "--name", dest="corpus_name",
|
||||
help="name of the corpus")
|
||||
parser.add_option("-l", "--n-most-frequent", type="int", dest="n_most_frequent", default=10, metavar="N",
|
||||
help="output N most frequent sentences")
|
||||
parser.add_option("-o", "--output-folder", dest="output_folder", default=".", metavar="PATH",
|
||||
help="folder for storing _stats.txt file")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
html_pattern = r'<\/?(a|abbr|acronym|address|applet|area|b|base|basefont|bdo|big|blockquote|body|br|button|caption|center|cite|code|col|colgroup|dd|del|dir|div|dfn|dl|dt|em|fieldset|font|form|frame|frameset|h1|h2|h3|h4|h5|h6|head|hr|html|i|iframe|img|input|ins|isindex|kbd|label|legend|li|link|map|menu|meta|noframes|noscript|object|ol|optgroup|option|p|param|pre|q|s|samp|script|select|small|span|strike|strong|style|sub|sup|table|tbody|td|textarea|tfoot|th|thead|title|tr|tt|u|ul|var|xmp).*?>'
|
||||
date_pattern = r'[0-9]{1,2}[\.\-/][0-9]{1,2}[\.\-/][0-9]{4}'
|
||||
@ -12,7 +24,7 @@ email_pattern = r'[\w\._\d]+@\w+(\.\w+)*'
|
||||
number_pattern = r'[0-9]+([\.\,][0-9]+)?'
|
||||
|
||||
|
||||
corpus_file = codecs.open(sys.argv[1], encoding='utf-8')
|
||||
corpus_file = codecs.open(options.corpus_file_path, encoding='utf-8')
|
||||
|
||||
sentences = []
|
||||
total_sentences = 0
|
||||
@ -66,12 +78,21 @@ ul.append((prev_sentence, curr_count))
|
||||
#print ul
|
||||
print "Unique values computed."
|
||||
|
||||
|
||||
print "Total sentences %d, empty: %d" % (total_sentences, empty_sentences)
|
||||
print "Among %d non-empty sentences there are %d unique sentences" % (total_sentences - empty_sentences, len(ul))
|
||||
print "%d most common sentences:" % n_largest_param
|
||||
|
||||
for sentence, count in heapq.nlargest(n_largest_param, ul, key = lambda x:x[1]):
|
||||
print "%d occurences of sentence: %s" % (count, sentence)
|
||||
print "%d most common sentences:" % options.n_most_frequent
|
||||
|
||||
|
||||
sep = '@#@'
|
||||
|
||||
with open(options.output_folder+'/'+os.path.basename(options.corpus_file_path)[:-4]+'_stats.txt', 'wb') as output_file:
|
||||
|
||||
|
||||
corpus_name = options.corpus_name if options.corpus_name is not None else options.corpus_file_path[:-4]
|
||||
output_file.write(corpus_name+sep+str(total_sentences)+sep+str(empty_sentences)+sep+str(len(ul))+'\n')
|
||||
|
||||
for sentence, count in heapq.nlargest(options.n_most_frequent, ul, key = lambda x:x[1]):
|
||||
print "%d occurences of sentence: %s" % (count, sentence)
|
||||
output_file.write(str(count)+sep+sentence.encode('UTF-8')+'\n')
|
||||
|
||||
|
91
tools/prepare_corpus_figures.py
Executable file
91
tools/prepare_corpus_figures.py
Executable file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from os import listdir
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
input_dir = 'stats'
|
||||
output_dir = 'figures'
|
||||
|
||||
def get_corpus_data(corpus_stats_file):
|
||||
with open(corpus_stats_file) as csf:
|
||||
file_lines = csf.readlines()
|
||||
|
||||
name, total, empty, unique = file_lines[0][:-1].split('@#@')
|
||||
most_frequent = []
|
||||
for line in file_lines[1:]:
|
||||
count, sentence = line[:-1].split('@#@')
|
||||
most_frequent.append((int(count), sentence))
|
||||
|
||||
return {"name":name, "total":int(total), "empty":int(empty), "unique":int(unique), "most_frequent":most_frequent}
|
||||
|
||||
|
||||
corpora = []
|
||||
|
||||
for f in listdir(input_dir):
|
||||
corpora.append(get_corpus_data(input_dir+'/'+f))
|
||||
|
||||
|
||||
# stats table
|
||||
with open(output_dir+'/stats_table.tex', 'w') as stats_table:
|
||||
stats_table.write(r'\begin{table}'+'\n')
|
||||
stats_table.write(r'\begin{center}'+'\n')
|
||||
stats_table.write(r'\begin{tabular}{|l|l|l|l|}'+'\n')
|
||||
stats_table.write(r'\hline'+'\n')
|
||||
stats_table.write(r'Corpus name & Total sentences & Non-empty & Unique\\'+'\n')
|
||||
stats_table.write(r'\hline\hline'+'\n')
|
||||
for corpus in corpora:
|
||||
non_empty_percentage = float(100*corpus["total"] - corpus["empty"])/corpus["total"]
|
||||
unique_percentage = float(100*corpus["unique"])/corpus["total"]
|
||||
stats_table.write("%s & %d & %d (%.2f%%) & %d (%.2f%%) \\\\\n" % (corpus["name"], corpus["total"], corpus["total"] - corpus["empty"], non_empty_percentage, corpus["unique"], unique_percentage))
|
||||
|
||||
stats_table.write(r'\hline'+'\n')
|
||||
stats_table.write(r'\end{tabular}'+'\n')
|
||||
stats_table.write(r'\caption{Corpora repetition statistics}'+'\n')
|
||||
stats_table.write(r'\label{tab:repetition_statistics}'+'\n')
|
||||
stats_table.write(r'\end{center}'+'\n')
|
||||
stats_table.write(r'\end{table}'+'\n')
|
||||
|
||||
# most frequent sentences table
|
||||
|
||||
for corpus in corpora:
|
||||
with open(output_dir+'/'+corpus["name"]+'_freq.tex', 'w') as freq_table:
|
||||
freq_table.write(r'\begin{table}'+'\n')
|
||||
freq_table.write(r'\begin{center}'+'\n')
|
||||
freq_table.write(r'\begin{tabular}{|l|l|}'+'\n')
|
||||
freq_table.write(r'\hline'+'\n')
|
||||
freq_table.write(r'Occurences & Sentence\\'+'\n')
|
||||
freq_table.write(r'\hline\hline'+'\n')
|
||||
for data in corpus["most_frequent"]:
|
||||
freq_table.write("%d & %s\n" % data)
|
||||
freq_table.write(r'\hline'+'\n')
|
||||
freq_table.write(r'\end{tabular}'+'\n')
|
||||
freq_table.write(r'\caption{Most frequent sentences in the corpus '+corpus["name"]+'}\n')
|
||||
freq_table.write(r'\label{tab:freq_'+corpus["name"]+'}\n')
|
||||
freq_table.write(r'\end{center}'+'\n')
|
||||
freq_table.write(r'\end{table}'+'\n')
|
||||
|
||||
# plot
|
||||
|
||||
N = 5
|
||||
menMeans = (20, 35, 30, 35, 27)
|
||||
womenMeans = (25, 32, 34, 20, 25)
|
||||
menStd = (2, 3, 4, 1, 2)
|
||||
womenStd = (3, 5, 2, 3, 3)
|
||||
ind = np.arange(N) # the x locations for the groups
|
||||
width = 0.35 # the width of the bars: can also be len(x) sequence
|
||||
|
||||
p1 = plt.bar(ind, menMeans, width, color='r', yerr=womenStd)
|
||||
p2 = plt.bar(ind, womenMeans, width, color='y',
|
||||
bottom=menMeans, yerr=menStd)
|
||||
|
||||
plt.ylabel('Scores')
|
||||
plt.title('Scores by group and gender')
|
||||
plt.xticks(ind+width/2., ('G1', 'G2', 'G3', 'G4', 'G5') )
|
||||
plt.yticks(np.arange(0,81,10))
|
||||
plt.legend( (p1[0], p2[0]), ('Men', 'Women') )
|
||||
|
||||
|
||||
plt.savefig('bar_graph.eps', format='eps')
|
Loading…
Reference in New Issue
Block a user