2015-10-04 16:24:58 +02:00
|
|
|
#!/usr/bin/python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
from os import listdir
|
|
|
|
import numpy as np
|
|
|
|
import matplotlib.pyplot as plt
|
2015-10-06 13:34:03 +02:00
|
|
|
import sys
|
2015-10-04 16:24:58 +02:00
|
|
|
|
|
|
|
|
|
|
|
input_dir = 'stats'
|
|
|
|
output_dir = 'figures'
|
|
|
|
|
|
|
|
def get_corpus_data(corpus_stats_file):
|
|
|
|
with open(corpus_stats_file) as csf:
|
|
|
|
file_lines = csf.readlines()
|
|
|
|
|
|
|
|
name, total, empty, unique = file_lines[0][:-1].split('@#@')
|
|
|
|
most_frequent = []
|
|
|
|
for line in file_lines[1:]:
|
|
|
|
count, sentence = line[:-1].split('@#@')
|
|
|
|
most_frequent.append((int(count), sentence))
|
|
|
|
|
|
|
|
return {"name":name, "total":int(total), "empty":int(empty), "unique":int(unique), "most_frequent":most_frequent}
|
|
|
|
|
|
|
|
|
|
|
|
corpora = []
|
|
|
|
|
|
|
|
for f in listdir(input_dir):
|
|
|
|
corpora.append(get_corpus_data(input_dir+'/'+f))
|
|
|
|
|
|
|
|
|
|
|
|
# stats table
|
|
|
|
with open(output_dir+'/stats_table.tex', 'w') as stats_table:
|
|
|
|
stats_table.write(r'\begin{table}'+'\n')
|
|
|
|
stats_table.write(r'\begin{center}'+'\n')
|
|
|
|
stats_table.write(r'\begin{tabular}{|l|l|l|l|}'+'\n')
|
|
|
|
stats_table.write(r'\hline'+'\n')
|
|
|
|
stats_table.write(r'Corpus name & Total sentences & Non-empty & Unique\\'+'\n')
|
|
|
|
stats_table.write(r'\hline\hline'+'\n')
|
|
|
|
for corpus in corpora:
|
2015-10-06 13:34:03 +02:00
|
|
|
non_empty_percentage = float(100*(corpus["total"] - corpus["empty"]))/corpus["total"]
|
2015-10-04 16:24:58 +02:00
|
|
|
unique_percentage = float(100*corpus["unique"])/corpus["total"]
|
2015-10-06 13:34:03 +02:00
|
|
|
stats_table.write("%s & %d & %d (%.2f\%%) & %d (%.2f\%%) \\\\\n" % (corpus["name"], corpus["total"], corpus["total"] - corpus["empty"], non_empty_percentage, corpus["unique"], unique_percentage))
|
2015-10-04 16:24:58 +02:00
|
|
|
|
|
|
|
stats_table.write(r'\hline'+'\n')
|
|
|
|
stats_table.write(r'\end{tabular}'+'\n')
|
|
|
|
stats_table.write(r'\caption{Corpora repetition statistics}'+'\n')
|
|
|
|
stats_table.write(r'\label{tab:repetition_statistics}'+'\n')
|
|
|
|
stats_table.write(r'\end{center}'+'\n')
|
|
|
|
stats_table.write(r'\end{table}'+'\n')
|
|
|
|
|
|
|
|
# most frequent sentences table
|
|
|
|
|
|
|
|
for corpus in corpora:
|
|
|
|
with open(output_dir+'/'+corpus["name"]+'_freq.tex', 'w') as freq_table:
|
|
|
|
freq_table.write(r'\begin{table}'+'\n')
|
|
|
|
freq_table.write(r'\begin{center}'+'\n')
|
|
|
|
freq_table.write(r'\begin{tabular}{|l|l|}'+'\n')
|
|
|
|
freq_table.write(r'\hline'+'\n')
|
|
|
|
freq_table.write(r'Occurences & Sentence\\'+'\n')
|
|
|
|
freq_table.write(r'\hline\hline'+'\n')
|
|
|
|
for data in corpus["most_frequent"]:
|
2015-10-06 13:34:03 +02:00
|
|
|
freq_table.write("%d & %s\\\\\n" % data)
|
2015-10-04 16:24:58 +02:00
|
|
|
freq_table.write(r'\hline'+'\n')
|
|
|
|
freq_table.write(r'\end{tabular}'+'\n')
|
|
|
|
freq_table.write(r'\caption{Most frequent sentences in the corpus '+corpus["name"]+'}\n')
|
|
|
|
freq_table.write(r'\label{tab:freq_'+corpus["name"]+'}\n')
|
|
|
|
freq_table.write(r'\end{center}'+'\n')
|
|
|
|
freq_table.write(r'\end{table}'+'\n')
|
|
|
|
|
|
|
|
# plot
|
|
|
|
|
2015-10-06 13:34:03 +02:00
|
|
|
N = len(corpora)
|
|
|
|
uniques = [float(100*corpus["unique"]) / corpus["total"] for corpus in corpora]
|
|
|
|
repeated = [float(100*(corpus["total"] - corpus["unique"] - corpus["empty"])) / corpus["total"] for corpus in corpora]
|
|
|
|
empty = [float(100*corpus["empty"]) / corpus["total"] for corpus in corpora]
|
|
|
|
|
2015-10-04 16:24:58 +02:00
|
|
|
ind = np.arange(N) # the x locations for the groups
|
|
|
|
width = 0.35 # the width of the bars: can also be len(x) sequence
|
|
|
|
|
2015-10-06 13:34:03 +02:00
|
|
|
p1 = plt.bar(ind, uniques, width, color='#009900')
|
|
|
|
p2 = plt.bar(ind, repeated, width, color='#99FF66', bottom=uniques)
|
|
|
|
p3 = plt.bar(ind, empty, width, color='#999966', bottom=[sum(x) for x in zip(repeated,uniques)])
|
2015-10-04 16:24:58 +02:00
|
|
|
|
2015-10-06 13:34:03 +02:00
|
|
|
plt.xticks(ind+width/2., [corpus["name"] for corpus in corpora] )
|
|
|
|
plt.yticks(np.arange(0,101,10))
|
|
|
|
plt.legend( (p1[0], p2[0], p3[0]), ('unique', 'repeated', 'empty') )
|
2015-10-04 16:24:58 +02:00
|
|
|
|
|
|
|
|
2015-10-06 13:34:03 +02:00
|
|
|
plt.savefig(output_dir+'/bar_graph.eps', format='eps')
|