#!/usr/bin/python # -*- coding: utf-8 -*- from os import listdir import numpy as np import matplotlib.pyplot as plt input_dir = 'stats' output_dir = 'figures' def get_corpus_data(corpus_stats_file): with open(corpus_stats_file) as csf: file_lines = csf.readlines() name, total, empty, unique = file_lines[0][:-1].split('@#@') most_frequent = [] for line in file_lines[1:]: count, sentence = line[:-1].split('@#@') most_frequent.append((int(count), sentence)) return {"name":name, "total":int(total), "empty":int(empty), "unique":int(unique), "most_frequent":most_frequent} corpora = [] for f in listdir(input_dir): corpora.append(get_corpus_data(input_dir+'/'+f)) # stats table with open(output_dir+'/stats_table.tex', 'w') as stats_table: stats_table.write(r'\begin{table}'+'\n') stats_table.write(r'\begin{center}'+'\n') stats_table.write(r'\begin{tabular}{|l|l|l|l|}'+'\n') stats_table.write(r'\hline'+'\n') stats_table.write(r'Corpus name & Total sentences & Non-empty & Unique\\'+'\n') stats_table.write(r'\hline\hline'+'\n') for corpus in corpora: non_empty_percentage = float(100*corpus["total"] - corpus["empty"])/corpus["total"] unique_percentage = float(100*corpus["unique"])/corpus["total"] stats_table.write("%s & %d & %d (%.2f%%) & %d (%.2f%%) \\\\\n" % (corpus["name"], corpus["total"], corpus["total"] - corpus["empty"], non_empty_percentage, corpus["unique"], unique_percentage)) stats_table.write(r'\hline'+'\n') stats_table.write(r'\end{tabular}'+'\n') stats_table.write(r'\caption{Corpora repetition statistics}'+'\n') stats_table.write(r'\label{tab:repetition_statistics}'+'\n') stats_table.write(r'\end{center}'+'\n') stats_table.write(r'\end{table}'+'\n') # most frequent sentences table for corpus in corpora: with open(output_dir+'/'+corpus["name"]+'_freq.tex', 'w') as freq_table: freq_table.write(r'\begin{table}'+'\n') freq_table.write(r'\begin{center}'+'\n') freq_table.write(r'\begin{tabular}{|l|l|}'+'\n') freq_table.write(r'\hline'+'\n') freq_table.write(r'Occurences & Sentence\\'+'\n') freq_table.write(r'\hline\hline'+'\n') for data in corpus["most_frequent"]: freq_table.write("%d & %s\n" % data) freq_table.write(r'\hline'+'\n') freq_table.write(r'\end{tabular}'+'\n') freq_table.write(r'\caption{Most frequent sentences in the corpus '+corpus["name"]+'}\n') freq_table.write(r'\label{tab:freq_'+corpus["name"]+'}\n') freq_table.write(r'\end{center}'+'\n') freq_table.write(r'\end{table}'+'\n') # plot N = 5 menMeans = (20, 35, 30, 35, 27) womenMeans = (25, 32, 34, 20, 25) menStd = (2, 3, 4, 1, 2) womenStd = (3, 5, 2, 3, 3) ind = np.arange(N) # the x locations for the groups width = 0.35 # the width of the bars: can also be len(x) sequence p1 = plt.bar(ind, menMeans, width, color='r', yerr=womenStd) p2 = plt.bar(ind, womenMeans, width, color='y', bottom=menMeans, yerr=menStd) plt.ylabel('Scores') plt.title('Scores by group and gender') plt.xticks(ind+width/2., ('G1', 'G2', 'G3', 'G4', 'G5') ) plt.yticks(np.arange(0,81,10)) plt.legend( (p1[0], p2[0]), ('Men', 'Women') ) plt.savefig('bar_graph.eps', format='eps')