concordia-library/tools/prepare_corpus_figures.py

#!/usr/bin/python
# -*- coding: utf-8 -*-

from os import listdir
import numpy as np
import matplotlib.pyplot as plt


input_dir = 'stats'
output_dir = 'figures'

def get_corpus_data(corpus_stats_file):
    with open(corpus_stats_file) as csf:
        file_lines = csf.readlines()

    name, total, empty, unique = file_lines[0][:-1].split('@#@')
    most_frequent = []
    for line in file_lines[1:]:
        count, sentence = line[:-1].split('@#@')
        most_frequent.append((int(count), sentence))
        
    return {"name":name, "total":int(total), "empty":int(empty), "unique":int(unique), "most_frequent":most_frequent}


corpora = []

for f in listdir(input_dir):
    corpora.append(get_corpus_data(input_dir+'/'+f))
    

# stats table
with open(output_dir+'/stats_table.tex', 'w') as stats_table:
    stats_table.write(r'\begin{table}'+'\n')
    stats_table.write(r'\begin{center}'+'\n')
    stats_table.write(r'\begin{tabular}{|l|l|l|l|}'+'\n')
    stats_table.write(r'\hline'+'\n')
    stats_table.write(r'Corpus name & Total sentences & Non-empty & Unique\\'+'\n')
    stats_table.write(r'\hline\hline'+'\n')
    for corpus in corpora:
        non_empty_percentage = float(100*corpus["total"] - corpus["empty"])/corpus["total"]
        unique_percentage = float(100*corpus["unique"])/corpus["total"]
        stats_table.write("%s & %d & %d (%.2f%%) & %d (%.2f%%) \\\\\n" % (corpus["name"], corpus["total"], corpus["total"] - corpus["empty"], non_empty_percentage, corpus["unique"], unique_percentage))
        
    stats_table.write(r'\hline'+'\n')
    stats_table.write(r'\end{tabular}'+'\n')
    stats_table.write(r'\caption{Corpora repetition statistics}'+'\n')
    stats_table.write(r'\label{tab:repetition_statistics}'+'\n')
    stats_table.write(r'\end{center}'+'\n')
    stats_table.write(r'\end{table}'+'\n')

# most frequent sentences table

for corpus in corpora:
    with open(output_dir+'/'+corpus["name"]+'_freq.tex', 'w') as freq_table:
        freq_table.write(r'\begin{table}'+'\n')
        freq_table.write(r'\begin{center}'+'\n')
        freq_table.write(r'\begin{tabular}{|l|l|}'+'\n')
        freq_table.write(r'\hline'+'\n')
        freq_table.write(r'Occurences & Sentence\\'+'\n')
        freq_table.write(r'\hline\hline'+'\n')
        for data in corpus["most_frequent"]:
            freq_table.write("%d & %s\n" % data)            
        freq_table.write(r'\hline'+'\n')
        freq_table.write(r'\end{tabular}'+'\n')
        freq_table.write(r'\caption{Most frequent sentences in the corpus '+corpus["name"]+'}\n')
        freq_table.write(r'\label{tab:freq_'+corpus["name"]+'}\n')
        freq_table.write(r'\end{center}'+'\n')
        freq_table.write(r'\end{table}'+'\n')

# plot

N = 5
menMeans   = (20, 35, 30, 35, 27)
womenMeans = (25, 32, 34, 20, 25)
menStd     = (2, 3, 4, 1, 2)
womenStd   = (3, 5, 2, 3, 3)
ind = np.arange(N)    # the x locations for the groups
width = 0.35       # the width of the bars: can also be len(x) sequence

p1 = plt.bar(ind, menMeans,   width, color='r', yerr=womenStd)
p2 = plt.bar(ind, womenMeans, width, color='y',
             bottom=menMeans, yerr=menStd)

plt.ylabel('Scores')
plt.title('Scores by group and gender')
plt.xticks(ind+width/2., ('G1', 'G2', 'G3', 'G4', 'G5') )
plt.yticks(np.arange(0,81,10))
plt.legend( (p1[0], p2[0]), ('Men', 'Women') )


plt.savefig('bar_graph.eps', format='eps')
corpus analyzer 2015-10-04 16:24:58 +02:00			`#!/usr/bin/python`
			`# -- coding: utf-8 --`

			`from os import listdir`
			`import numpy as np`
			`import matplotlib.pyplot as plt`


			`input_dir = 'stats'`
			`output_dir = 'figures'`

			`def get_corpus_data(corpus_stats_file):`
			`with open(corpus_stats_file) as csf:`
			`file_lines = csf.readlines()`

			`name, total, empty, unique = file_lines[0][:-1].split('@#@')`
			`most_frequent = []`
			`for line in file_lines[1:]:`
			`count, sentence = line[:-1].split('@#@')`
			`most_frequent.append((int(count), sentence))`

			`return {"name":name, "total":int(total), "empty":int(empty), "unique":int(unique), "most_frequent":most_frequent}`


			`corpora = []`

			`for f in listdir(input_dir):`
			`corpora.append(get_corpus_data(input_dir+'/'+f))`


			`# stats table`
			`with open(output_dir+'/stats_table.tex', 'w') as stats_table:`
			`stats_table.write(r'\begin{table}'+'\n')`
			`stats_table.write(r'\begin{center}'+'\n')`
			`stats_table.write(r'\begin{tabular}{\|l\|l\|l\|l\|}'+'\n')`
			`stats_table.write(r'\hline'+'\n')`
			`stats_table.write(r'Corpus name & Total sentences & Non-empty & Unique\\'+'\n')`
			`stats_table.write(r'\hline\hline'+'\n')`
			`for corpus in corpora:`
			`non_empty_percentage = float(100*corpus["total"] - corpus["empty"])/corpus["total"]`
			`unique_percentage = float(100*corpus["unique"])/corpus["total"]`
			`stats_table.write("%s & %d & %d (%.2f%%) & %d (%.2f%%) \\\\\n" % (corpus["name"], corpus["total"], corpus["total"] - corpus["empty"], non_empty_percentage, corpus["unique"], unique_percentage))`

			`stats_table.write(r'\hline'+'\n')`
			`stats_table.write(r'\end{tabular}'+'\n')`
			`stats_table.write(r'\caption{Corpora repetition statistics}'+'\n')`
			`stats_table.write(r'\label{tab:repetition_statistics}'+'\n')`
			`stats_table.write(r'\end{center}'+'\n')`
			`stats_table.write(r'\end{table}'+'\n')`

			`# most frequent sentences table`

			`for corpus in corpora:`
			`with open(output_dir+'/'+corpus["name"]+'_freq.tex', 'w') as freq_table:`
			`freq_table.write(r'\begin{table}'+'\n')`
			`freq_table.write(r'\begin{center}'+'\n')`
			`freq_table.write(r'\begin{tabular}{\|l\|l\|}'+'\n')`
			`freq_table.write(r'\hline'+'\n')`
			`freq_table.write(r'Occurences & Sentence\\'+'\n')`
			`freq_table.write(r'\hline\hline'+'\n')`
			`for data in corpus["most_frequent"]:`
			`freq_table.write("%d & %s\n" % data)`
			`freq_table.write(r'\hline'+'\n')`
			`freq_table.write(r'\end{tabular}'+'\n')`
			`freq_table.write(r'\caption{Most frequent sentences in the corpus '+corpus["name"]+'}\n')`
			`freq_table.write(r'\label{tab:freq_'+corpus["name"]+'}\n')`
			`freq_table.write(r'\end{center}'+'\n')`
			`freq_table.write(r'\end{table}'+'\n')`

			`# plot`

			`N = 5`
			`menMeans = (20, 35, 30, 35, 27)`
			`womenMeans = (25, 32, 34, 20, 25)`
			`menStd = (2, 3, 4, 1, 2)`
			`womenStd = (3, 5, 2, 3, 3)`
			`ind = np.arange(N) # the x locations for the groups`
			`width = 0.35 # the width of the bars: can also be len(x) sequence`

			`p1 = plt.bar(ind, menMeans, width, color='r', yerr=womenStd)`
			`p2 = plt.bar(ind, womenMeans, width, color='y',`
			`bottom=menMeans, yerr=menStd)`

			`plt.ylabel('Scores')`
			`plt.title('Scores by group and gender')`
			`plt.xticks(ind+width/2., ('G1', 'G2', 'G3', 'G4', 'G5') )`
			`plt.yticks(np.arange(0,81,10))`
			`plt.legend( (p1[0], p2[0]), ('Men', 'Women') )`


			`plt.savefig('bar_graph.eps', format='eps')`