data processing, analysis and visualization
This commit is contained in:
parent
99b6e1f59a
commit
3e0ee10fd9
75
analysis/advanced.py
Normal file
75
analysis/advanced.py
Normal file
@ -0,0 +1,75 @@
|
||||
from matplotlib import pyplot as plt
|
||||
from math import log
|
||||
def log_rang_log_freq(vals: list[int], fname: str = "fig.png"):
|
||||
plt.figure().clear()
|
||||
plt.plot([log(x) for x in range(1, len(vals)+1)], [log(y) for y in vals])
|
||||
plt.savefig('../figures/'+fname)
|
||||
plt.show()
|
||||
return plt
|
||||
|
||||
def words_freq_dict(filename: str = "word_freq.txt") -> dict:
|
||||
words = {}
|
||||
with open(filename) as f:
|
||||
for line in f.readlines():
|
||||
try:
|
||||
occ, word = line.strip().replace('\n', '').split(' ')
|
||||
occ = int(occ)
|
||||
except:
|
||||
#words[len' '] = line.strip().replace('\n', '')
|
||||
pass
|
||||
if len(word) in words:
|
||||
if words[len(word)]['min']['count'] > occ:
|
||||
words[len(word)]['min']['count'] = occ
|
||||
words[len(word)]['min']['word'] = word
|
||||
|
||||
if words[len(word)]['max']['count'] < occ:
|
||||
words[len(word)]['max']['count'] = occ
|
||||
words[len(word)]['max']['word'] = word
|
||||
else:
|
||||
words[len(word)] = {
|
||||
'min' : {
|
||||
'word': word,
|
||||
'count': occ
|
||||
},
|
||||
'max' : {
|
||||
'word': word,
|
||||
'count': occ
|
||||
}
|
||||
}
|
||||
|
||||
return words
|
||||
|
||||
|
||||
def word_len_occ(filename: str = "len_freq.txt") -> list[int]:
|
||||
word_len, word_occ = [], []
|
||||
with open(filename) as f:
|
||||
for line in f.readlines():
|
||||
occ, l = line.strip().replace('\n', ' ').split(' ')
|
||||
word_len.append(int(l))
|
||||
word_occ.append(int(occ))
|
||||
return word_len[1:], word_occ[1:]
|
||||
|
||||
|
||||
def bigram_len_occ(filename: str = "bigram_freq.txt") -> list[int]:
|
||||
bigram_len, bigram_occ = [], []
|
||||
with open(filename) as f:
|
||||
for line in f.readlines():
|
||||
occ, l1, l2 = line.strip().replace('\n', ' ').split(' ')
|
||||
bigram_len.append(len(l1) + len(l2))
|
||||
bigram_occ.append(int(occ))
|
||||
return bigram_len[1:], bigram_occ[1:]
|
||||
|
||||
|
||||
|
||||
#Zpif law figure
|
||||
log_rang_log_freq(word_len_occ()[1], 'zipf-words.png')
|
||||
|
||||
#Zipf law for bigram figure
|
||||
log_rang_log_freq(bigram_len_occ()[1], 'zipf-bigrams.png')
|
||||
|
||||
#Most & least frequent words from text
|
||||
def disturbing_words():
|
||||
words = words_freq_dict()
|
||||
for i, w in sorted(words.items()):
|
||||
if(w['min']['word'] != w['max']['word']):
|
||||
print(f'{i} - {w}')
|
Loading…
Reference in New Issue
Block a user