all done zad 2

This commit is contained in:
Mikołaj Pokrywka 2023-03-21 23:51:37 +01:00
parent 3dba635416
commit 9804f8b3dc
16 changed files with 296 additions and 64 deletions

5
10_long_frq_words Normal file
View File

@ -0,0 +1,5 @@
Israeli-Palestinian
disproportionately
inflation-adjusted
industrialization
Secretary-General

29
10_long_words.py Normal file
View File

@ -0,0 +1,29 @@
import sys
import regex as re
from collections import Counter
from collections import OrderedDict
from itertools import islice
def get_freq_list_sorted_by_len(c):
items = c.items()
return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9\*]+', t):
yield m.group(0)
file = open('News-Commentary-v16', 'r' )
content = file.read()
myDict = Counter(get_words(content))
sorted_by_len = get_freq_list_sorted_by_len(myDict)
top_10 = list(islice(sorted_by_len, 0, 10))
for i in top_10:
print(i)

View File

@ -1,73 +1,44 @@
# Corpus
News-Commentary v16 przefiltrowany korpus -- News-Commentary-v16.xz
# Zadanie 2
# Statystyki
## Zbadać prawo Zipfa dla innych jednostek niż wyrazy (n-gramy, rdzenie, lematy, itp.)
## Po filtrowaniu:
### Ilość linijek
Zbadano dla trigramów na wyrazach i znakach, wyniki znajdują się w plikach:
`wc -l`
`pt-3_n-gram_words-log-log.png`
632985
`pt-3_n-gram_chars-log-log.png`
### Rozmiar
83.0 MiB
## Filtrowanie
# Użyto biblioteki opusfilter
`opusfilter filter_config.yaml`
1. Usuwanie duplikatów (2.40% duplicate lines)
2. Użyto następujących filtrów:
```
filters:
- LengthFilter:
unit: word
min_length: 1
max_length: 300
- LengthRatioFilter:
unit: word
threshold: 3
- LongWordFilter:
threshold: 40
```
# Użyto skryptu filter.py, który:
1. Usuwa linijki w których nie ma ani jednej litery Unicode
2. Usuwa linijki składające się z jednego słowa, który jest linkiem lub jest alfanumeryczny
ścieżka do skyptu: `zipf.py`
## Przed przefiltrowaniem
### Ilość linijek
`wc -l`
648886
### Rozmiar
84.8 MiB
### 10 przykładowych losowych zdań z korpusu:
`lm shuf corpora.eng | head`
## Podać słowa, które najbardziej łamią prawo wiążące długość z częstością
```
1 The crash is followed by a flight to safety, which is followed by a steep fall in the velocity of money as investors hoard cash.
2 In this sense, the pandemic represents a unique opportunity to advance European integration like never before.
3 As depositors flee from a weak bank, they can destroy the banks liquidity.
4 But progress is nonetheless being made.
5 Critics of the growth model argue that it is imperative to redistribute income and wealth as soon as possible.
6 All told, countries that have pursued greater economic openness have enjoyed improved nutritional, health, and educational outcomes, as well as higher productivity and incomes.
7 The periods around World War I and World War II are routinely overlooked in discussions that focus on deregulation of capital markets since the 1980s.
8 The Greek people deserve some real choices in the near future.
9 LONDON The outbreak of the Zika virus, like Ebola before it, has highlighted the risk that infectious diseases can pose to the health of entire countries and the importance of vaccines to the fight against fast-moving epidemics.
10 Controls may even require curbing individual freedoms, like accessing hospitals or getting on airplanes.
```
Israeli-Palestinian
disproportionately
inflation-adjusted
industrialization
Secretary-General
```
ścieżka do skryptu: `long_freq_words.py`
## Wymyślić i zbadać 2 zależności dotyczące wyrazów bądź innych jednostek w tekście.
### Piewsza wymyślona zależność
Hipoteza -- Średnia długość wyrazów będzie większa w dłuższych zdaniach.
Wnioski:
1. Średnia długośc wyrazu **nie jest większa** przy dłuższych zdaniach wynika to z wykresu: `long_word_in_long_sent.png` (uwaga kolory są wybierane naiwnie losowo)
2. Krótkie zdania mają większą średnią długość wyrazu
Ścieżka do skryptu: `long_word_in_long_sent.py`
### Druga wymyślona zależność
Hipoteza -- Ilość wyrazów pisanych w całości z wielkich liter będzie większa w krótkich zdaniach
Wnioski:
1. Ilość wyrazów pisanych w całości z wielkich liter nie jest bardzo zauważalnie większa w krótkich zdaniach wynika to z wykresu: `uppercases_are_in_short_sents.png` (uwaga kolory są wybierane naiwnie losowo)
Ścieżka do skryptu: `uppercases_are_in_short_sents.py`

42
long_freq_words.py Normal file
View File

@ -0,0 +1,42 @@
import sys
import regex as re
from collections import Counter
from collections import OrderedDict
from itertools import islice
def get_freq_list_sorted_by_len(c):
items = c.items()
return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
def get_freq_list(c):
items = c.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
file = open('News-Commentary-v16', 'r' )
content = file.read()
myDict = Counter(get_words(content))
sorted_by_len = get_freq_list_sorted_by_len(myDict)
sorted_by_freq = get_freq_list(myDict)
top_frq = list(islice(sorted_by_freq, 0, 5000))
top_long = list(islice(sorted_by_len, 0, 5000))
for long_word in top_long:
if long_word in top_frq:
print(long_word)

BIN
long_word_in_long_sent.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

55
long_word_in_long_sent.py Normal file
View File

@ -0,0 +1,55 @@
import sys
import regex as re
from collections import Counter
from collections import OrderedDict
from itertools import islice
import matplotlib.pyplot as plt
import numpy as np
def get_freq_list_sorted_by_len(c):
items = c.items()
return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
def get_freq_list(c):
items = c.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
def get_avg_words_len_in_sent(sent):
len_words_sum = 0
splitted_sent = sent.split()
for word in splitted_sent:
len_words_sum += len(word)
return len_words_sum / len(splitted_sent)
file = open('News-Commentary-v16', 'r' )
lines = Counter({})
x = []
y = []
for line in file:
x.append(len(line.split()))
y.append(get_avg_words_len_in_sent(line))
fig, ax = plt.subplots()
color=['red','green','blue', 'yellow', 'grey', 'brown', 'pink']
colors = [color[i%6] for i in range(len(x))]
ax.scatter(x, y, c=colors, facecolors='none', edgecolors='r')
plt.xlabel('Długość zdania w wyrazach', fontsize=18)
plt.ylabel('Średnia długość wyrazu', fontsize=16)
# plt.show()
plt.savefig('long_word_in_long_sent')

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

BIN
pt-chars-log-log.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

BIN
pt-words-log-log.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

10
top_10_words Normal file
View File

@ -0,0 +1,10 @@
diphtheria-tetanus-pertussis-containing
target-inflation-and-float-the-currency
cut-emissions-now-before-it-is-too-late
lift-yourself-up-by-your-own-bootstraps
five-percent-growth-of-kilowatt-hours
supply-shock-cum-derived-demand-shock
Harkat-ul-Jihad-Al-Islami-Bangladesh
campaign-contributions-for-subsidies
mortgage-securitization-derivatives
globalization--migration--increases

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

View File

@ -0,0 +1,56 @@
import sys
import regex as re
from collections import Counter
from collections import OrderedDict
from itertools import islice
import matplotlib.pyplot as plt
import numpy as np
from math import log
def get_freq_list_sorted_by_len(c):
items = c.items()
return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
def get_freq_list(c):
items = c.items()
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
def get_avg_words_len_in_sent(sent):
len_words_sum = 0
splitted_sent = sent.split()
for word in splitted_sent:
len_words_sum += len(word)
return len_words_sum / len(splitted_sent)
file = open('News-Commentary-v16', 'r' )
lines = Counter({})
x = []
y = []
for line in file:
line=line.rstrip()
upper_counter = 0
for word in line.split():
if word.isupper():
upper_counter+=1
x.append(len(line.split()))
y.append(upper_counter)
color=['red','green','blue', 'yellow', 'grey', 'brown', 'pink']
colors = [color[i%6] for i in range(len(x))]
fig, ax = plt.subplots()
ax.scatter(x, y, c=colors, facecolors='none', edgecolors='r', cmap="jet")
plt.xlabel('Długość zdania w wyrazach', fontsize=16)
plt.ylabel('Ilość wyrazów pisanych \nw całości z wielkich liter', fontsize=16)
# plt.show()
plt.savefig('uppercases_are_in_short_sents')

64
zipf.py Normal file
View File

@ -0,0 +1,64 @@
import matplotlib.pyplot as plt
from math import log
from collections import OrderedDict
from collections import Counter
import regex as re
from itertools import islice
def freq_list(g, top=None):
c = Counter(g)
if top is None:
items = c.items()
else:
items = c.most_common(top)
return OrderedDict(sorted(items, key=lambda t: -t[1]))
def log_rang_log_freq(name, g):
freq = freq_list(g)
plt.figure().clear()
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
fname = f'{name}.png'
plt.savefig(fname)
return fname
def get_words(t):
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
yield m.group(0)
file = open('News-Commentary-v16', 'r' )
def get_characters(t):
yield from t
content = file.read()
# log_rang_log_freq('pt-words-log-log', get_words(content))
# b = freq_list(get_characters(content))
# print(b)
# a = list(islice(b, 0, 10))
# print(a)
# log_rang_log_freq('pt-chars-log-log', get_characters(content))
def ngrams(iter, size):
ngram = []
for item in iter:
ngram.append(item)
if len(ngram) == size:
yield tuple(ngram)
ngram = ngram[1:]
# ngram_list = list(islice(ngrams(get_words(content), 3), 0, 100 ))
# print(ngram_list)
log_rang_log_freq('pt-3_unigram_chars-log-log', ngrams(get_characters(content), 3))
log_rang_log_freq('pt-3_unigram_words-log-log', ngrams(get_words(content), 3))