all done zad 2
5
10_long_frq_words
Normal file
@ -0,0 +1,5 @@
|
||||
Israeli-Palestinian
|
||||
disproportionately
|
||||
inflation-adjusted
|
||||
industrialization
|
||||
Secretary-General
|
29
10_long_words.py
Normal file
@ -0,0 +1,29 @@
|
||||
import sys
|
||||
import regex as re
|
||||
from collections import Counter
|
||||
from collections import OrderedDict
|
||||
from itertools import islice
|
||||
|
||||
def get_freq_list_sorted_by_len(c):
|
||||
|
||||
items = c.items()
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
|
||||
|
||||
|
||||
def get_words(t):
|
||||
for m in re.finditer(r'[\p{L}0-9\*]+', t):
|
||||
yield m.group(0)
|
||||
|
||||
file = open('News-Commentary-v16', 'r' )
|
||||
|
||||
content = file.read()
|
||||
|
||||
myDict = Counter(get_words(content))
|
||||
|
||||
sorted_by_len = get_freq_list_sorted_by_len(myDict)
|
||||
|
||||
|
||||
top_10 = list(islice(sorted_by_len, 0, 10))
|
||||
for i in top_10:
|
||||
print(i)
|
99
README.md
@ -1,73 +1,44 @@
|
||||
# Corpus
|
||||
News-Commentary v16 przefiltrowany korpus -- News-Commentary-v16.xz
|
||||
# Zadanie 2
|
||||
|
||||
# Statystyki
|
||||
## Zbadać prawo Zipfa dla innych jednostek niż wyrazy (n-gramy, rdzenie, lematy, itp.)
|
||||
|
||||
## Po filtrowaniu:
|
||||
### Ilość linijek
|
||||
Zbadano dla trigramów na wyrazach i znakach, wyniki znajdują się w plikach:
|
||||
|
||||
`wc -l`
|
||||
`pt-3_n-gram_words-log-log.png`
|
||||
|
||||
632985
|
||||
`pt-3_n-gram_chars-log-log.png`
|
||||
|
||||
### Rozmiar
|
||||
|
||||
83.0 MiB
|
||||
|
||||
## Filtrowanie
|
||||
|
||||
# Użyto biblioteki opusfilter
|
||||
|
||||
`opusfilter filter_config.yaml`
|
||||
|
||||
1. Usuwanie duplikatów (2.40% duplicate lines)
|
||||
2. Użyto następujących filtrów:
|
||||
```
|
||||
filters:
|
||||
- LengthFilter:
|
||||
unit: word
|
||||
min_length: 1
|
||||
max_length: 300
|
||||
|
||||
- LengthRatioFilter:
|
||||
unit: word
|
||||
threshold: 3
|
||||
|
||||
- LongWordFilter:
|
||||
threshold: 40
|
||||
|
||||
```
|
||||
|
||||
# Użyto skryptu filter.py, który:
|
||||
1. Usuwa linijki w których nie ma ani jednej litery Unicode
|
||||
2. Usuwa linijki składające się z jednego słowa, który jest linkiem lub jest alfanumeryczny
|
||||
ścieżka do skyptu: `zipf.py`
|
||||
|
||||
|
||||
## Przed przefiltrowaniem
|
||||
|
||||
### Ilość linijek
|
||||
|
||||
`wc -l`
|
||||
|
||||
648886
|
||||
|
||||
### Rozmiar
|
||||
|
||||
84.8 MiB
|
||||
|
||||
### 10 przykładowych losowych zdań z korpusu:
|
||||
|
||||
`lm shuf corpora.eng | head`
|
||||
## Podać słowa, które najbardziej łamią prawo wiążące długość z częstością
|
||||
|
||||
```
|
||||
1 The crash is followed by a flight to safety, which is followed by a steep fall in the velocity of money as investors hoard cash.
|
||||
2 In this sense, the pandemic represents a unique opportunity to advance European integration like never before.
|
||||
3 As depositors flee from a weak bank, they can destroy the bank’s liquidity.
|
||||
4 But progress is nonetheless being made.
|
||||
5 Critics of the growth model argue that it is imperative to redistribute income and wealth as soon as possible.
|
||||
6 All told, countries that have pursued greater economic openness have enjoyed improved nutritional, health, and educational outcomes, as well as higher productivity and incomes.
|
||||
7 The periods around World War I and World War II are routinely overlooked in discussions that focus on deregulation of capital markets since the 1980s.
|
||||
8 The Greek people deserve some real choices in the near future.
|
||||
9 LONDON – The outbreak of the Zika virus, like Ebola before it, has highlighted the risk that infectious diseases can pose to the health of entire countries – and the importance of vaccines to the fight against fast-moving epidemics.
|
||||
10 Controls may even require curbing individual freedoms, like accessing hospitals or getting on airplanes.
|
||||
```
|
||||
Israeli-Palestinian
|
||||
disproportionately
|
||||
inflation-adjusted
|
||||
industrialization
|
||||
Secretary-General
|
||||
```
|
||||
ścieżka do skryptu: `long_freq_words.py`
|
||||
|
||||
## Wymyślić i zbadać 2 zależności dotyczące wyrazów bądź innych jednostek w tekście.
|
||||
|
||||
### Piewsza wymyślona zależność
|
||||
|
||||
Hipoteza -- Średnia długość wyrazów będzie większa w dłuższych zdaniach.
|
||||
|
||||
Wnioski:
|
||||
1. Średnia długośc wyrazu **nie jest większa** przy dłuższych zdaniach wynika to z wykresu: `long_word_in_long_sent.png` (uwaga kolory są wybierane naiwnie losowo)
|
||||
2. Krótkie zdania mają większą średnią długość wyrazu
|
||||
|
||||
Ścieżka do skryptu: `long_word_in_long_sent.py`
|
||||
### Druga wymyślona zależność
|
||||
|
||||
Hipoteza -- Ilość wyrazów pisanych w całości z wielkich liter będzie większa w krótkich zdaniach
|
||||
|
||||
Wnioski:
|
||||
1. Ilość wyrazów pisanych w całości z wielkich liter nie jest bardzo zauważalnie większa w krótkich zdaniach wynika to z wykresu: `uppercases_are_in_short_sents.png` (uwaga kolory są wybierane naiwnie losowo)
|
||||
|
||||
|
||||
Ścieżka do skryptu: `uppercases_are_in_short_sents.py`
|
42
long_freq_words.py
Normal file
@ -0,0 +1,42 @@
|
||||
import sys
|
||||
import regex as re
|
||||
from collections import Counter
|
||||
from collections import OrderedDict
|
||||
from itertools import islice
|
||||
|
||||
def get_freq_list_sorted_by_len(c):
|
||||
|
||||
items = c.items()
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
|
||||
|
||||
|
||||
def get_freq_list(c):
|
||||
|
||||
items = c.items()
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
|
||||
|
||||
|
||||
def get_words(t):
|
||||
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
|
||||
yield m.group(0)
|
||||
|
||||
file = open('News-Commentary-v16', 'r' )
|
||||
|
||||
content = file.read()
|
||||
|
||||
myDict = Counter(get_words(content))
|
||||
|
||||
sorted_by_len = get_freq_list_sorted_by_len(myDict)
|
||||
sorted_by_freq = get_freq_list(myDict)
|
||||
|
||||
|
||||
top_frq = list(islice(sorted_by_freq, 0, 5000))
|
||||
|
||||
|
||||
top_long = list(islice(sorted_by_len, 0, 5000))
|
||||
|
||||
for long_word in top_long:
|
||||
if long_word in top_frq:
|
||||
print(long_word)
|
BIN
long_word_in_long_sent.png
Normal file
After Width: | Height: | Size: 79 KiB |
55
long_word_in_long_sent.py
Normal file
@ -0,0 +1,55 @@
|
||||
import sys
|
||||
import regex as re
|
||||
from collections import Counter
|
||||
from collections import OrderedDict
|
||||
from itertools import islice
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
def get_freq_list_sorted_by_len(c):
|
||||
|
||||
items = c.items()
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
|
||||
|
||||
|
||||
def get_freq_list(c):
|
||||
|
||||
items = c.items()
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
|
||||
|
||||
|
||||
def get_avg_words_len_in_sent(sent):
|
||||
len_words_sum = 0
|
||||
splitted_sent = sent.split()
|
||||
for word in splitted_sent:
|
||||
len_words_sum += len(word)
|
||||
return len_words_sum / len(splitted_sent)
|
||||
|
||||
|
||||
file = open('News-Commentary-v16', 'r' )
|
||||
|
||||
lines = Counter({})
|
||||
x = []
|
||||
y = []
|
||||
for line in file:
|
||||
x.append(len(line.split()))
|
||||
y.append(get_avg_words_len_in_sent(line))
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
|
||||
|
||||
color=['red','green','blue', 'yellow', 'grey', 'brown', 'pink']
|
||||
|
||||
colors = [color[i%6] for i in range(len(x))]
|
||||
ax.scatter(x, y, c=colors, facecolors='none', edgecolors='r')
|
||||
|
||||
plt.xlabel('Długość zdania w wyrazach', fontsize=18)
|
||||
plt.ylabel('Średnia długość wyrazu', fontsize=16)
|
||||
|
||||
|
||||
|
||||
|
||||
# plt.show()
|
||||
plt.savefig('long_word_in_long_sent')
|
BIN
pt-3_n-gram_chars-log-log.png
Normal file
After Width: | Height: | Size: 16 KiB |
BIN
pt-3_n-gram_words-log-log.png
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
pt-3_unigram_chars-log-log.png
Normal file
After Width: | Height: | Size: 16 KiB |
BIN
pt-3_unigram_words-log-log.png
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
pt-chars-log-log.png
Normal file
After Width: | Height: | Size: 16 KiB |
BIN
pt-words-log-log.png
Normal file
After Width: | Height: | Size: 16 KiB |
10
top_10_words
Normal file
@ -0,0 +1,10 @@
|
||||
diphtheria-tetanus-pertussis-containing
|
||||
target-inflation-and-float-the-currency
|
||||
cut-emissions-now-before-it-is-too-late
|
||||
lift-yourself-up-by-your-own-bootstraps
|
||||
five-percent-growth-of-kilowatt-hours
|
||||
supply-shock-cum-derived-demand-shock
|
||||
Harkat-ul-Jihad-Al-Islami-Bangladesh
|
||||
campaign-contributions-for-subsidies
|
||||
mortgage-securitization-derivatives
|
||||
globalization--migration--increases
|
BIN
uppercases_are_in_short_sents.png
Normal file
After Width: | Height: | Size: 65 KiB |
56
uppercases_are_in_short_sents.py
Normal file
@ -0,0 +1,56 @@
|
||||
import sys
|
||||
import regex as re
|
||||
from collections import Counter
|
||||
from collections import OrderedDict
|
||||
from itertools import islice
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from math import log
|
||||
def get_freq_list_sorted_by_len(c):
|
||||
|
||||
items = c.items()
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
|
||||
|
||||
|
||||
def get_freq_list(c):
|
||||
|
||||
items = c.items()
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
|
||||
|
||||
|
||||
def get_avg_words_len_in_sent(sent):
|
||||
len_words_sum = 0
|
||||
splitted_sent = sent.split()
|
||||
for word in splitted_sent:
|
||||
len_words_sum += len(word)
|
||||
return len_words_sum / len(splitted_sent)
|
||||
|
||||
|
||||
file = open('News-Commentary-v16', 'r' )
|
||||
|
||||
lines = Counter({})
|
||||
x = []
|
||||
y = []
|
||||
for line in file:
|
||||
line=line.rstrip()
|
||||
upper_counter = 0
|
||||
for word in line.split():
|
||||
if word.isupper():
|
||||
upper_counter+=1
|
||||
x.append(len(line.split()))
|
||||
y.append(upper_counter)
|
||||
|
||||
color=['red','green','blue', 'yellow', 'grey', 'brown', 'pink']
|
||||
|
||||
colors = [color[i%6] for i in range(len(x))]
|
||||
fig, ax = plt.subplots()
|
||||
|
||||
ax.scatter(x, y, c=colors, facecolors='none', edgecolors='r', cmap="jet")
|
||||
plt.xlabel('Długość zdania w wyrazach', fontsize=16)
|
||||
plt.ylabel('Ilość wyrazów pisanych \nw całości z wielkich liter', fontsize=16)
|
||||
|
||||
# plt.show()
|
||||
plt.savefig('uppercases_are_in_short_sents')
|
||||
|
64
zipf.py
Normal file
@ -0,0 +1,64 @@
|
||||
import matplotlib.pyplot as plt
|
||||
from math import log
|
||||
from collections import OrderedDict
|
||||
from collections import Counter
|
||||
import regex as re
|
||||
from itertools import islice
|
||||
|
||||
|
||||
def freq_list(g, top=None):
|
||||
c = Counter(g)
|
||||
|
||||
if top is None:
|
||||
items = c.items()
|
||||
else:
|
||||
items = c.most_common(top)
|
||||
|
||||
return OrderedDict(sorted(items, key=lambda t: -t[1]))
|
||||
|
||||
def log_rang_log_freq(name, g):
|
||||
freq = freq_list(g)
|
||||
|
||||
plt.figure().clear()
|
||||
plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
|
||||
|
||||
fname = f'{name}.png'
|
||||
|
||||
plt.savefig(fname)
|
||||
|
||||
return fname
|
||||
|
||||
def get_words(t):
|
||||
for m in re.finditer(r'[\p{L}0-9-\*]+', t):
|
||||
yield m.group(0)
|
||||
|
||||
file = open('News-Commentary-v16', 'r' )
|
||||
|
||||
def get_characters(t):
|
||||
yield from t
|
||||
|
||||
|
||||
content = file.read()
|
||||
|
||||
|
||||
# log_rang_log_freq('pt-words-log-log', get_words(content))
|
||||
# b = freq_list(get_characters(content))
|
||||
# print(b)
|
||||
# a = list(islice(b, 0, 10))
|
||||
# print(a)
|
||||
# log_rang_log_freq('pt-chars-log-log', get_characters(content))
|
||||
|
||||
def ngrams(iter, size):
|
||||
ngram = []
|
||||
for item in iter:
|
||||
ngram.append(item)
|
||||
if len(ngram) == size:
|
||||
yield tuple(ngram)
|
||||
ngram = ngram[1:]
|
||||
|
||||
# ngram_list = list(islice(ngrams(get_words(content), 3), 0, 100 ))
|
||||
# print(ngram_list)
|
||||
|
||||
|
||||
log_rang_log_freq('pt-3_unigram_chars-log-log', ngrams(get_characters(content), 3))
|
||||
log_rang_log_freq('pt-3_unigram_words-log-log', ngrams(get_words(content), 3))
|