all done zad 2

2023-03-21 23:51:37 +01:00
16 changed files with 296 additions and 64 deletions
--- a/5
+++ b/5
@ -0,0 +1,5 @@
+Israeli-Palestinian
+disproportionately
+inflation-adjusted
+industrialization
+Secretary-General
--- a/10_long_words.py
+++ b/10_long_words.py
@ -0,0 +1,29 @@
+import sys
+import regex as re
+from collections import Counter
+from collections import OrderedDict
+from itertools import islice
+
+def get_freq_list_sorted_by_len(c):
+
+    items = c.items()
+
+    return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
+
+
+def get_words(t):
+    for m in re.finditer(r'[\p{L}0-9\*]+', t):
+        yield m.group(0)
+
+file = open('News-Commentary-v16', 'r' )
+
+content = file.read()
+
+myDict = Counter(get_words(content))
+
+sorted_by_len = get_freq_list_sorted_by_len(myDict)
+
+
+top_10 = list(islice(sorted_by_len, 0, 10))
+for i in top_10:
+    print(i)
--- a/README.md
+++ b/README.md
@ -1,73 +1,44 @@
-# Corpus
-News-Commentary v16 przefiltrowany korpus -- News-Commentary-v16.xz
+# Zadanie 2

-# Statystyki
+## Zbadać prawo Zipfa dla innych jednostek niż wyrazy (n-gramy, rdzenie, lematy, itp.)

-## Po filtrowaniu:
-### Ilość linijek
+Zbadano dla trigramów na wyrazach i znakach, wyniki znajdują się w plikach:

-`wc -l`
+ `pt-3_n-gram_words-log-log.png`

-632985
+ `pt-3_n-gram_chars-log-log.png`

-### Rozmiar
-
-83.0 MiB
-
-## Filtrowanie
-
-# Użyto biblioteki opusfilter
-
-`opusfilter filter_config.yaml`
-
-1. Usuwanie duplikatów (2.40% duplicate lines)
-2. Użyto następujących filtrów:
-```
-      filters:
-        - LengthFilter:
-            unit: word
-            min_length: 1
-            max_length: 300
-
-        - LengthRatioFilter:
-            unit: word
-            threshold: 3
-
-        - LongWordFilter:
-            threshold: 40
-        
-```
-
-# Użyto skryptu filter.py, który:
-1. Usuwa linijki w których nie ma ani jednej litery Unicode
-2. Usuwa linijki składające się z jednego słowa, który jest linkiem lub jest alfanumeryczny
+ścieżka do skyptu: `zipf.py`


-## Przed przefiltrowaniem
-
-### Ilość linijek
-
-`wc -l`
-
-648886
-
-### Rozmiar
-
-84.8 MiB
-
-### 10 przykładowych losowych zdań z korpusu:
-
-`lm shuf corpora.eng | head`
+## Podać słowa, które najbardziej łamią prawo wiążące długość z częstością

 ```
-     1  The crash is followed by a flight to safety, which is followed by a steep fall in the velocity of money as investors hoard cash.
-     2  In this sense, the pandemic represents a unique opportunity to advance European integration like never before.
-     3  As depositors flee from a weak bank, they can destroy the bank’s liquidity.
-     4  But progress is nonetheless being made.
-     5  Critics of the growth model argue that it is imperative to redistribute income and wealth as soon as possible.
-     6  All told, countries that have pursued greater economic openness have enjoyed improved nutritional, health, and educational outcomes, as well as higher productivity and incomes.
-     7  The periods around World War I and World War II are routinely overlooked in discussions that focus on deregulation of capital markets since the 1980s.
-     8  The Greek people deserve some real choices in the near future.
-     9  LONDON – The outbreak of the Zika virus, like Ebola before it, has highlighted the risk that infectious diseases can pose to the health of entire countries – and the importance of vaccines to the fight against fast-moving epidemics.
-    10  Controls may even require curbing individual freedoms, like accessing hospitals or getting on airplanes.
-```
+Israeli-Palestinian
+disproportionately
+inflation-adjusted
+industrialization
+Secretary-General
+```
+ścieżka do skryptu: `long_freq_words.py`
+
+## Wymyślić i zbadać 2 zależności dotyczące wyrazów bądź innych jednostek w tekście.
+
+### Piewsza wymyślona zależność 
+
+Hipoteza -- Średnia długość wyrazów będzie większa w dłuższych zdaniach.
+
+Wnioski: 
+1. Średnia długośc wyrazu **nie jest większa** przy dłuższych zdaniach wynika to z wykresu: `long_word_in_long_sent.png` (uwaga kolory są wybierane naiwnie losowo)
+2. Krótkie zdania mają większą średnią długość wyrazu
+
+Ścieżka do skryptu: `long_word_in_long_sent.py` 
+### Druga wymyślona zależność
+
+Hipoteza -- Ilość wyrazów pisanych w całości z wielkich liter będzie większa w krótkich zdaniach
+
+Wnioski: 
+1. Ilość wyrazów pisanych w całości z wielkich liter nie jest bardzo zauważalnie większa w krótkich zdaniach wynika to z wykresu: `uppercases_are_in_short_sents.png` (uwaga kolory są wybierane naiwnie losowo)
+
+
+Ścieżka do skryptu: `uppercases_are_in_short_sents.py`
--- a/long_freq_words.py
+++ b/long_freq_words.py
@ -0,0 +1,42 @@
+import sys
+import regex as re
+from collections import Counter
+from collections import OrderedDict
+from itertools import islice
+
+def get_freq_list_sorted_by_len(c):
+
+    items = c.items()
+
+    return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
+
+
+def get_freq_list(c):
+
+    items = c.items()
+
+    return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
+
+
+def get_words(t):
+    for m in re.finditer(r'[\p{L}0-9-\*]+', t):
+        yield m.group(0)
+
+file = open('News-Commentary-v16', 'r' )
+
+content = file.read()
+
+myDict = Counter(get_words(content))
+
+sorted_by_len = get_freq_list_sorted_by_len(myDict)
+sorted_by_freq = get_freq_list(myDict)
+
+
+top_frq = list(islice(sorted_by_freq, 0, 5000))
+
+
+top_long = list(islice(sorted_by_len, 0, 5000))
+
+for long_word in top_long:
+    if long_word in top_frq:
+        print(long_word)
--- a/long_word_in_long_sent.png
+++ b/long_word_in_long_sent.png
--- a/long_word_in_long_sent.py
+++ b/long_word_in_long_sent.py
@ -0,0 +1,55 @@
+import sys
+import regex as re
+from collections import Counter
+from collections import OrderedDict
+from itertools import islice
+import matplotlib.pyplot as plt
+import numpy as np
+
+def get_freq_list_sorted_by_len(c):
+
+    items = c.items()
+
+    return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
+
+
+def get_freq_list(c):
+
+    items = c.items()
+
+    return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
+
+
+def get_avg_words_len_in_sent(sent):
+    len_words_sum = 0
+    splitted_sent = sent.split()
+    for word in splitted_sent:
+        len_words_sum += len(word)
+    return len_words_sum / len(splitted_sent)
+
+
+file = open('News-Commentary-v16', 'r' )
+
+lines = Counter({})
+x = []
+y = []
+for line in file:
+    x.append(len(line.split()))
+    y.append(get_avg_words_len_in_sent(line))
+
+fig, ax = plt.subplots()
+
+
+color=['red','green','blue', 'yellow', 'grey', 'brown', 'pink']
+
+colors = [color[i%6] for i in range(len(x))]
+ax.scatter(x, y, c=colors, facecolors='none', edgecolors='r')
+
+plt.xlabel('Długość zdania w wyrazach', fontsize=18)
+plt.ylabel('Średnia długość wyrazu', fontsize=16)
+
+
+
+
+# plt.show()
+plt.savefig('long_word_in_long_sent')
--- a/pt-3_n-gram_chars-log-log.png
+++ b/pt-3_n-gram_chars-log-log.png
--- a/pt-3_n-gram_words-log-log.png
+++ b/pt-3_n-gram_words-log-log.png
--- a/pt-3_unigram_chars-log-log.png
+++ b/pt-3_unigram_chars-log-log.png
--- a/pt-3_unigram_words-log-log.png
+++ b/pt-3_unigram_words-log-log.png
--- a/pt-chars-log-log.png
+++ b/pt-chars-log-log.png
--- a/pt-words-log-log.png
+++ b/pt-words-log-log.png
--- a/10
+++ b/10
@ -0,0 +1,10 @@
+diphtheria-tetanus-pertussis-containing
+target-inflation-and-float-the-currency
+cut-emissions-now-before-it-is-too-late
+lift-yourself-up-by-your-own-bootstraps
+five-percent-growth-of-kilowatt-hours
+supply-shock-cum-derived-demand-shock
+Harkat-ul-Jihad-Al-Islami-Bangladesh
+campaign-contributions-for-subsidies
+mortgage-securitization-derivatives
+globalization--migration--increases
--- a/uppercases_are_in_short_sents.png
+++ b/uppercases_are_in_short_sents.png
--- a/uppercases_are_in_short_sents.py
+++ b/uppercases_are_in_short_sents.py
@ -0,0 +1,56 @@
+import sys
+import regex as re
+from collections import Counter
+from collections import OrderedDict
+from itertools import islice
+import matplotlib.pyplot as plt
+import numpy as np
+from math import log
+def get_freq_list_sorted_by_len(c):
+
+    items = c.items()
+
+    return OrderedDict(sorted(items, key=lambda t: len(t[0]), reverse=True))
+
+
+def get_freq_list(c):
+
+    items = c.items()
+
+    return OrderedDict(sorted(items, key=lambda t:t[1], reverse=True))
+
+
+def get_avg_words_len_in_sent(sent):
+    len_words_sum = 0
+    splitted_sent = sent.split()
+    for word in splitted_sent:
+        len_words_sum += len(word)
+    return len_words_sum / len(splitted_sent)
+
+
+file = open('News-Commentary-v16', 'r' )
+
+lines = Counter({})
+x = []
+y = []
+for line in file:
+    line=line.rstrip()
+    upper_counter = 0
+    for word in line.split():
+        if word.isupper():
+            upper_counter+=1
+    x.append(len(line.split()))
+    y.append(upper_counter)
+
+color=['red','green','blue', 'yellow', 'grey', 'brown', 'pink']
+
+colors = [color[i%6] for i in range(len(x))]
+fig, ax = plt.subplots()
+
+ax.scatter(x, y, c=colors, facecolors='none', edgecolors='r', cmap="jet")
+plt.xlabel('Długość zdania w wyrazach', fontsize=16)
+plt.ylabel('Ilość wyrazów pisanych \nw całości z wielkich liter', fontsize=16)
+
+# plt.show()
+plt.savefig('uppercases_are_in_short_sents')
+
--- a/zipf.py
+++ b/zipf.py
@ -0,0 +1,64 @@
+import matplotlib.pyplot as plt
+from math import log
+from collections import OrderedDict
+from collections import Counter
+import regex as re
+from itertools import islice
+
+
+def freq_list(g, top=None):
+    c = Counter(g)
+
+    if top is None:
+       items = c.items()
+    else:
+       items = c.most_common(top)
+
+    return OrderedDict(sorted(items, key=lambda t: -t[1]))
+
+def log_rang_log_freq(name, g):
+    freq = freq_list(g)
+
+    plt.figure().clear()
+    plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])
+
+    fname = f'{name}.png'
+
+    plt.savefig(fname)
+
+    return fname
+
+def get_words(t):
+    for m in re.finditer(r'[\p{L}0-9-\*]+', t):
+        yield m.group(0)
+
+file = open('News-Commentary-v16', 'r' )
+
+def get_characters(t):
+    yield from t
+
+
+content = file.read()
+
+
+# log_rang_log_freq('pt-words-log-log', get_words(content))
+# b = freq_list(get_characters(content))
+# print(b)
+# a = list(islice(b, 0, 10))
+# print(a)
+# log_rang_log_freq('pt-chars-log-log', get_characters(content))
+
+def ngrams(iter, size):
+  ngram = []
+  for item in iter:
+    ngram.append(item)
+    if len(ngram) == size:
+        yield tuple(ngram)
+        ngram = ngram[1:]
+
+# ngram_list = list(islice(ngrams(get_words(content), 3), 0, 100  ))
+# print(ngram_list)
+
+
+log_rang_log_freq('pt-3_unigram_chars-log-log', ngrams(get_characters(content), 3))
+log_rang_log_freq('pt-3_unigram_words-log-log', ngrams(get_words(content), 3))