commit 0c249dab9a7475b5c1a9148e4bddf3b0f3ed0ec8 Author: Mikołaj Pokrywka Date: Tue Mar 14 18:27:43 2023 +0100 all done diff --git a/News-Commentary-v16.xz b/News-Commentary-v16.xz new file mode 100644 index 0000000..f5ed2a8 Binary files /dev/null and b/News-Commentary-v16.xz differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..b1c88f9 --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +# Corpus +News-Commentary v16 przefiltrowany korpus -- News-Commentary-v16. + +# Statystyki + +## Po filtrowaniu: +### Ilość linijek + +`wc -l` + +632985 + +### Rozmiar + +83.0 MiB + +## Filtrowanie + +# Użyto biblioteki opusfilter + +`opusfilter filter_config.yaml` + +1. Usuwanie duplikatów (2.40% duplicate lines) +2. Użyto następujących filtrów: +``` + filters: + - LengthFilter: + unit: word + min_length: 1 + max_length: 300 + + - LengthRatioFilter: + unit: word + threshold: 3 + + - LongWordFilter: + threshold: 40 + +``` + +# Użyto skryptu filter.py, który: +1. Usuwa linijki w których nie ma ani jednej litery Unicode +2. Usuwa linijki składające się z jednego słowa, który jest linkiem lub jest alfanumeryczny + + +## Przed przefiltrowaniem + +### Ilość linijek + +`wc -l` + +648886 + +### Rozmiar + +84.8 MiB + +### 10 przykładowych losowych zdań z korpusu: + +`lm shuf corpora.eng | head` + +``` + 1 The crash is followed by a flight to safety, which is followed by a steep fall in the velocity of money as investors hoard cash. + 2 In this sense, the pandemic represents a unique opportunity to advance European integration like never before. + 3 As depositors flee from a weak bank, they can destroy the bank’s liquidity. + 4 But progress is nonetheless being made. + 5 Critics of the growth model argue that it is imperative to redistribute income and wealth as soon as possible. + 6 All told, countries that have pursued greater economic openness have enjoyed improved nutritional, health, and educational outcomes, as well as higher productivity and incomes. + 7 The periods around World War I and World War II are routinely overlooked in discussions that focus on deregulation of capital markets since the 1980s. + 8 The Greek people deserve some real choices in the near future. + 9 LONDON – The outbreak of the Zika virus, like Ebola before it, has highlighted the risk that infectious diseases can pose to the health of entire countries – and the importance of vaccines to the fight against fast-moving epidemics. + 10 Controls may even require curbing individual freedoms, like accessing hospitals or getting on airplanes. +``` \ No newline at end of file diff --git a/filter.py b/filter.py new file mode 100644 index 0000000..f4b3f6d --- /dev/null +++ b/filter.py @@ -0,0 +1,17 @@ +import sys +import regex + +for line in sys.stdin: + line = line.strip() + sent_len = len(line.split()) + if not regex.search('\p{L}', line): + continue + + if sent_len == 1: + if 'http' in line: + continue + if line.isalnum(): + continue + + print(line) + \ No newline at end of file diff --git a/filter_config.yaml b/filter_config.yaml new file mode 100644 index 0000000..b5e14b5 --- /dev/null +++ b/filter_config.yaml @@ -0,0 +1,27 @@ +steps: + - type: remove_duplicates + parameters: + inputs: + - monolingual_data/corpora.eng.gz + outputs: + - monolingual_data/buff_deduped.eng.gz + + - type: filter + parameters: + inputs: + - monolingual_data/buff_deduped.eng.gz + outputs: + - filtered_mono_data/filtered.eng.gz + filters: + - LengthFilter: + unit: word + min_length: 1 + max_length: 300 + + - LengthRatioFilter: + unit: word + threshold: 3 + + - LongWordFilter: + threshold: 40 +