Add run.py and remove train directory
This commit is contained in:
parent
9da4f11da1
commit
f4dc29070d
@ -1,4 +0,0 @@
|
||||
<a name="2.0.0"></a>
|
||||
## 2.0.0 (2020-05-22)
|
||||
|
||||
* Switch to probabilities as the main metric
|
38
README.md
38
README.md
@ -1,38 +0,0 @@
|
||||
|
||||
"He Said She Said" classification challenge (2nd edition)
|
||||
=========================================================
|
||||
|
||||
Give the probability that a text in Polish was written by a man.
|
||||
|
||||
This challenge is based on the "He Said She Said" corpus for Polish.
|
||||
The corpus was created by grepping gender-specific first person
|
||||
expressions (e.g. "zrobiłem/zrobiłam", "jestem zadowolony/zadowolona",
|
||||
"będę robił/robiła") in the Common Crawl corpus. Such expressions were
|
||||
normalised here into masculine forms.
|
||||
|
||||
Classes
|
||||
-------
|
||||
|
||||
* `0` — text written by a woman
|
||||
* `1` — text written by a man
|
||||
|
||||
Directory structure
|
||||
-------------------
|
||||
|
||||
* `README.md` — this file
|
||||
* `config.txt` — configuration file
|
||||
* `train/` — directory with training data
|
||||
* `train/train.tsv.gz` — train set (gzipped), the class is given in the first column,
|
||||
a text fragment in the second one
|
||||
* `train/meta.tsv.gz` — metadata (do not use during training)
|
||||
* `dev-0/` — directory with dev (test) data
|
||||
* `dev-0/in.tsv` — input data for the dev set (text fragments)
|
||||
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||
* `dev-0/meta.tsv` — metadata (not used during testing)
|
||||
* `dev-1/` — directory with extra dev (test) data
|
||||
* `dev-1/in.tsv` — input data for the extra dev set (text fragments)
|
||||
* `dev-1/expected.tsv` — expected (reference) data for the extra dev set
|
||||
* `dev-1/meta.tsv` — metadata (not used during testing)
|
||||
* `test-A` — directory with test data
|
||||
* `test-A/in.tsv` — input data for the test set (text fragments)
|
||||
* `test-A/expected.tsv` — expected (reference) data for the test set (hidden)
|
@ -1 +0,0 @@
|
||||
--metric Likelihood --metric Accuracy --metric {Likelihood:N<Likelihood>,Accuracy:N<Accuracy>}P<2>{f<in[2]:for-humans>N<+H>,f<in[3]:contaminated>N<+C>,f<in[3]:not-contaminated>N<-C>} --precision 5
|
37
run.py
Normal file
37
run.py
Normal file
@ -0,0 +1,37 @@
|
||||
import random
|
||||
|
||||
|
||||
women_word_list = ["mąż", "fryzjer", "kosmety", "biżuter", "sukienk", "polk", "kolczy", "rodzin", "obcas",
|
||||
"bransolet", "spink", "torebk", "szmink", "kobiet", "koleżan", "kuchni", "gotowa", "przepis",
|
||||
"ciast", "ciąż", "miesiączk"]
|
||||
men_word_list = ["samoch", "kompute", "pc", "sport", "km", "windows", "paliw", "kierownic", "silnik", "opon", "piw",
|
||||
"koleg", "śrub", "mecz", "system", "serwer"]
|
||||
|
||||
data = []
|
||||
with open("j:\Desktop\ekstrakcjacw5\petite-difference-challenge2\\test-A\in.tsv", "r", encoding="UTF-8") as read_file:
|
||||
counter = 0
|
||||
for line in read_file.readlines():
|
||||
is_written = False
|
||||
counter += 1
|
||||
for word in men_word_list:
|
||||
if word in line:
|
||||
data.append("1\n")
|
||||
is_written = True
|
||||
break
|
||||
|
||||
if is_written is True:
|
||||
continue
|
||||
|
||||
for word in women_word_list:
|
||||
if word in line:
|
||||
data.append("0\n")
|
||||
is_written = True
|
||||
break
|
||||
|
||||
if is_written is True:
|
||||
continue
|
||||
else:
|
||||
data.append(f"{(random.randint(0, 1))}\n")
|
||||
|
||||
with open("j:\Desktop\ekstrakcjacw5\petite-difference-challenge2\\test-A\out.tsv", "w", encoding="UTF-8") as output_file:
|
||||
output_file.writelines(data)
|
3601424
train/expected.tsv
3601424
train/expected.tsv
File diff suppressed because it is too large
Load Diff
BIN
train/in.tsv.xz
BIN
train/in.tsv.xz
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user