478873
This commit is contained in:
parent
d717dec7f5
commit
8c0542a4b4
.gitignoreCHANGELOG.mdREADME.mdconfig.txt
dev-0
dev-1
run.pytest-A
train
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
*~
|
4
CHANGELOG.md
Normal file
4
CHANGELOG.md
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
<a name="2.0.0"></a>
|
||||||
|
## 2.0.0 (2020-05-22)
|
||||||
|
|
||||||
|
* Switch to probabilities as the main metric
|
38
README.md
Normal file
38
README.md
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
|
||||||
|
"He Said She Said" classification challenge (2nd edition)
|
||||||
|
=========================================================
|
||||||
|
|
||||||
|
Give the probability that a text in Polish was written by a man.
|
||||||
|
|
||||||
|
This challenge is based on the "He Said She Said" corpus for Polish.
|
||||||
|
The corpus was created by grepping gender-specific first person
|
||||||
|
expressions (e.g. "zrobiłem/zrobiłam", "jestem zadowolony/zadowolona",
|
||||||
|
"będę robił/robiła") in the Common Crawl corpus. Such expressions were
|
||||||
|
normalised here into masculine forms.
|
||||||
|
|
||||||
|
Classes
|
||||||
|
-------
|
||||||
|
|
||||||
|
* `0` — text written by a woman
|
||||||
|
* `1` — text written by a man
|
||||||
|
|
||||||
|
Directory structure
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
* `README.md` — this file
|
||||||
|
* `config.txt` — configuration file
|
||||||
|
* `train/` — directory with training data
|
||||||
|
* `train/train.tsv.gz` — train set (gzipped), the class is given in the first column,
|
||||||
|
a text fragment in the second one
|
||||||
|
* `train/meta.tsv.gz` — metadata (do not use during training)
|
||||||
|
* `dev-0/` — directory with dev (test) data
|
||||||
|
* `dev-0/in.tsv` — input data for the dev set (text fragments)
|
||||||
|
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||||
|
* `dev-0/meta.tsv` — metadata (not used during testing)
|
||||||
|
* `dev-1/` — directory with extra dev (test) data
|
||||||
|
* `dev-1/in.tsv` — input data for the extra dev set (text fragments)
|
||||||
|
* `dev-1/expected.tsv` — expected (reference) data for the extra dev set
|
||||||
|
* `dev-1/meta.tsv` — metadata (not used during testing)
|
||||||
|
* `test-A` — directory with test data
|
||||||
|
* `test-A/in.tsv` — input data for the test set (text fragments)
|
||||||
|
* `test-A/expected.tsv` — expected (reference) data for the test set (hidden)
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric Likelihood --metric Accuracy --metric {Likelihood:N<Likelihood>,Accuracy:N<Accuracy>}P<2>{f<in[2]:for-humans>N<+H>,f<in[3]:contaminated>N<+C>,f<in[3]:not-contaminated>N<-C>} --precision 5
|
137314
dev-0/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
137314
dev-0/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
137314
dev-0/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/.ipynb_checkpoints/out-checkpoint.tsv
Normal file
137314
dev-0/.ipynb_checkpoints/out-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/expected.tsv
Normal file
137314
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/in.tsv
Normal file
137314
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/meta.tsv
Normal file
137314
dev-0/meta.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/out.tsv
Normal file
137314
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
156606
dev-1/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
156606
dev-1/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/.ipynb_checkpoints/meta-checkpoint.tsv
Normal file
156606
dev-1/.ipynb_checkpoints/meta-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/.ipynb_checkpoints/out-checkpoint.tsv
Normal file
156606
dev-1/.ipynb_checkpoints/out-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/expected.tsv
Normal file
156606
dev-1/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/in.tsv
Normal file
156606
dev-1/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/meta.tsv
Normal file
156606
dev-1/meta.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/out.tsv
Normal file
156606
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
52
run.py
Normal file
52
run.py
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
from sklearn.metrics import recall_score
|
||||||
|
from sklearn.metrics import precision_score
|
||||||
|
from sklearn.metrics import accuracy_score
|
||||||
|
from sklearn.metrics import f1_score
|
||||||
|
|
||||||
|
male = ['windows', 'gb', 'mb', 'meczu', 'pc', 'opony', 'apple', 'iphone', 'zwiast', 'hd', 'ubunt',
|
||||||
|
'system', 'serwer', "youtub", "sfd", "kfd", "elektr", "autoce", "dobrep",'merced', 'bmw',
|
||||||
|
'audi', 'porsch', 'gry', 'gra','gram' 'cs', 'counte', 'piłka', 'mecz', 'gol', 'bramka', 'linux',
|
||||||
|
'robota','felga','lagi' 'żona', 'żona', 'żony', 'żonie', 'żoną', 'zona', 'zony', 'zonie', 'komput', 'inform'
|
||||||
|
'sserwer', 'ziom', 'ziomków', 'ziomkow', 'kumpel', 'kolega', 'kolegą', 'kolegi', 'pad'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
female = ['ciąży', 'miesią', 'ciasto', 'ciążę', 'zadowo', 'ciąża', 'ciazy', 'antyko', 'gineko',
|
||||||
|
'tablet', 'porodz', 'mąż', 'miesią', 'krwawi', 'ciasta', 'sukien', 'podpas', 'szmink',
|
||||||
|
'maz', 'męża', 'męza', 'mąż', 'chłopak', 'szpilk'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def prediction(male,female, in_file):
|
||||||
|
results = []
|
||||||
|
with open(in_file, encoding='utf-8',) as file:
|
||||||
|
for line in file.readlines():
|
||||||
|
text = line.split("\t")[0].strip()
|
||||||
|
text = text.replace(",","").replace(".","").replace("/","").replace("–","").replace(":","").lower()
|
||||||
|
stem_words = [word[:6] for word in text.split()]
|
||||||
|
|
||||||
|
man_score = len([w for w in stem_words if w in male])
|
||||||
|
girl_score = len([w for w in stem_words if w in female])
|
||||||
|
if man_score > girl_score:
|
||||||
|
results.append('1')
|
||||||
|
else:
|
||||||
|
results.append('0')
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def out_file(result, out_file):
|
||||||
|
with open(out_file, 'w') as file:
|
||||||
|
for r in result:
|
||||||
|
file.write(r + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
result = prediction(male,female,'dev-0/in.tsv')
|
||||||
|
out_file(result, 'dev-0/out.tsv')
|
||||||
|
|
||||||
|
result = prediction(male,female,'dev-1/in.tsv')
|
||||||
|
out_file(result, 'dev-1/out.tsv')
|
||||||
|
|
||||||
|
result = prediction(male,female,'test-A/in.tsv')
|
||||||
|
out_file(result, 'test-A/out.tsv')
|
134618
test-A/.ipynb_checkpoints/out-checkpoint.tsv
Normal file
134618
test-A/.ipynb_checkpoints/out-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
134618
test-A/in.tsv
Normal file
134618
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
134618
test-A/out.tsv
Normal file
134618
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
3601424
train/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
3601424
train/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
3601424
train/expected.tsv
Normal file
3601424
train/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/in.tsv.xz
Normal file
BIN
train/in.tsv.xz
Normal file
Binary file not shown.
BIN
train/meta.tsv.gz
Normal file
BIN
train/meta.tsv.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user