init
This commit is contained in:
commit
6ee287c2bf
38
README.md
Normal file
38
README.md
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
"He Said She Said" classification challenge (2nd edition)
|
||||
=========================================================
|
||||
|
||||
Give the probability that a text in Polish was written by a man.
|
||||
|
||||
This challenge is based on the "He Said She Said" corpus for Polish.
|
||||
The corpus was created by grepping gender-specific first person
|
||||
expressions (e.g. "zrobiłem/zrobiłam", "jestem zadowolony/zadowolona",
|
||||
"będę robił/robiła") in the Common Crawl corpus. Such expressions were
|
||||
normalised here into masculine forms.
|
||||
|
||||
Classes
|
||||
-------
|
||||
|
||||
* `0` — text written by a woman
|
||||
* `1` — text written by a man
|
||||
|
||||
Directory structure
|
||||
-------------------
|
||||
|
||||
* `README.md` — this file
|
||||
* `config.txt` — configuration file
|
||||
* `train/` — directory with training data
|
||||
* `train/train.tsv.gz` — train set (gzipped), the class is given in the first column,
|
||||
a text fragment in the second one
|
||||
* `train/meta.tsv.gz` — metadata (do not use during training)
|
||||
* `dev-0/` — directory with dev (test) data
|
||||
* `dev-0/in.tsv` — input data for the dev set (text fragments)
|
||||
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||
* `dev-0/meta.tsv` — metadata (not used during testing)
|
||||
* `dev-1/` — directory with extra dev (test) data
|
||||
* `dev-1/in.tsv` — input data for the extra dev set (text fragments)
|
||||
* `dev-1/expected.tsv` — expected (reference) data for the extra dev set
|
||||
* `dev-1/meta.tsv` — metadata (not used during testing)
|
||||
* `test-A` — directory with test data
|
||||
* `test-A/in.tsv` — input data for the test set (text fragments)
|
||||
* `test-A/expected.tsv` — expected (reference) data for the test set (hidden)
|
137314
dev-0/expected.tsv
Normal file
137314
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/in.tsv
Normal file
137314
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
137314
dev-0/meta.tsv
Normal file
137314
dev-0/meta.tsv
Normal file
File diff suppressed because it is too large
Load Diff
2000
dev-0/out.tsv
Normal file
2000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/expected.tsv
Normal file
156606
dev-1/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/in.tsv
Normal file
156606
dev-1/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
156606
dev-1/meta.tsv
Normal file
156606
dev-1/meta.tsv
Normal file
File diff suppressed because it is too large
Load Diff
2000
dev-1/out.tsv
Normal file
2000
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
76
logistic_regression.py
Normal file
76
logistic_regression.py
Normal file
@ -0,0 +1,76 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import itertools as IT
|
||||
import numpy as np
|
||||
import csv
|
||||
|
||||
class LogisticRegression(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(LogisticRegression, self).__init__()
|
||||
self.linear = torch.nn.Linear(WORDS_IN_DICTIONARY, 2)
|
||||
def forward(self, x):
|
||||
y_pred = torch.sigmoid(self.linear(x))
|
||||
return y_pred
|
||||
def make_vector(sentence, dictionary):
|
||||
vector = torch.zeros(len(dictionary))
|
||||
for word in sentence:
|
||||
vector[dictionary[word]] += 1
|
||||
|
||||
return vector.view(1, -1)
|
||||
|
||||
def read_data(path):
|
||||
line = open(path, 'r').readlines()[0:2000]
|
||||
data = []
|
||||
for word in line:
|
||||
data.append(word.split())
|
||||
return data
|
||||
|
||||
def main():
|
||||
train_data = read_data("train/in.tsv")
|
||||
temp = open('train/expected.tsv', 'r').readlines()[0:2000]
|
||||
train_data_output = []
|
||||
for sent in temp:
|
||||
train_data_output.append(int(sent))
|
||||
|
||||
test_data = read_data('test-A/in.tsv')
|
||||
output = open('test-A/out.tsv', 'w')
|
||||
|
||||
|
||||
dictionary = {}
|
||||
for sent in train_data + test_data:
|
||||
for word in sent:
|
||||
if word not in dictionary:
|
||||
dictionary[word] = len(dictionary)
|
||||
|
||||
WORDS_IN_DICTIONARY = len(dictionary)
|
||||
|
||||
model = LogisticRegression()
|
||||
|
||||
criterion = nn.NLLLoss()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
epochs = 100
|
||||
for epoch in range(epochs):
|
||||
if epoch % 10 == 0:
|
||||
print(str(epoch/epochs * 100) + "%")
|
||||
for instance, label in IT.zip_longest(train_data, train_data_output):
|
||||
vector = make_vector(instance, dictionary)
|
||||
target = torch.LongTensor([label])
|
||||
model.zero_grad()
|
||||
log_probs = model(vector)
|
||||
loss = criterion(log_probs, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
for instance in test_data:
|
||||
vec = make_vector(instance, dictionary)
|
||||
log_probs = model(vec)
|
||||
y_pred = np.argmax(log_probs[0].detach().numpy())
|
||||
output.write(str(int(y_pred)) + '\n')
|
||||
|
||||
output.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
134618
test-A/in.tsv
Normal file
134618
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
2000
test-A/out.tsv
Normal file
2000
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user