init
This commit is contained in:
commit
cf3569c260
|
@ -0,0 +1,38 @@
|
|||
|
||||
"He Said She Said" classification challenge (2nd edition)
|
||||
=========================================================
|
||||
|
||||
Give the probability that a text in Polish was written by a man.
|
||||
|
||||
This challenge is based on the "He Said She Said" corpus for Polish.
|
||||
The corpus was created by grepping gender-specific first person
|
||||
expressions (e.g. "zrobiłem/zrobiłam", "jestem zadowolony/zadowolona",
|
||||
"będę robił/robiła") in the Common Crawl corpus. Such expressions were
|
||||
normalised here into masculine forms.
|
||||
|
||||
Classes
|
||||
-------
|
||||
|
||||
* `0` — text written by a woman
|
||||
* `1` — text written by a man
|
||||
|
||||
Directory structure
|
||||
-------------------
|
||||
|
||||
* `README.md` — this file
|
||||
* `config.txt` — configuration file
|
||||
* `train/` — directory with training data
|
||||
* `train/train.tsv.gz` — train set (gzipped), the class is given in the first column,
|
||||
a text fragment in the second one
|
||||
* `train/meta.tsv.gz` — metadata (do not use during training)
|
||||
* `dev-0/` — directory with dev (test) data
|
||||
* `dev-0/in.tsv` — input data for the dev set (text fragments)
|
||||
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||
* `dev-0/meta.tsv` — metadata (not used during testing)
|
||||
* `dev-1/` — directory with extra dev (test) data
|
||||
* `dev-1/in.tsv` — input data for the extra dev set (text fragments)
|
||||
* `dev-1/expected.tsv` — expected (reference) data for the extra dev set
|
||||
* `dev-1/meta.tsv` — metadata (not used during testing)
|
||||
* `test-A` — directory with test data
|
||||
* `test-A/in.tsv` — input data for the test set (text fragments)
|
||||
* `test-A/expected.tsv` — expected (reference) data for the test set (hidden)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,76 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import itertools as IT
|
||||
import numpy as np
|
||||
import csv
|
||||
|
||||
class LogisticRegression(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(LogisticRegression, self).__init__()
|
||||
self.linear = torch.nn.Linear(WORDS_IN_DICTIONARY, 2)
|
||||
def forward(self, x):
|
||||
y_pred = torch.sigmoid(self.linear(x))
|
||||
return y_pred
|
||||
def make_vector(sentence, dictionary):
|
||||
vector = torch.zeros(len(dictionary))
|
||||
for word in sentence:
|
||||
vector[dictionary[word]] += 1
|
||||
|
||||
return vector.view(1, -1)
|
||||
|
||||
def read_data(path):
|
||||
line = open(path, 'r').readlines()[0:2000]
|
||||
data = []
|
||||
for word in line:
|
||||
data.append(word.split())
|
||||
return data
|
||||
|
||||
def main():
|
||||
train_data = read_data("train/in.tsv")
|
||||
temp = open('train/expected.tsv', 'r').readlines()[0:2000]
|
||||
train_data_output = []
|
||||
for sent in temp:
|
||||
train_data_output.append(int(sent))
|
||||
|
||||
test_data = read_data('test-A/in.tsv')
|
||||
output = open('test-A/out.tsv', 'w')
|
||||
|
||||
|
||||
dictionary = {}
|
||||
for sent in train_data + test_data:
|
||||
for word in sent:
|
||||
if word not in dictionary:
|
||||
dictionary[word] = len(dictionary)
|
||||
|
||||
WORDS_IN_DICTIONARY = len(dictionary)
|
||||
|
||||
model = LogisticRegression()
|
||||
|
||||
criterion = nn.NLLLoss()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.1)
|
||||
|
||||
epochs = 100
|
||||
for epoch in range(epochs):
|
||||
if epoch % 10 == 0:
|
||||
print(str(epoch/epochs * 100) + "%")
|
||||
for instance, label in IT.zip_longest(train_data, train_data_output):
|
||||
vector = make_vector(instance, dictionary)
|
||||
target = torch.LongTensor([label])
|
||||
model.zero_grad()
|
||||
log_probs = model(vector)
|
||||
loss = criterion(log_probs, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
for instance in test_data:
|
||||
vec = make_vector(instance, dictionary)
|
||||
log_probs = model(vec)
|
||||
y_pred = np.argmax(log_probs[0].detach().numpy())
|
||||
output.write(str(int(y_pred)) + '\n')
|
||||
|
||||
output.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue