init

2021-02-03 16:32:16 +01:00 · 2021-02-03 16:32:16 +01:00 · cf3569c260
commit cf3569c260
12 changed files with 1022492 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,38 @@
+
+"He Said She Said" classification challenge (2nd edition)
+=========================================================
+
+Give the probability that a text in Polish was written by a man.
+
+This challenge is based on the "He Said She Said" corpus for Polish.
+The corpus was created by grepping gender-specific first person
+expressions (e.g. "zrobiłem/zrobiłam", "jestem zadowolony/zadowolona",
+"będę robił/robiła") in the Common Crawl corpus. Such expressions were
+normalised here into masculine forms.
+
+Classes
+-------
+
+* `0` — text written by a woman
+* `1` — text written by a man
+
+Directory structure
+-------------------
+
+* `README.md` — this file
+* `config.txt` — configuration file
+* `train/` — directory with training data
+* `train/train.tsv.gz` — train set (gzipped), the class is given in the first column,
+  a text fragment in the second one
+* `train/meta.tsv.gz` — metadata (do not use during training)
+* `dev-0/` — directory with dev (test) data
+* `dev-0/in.tsv` — input data for the dev set (text fragments)
+* `dev-0/expected.tsv` — expected (reference) data for the dev set
+* `dev-0/meta.tsv` — metadata (not used during testing)
+* `dev-1/` — directory with extra dev (test) data
+* `dev-1/in.tsv` — input data for the extra dev set (text fragments)
+* `dev-1/expected.tsv` — expected (reference) data for the extra dev set
+* `dev-1/meta.tsv` — metadata (not used during testing)
+* `test-A` — directory with test data
+* `test-A/in.tsv` — input data for the test set (text fragments)
+* `test-A/expected.tsv` — expected (reference) data for the test set (hidden)
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/meta.tsv
+++ b/dev-0/meta.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/dev-1/expected.tsv
+++ b/dev-1/expected.tsv
--- a/dev-1/in.tsv
+++ b/dev-1/in.tsv
--- a/dev-1/meta.tsv
+++ b/dev-1/meta.tsv
--- a/dev-1/out.tsv
+++ b/dev-1/out.tsv
--- a/logistic_regression.py
+++ b/logistic_regression.py
@ -0,0 +1,76 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import itertools as IT
+import numpy as np
+import csv  
+
+class LogisticRegression(torch.nn.Module):
+     def __init__(self):
+        super(LogisticRegression, self).__init__()
+        self.linear = torch.nn.Linear(WORDS_IN_DICTIONARY, 2)
+     def forward(self, x):
+        y_pred = torch.sigmoid(self.linear(x))
+        return y_pred
+def make_vector(sentence, dictionary):
+    vector = torch.zeros(len(dictionary))
+    for word in sentence:
+        vector[dictionary[word]] += 1
+
+    return vector.view(1, -1)
+
+def read_data(path):
+  line = open(path, 'r').readlines()[0:2000]
+  data = []
+  for word in line:
+    data.append(word.split())
+  return data
+
+def main():
+  train_data = read_data("train/in.tsv")
+  temp = open('train/expected.tsv', 'r').readlines()[0:2000]
+  train_data_output = []
+  for sent in temp:
+    train_data_output.append(int(sent))
+
+  test_data = read_data('test-A/in.tsv')
+  output = open('test-A/out.tsv', 'w')
+
+
+  dictionary = {}
+  for sent in train_data + test_data:
+    for word in sent:
+      if word not in dictionary:
+              dictionary[word] = len(dictionary)
+
+  WORDS_IN_DICTIONARY = len(dictionary)
+
+  model = LogisticRegression()
+
+  criterion = nn.NLLLoss()
+  optimizer = optim.SGD(model.parameters(), lr=0.1)
+
+  epochs = 100
+  for epoch in range(epochs):
+    if epoch % 10 == 0:
+      print(str(epoch/epochs * 100) + "%")
+    for instance, label in IT.zip_longest(train_data, train_data_output):
+      vector = make_vector(instance, dictionary)
+      target = torch.LongTensor([label])
+      model.zero_grad()
+      log_probs = model(vector)
+      loss = criterion(log_probs, target)
+      loss.backward()
+      optimizer.step() 
+
+  for instance in test_data:
+          vec = make_vector(instance, dictionary)
+          log_probs = model(vec)
+          y_pred = np.argmax(log_probs[0].detach().numpy())
+          output.write(str(int(y_pred)) + '\n')
+
+  output.close()
+
+
+if __name__ == '__main__':
+    main()
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv