first commit

This commit is contained in:
Yevheniia Tsapkova 2020-05-26 11:50:16 +02:00
commit ba0ba3b068
19 changed files with 5046523 additions and 0 deletions

1
.gitignore vendored Executable file
View File

@ -0,0 +1 @@
*~

4
CHANGELOG.md Executable file
View File

@ -0,0 +1,4 @@
<a name="2.0.0"></a>
## 2.0.0 (2020-05-22)
* Switch to probabilities as the main metric

38
README.md Executable file
View File

@ -0,0 +1,38 @@
"He Said She Said" classification challenge (2nd edition)
=========================================================
Give the probability that a text in Polish was written by a man.
This challenge is based on the "He Said She Said" corpus for Polish.
The corpus was created by grepping gender-specific first person
expressions (e.g. "zrobiłem/zrobiłam", "jestem zadowolony/zadowolona",
"będę robił/robiła") in the Common Crawl corpus. Such expressions were
normalised here into masculine forms.
Classes
-------
* `0` — text written by a woman
* `1` — text written by a man
Directory structure
-------------------
* `README.md` — this file
* `config.txt` — configuration file
* `train/` — directory with training data
* `train/train.tsv.gz` — train set (gzipped), the class is given in the first column,
a text fragment in the second one
* `train/meta.tsv.gz` — metadata (do not use during training)
* `dev-0/` — directory with dev (test) data
* `dev-0/in.tsv` — input data for the dev set (text fragments)
* `dev-0/expected.tsv` — expected (reference) data for the dev set
* `dev-0/meta.tsv` — metadata (not used during testing)
* `dev-1/` — directory with extra dev (test) data
* `dev-1/in.tsv` — input data for the extra dev set (text fragments)
* `dev-1/expected.tsv` — expected (reference) data for the extra dev set
* `dev-1/meta.tsv` — metadata (not used during testing)
* `test-A` — directory with test data
* `test-A/in.tsv` — input data for the test set (text fragments)
* `test-A/expected.tsv` — expected (reference) data for the test set (hidden)

1
config.txt Executable file
View File

@ -0,0 +1 @@
--metric Likelihood --metric Accuracy --precision 5

137314
dev-0/expected.tsv Executable file

File diff suppressed because it is too large Load Diff

137314
dev-0/in.tsv Executable file

File diff suppressed because it is too large Load Diff

137314
dev-0/meta.tsv Executable file

File diff suppressed because it is too large Load Diff

137314
dev-0/out.tsv Executable file

File diff suppressed because it is too large Load Diff

156606
dev-1/expected.tsv Executable file

File diff suppressed because it is too large Load Diff

156606
dev-1/in.tsv Executable file

File diff suppressed because it is too large Load Diff

156606
dev-1/meta.tsv Executable file

File diff suppressed because it is too large Load Diff

156606
dev-1/out.tsv Executable file

File diff suppressed because it is too large Load Diff

72
solution.py Executable file
View File

@ -0,0 +1,72 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#TASK REMARK: ALL FILES WERE STRIPPED FROM <CR> CHAR
#DUE TO PROBLEMS WITH READING INPUT FILES
import gzip
import re
import string
import ftfy
import datetime
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
documents = []
labels = []
line_id = 0
m = 0
f = 0
print(datetime.datetime.now(), "starting")
with open('train/data.tsv', 'rt') as ins:
for line in ins:
sub = re.sub("[^\w ęóśłżźćń\t]+", "", line.lower(), flags=re.UNICODE).rstrip()
try:
label, text = sub\
.split('\t', 1)
documents.append(re.sub("\s+", " ", text))
labels.append(label.upper())
line_id = line_id + 1
if(label == 'm'):
m = m + 1
if(label == 'f'):
f = f + 1
except ValueError:
print('error on line {:d}', line_id)
continue
print(datetime.datetime.now(), "file read ")
print('m:', m, 'f:', f)
print('m/f:', m/f)
print('read ', len(documents), ' lines')
print(datetime.datetime.now(), "creating vectorizer and fitting documents ")
vectorizer = TfidfVectorizer()
vectorizer.fit(documents)
print(datetime.datetime.now(), "transforming documents")
X1 = vectorizer.transform(documents)
print(datetime.datetime.now(), "creating LinearSVC")
clf = LinearSVC()
print(datetime.datetime.now(), "training model")
clf = clf.fit(X1, labels)
print("fitting completed: (documents, labels)")
print(len(documents), ",", len(labels))
for name in ['dev-0', 'dev-1', 'test-A']:
print(datetime.datetime.now(), 'now serving:', name)
documents = []
with open(name + '/data.tsv', 'rt') as source:
for line in source:
documents.append(re.sub("[^\w ęóśłżźćń]+", "", line.lower(), flags=re.UNICODE).rstrip())
print(datetime.datetime.now(), 'read ', len(documents), ' from', name)
print(datetime.datetime.now(), 'transforming and classifying')
x = vectorizer.transform(documents)
output = clf.predict(x)
print(datetime.datetime.now(), 'saving output to ' + name + '/out.tsv')
with open(name + '/out.tsv', 'w') as wr:
for ans in output:
wr.write(ans + '\n')
print('finished')

134618
test-A/in.tsv Executable file

File diff suppressed because it is too large Load Diff

134618
test-A/out.tsv Executable file

File diff suppressed because it is too large Load Diff

67
train.py Executable file
View File

@ -0,0 +1,67 @@
!xzcat train/in.tsv.xz | wc -l
import gzip
import re
import string
import ftfy
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import csv
import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
from sklearn.neural_network import MLPClassifier
def load_set(path, isTest):
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
if not isTest:
expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
return dataset, expected
return dataset
train_set, expected_train = load_set("train", False)
dev_set, expected_dev = load_set("dev-0", False)
dev_set_1, expected_dev_1 = load_set("dev-1", False)
test_set = load_set("test-A", True)
vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
vectorized = vectorize.fit_transform(train_set["text"])
X = vectorized
y = expected_train["class"]
bayes = LogisticRegression(max_iter=1000)
bayes.fit(X,y)
def predict_data(data):
prepared = prepare_data(data)
vectorized = vectorize.transform(data["text"])
predicted = bayes.predict_proba(vectorized)[:,1]
predicted[predicted < 0.05] = 0.05
predicted[predicted > 0.95] = 0.95
return predicted
dev_predicted = predict_data(dev_set)
dev_predicted_1 = predict_data(dev_set1)
test_predicted = predict_data(test_set)
test_predicted = np.array([item.strip() for item in test_predicted])
dev_predicted = np.array([item.strip() for item in dev_predicted])
dev_predicted1 = np.array([item.strip() for item in dev_predicted_1])
np.savetxt('test-A/out.tsv', test_predicted, '%f')
np.savetxt('dev-0/out.tsv', dev_predicted, '%f')
np.savetxt('dev-1/out.tsv', dev_predicted_1, '%f')
!wget https://gonito.net/get/bin/geval
!chmod u+x geval
!./geval -t "dev-0"

3601424
train/expected.tsv Executable file

File diff suppressed because it is too large Load Diff

BIN
train/in.tsv.xz Executable file

Binary file not shown.

BIN
train/meta.tsv.gz Executable file

Binary file not shown.