first commit
This commit is contained in:
commit
ba0ba3b068
1
.gitignore
vendored
Executable file
1
.gitignore
vendored
Executable file
@ -0,0 +1 @@
|
||||
*~
|
4
CHANGELOG.md
Executable file
4
CHANGELOG.md
Executable file
@ -0,0 +1,4 @@
|
||||
<a name="2.0.0"></a>
|
||||
## 2.0.0 (2020-05-22)
|
||||
|
||||
* Switch to probabilities as the main metric
|
38
README.md
Executable file
38
README.md
Executable file
@ -0,0 +1,38 @@
|
||||
|
||||
"He Said She Said" classification challenge (2nd edition)
|
||||
=========================================================
|
||||
|
||||
Give the probability that a text in Polish was written by a man.
|
||||
|
||||
This challenge is based on the "He Said She Said" corpus for Polish.
|
||||
The corpus was created by grepping gender-specific first person
|
||||
expressions (e.g. "zrobiłem/zrobiłam", "jestem zadowolony/zadowolona",
|
||||
"będę robił/robiła") in the Common Crawl corpus. Such expressions were
|
||||
normalised here into masculine forms.
|
||||
|
||||
Classes
|
||||
-------
|
||||
|
||||
* `0` — text written by a woman
|
||||
* `1` — text written by a man
|
||||
|
||||
Directory structure
|
||||
-------------------
|
||||
|
||||
* `README.md` — this file
|
||||
* `config.txt` — configuration file
|
||||
* `train/` — directory with training data
|
||||
* `train/train.tsv.gz` — train set (gzipped), the class is given in the first column,
|
||||
a text fragment in the second one
|
||||
* `train/meta.tsv.gz` — metadata (do not use during training)
|
||||
* `dev-0/` — directory with dev (test) data
|
||||
* `dev-0/in.tsv` — input data for the dev set (text fragments)
|
||||
* `dev-0/expected.tsv` — expected (reference) data for the dev set
|
||||
* `dev-0/meta.tsv` — metadata (not used during testing)
|
||||
* `dev-1/` — directory with extra dev (test) data
|
||||
* `dev-1/in.tsv` — input data for the extra dev set (text fragments)
|
||||
* `dev-1/expected.tsv` — expected (reference) data for the extra dev set
|
||||
* `dev-1/meta.tsv` — metadata (not used during testing)
|
||||
* `test-A` — directory with test data
|
||||
* `test-A/in.tsv` — input data for the test set (text fragments)
|
||||
* `test-A/expected.tsv` — expected (reference) data for the test set (hidden)
|
1
config.txt
Executable file
1
config.txt
Executable file
@ -0,0 +1 @@
|
||||
--metric Likelihood --metric Accuracy --precision 5
|
137314
dev-0/expected.tsv
Executable file
137314
dev-0/expected.tsv
Executable file
File diff suppressed because it is too large
Load Diff
137314
dev-0/in.tsv
Executable file
137314
dev-0/in.tsv
Executable file
File diff suppressed because it is too large
Load Diff
137314
dev-0/meta.tsv
Executable file
137314
dev-0/meta.tsv
Executable file
File diff suppressed because it is too large
Load Diff
137314
dev-0/out.tsv
Executable file
137314
dev-0/out.tsv
Executable file
File diff suppressed because it is too large
Load Diff
156606
dev-1/expected.tsv
Executable file
156606
dev-1/expected.tsv
Executable file
File diff suppressed because it is too large
Load Diff
156606
dev-1/in.tsv
Executable file
156606
dev-1/in.tsv
Executable file
File diff suppressed because it is too large
Load Diff
156606
dev-1/meta.tsv
Executable file
156606
dev-1/meta.tsv
Executable file
File diff suppressed because it is too large
Load Diff
156606
dev-1/out.tsv
Executable file
156606
dev-1/out.tsv
Executable file
File diff suppressed because it is too large
Load Diff
72
solution.py
Executable file
72
solution.py
Executable file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
#TASK REMARK: ALL FILES WERE STRIPPED FROM <CR> CHAR
|
||||
#DUE TO PROBLEMS WITH READING INPUT FILES
|
||||
|
||||
import gzip
|
||||
import re
|
||||
import string
|
||||
import ftfy
|
||||
import datetime
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
documents = []
|
||||
labels = []
|
||||
line_id = 0
|
||||
m = 0
|
||||
f = 0
|
||||
print(datetime.datetime.now(), "starting")
|
||||
with open('train/data.tsv', 'rt') as ins:
|
||||
for line in ins:
|
||||
sub = re.sub("[^\w ęóśłżźćń\t]+", "", line.lower(), flags=re.UNICODE).rstrip()
|
||||
try:
|
||||
label, text = sub\
|
||||
.split('\t', 1)
|
||||
documents.append(re.sub("\s+", " ", text))
|
||||
labels.append(label.upper())
|
||||
line_id = line_id + 1
|
||||
if(label == 'm'):
|
||||
m = m + 1
|
||||
if(label == 'f'):
|
||||
f = f + 1
|
||||
except ValueError:
|
||||
print('error on line {:d}', line_id)
|
||||
continue
|
||||
|
||||
print(datetime.datetime.now(), "file read ")
|
||||
print('m:', m, 'f:', f)
|
||||
print('m/f:', m/f)
|
||||
print('read ', len(documents), ' lines')
|
||||
|
||||
print(datetime.datetime.now(), "creating vectorizer and fitting documents ")
|
||||
vectorizer = TfidfVectorizer()
|
||||
vectorizer.fit(documents)
|
||||
|
||||
print(datetime.datetime.now(), "transforming documents")
|
||||
X1 = vectorizer.transform(documents)
|
||||
|
||||
print(datetime.datetime.now(), "creating LinearSVC")
|
||||
clf = LinearSVC()
|
||||
print(datetime.datetime.now(), "training model")
|
||||
clf = clf.fit(X1, labels)
|
||||
print("fitting completed: (documents, labels)")
|
||||
print(len(documents), ",", len(labels))
|
||||
|
||||
for name in ['dev-0', 'dev-1', 'test-A']:
|
||||
print(datetime.datetime.now(), 'now serving:', name)
|
||||
documents = []
|
||||
with open(name + '/data.tsv', 'rt') as source:
|
||||
for line in source:
|
||||
documents.append(re.sub("[^\w ęóśłżźćń]+", "", line.lower(), flags=re.UNICODE).rstrip())
|
||||
print(datetime.datetime.now(), 'read ', len(documents), ' from', name)
|
||||
print(datetime.datetime.now(), 'transforming and classifying')
|
||||
x = vectorizer.transform(documents)
|
||||
output = clf.predict(x)
|
||||
|
||||
print(datetime.datetime.now(), 'saving output to ' + name + '/out.tsv')
|
||||
with open(name + '/out.tsv', 'w') as wr:
|
||||
for ans in output:
|
||||
wr.write(ans + '\n')
|
||||
print('finished')
|
134618
test-A/in.tsv
Executable file
134618
test-A/in.tsv
Executable file
File diff suppressed because it is too large
Load Diff
134618
test-A/out.tsv
Executable file
134618
test-A/out.tsv
Executable file
File diff suppressed because it is too large
Load Diff
67
train.py
Executable file
67
train.py
Executable file
@ -0,0 +1,67 @@
|
||||
!xzcat train/in.tsv.xz | wc -l
|
||||
|
||||
import gzip
|
||||
import re
|
||||
import string
|
||||
import ftfy
|
||||
import datetime
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy.sparse import hstack
|
||||
import csv
|
||||
import datetime
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.linear_model import SGDClassifier, LogisticRegression
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB,GaussianNB
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
|
||||
def load_set(path, isTest):
|
||||
dataset = pd.read_csv(path+"/in.tsv.xz", delimiter="\t",header=None,names=["text","date"],quoting=csv.QUOTE_NONE)
|
||||
if not isTest:
|
||||
expected = pd.read_csv(path+"/expected.tsv",header=None,names=["class"],dtype="category")
|
||||
return dataset, expected
|
||||
return dataset
|
||||
|
||||
train_set, expected_train = load_set("train", False)
|
||||
dev_set, expected_dev = load_set("dev-0", False)
|
||||
dev_set_1, expected_dev_1 = load_set("dev-1", False)
|
||||
test_set = load_set("test-A", True)
|
||||
|
||||
vectorize = CountVectorizer(stop_words='english',ngram_range=(1,3),strip_accents='ascii')
|
||||
vectorized = vectorize.fit_transform(train_set["text"])
|
||||
|
||||
X = vectorized
|
||||
y = expected_train["class"]
|
||||
|
||||
bayes = LogisticRegression(max_iter=1000)
|
||||
bayes.fit(X,y)
|
||||
|
||||
def predict_data(data):
|
||||
prepared = prepare_data(data)
|
||||
vectorized = vectorize.transform(data["text"])
|
||||
predicted = bayes.predict_proba(vectorized)[:,1]
|
||||
predicted[predicted < 0.05] = 0.05
|
||||
predicted[predicted > 0.95] = 0.95
|
||||
return predicted
|
||||
|
||||
dev_predicted = predict_data(dev_set)
|
||||
dev_predicted_1 = predict_data(dev_set1)
|
||||
test_predicted = predict_data(test_set)
|
||||
|
||||
test_predicted = np.array([item.strip() for item in test_predicted])
|
||||
dev_predicted = np.array([item.strip() for item in dev_predicted])
|
||||
dev_predicted1 = np.array([item.strip() for item in dev_predicted_1])
|
||||
|
||||
np.savetxt('test-A/out.tsv', test_predicted, '%f')
|
||||
np.savetxt('dev-0/out.tsv', dev_predicted, '%f')
|
||||
np.savetxt('dev-1/out.tsv', dev_predicted_1, '%f')
|
||||
|
||||
!wget https://gonito.net/get/bin/geval
|
||||
!chmod u+x geval
|
||||
|
||||
!./geval -t "dev-0"
|
3601424
train/expected.tsv
Executable file
3601424
train/expected.tsv
Executable file
File diff suppressed because it is too large
Load Diff
BIN
train/in.tsv.xz
Executable file
BIN
train/in.tsv.xz
Executable file
Binary file not shown.
BIN
train/meta.tsv.gz
Executable file
BIN
train/meta.tsv.gz
Executable file
Binary file not shown.
Loading…
Reference in New Issue
Block a user