Fix preprocessing

This commit is contained in:
kuba 2021-04-28 20:21:12 +02:00
parent 587bc42691
commit 680de8dc60
4 changed files with 20 additions and 35 deletions

View File

@ -1,22 +1,10 @@
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
stopwords = []
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
with open('stopwords') as f:
stopwords = [line.rstrip() for line in f]
classifier = MultinomialNB() classifier = MultinomialNB()
vectorizer = TfidfVectorizer() vectorizer = TfidfVectorizer()
def preprocess(doc):
doc = doc.lower().split(' ')
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
doc = ' '.join(doc)
return doc
def train(): def train():
with open('train/train.tsv') as f: with open('train/train.tsv') as f:
docs = [line.rstrip() for line in f] docs = [line.rstrip() for line in f]
@ -26,7 +14,7 @@ def train():
y_with_doc = doc.split('\t') y_with_doc = doc.split('\t')
y.append(y_with_doc[0]) y.append(y_with_doc[0])
doc = y_with_doc[1] doc = y_with_doc[1]
docs_preprocessed.append(preprocess(doc)) docs_preprocessed.append(doc)
y = [int(value) for value in y] y = [int(value) for value in y]
x = vectorizer.fit_transform(docs_preprocessed) x = vectorizer.fit_transform(docs_preprocessed)
classifier.fit(x, y) classifier.fit(x, y)
@ -35,9 +23,6 @@ def train():
def classify(path): def classify(path):
with open(path + 'in.tsv') as f: with open(path + 'in.tsv') as f:
docs = [line.rstrip() for line in f] docs = [line.rstrip() for line in f]
docs_preprocessed = []
for doc in docs:
docs_preprocessed.append(preprocess(doc))
test_x = vectorizer.transform(docs) test_x = vectorizer.transform(docs)
predictions = classifier.predict(test_x) predictions = classifier.predict(test_x)
with open(path + 'out.tsv', 'w') as file: with open(path + 'out.tsv', 'w') as file:
@ -47,4 +32,4 @@ def classify(path):
train() train()
classify('dev-0/') classify('dev-0/')
classify('test-A/') classify('test-A/')

View File

@ -418,7 +418,7 @@
0 0
1 1
0 0
0 1
0 0
1 1
1 1
@ -1318,7 +1318,7 @@
1 1
1 1
1 1
0 1
0 0
0 0
0 0
@ -1371,7 +1371,7 @@
1 1
1 1
1 1
1 0
1 1
1 1
1 1
@ -3200,7 +3200,7 @@
1 1
1 1
1 1
0 1
1 1
1 1
0 0
@ -3614,7 +3614,7 @@
1 1
1 1
1 1
0 1
0 0
0 0
1 1
@ -4389,7 +4389,7 @@
1 1
1 1
0 0
0 1
1 1
0 0
1 1
@ -4987,7 +4987,7 @@
1 1
1 1
0 0
0 1
1 1
1 1
1 1
@ -5444,7 +5444,7 @@
1 1
1 1
0 0
0 1
1 1
1 1
0 0

1 1
418 0
419 1
420 0
421 0 1
422 0
423 1
424 1
1318 1
1319 1
1320 1
1321 0 1
1322 0
1323 0
1324 0
1371 1
1372 1
1373 1
1374 1 0
1375 1
1376 1
1377 1
3200 1
3201 1
3202 1
3203 0 1
3204 1
3205 1
3206 0
3614 1
3615 1
3616 1
3617 0 1
3618 0
3619 0
3620 1
4389 1
4390 1
4391 0
4392 0 1
4393 1
4394 0
4395 1
4987 1
4988 1
4989 0
4990 0 1
4991 1
4992 1
4993 1
5444 1
5445 1
5446 0
5447 0 1
5448 1
5449 1
5450 0

BIN
geval Executable file

Binary file not shown.

View File

@ -204,7 +204,7 @@
1 1
0 0
0 0
0 1
0 0
1 1
0 0
@ -1719,7 +1719,7 @@
0 0
1 1
1 1
0 1
1 1
0 0
1 1
@ -1807,7 +1807,7 @@
1 1
0 0
1 1
0 1
1 1
1 1
1 1
@ -1946,7 +1946,7 @@
1 1
0 0
0 0
0 1
1 1
1 1
0 0
@ -2080,7 +2080,7 @@
0 0
1 1
1 1
0 1
0 0
0 0
1 1
@ -2210,7 +2210,7 @@
0 0
1 1
1 1
1 0
1 1
1 1
1 1
@ -2737,7 +2737,7 @@
1 1
0 0
0 0
0 1
0 0
1 1
1 1
@ -3109,7 +3109,7 @@
1 1
1 1
1 1
0 1
1 1
1 1
0 0
@ -3365,7 +3365,7 @@
1 1
0 0
1 1
0 1
1 1
1 1
1 1
@ -4065,7 +4065,7 @@
1 1
1 1
1 1
1 0
1 1
0 0
1 1

1 1
204 1
205 0
206 0
207 0 1
208 0
209 1
210 0
1719 0
1720 1
1721 1
1722 0 1
1723 1
1724 0
1725 1
1807 1
1808 0
1809 1
1810 0 1
1811 1
1812 1
1813 1
1946 1
1947 0
1948 0
1949 0 1
1950 1
1951 1
1952 0
2080 0
2081 1
2082 1
2083 0 1
2084 0
2085 0
2086 1
2210 0
2211 1
2212 1
2213 1 0
2214 1
2215 1
2216 1
2737 1
2738 0
2739 0
2740 0 1
2741 0
2742 1
2743 1
3109 1
3110 1
3111 1
3112 0 1
3113 1
3114 1
3115 0
3365 1
3366 0
3367 1
3368 0 1
3369 1
3370 1
3371 1
4065 1
4066 1
4067 1
4068 1 0
4069 1
4070 0
4071 1