73 lines
2.3 KiB
Python
73 lines
2.3 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
#TASK REMARK: ALL FILES WERE STRIPPED FROM <CR> CHAR
|
||
|
#DUE TO PROBLEMS WITH READING INPUT FILES
|
||
|
|
||
|
import gzip
|
||
|
import re
|
||
|
import string
|
||
|
import ftfy
|
||
|
import datetime
|
||
|
from sklearn.svm import LinearSVC
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
|
||
|
documents = []
|
||
|
labels = []
|
||
|
line_id = 0
|
||
|
m = 0
|
||
|
f = 0
|
||
|
print(datetime.datetime.now(), "starting")
|
||
|
with open('train/data.tsv', 'rt') as ins:
|
||
|
for line in ins:
|
||
|
sub = re.sub("[^\w ęóśłżźćń\t]+", "", line.lower(), flags=re.UNICODE).rstrip()
|
||
|
try:
|
||
|
label, text = sub\
|
||
|
.split('\t', 1)
|
||
|
documents.append(re.sub("\s+", " ", text))
|
||
|
labels.append(label.upper())
|
||
|
line_id = line_id + 1
|
||
|
if(label == 'm'):
|
||
|
m = m + 1
|
||
|
if(label == 'f'):
|
||
|
f = f + 1
|
||
|
except ValueError:
|
||
|
print('error on line {:d}', line_id)
|
||
|
continue
|
||
|
|
||
|
print(datetime.datetime.now(), "file read ")
|
||
|
print('m:', m, 'f:', f)
|
||
|
print('m/f:', m/f)
|
||
|
print('read ', len(documents), ' lines')
|
||
|
|
||
|
print(datetime.datetime.now(), "creating vectorizer and fitting documents ")
|
||
|
vectorizer = TfidfVectorizer()
|
||
|
vectorizer.fit(documents)
|
||
|
|
||
|
print(datetime.datetime.now(), "transforming documents")
|
||
|
X1 = vectorizer.transform(documents)
|
||
|
|
||
|
print(datetime.datetime.now(), "creating LinearSVC")
|
||
|
clf = LinearSVC()
|
||
|
print(datetime.datetime.now(), "training model")
|
||
|
clf = clf.fit(X1, labels)
|
||
|
print("fitting completed: (documents, labels)")
|
||
|
print(len(documents), ",", len(labels))
|
||
|
|
||
|
for name in ['dev-0', 'dev-1', 'test-A']:
|
||
|
print(datetime.datetime.now(), 'now serving:', name)
|
||
|
documents = []
|
||
|
with open(name + '/data.tsv', 'rt') as source:
|
||
|
for line in source:
|
||
|
documents.append(re.sub("[^\w ęóśłżźćń]+", "", line.lower(), flags=re.UNICODE).rstrip())
|
||
|
print(datetime.datetime.now(), 'read ', len(documents), ' from', name)
|
||
|
print(datetime.datetime.now(), 'transforming and classifying')
|
||
|
x = vectorizer.transform(documents)
|
||
|
output = clf.predict(x)
|
||
|
|
||
|
print(datetime.datetime.now(), 'saving output to ' + name + '/out.tsv')
|
||
|
with open(name + '/out.tsv', 'w') as wr:
|
||
|
for ans in output:
|
||
|
wr.write(ans + '\n')
|
||
|
print('finished')
|