petite-difference-challenge2/solution.py

73 lines
2.3 KiB
Python
Raw Normal View History

2020-05-26 11:50:16 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#TASK REMARK: ALL FILES WERE STRIPPED FROM <CR> CHAR
#DUE TO PROBLEMS WITH READING INPUT FILES
import gzip
import re
import string
import ftfy
import datetime
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
documents = []
labels = []
line_id = 0
m = 0
f = 0
print(datetime.datetime.now(), "starting")
with open('train/data.tsv', 'rt') as ins:
for line in ins:
sub = re.sub("[^\w ęóśłżźćń\t]+", "", line.lower(), flags=re.UNICODE).rstrip()
try:
label, text = sub\
.split('\t', 1)
documents.append(re.sub("\s+", " ", text))
labels.append(label.upper())
line_id = line_id + 1
if(label == 'm'):
m = m + 1
if(label == 'f'):
f = f + 1
except ValueError:
print('error on line {:d}', line_id)
continue
print(datetime.datetime.now(), "file read ")
print('m:', m, 'f:', f)
print('m/f:', m/f)
print('read ', len(documents), ' lines')
print(datetime.datetime.now(), "creating vectorizer and fitting documents ")
vectorizer = TfidfVectorizer()
vectorizer.fit(documents)
print(datetime.datetime.now(), "transforming documents")
X1 = vectorizer.transform(documents)
print(datetime.datetime.now(), "creating LinearSVC")
clf = LinearSVC()
print(datetime.datetime.now(), "training model")
clf = clf.fit(X1, labels)
print("fitting completed: (documents, labels)")
print(len(documents), ",", len(labels))
for name in ['dev-0', 'dev-1', 'test-A']:
print(datetime.datetime.now(), 'now serving:', name)
documents = []
with open(name + '/data.tsv', 'rt') as source:
for line in source:
documents.append(re.sub("[^\w ęóśłżźćń]+", "", line.lower(), flags=re.UNICODE).rstrip())
print(datetime.datetime.now(), 'read ', len(documents), ' from', name)
print(datetime.datetime.now(), 'transforming and classifying')
x = vectorizer.transform(documents)
output = clf.predict(x)
print(datetime.datetime.now(), 'saving output to ' + name + '/out.tsv')
with open(name + '/out.tsv', 'w') as wr:
for ans in output:
wr.write(ans + '\n')
print('finished')