#!/usr/bin/env python # -*- coding: utf-8 -*- #TASK REMARK: ALL FILES WERE STRIPPED FROM CHAR #DUE TO PROBLEMS WITH READING INPUT FILES import gzip import re import string import ftfy import datetime from sklearn.svm import LinearSVC from sklearn.feature_extraction.text import TfidfVectorizer documents = [] labels = [] line_id = 0 m = 0 f = 0 print(datetime.datetime.now(), "starting") with open('train/data.tsv', 'rt') as ins: for line in ins: sub = re.sub("[^\w ęóśłżźćń\t]+", "", line.lower(), flags=re.UNICODE).rstrip() try: label, text = sub\ .split('\t', 1) documents.append(re.sub("\s+", " ", text)) labels.append(label.upper()) line_id = line_id + 1 if(label == 'm'): m = m + 1 if(label == 'f'): f = f + 1 except ValueError: print('error on line {:d}', line_id) continue print(datetime.datetime.now(), "file read ") print('m:', m, 'f:', f) print('m/f:', m/f) print('read ', len(documents), ' lines') print(datetime.datetime.now(), "creating vectorizer and fitting documents ") vectorizer = TfidfVectorizer() vectorizer.fit(documents) print(datetime.datetime.now(), "transforming documents") X1 = vectorizer.transform(documents) print(datetime.datetime.now(), "creating LinearSVC") clf = LinearSVC() print(datetime.datetime.now(), "training model") clf = clf.fit(X1, labels) print("fitting completed: (documents, labels)") print(len(documents), ",", len(labels)) for name in ['dev-0', 'dev-1', 'test-A']: print(datetime.datetime.now(), 'now serving:', name) documents = [] with open(name + '/data.tsv', 'rt') as source: for line in source: documents.append(re.sub("[^\w ęóśłżźćń]+", "", line.lower(), flags=re.UNICODE).rstrip()) print(datetime.datetime.now(), 'read ', len(documents), ' from', name) print(datetime.datetime.now(), 'transforming and classifying') x = vectorizer.transform(documents) output = clf.predict(x) print(datetime.datetime.now(), 'saving output to ' + name + '/out.tsv') with open(name + '/out.tsv', 'w') as wr: for ans in output: wr.write(ans + '\n') print('finished')