import csv import numpy as np import pandas as pd from scipy.sparse import vstack from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from stop_words import get_stop_words def to_n(word, n): if len(word) < n + 1: return word else: return word[:n] def stem(sentence): return ' '.join([to_n(word, 7) for word in sentence.split()]) def remove_specials(text): to_replace = '.,<>)(*&^%$#@~;:!?-_=+/\\\'\"|{}[]012345679' for spec in to_replace: text = text.replace(spec, '') return text df = pd.read_csv('train/train.tsv.xz', sep='\t', compression='xz', names=['date_from', 'date_to', 'title', 'source', 'text']) df['text'] = [stem(remove_specials(x.lower())) for x in df['text']] vectorizer = TfidfVectorizer(stop_words=get_stop_words('polish')) x = vectorizer.fit_transform(df['text']) x = vstack([x, x]) labels1 = df.pop('date_from') labels2 = df.pop('date_to') labels = np.concatenate((labels1, labels2), axis=0) #todo lin_reg = LinearRegression() lin_reg.fit(x, labels) # ---------------------------------------------------------------------------------------------------------------------- t_df = pd.read_csv('dev-0/in.tsv', sep='\t', names=['text'], quoting=csv.QUOTE_NONE) tlabs = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['date']) t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']] vecs = vectorizer.transform(t_df['text']) predict = lin_reg.predict(vecs) with open('dev-0/out.tsv', 'w') as f: tsvf = csv.writer(f, delimiter='\n') tsvf.writerow(predict) # ---------------------------------------------------------------------------------------------------------------------- t_df = pd.read_csv('test-A/in.tsv', sep='\t', names=['text']) t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']] vecs = vectorizer.transform(t_df['text']) predict = lin_reg.predict(vecs) with open('test-A/out.tsv', 'w') as f: tsvf = csv.writer(f, delimiter='\n') tsvf.writerow(predict)