retroc2/spp.py

70 lines
2.1 KiB
Python

import csv
import numpy as np
import pandas as pd
from scipy.sparse import vstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from stop_words import get_stop_words
def to_n(word, n):
if len(word) < n + 1:
return word
else:
return word[:n]
def stem(sentence):
return ' '.join([to_n(word, 7) for word in sentence.split()])
def remove_specials(text):
to_replace = '.,<>)(*&^%$#@~;:!?-_=+/\\\'\"|{}[]012345679'
for spec in to_replace:
text = text.replace(spec, '')
return text
df = pd.read_csv('train/train.tsv.xz',
sep='\t',
compression='xz',
names=['date_from', 'date_to', 'title', 'source', 'text'])
df['text'] = [stem(remove_specials(x.lower())) for x in df['text']]
vectorizer = TfidfVectorizer(stop_words=get_stop_words('polish'))
x = vectorizer.fit_transform(df['text'])
x = vstack([x, x])
labels1 = df.pop('date_from')
labels2 = df.pop('date_to')
labels = np.concatenate((labels1, labels2), axis=0) #todo
lin_reg = LinearRegression()
lin_reg.fit(x, labels)
# ----------------------------------------------------------------------------------------------------------------------
t_df = pd.read_csv('dev-0/in.tsv', sep='\t', names=['text'], quoting=csv.QUOTE_NONE)
tlabs = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['date'])
t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']]
vecs = vectorizer.transform(t_df['text'])
predict = lin_reg.predict(vecs)
with open('dev-0/out.tsv', 'w') as f:
tsvf = csv.writer(f, delimiter='\n')
tsvf.writerow(predict)
# ----------------------------------------------------------------------------------------------------------------------
t_df = pd.read_csv('test-A/in.tsv', sep='\t', names=['text'])
t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']]
vecs = vectorizer.transform(t_df['text'])
predict = lin_reg.predict(vecs)
with open('test-A/out.tsv', 'w') as f:
tsvf = csv.writer(f, delimiter='\n')
tsvf.writerow(predict)