forked from filipg/aitech-eks-pub
45 lines
1.1 KiB
Python
45 lines
1.1 KiB
Python
import regex as re
|
|
from sklearn.feature_extraction.text import HashingVectorizer
|
|
import torch
|
|
|
|
|
|
token_root_len = 7
|
|
|
|
|
|
class Analyzer(object):
|
|
|
|
def __init__(self):
|
|
self.token_pat = re.compile(r'(?:\p{L}|\d)+')
|
|
|
|
def __call__(self, doc):
|
|
return [tok[0:token_root_len] for tok in self.token_pat.findall(doc)]
|
|
|
|
|
|
# hiperparametr - liczba bitów hasza
|
|
vector_length = 2**18
|
|
|
|
|
|
vectorizer = HashingVectorizer(n_features=vector_length, analyzer=Analyzer())
|
|
|
|
midpoint = 1913.0
|
|
|
|
|
|
def vectorize_text(content):
|
|
# musimy przekonwertować macierz sklearn => macierz numpy => tensor pytorcha
|
|
return (torch.from_numpy(vectorizer.fit_transform([content]).toarray()))[0]
|
|
|
|
|
|
def vectorize_batch(contents):
|
|
# musimy przekonwertować macierz sklearn => macierz numpy => tensor pytorcha
|
|
return (torch.from_numpy(vectorizer.fit_transform(contents).toarray()))
|
|
|
|
|
|
|
|
def process_line(line):
|
|
fields = line.strip('\n').split('\t')
|
|
|
|
year_from, year_to, _, _, content = fields
|
|
# normalizujemy lata do wartości (-1,1)
|
|
year_normalized = ((float(year_from) + float(year_to)) / 2 - midpoint) / 100.0
|
|
return (content, torch.tensor(year_normalized))
|