import regex as re from sklearn.feature_extraction.text import HashingVectorizer import torch token_root_len = 7 class Analyzer(object): def __init__(self): self.token_pat = re.compile(r'(?:\p{L}|\d)+') def __call__(self, doc): return [tok[0:token_root_len] for tok in self.token_pat.findall(doc)] # hiperparametr - liczba bitów hasza vector_length = 2**18 vectorizer = HashingVectorizer(n_features=vector_length, analyzer=Analyzer()) midpoint = 1913.0 def vectorize_text(content): # musimy przekonwertować macierz sklearn => macierz numpy => tensor pytorcha return (torch.from_numpy(vectorizer.fit_transform([content]).toarray()))[0] def vectorize_batch(contents): # musimy przekonwertować macierz sklearn => macierz numpy => tensor pytorcha return (torch.from_numpy(vectorizer.fit_transform(contents).toarray())) def process_line(line): fields = line.strip('\n').split('\t') year_from, year_to, _, _, content = fields # normalizujemy lata do wartości (-1,1) year_normalized = ((float(year_from) + float(year_to)) / 2 - midpoint) / 100.0 return (content, torch.tensor(year_normalized))