diff --git a/embeddings.py b/embeddings.py new file mode 100644 index 0000000..a7df0b3 --- /dev/null +++ b/embeddings.py @@ -0,0 +1,35 @@ +import numpy as np +import torch +from transformers import AutoModel, AutoTokenizer + +tokenizer = AutoTokenizer.from_pretrained("Geotrend/distilbert-base-pl-cased") +model = AutoModel.from_pretrained("Geotrend/distilbert-base-pl-cased") + +text = """ +"nazwa": "Tatar wołowy",""" +# "skladniki": [ +# "wołowina", +# "cebula", +# "ogórki kiszone", +# "musztarda", +# "jajko", +# "pieprz", +# "sól" +# ], +# "alergeny": [ +# "jajko", +# "gorczyca" +# ] +# """ +encoded_input = tokenizer(text, return_tensors='pt', padding=True) +output = model(**encoded_input) +prompt = "tatar" +encoded_prompt = tokenizer(prompt, return_tensors='pt', padding=True) +output_prompt = model(**encoded_prompt) + +text_embedding = output.last_hidden_state[:, 0, :] +prompt_embedding = output_prompt.last_hidden_state[:, 0, :] +cosine = torch.nn.functional.cosine_similarity( + text_embedding, prompt_embedding, dim=1) + +print(cosine.item())