56 lines
2.6 KiB
Python
56 lines
2.6 KiB
Python
# from transformers import *
|
|
# import torch
|
|
#
|
|
# # Let's see how to increase the vocabulary of Bert model and tokenizer
|
|
# tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")
|
|
# model = BertModel.from_pretrained("google-bert/bert-base-uncased")
|
|
#
|
|
# num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
|
|
# print("We have added", num_added_toks, "tokens")
|
|
# # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
|
|
# model.resize_token_embeddings(len(tokenizer))
|
|
|
|
# from datasets import Dataset
|
|
# import pandas as pd
|
|
#
|
|
# raw_data = pd.read_csv('train/train.tsv', sep='\t', names=['y', 'x'], header=None)
|
|
#
|
|
# raw_data = Dataset.from_pandas(raw_data)
|
|
#
|
|
# print(raw_data)
|
|
#
|
|
#
|
|
# label_list = ['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']
|
|
# id2label = {i: label for i, label in enumerate(label_list)}
|
|
# label2id = {v: k for k, v in id2label.items()}
|
|
|
|
|
|
# print("aaaa Aaaa AAAA aAAA".lower())
|
|
#
|
|
#
|
|
#
|
|
# from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
|
#
|
|
# recognizer = pipeline("ner")
|
|
# a = recognizer(["</S> Peter Blackburn </S> BRUSSELS 1996-08-22 </S> The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep ."])
|
|
# print(a)
|
|
#
|
|
# model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
|
|
# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
|
|
# recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
|
|
# a = recognizer(["</S> Peter Blackburn </S> BRUSSELS 1996-08-22 </S> The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep ."])
|
|
# print(a)
|
|
|
|
## https://medium.com/@anyuanay/working-with-hugging-face-lesson-2-1-71c6e4662479
|
|
|
|
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
|
|
|
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
|
|
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
|
|
|
|
nlp = pipeline("ner", model=model, tokenizer=tokenizer)
|
|
|
|
text = "</S> Peter Blackburn </S> BRUSSELS 1996-08-22 </S> The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep ."
|
|
|
|
ner_results = nlp(text)
|
|
print(ner_results) |