# from transformers import * # import torch # # # Let's see how to increase the vocabulary of Bert model and tokenizer # tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased") # model = BertModel.from_pretrained("google-bert/bert-base-uncased") # # num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) # print("We have added", num_added_toks, "tokens") # # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. # model.resize_token_embeddings(len(tokenizer)) # from datasets import Dataset # import pandas as pd # # raw_data = pd.read_csv('train/train.tsv', sep='\t', names=['y', 'x'], header=None) # # raw_data = Dataset.from_pandas(raw_data) # # print(raw_data) # # # label_list = ['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC'] # id2label = {i: label for i, label in enumerate(label_list)} # label2id = {v: k for k, v in id2label.items()} # print("aaaa Aaaa AAAA aAAA".lower()) # # # # from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer # # recognizer = pipeline("ner") # a = recognizer([" Peter Blackburn BRUSSELS 1996-08-22 The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep ."]) # print(a) # # model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") # tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") # recognizer = pipeline("ner", model=model, tokenizer=tokenizer) # a = recognizer([" Peter Blackburn BRUSSELS 1996-08-22 The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep ."]) # print(a) ## https://medium.com/@anyuanay/working-with-hugging-face-lesson-2-1-71c6e4662479 from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER") tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER") nlp = pipeline("ner", model=model, tokenizer=tokenizer) text = " Peter Blackburn BRUSSELS 1996-08-22 The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep ." ner_results = nlp(text) print(ner_results)