import re import json from os import walk import spacy from tqdm import tqdm import csv path_to_file_input = "./recipe" recipe_files = [] for (dirpath, dirnames, filenames) in walk(path_to_file_input): recipe_files.extend(filenames) break recipes_names=[] recipes_ingredients=dict() for recipe in recipe_files: with open(path_to_file_input+"/"+recipe) as f: data=json.load(f) recipe_ingredients=[] for row in data["content"]: if row['type']=='ingredient': recipe_ingredients.append(" ".join(re.findall("[a-zA-z]+",row['text']))) recipes_ingredients[int(recipe[:3])]=[re.sub('json|(I+)',' ', ' '.join(re.findall("[a-zA-Z]+",recipe)))]+recipe_ingredients text_list = [] for k,v in recipes_ingredients.items(): text_list.append(str(k)+' '+' '.join(v)) tok_text=[] nlp=spacy.load("en_core_web_sm") stopwords=['can','cans','inch','cup','cups','and','or','what','teaspoon','teaspoons','chopped','cut'] for doc in tqdm(nlp.pipe(text_list, disable=['tagger', 'parser', 'ner'])): tok=[t.text for t in doc if not t.text in stopwords] tok_text.append(tok) print(tok_text) with open('ingredients_recipes_merged.csv', 'w') as f: wr=csv.writer(f) wr.writerows(tok_text)