AMUseBot/ai_talks/AMUseBotBackend/utils/tools/generate_merged_ingredients.py
2023-06-05 21:23:33 +02:00

47 lines
1.2 KiB
Python

import re
import json
from os import walk
import spacy
from tqdm import tqdm
import csv
path_to_file_input = "./recipe"
recipe_files = []
for (dirpath, dirnames, filenames) in walk(path_to_file_input):
recipe_files.extend(filenames)
break
recipes_names=[]
recipes_ingredients=dict()
for recipe in recipe_files:
with open(path_to_file_input+"/"+recipe) as f:
data=json.load(f)
recipe_ingredients=[]
for row in data["content"]:
if row['type']=='ingredient':
recipe_ingredients.append(" ".join(re.findall("[a-zA-z]+",row['text'])))
recipes_ingredients[int(recipe[:3])]=[re.sub('json|(I+)',' ', ' '.join(re.findall("[a-zA-Z]+",recipe)))]+recipe_ingredients
text_list = []
for k,v in recipes_ingredients.items():
text_list.append(str(k)+' '+' '.join(v))
tok_text=[]
nlp=spacy.load("en_core_web_sm")
stopwords=['can','cans','inch','cup','cups','and','or','what','teaspoon','teaspoons','chopped','cut']
for doc in tqdm(nlp.pipe(text_list, disable=['tagger', 'parser', 'ner'])):
tok=[t.text for t in doc if not t.text in stopwords]
tok_text.append(tok)
print(tok_text)
with open('ingredients_recipes_merged.csv', 'w') as f:
wr=csv.writer(f)
wr.writerows(tok_text)