47 lines
1.2 KiB
Python
47 lines
1.2 KiB
Python
|
|
||
|
import re
|
||
|
import json
|
||
|
from os import walk
|
||
|
import spacy
|
||
|
from tqdm import tqdm
|
||
|
import csv
|
||
|
|
||
|
path_to_file_input = "./recipe"
|
||
|
|
||
|
recipe_files = []
|
||
|
for (dirpath, dirnames, filenames) in walk(path_to_file_input):
|
||
|
recipe_files.extend(filenames)
|
||
|
break
|
||
|
|
||
|
recipes_names=[]
|
||
|
|
||
|
recipes_ingredients=dict()
|
||
|
for recipe in recipe_files:
|
||
|
with open(path_to_file_input+"/"+recipe) as f:
|
||
|
data=json.load(f)
|
||
|
recipe_ingredients=[]
|
||
|
for row in data["content"]:
|
||
|
if row['type']=='ingredient':
|
||
|
recipe_ingredients.append(" ".join(re.findall("[a-zA-z]+",row['text'])))
|
||
|
recipes_ingredients[int(recipe[:3])]=[re.sub('json|(I+)',' ', ' '.join(re.findall("[a-zA-Z]+",recipe)))]+recipe_ingredients
|
||
|
text_list = []
|
||
|
|
||
|
for k,v in recipes_ingredients.items():
|
||
|
text_list.append(str(k)+' '+' '.join(v))
|
||
|
tok_text=[]
|
||
|
|
||
|
nlp=spacy.load("en_core_web_sm")
|
||
|
stopwords=['can','cans','inch','cup','cups','and','or','what','teaspoon','teaspoons','chopped','cut']
|
||
|
for doc in tqdm(nlp.pipe(text_list, disable=['tagger', 'parser', 'ner'])):
|
||
|
tok=[t.text for t in doc if not t.text in stopwords]
|
||
|
tok_text.append(tok)
|
||
|
|
||
|
print(tok_text)
|
||
|
with open('ingredients_recipes_merged.csv', 'w') as f:
|
||
|
wr=csv.writer(f)
|
||
|
wr.writerows(tok_text)
|
||
|
|
||
|
|
||
|
|
||
|
|