import urllib.request from bs4 import BeautifulSoup from tika import parser import string from nltk.tokenize import word_tokenize import wget import os def download_pdf(url): '''Download PDFs (Not working properly)''' wget.download(url, "temp/temp.pdf", None) parsed = parser.from_file("temp/temp.pdf")['content'] os.remove("temp/temp.pdf") return parsed.replace("\n\n","\n").replace("\n"," ").replace("\r","").replace("\t"," ").replace(" "," ") def text(url, timeout = 30, stika = True): '''Downloads text from the urls''' try: if ".pdf" in url and stika: return download_pdf(url) elif ".pdf" in url and not stika: return "" else: html = urllib.request.urlopen(url, timeout=timeout).read().decode() if html[:5] == "%PDF-": if stika: return download_pdf(url) return "" soup = BeautifulSoup(html,"lxml") [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] return soup.getText().replace("\n\n","\n").replace("\n"," ").replace("\r","").replace("\t"," ").replace(" "," ") except: return "" def normalize(x): '''Lowers, eliminates punctuation and strips text''' y = "" for z in x: y += z + " " output = y.lower() output = output.translate(str.maketrans("","", string.punctuation)) output = output.strip() return word_tokenize(output) def doall(url, timeout, pdfsupport): '''Does all (Makes it easier to multithread)''' raw = text(url, timeout, pdfsupport) nor = normalize(raw.split()) return nor