PCQRSCANER/venv/Lib/site-packages/Hooke/extract.py

import urllib.request
from bs4 import BeautifulSoup
from tika import parser
import string
from nltk.tokenize import word_tokenize
import wget
import os

def download_pdf(url):
    '''Download PDFs (Not working properly)'''
    wget.download(url, "temp/temp.pdf", None)
    parsed = parser.from_file("temp/temp.pdf")['content']
    os.remove("temp/temp.pdf")
    return parsed.replace("\n\n","\n").replace("\n"," ").replace("\r","").replace("\t"," ").replace("  "," ")

def text(url, timeout = 30, stika = True):
    '''Downloads text from the urls'''
    try:
        if ".pdf" in url and stika:
            return download_pdf(url)
        elif ".pdf" in url and not stika:
            return ""
        else:
            html = urllib.request.urlopen(url, timeout=timeout).read().decode()
            if html[:5] == "%PDF-":
                if stika:
                    return download_pdf(url)
                return ""
            soup = BeautifulSoup(html,"lxml")
            [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
            return soup.getText().replace("\n\n","\n").replace("\n"," ").replace("\r","").replace("\t"," ").replace("  "," ")
    except:
        return ""

def normalize(x):
    '''Lowers, eliminates punctuation and strips text'''
    y = ""
    for z in x:
        y += z + " "
    output = y.lower()
    output = output.translate(str.maketrans("","", string.punctuation))
    output = output.strip()
    return word_tokenize(output)

def doall(url, timeout, pdfsupport):
    '''Does all (Makes it easier to multithread)'''
    raw = text(url, timeout, pdfsupport)
    nor = normalize(raw.split())
    return nor
3 2019-12-22 21:51:47 +01:00			`import urllib.request`
			`from bs4 import BeautifulSoup`
			`from tika import parser`
			`import string`
			`from nltk.tokenize import word_tokenize`
			`import wget`
			`import os`

			`def download_pdf(url):`
			`'''Download PDFs (Not working properly)'''`
			`wget.download(url, "temp/temp.pdf", None)`
			`parsed = parser.from_file("temp/temp.pdf")['content']`
			`os.remove("temp/temp.pdf")`
			`return parsed.replace("\n\n","\n").replace("\n"," ").replace("\r","").replace("\t"," ").replace(" "," ")`

			`def text(url, timeout = 30, stika = True):`
			`'''Downloads text from the urls'''`
			`try:`
			`if ".pdf" in url and stika:`
			`return download_pdf(url)`
			`elif ".pdf" in url and not stika:`
			`return ""`
			`else:`
			`html = urllib.request.urlopen(url, timeout=timeout).read().decode()`
			`if html[:5] == "%PDF-":`
			`if stika:`
			`return download_pdf(url)`
			`return ""`
			`soup = BeautifulSoup(html,"lxml")`
			`[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]`
			`return soup.getText().replace("\n\n","\n").replace("\n"," ").replace("\r","").replace("\t"," ").replace(" "," ")`
			`except:`
			`return ""`

			`def normalize(x):`
			`'''Lowers, eliminates punctuation and strips text'''`
			`y = ""`
			`for z in x:`
			`y += z + " "`
			`output = y.lower()`
			`output = output.translate(str.maketrans("","", string.punctuation))`
			`output = output.strip()`
			`return word_tokenize(output)`

			`def doall(url, timeout, pdfsupport):`
			`'''Does all (Makes it easier to multithread)'''`
			`raw = text(url, timeout, pdfsupport)`
			`nor = normalize(raw.split())`
			`return nor`