added proxy downloader
This commit is contained in:
parent
35d3b11ec6
commit
c205e1b627
38
#duck-duck-go.py#
Normal file
38
#duck-duck-go.py#
Normal file
@ -0,0 +1,38 @@
|
||||
import requests
|
||||
from string import Template
|
||||
from random import choice
|
||||
|
||||
|
||||
class DuckDuckGo(object):
|
||||
"""Documentation for DuckDuckGo
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, proxies=None, language=''):
|
||||
self.proxies = [] if proxies is None else proxies
|
||||
self.language = language
|
||||
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
|
||||
|
||||
def _get(self, query, language):
|
||||
if self.proxies:
|
||||
proxy = choice(self.proxies)
|
||||
ip_port = proxy[0]
|
||||
protocol = proxy[1]
|
||||
link = self.query.substitute(query=query, lang=language)
|
||||
proxies = {protocol: ip_port}
|
||||
requests.get(link, proxies=proxies)
|
||||
else:
|
||||
|
||||
def body(self, query, language):
|
||||
pass
|
||||
|
||||
def links(self, query, language):
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
38
duckduckgo.py
Normal file
38
duckduckgo.py
Normal file
@ -0,0 +1,38 @@
|
||||
import requests
|
||||
from string import Template
|
||||
from random import choice
|
||||
|
||||
|
||||
class DuckDuckGo(object):
|
||||
"""Documentation for DuckDuckGo
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, proxies=None, language=''):
|
||||
self.proxies = [] if proxies is None else proxies
|
||||
self.language = language
|
||||
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
|
||||
|
||||
def _get(self, query, language):
|
||||
link = self.query.substitute(query=query, lang=language)
|
||||
if self.proxies:
|
||||
proxy = choice(self.proxies)
|
||||
ip_and_port = proxy[0]
|
||||
protocol = proxy[1]
|
||||
proxies = {protocol: ip_and_port}
|
||||
requests.get(link, proxies=proxies)
|
||||
return requests.get(link)
|
||||
|
||||
def body(self, query, language):
|
||||
pass
|
||||
|
||||
def links(self, query, language):
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
33
full_scrapper.py
Normal file
33
full_scrapper.py
Normal file
@ -0,0 +1,33 @@
|
||||
import dill
|
||||
from google import search
|
||||
|
||||
|
||||
def check(parish):
|
||||
if parish.url in search(query, lang='pl', stop=10, pause=3.0):
|
||||
return true
|
||||
|
||||
|
||||
def find_url(parish):
|
||||
pass
|
||||
|
||||
|
||||
def stem_url(url):
|
||||
|
||||
|
||||
def main():
|
||||
parishes = []
|
||||
with open('./parishes.dill', 'rb') as f:
|
||||
parishes = dill.load(f)
|
||||
|
||||
for parish in parishes:
|
||||
if parish.url:
|
||||
check(parish)
|
||||
else:
|
||||
find_url(parish)
|
||||
|
||||
import ipdb
|
||||
ipdb.set_trace()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
30
proxy.py
Normal file
30
proxy.py
Normal file
@ -0,0 +1,30 @@
|
||||
from selenium import webdriver
|
||||
import re
|
||||
|
||||
class Proxy():
|
||||
def __init__(self, proxies = None):
|
||||
"docstring"
|
||||
self.proxies = [] if proxies is None else proxies
|
||||
|
||||
def download(self):
|
||||
driver = webdriver.Chrome()
|
||||
driver.maximize_window()
|
||||
driver.get('http://www.gatherproxy.com/proxylist/anonymity/?t=elite')
|
||||
full_list_button = driver.find_element_by_xpath('//input[@type="submit" and @value="Show Full List"]')
|
||||
full_list_button.click()
|
||||
print(driver.page_source)
|
||||
for match in re.finditer('<a href="#(.*?)" class="inactive" onclick="gp.pageClick', driver.page_source):
|
||||
pass
|
||||
pages_nr = int(match.group(1))
|
||||
for i in range(2, pages_nr+1):
|
||||
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
||||
print(i)
|
||||
|
||||
def _get_proxies(self, html):
|
||||
for match in re.findall("<td><script>document.write\('(.*?)'[\w\W]*?<td><script>document.write\(gp.dep\('(.*?)'", html):
|
||||
proxy = (match[0], str(int(match[1],16)))
|
||||
self.proxies.append(proxy)
|
||||
|
||||
if __name__ == '__main__':
|
||||
p = Proxy()
|
||||
p.download()
|
@ -1,2 +1,3 @@
|
||||
requests
|
||||
dill
|
||||
dryscrape
|
||||
|
@ -5,4 +5,8 @@
|
||||
# pip-compile --output-file requirements.txt requirements.in
|
||||
#
|
||||
dill==0.2.6
|
||||
dryscrape==1.0
|
||||
lxml==3.8.0 # via dryscrape
|
||||
requests==2.13.0
|
||||
webkit-server==1.0 # via dryscrape
|
||||
xvfbwrapper==0.2.9 # via dryscrape
|
||||
|
Loading…
Reference in New Issue
Block a user