added proxy downloader

This commit is contained in:
siulkilulki 2017-06-10 02:09:22 +02:00
parent 35d3b11ec6
commit c205e1b627
7 changed files with 147 additions and 0 deletions

38
#duck-duck-go.py# Normal file
View File

@ -0,0 +1,38 @@
import requests
from string import Template
from random import choice
class DuckDuckGo(object):
"""Documentation for DuckDuckGo
"""
def __init__(self, proxies=None, language=''):
self.proxies = [] if proxies is None else proxies
self.language = language
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
def _get(self, query, language):
if self.proxies:
proxy = choice(self.proxies)
ip_port = proxy[0]
protocol = proxy[1]
link = self.query.substitute(query=query, lang=language)
proxies = {protocol: ip_port}
requests.get(link, proxies=proxies)
else:
def body(self, query, language):
pass
def links(self, query, language):
pass
def main():
pass
if __name__ == '__main__':
main()

38
duckduckgo.py Normal file
View File

@ -0,0 +1,38 @@
import requests
from string import Template
from random import choice
class DuckDuckGo(object):
"""Documentation for DuckDuckGo
"""
def __init__(self, proxies=None, language=''):
self.proxies = [] if proxies is None else proxies
self.language = language
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
def _get(self, query, language):
link = self.query.substitute(query=query, lang=language)
if self.proxies:
proxy = choice(self.proxies)
ip_and_port = proxy[0]
protocol = proxy[1]
proxies = {protocol: ip_and_port}
requests.get(link, proxies=proxies)
return requests.get(link)
def body(self, query, language):
pass
def links(self, query, language):
pass
def main():
pass
if __name__ == '__main__':
main()

33
full_scrapper.py Normal file
View File

@ -0,0 +1,33 @@
import dill
from google import search
def check(parish):
if parish.url in search(query, lang='pl', stop=10, pause=3.0):
return true
def find_url(parish):
pass
def stem_url(url):
def main():
parishes = []
with open('./parishes.dill', 'rb') as f:
parishes = dill.load(f)
for parish in parishes:
if parish.url:
check(parish)
else:
find_url(parish)
import ipdb
ipdb.set_trace()
if __name__ == "__main__":
main()

30
proxy.py Normal file
View File

@ -0,0 +1,30 @@
from selenium import webdriver
import re
class Proxy():
def __init__(self, proxies = None):
"docstring"
self.proxies = [] if proxies is None else proxies
def download(self):
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('http://www.gatherproxy.com/proxylist/anonymity/?t=elite')
full_list_button = driver.find_element_by_xpath('//input[@type="submit" and @value="Show Full List"]')
full_list_button.click()
print(driver.page_source)
for match in re.finditer('<a href="#(.*?)" class="inactive" onclick="gp.pageClick', driver.page_source):
pass
pages_nr = int(match.group(1))
for i in range(2, pages_nr+1):
driver.execute_script('gp.pageClick(' + str(i) + ')')
print(i)
def _get_proxies(self, html):
for match in re.findall("<td><script>document.write\('(.*?)'[\w\W]*?<td><script>document.write\(gp.dep\('(.*?)'", html):
proxy = (match[0], str(int(match[1],16)))
self.proxies.append(proxy)
if __name__ == '__main__':
p = Proxy()
p.download()

3
q Normal file
View File

@ -0,0 +1,3 @@
requests
dill
dryscrape

View File

@ -1,2 +1,3 @@
requests requests
dill dill
dryscrape

View File

@ -5,4 +5,8 @@
# pip-compile --output-file requirements.txt requirements.in # pip-compile --output-file requirements.txt requirements.in
# #
dill==0.2.6 dill==0.2.6
dryscrape==1.0
lxml==3.8.0 # via dryscrape
requests==2.13.0 requests==2.13.0
webkit-server==1.0 # via dryscrape
xvfbwrapper==0.2.9 # via dryscrape