added proxy downloader
This commit is contained in:
parent
35d3b11ec6
commit
c205e1b627
38
#duck-duck-go.py#
Normal file
38
#duck-duck-go.py#
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import requests
|
||||||
|
from string import Template
|
||||||
|
from random import choice
|
||||||
|
|
||||||
|
|
||||||
|
class DuckDuckGo(object):
|
||||||
|
"""Documentation for DuckDuckGo
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, proxies=None, language=''):
|
||||||
|
self.proxies = [] if proxies is None else proxies
|
||||||
|
self.language = language
|
||||||
|
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
|
||||||
|
|
||||||
|
def _get(self, query, language):
|
||||||
|
if self.proxies:
|
||||||
|
proxy = choice(self.proxies)
|
||||||
|
ip_port = proxy[0]
|
||||||
|
protocol = proxy[1]
|
||||||
|
link = self.query.substitute(query=query, lang=language)
|
||||||
|
proxies = {protocol: ip_port}
|
||||||
|
requests.get(link, proxies=proxies)
|
||||||
|
else:
|
||||||
|
|
||||||
|
def body(self, query, language):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def links(self, query, language):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
38
duckduckgo.py
Normal file
38
duckduckgo.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import requests
|
||||||
|
from string import Template
|
||||||
|
from random import choice
|
||||||
|
|
||||||
|
|
||||||
|
class DuckDuckGo(object):
|
||||||
|
"""Documentation for DuckDuckGo
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, proxies=None, language=''):
|
||||||
|
self.proxies = [] if proxies is None else proxies
|
||||||
|
self.language = language
|
||||||
|
self.query = Template('https://duckduckgo.com/html/?q=$query&kl=$lang')
|
||||||
|
|
||||||
|
def _get(self, query, language):
|
||||||
|
link = self.query.substitute(query=query, lang=language)
|
||||||
|
if self.proxies:
|
||||||
|
proxy = choice(self.proxies)
|
||||||
|
ip_and_port = proxy[0]
|
||||||
|
protocol = proxy[1]
|
||||||
|
proxies = {protocol: ip_and_port}
|
||||||
|
requests.get(link, proxies=proxies)
|
||||||
|
return requests.get(link)
|
||||||
|
|
||||||
|
def body(self, query, language):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def links(self, query, language):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
33
full_scrapper.py
Normal file
33
full_scrapper.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
import dill
|
||||||
|
from google import search
|
||||||
|
|
||||||
|
|
||||||
|
def check(parish):
|
||||||
|
if parish.url in search(query, lang='pl', stop=10, pause=3.0):
|
||||||
|
return true
|
||||||
|
|
||||||
|
|
||||||
|
def find_url(parish):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def stem_url(url):
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parishes = []
|
||||||
|
with open('./parishes.dill', 'rb') as f:
|
||||||
|
parishes = dill.load(f)
|
||||||
|
|
||||||
|
for parish in parishes:
|
||||||
|
if parish.url:
|
||||||
|
check(parish)
|
||||||
|
else:
|
||||||
|
find_url(parish)
|
||||||
|
|
||||||
|
import ipdb
|
||||||
|
ipdb.set_trace()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
30
proxy.py
Normal file
30
proxy.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
import re
|
||||||
|
|
||||||
|
class Proxy():
|
||||||
|
def __init__(self, proxies = None):
|
||||||
|
"docstring"
|
||||||
|
self.proxies = [] if proxies is None else proxies
|
||||||
|
|
||||||
|
def download(self):
|
||||||
|
driver = webdriver.Chrome()
|
||||||
|
driver.maximize_window()
|
||||||
|
driver.get('http://www.gatherproxy.com/proxylist/anonymity/?t=elite')
|
||||||
|
full_list_button = driver.find_element_by_xpath('//input[@type="submit" and @value="Show Full List"]')
|
||||||
|
full_list_button.click()
|
||||||
|
print(driver.page_source)
|
||||||
|
for match in re.finditer('<a href="#(.*?)" class="inactive" onclick="gp.pageClick', driver.page_source):
|
||||||
|
pass
|
||||||
|
pages_nr = int(match.group(1))
|
||||||
|
for i in range(2, pages_nr+1):
|
||||||
|
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
||||||
|
print(i)
|
||||||
|
|
||||||
|
def _get_proxies(self, html):
|
||||||
|
for match in re.findall("<td><script>document.write\('(.*?)'[\w\W]*?<td><script>document.write\(gp.dep\('(.*?)'", html):
|
||||||
|
proxy = (match[0], str(int(match[1],16)))
|
||||||
|
self.proxies.append(proxy)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
p = Proxy()
|
||||||
|
p.download()
|
@ -1,2 +1,3 @@
|
|||||||
requests
|
requests
|
||||||
dill
|
dill
|
||||||
|
dryscrape
|
||||||
|
@ -5,4 +5,8 @@
|
|||||||
# pip-compile --output-file requirements.txt requirements.in
|
# pip-compile --output-file requirements.txt requirements.in
|
||||||
#
|
#
|
||||||
dill==0.2.6
|
dill==0.2.6
|
||||||
|
dryscrape==1.0
|
||||||
|
lxml==3.8.0 # via dryscrape
|
||||||
requests==2.13.0
|
requests==2.13.0
|
||||||
|
webkit-server==1.0 # via dryscrape
|
||||||
|
xvfbwrapper==0.2.9 # via dryscrape
|
||||||
|
Loading…
Reference in New Issue
Block a user