2017-06-10 02:09:22 +02:00
|
|
|
from selenium import webdriver
|
2017-06-18 21:33:44 +02:00
|
|
|
from selenium import common
|
2017-06-10 02:09:22 +02:00
|
|
|
import re
|
2017-06-11 00:00:22 +02:00
|
|
|
import random
|
2017-06-10 02:09:22 +02:00
|
|
|
|
2017-06-11 00:00:22 +02:00
|
|
|
|
|
|
|
# TODO: export path with geckodriver or chromedriver automatically and put driver in project files
|
|
|
|
# TODO: automatically download geckodriver or chromedriver
|
2017-06-10 02:09:22 +02:00
|
|
|
class Proxy():
|
2017-06-11 00:00:22 +02:00
|
|
|
def __init__(self, proxies=None):
|
2017-06-10 02:09:22 +02:00
|
|
|
"docstring"
|
2017-06-18 21:33:44 +02:00
|
|
|
#self.proxies = [] if proxies is None else proxies
|
|
|
|
self.proxies = proxies or []
|
2017-06-10 02:09:22 +02:00
|
|
|
|
2017-06-18 21:33:44 +02:00
|
|
|
def download(self, limit=0):
|
|
|
|
print('Transparent proxies')
|
|
|
|
self._download('Transparent', limit)
|
|
|
|
print('Elite proxies')
|
|
|
|
self._download('elite', limit)
|
|
|
|
|
|
|
|
def _download(self, type, limit=0):
|
|
|
|
driver = webdriver.PhantomJS('./phantomjs')
|
2017-06-10 02:09:22 +02:00
|
|
|
driver.maximize_window()
|
2017-06-18 21:33:44 +02:00
|
|
|
driver.get('http://www.gatherproxy.com/proxylist/anonymity/?t=' + type)
|
2017-06-11 00:00:22 +02:00
|
|
|
full_list_button = driver.find_element_by_xpath(
|
|
|
|
'//input[@type="submit" and @value="Show Full List"]')
|
2017-06-10 02:09:22 +02:00
|
|
|
full_list_button.click()
|
2017-06-12 22:08:29 +02:00
|
|
|
#print(driver.page_source)
|
2017-06-11 00:00:22 +02:00
|
|
|
for match in re.finditer(
|
|
|
|
'<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
|
|
|
|
driver.page_source):
|
2017-06-10 02:09:22 +02:00
|
|
|
pass
|
2017-06-18 21:33:44 +02:00
|
|
|
if limit == 0:
|
|
|
|
pages_nr = int(match.group(1))
|
|
|
|
else:
|
|
|
|
pages_nr = limit
|
2017-06-11 00:00:22 +02:00
|
|
|
for i in range(1, pages_nr + 1):
|
|
|
|
self._get_proxies(driver.page_source)
|
2017-06-18 21:33:44 +02:00
|
|
|
try:
|
|
|
|
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
|
|
|
except common.exceptions.WebDriverException:
|
|
|
|
import ipdb
|
|
|
|
ipdb.set_trace()
|
|
|
|
driver.execute_script('gp.pageClick(' + str(i) + ')')
|
2017-06-10 02:09:22 +02:00
|
|
|
print(i)
|
2017-06-11 00:00:22 +02:00
|
|
|
|
|
|
|
def random(self):
|
|
|
|
return random.choice(self.proxies)
|
|
|
|
|
2017-06-10 02:09:22 +02:00
|
|
|
def _get_proxies(self, html):
|
2017-06-11 00:00:22 +02:00
|
|
|
for match in re.findall(
|
|
|
|
"<td><script>document.write\('(.*?)'[\w\W]*?<td><script>document.write\(gp.dep\('(.*?)'",
|
|
|
|
html):
|
|
|
|
proxy = (match[0], str(int(match[1], 16)))
|
2017-06-10 02:09:22 +02:00
|
|
|
self.proxies.append(proxy)
|
|
|
|
|
2017-06-11 00:00:22 +02:00
|
|
|
|
2017-06-10 02:09:22 +02:00
|
|
|
if __name__ == '__main__':
|
|
|
|
p = Proxy()
|
|
|
|
p.download()
|
2017-06-18 21:33:44 +02:00
|
|
|
proxy = p.random()
|