mass-scraper/scraper/proxy.py

from selenium import webdriver
from selenium import common
import re
import random


# TODO: export path with geckodriver or chromedriver automatically and put driver in project files
# TODO: automatically download geckodriver or chromedriver
class Proxy():
    def __init__(self, proxies=None):
        "docstring"
        #self.proxies = [] if proxies is None else proxies
        self.proxies = proxies or []

    def download(self, limit=0):
        print('Transparent proxies')
        self._download('Transparent', limit)
        print('Elite       proxies')
        self._download('elite', limit)

    def _download(self, type, limit=0):
        driver = webdriver.PhantomJS('./phantomjs')
        driver.maximize_window()
        driver.get('http://www.gatherproxy.com/proxylist/anonymity/?t=' + type)
        full_list_button = driver.find_element_by_xpath(
            '//input[@type="submit" and @value="Show Full List"]')
        full_list_button.click()
        #print(driver.page_source)
        for match in re.finditer(
                '<a href="#(.*?)" class="inactive" onclick="gp.pageClick',
                driver.page_source):
            pass
        if limit == 0:
            pages_nr = int(match.group(1))
        else:
            pages_nr = limit
        for i in range(1, pages_nr + 1):
            self._get_proxies(driver.page_source)
            try:
                driver.execute_script('gp.pageClick(' + str(i) + ')')
            except common.exceptions.WebDriverException:
                import ipdb
                ipdb.set_trace()
                driver.execute_script('gp.pageClick(' + str(i) + ')')
            print(i)

    def random(self):
        return random.choice(self.proxies)

    def _get_proxies(self, html):
        for match in re.findall(
                "<td><script>document.write\('(.*?)'[\w\W]*?<td><script>document.write\(gp.dep\('(.*?)'",
                html):
            proxy = (match[0], str(int(match[1], 16)))
            self.proxies.append(proxy)


if __name__ == '__main__':
    p = Proxy()
    p.download()
    proxy = p.random()
added proxy downloader 2017-06-10 02:09:22 +02:00			`from selenium import webdriver`
code refactorings and improvements 2017-06-18 21:33:44 +02:00			`from selenium import common`
added proxy downloader 2017-06-10 02:09:22 +02:00			`import re`
done proxy.py 2017-06-11 00:00:22 +02:00			`import random`
added proxy downloader 2017-06-10 02:09:22 +02:00
done proxy.py 2017-06-11 00:00:22 +02:00
			`# TODO: export path with geckodriver or chromedriver automatically and put driver in project files`
			`# TODO: automatically download geckodriver or chromedriver`
added proxy downloader 2017-06-10 02:09:22 +02:00			`class Proxy():`
done proxy.py 2017-06-11 00:00:22 +02:00			`def __init__(self, proxies=None):`
added proxy downloader 2017-06-10 02:09:22 +02:00			`"docstring"`
code refactorings and improvements 2017-06-18 21:33:44 +02:00			`#self.proxies = [] if proxies is None else proxies`
			`self.proxies = proxies or []`
added proxy downloader 2017-06-10 02:09:22 +02:00
code refactorings and improvements 2017-06-18 21:33:44 +02:00			`def download(self, limit=0):`
			`print('Transparent proxies')`
			`self._download('Transparent', limit)`
			`print('Elite proxies')`
			`self._download('elite', limit)`

			`def _download(self, type, limit=0):`
			`driver = webdriver.PhantomJS('./phantomjs')`
added proxy downloader 2017-06-10 02:09:22 +02:00			`driver.maximize_window()`
code refactorings and improvements 2017-06-18 21:33:44 +02:00			`driver.get('http://www.gatherproxy.com/proxylist/anonymity/?t=' + type)`
done proxy.py 2017-06-11 00:00:22 +02:00			`full_list_button = driver.find_element_by_xpath(`
			`'//input[@type="submit" and @value="Show Full List"]')`
added proxy downloader 2017-06-10 02:09:22 +02:00			`full_list_button.click()`
proof of concept alpha 2017-06-12 22:08:29 +02:00			`#print(driver.page_source)`
done proxy.py 2017-06-11 00:00:22 +02:00			`for match in re.finditer(`
			`'<a href="#(.*?)" class="inactive" onclick="gp.pageClick',`
			`driver.page_source):`
added proxy downloader 2017-06-10 02:09:22 +02:00			`pass`
code refactorings and improvements 2017-06-18 21:33:44 +02:00			`if limit == 0:`
			`pages_nr = int(match.group(1))`
			`else:`
			`pages_nr = limit`
done proxy.py 2017-06-11 00:00:22 +02:00			`for i in range(1, pages_nr + 1):`
			`self._get_proxies(driver.page_source)`
code refactorings and improvements 2017-06-18 21:33:44 +02:00			`try:`
			`driver.execute_script('gp.pageClick(' + str(i) + ')')`
			`except common.exceptions.WebDriverException:`
			`import ipdb`
			`ipdb.set_trace()`
			`driver.execute_script('gp.pageClick(' + str(i) + ')')`
added proxy downloader 2017-06-10 02:09:22 +02:00			`print(i)`
done proxy.py 2017-06-11 00:00:22 +02:00
			`def random(self):`
			`return random.choice(self.proxies)`

added proxy downloader 2017-06-10 02:09:22 +02:00			`def _get_proxies(self, html):`
done proxy.py 2017-06-11 00:00:22 +02:00			`for match in re.findall(`
			`"<td><script>document.write\('(.?)'[\w\W]?<td><script>document.write\(gp.dep\('(.*?)'",`
			`html):`
			`proxy = (match[0], str(int(match[1], 16)))`
added proxy downloader 2017-06-10 02:09:22 +02:00			`self.proxies.append(proxy)`

done proxy.py 2017-06-11 00:00:22 +02:00
added proxy downloader 2017-06-10 02:09:22 +02:00			`if __name__ == '__main__':`
			`p = Proxy()`
			`p.download()`
code refactorings and improvements 2017-06-18 21:33:44 +02:00			`proxy = p.random()`