Public Proxy Grabber

flambda · Dienstag 4. Februar 2014, 19:04

Hi,

da ich immer wieder Proxies innerhalb meiner Python Programme brauche habe ich mich mal dran gemacht und ein Grabber Framework geschrieben.
Wollte es gerne später als Paket auf pypi veröffentlichen, vorher wollte ich aber gerne Meinungen von euch hier höhren:

Bisher unterstützte Seiten:
HideMyAss.com
Samair.ru
NNtime.com

ToDo:
*Docstrings
*CookieCon als eigenständiges Paket
*Vereinheitlichungen beim Setup der CookieCOn Verbindung

Mögliche Probleme:
jsbeautifier hat noch einen Bug der aber bereits iim Master Branch gefixt wurde. Ka ob ihr das schon mit einem pip install bekommt oder nicht.
Wenn nein, der Commit war hier:
https://github.com/einars/js-beautify/commit/12d67f

Habe hier noch ein anderes Skript benutzt (CookieCon) das ich jetzt erstmal mit reingepackt habe da es nocj kein eigenständiges Paket auf pypi ist.

__init__.py

Code: Alles auswählen

__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.1"
__maintainer__ = "Frederik Lauber"
__status__ = "Development"
__contact__ = "https://flambda.de/impressum.html"

comon.py:

Code: Alles auswählen

"""Some shared functions between all modules"""
__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = ""
__maintainer__ = "Frederik Lauber"
__status__ = "Development"
__contact__ = "https://flambda.de/impressum.html"
from . import proxy3


class NoProxiesOnPage(Exception):
	pass


class ProxyGrabber(object):
	row_names = []

	def __init__(self):
		self.cached_proxies = set()
		self.already_returned_proxies = set()
		self.page = 1

		def do_nothing(*args, **kwargs):
			pass
		row_functions = dict()
		for (num, row_name) in enumerate(self.row_names):
			tmp_name = "_parse_" + row_name
			if tmp_name in dir(self):
				row_functions[num] = getattr(self, tmp_name)
			else:
				row_functions[num] = do_nothing
		self.row_functions = row_functions

	def _tr_parser(self, tr):
		row = 0
		info_dict = dict()
		for td in tr.find_all('td'):
			self.row_functions[row](td, info_dict)
			row += 1
		return info_dict

	def _get_set_from_page(self, page_number):
		proxy_set = set()
		page = self.con.request(self.PROXY_PAGE_TEMPLATE.substitute(page=self._prepare_page_number(page_number)))
		self._per_page_parsing(page)
		for tr in self._extract_trs(page):
			try:
				info_dict = self._tr_parser(tr)
				proxy_set.add(proxy3.Proxy.info_dict(info_dict))
			except IndexError:
				#this happens in samair grabber due to adds in the tds
				continue
			#except Exception:
				#not sure which other errors might happen
			#	continue
		if proxy_set:
			return proxy_set
		else:
			raise NoProxiesOnPage

	@staticmethod
	def _prepare_page_number(page_number):
		return page_number

	def _per_page_parsing(self, page):
		pass

	def _extract_trs(self, page):
		return []

	def __iter__(self):
		while True:
			while not self.cached_proxies:
				try:
					self.cached_proxies.update(self._get_set_from_page(self.page) - self.already_returned_proxies)
				except NoProxiesOnPage:
					raise StopIteration
				self.page += 1
			tmp = self.cached_proxies.pop()
			self.already_returned_proxies.add(tmp)
			yield tmp

	def reset_page(self):
		self.page = 1

def generate_decode_dict(decoding_string):
	decode_dict = dict()
	for tag in decoding_string.replace("\n", "").replace(" ", "").strip("';").split(";"):
		l = tag.split("=")
		key = l[0]
		value = l[1]
		decode_dict[key] = value
	return decode_dict


def decode_port(obfuscated_port, decode_dict):
	return "".join([decode_dict[key] for key in obfuscated_port.split("+")])

CookieCon.py:

Code: Alles auswählen

"""Python3 urllib wrapper with Cookie support
Supports request and urlretrive on
Connection which need Cookies
for auth or other"""
__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.5"
__maintainer__ = "Frederik Lauber"
__status__ = "Development"
__contact__ = "https://flambda.de/impressum.html"
import os
import re
from shutil import copyfileobj
from urllib.parse import urlencode
from urllib.request import build_opener, HTTPCookieProcessor, Request
from http.cookiejar import CookieJar


class CookieCon(object):
	"""Object which holds all cookies etc."""
	def __init__(self, encoding='utf-8', userAgent=None):
		self._encoding = encoding
		self._cookiejar = CookieJar()
		self._opener = build_opener(HTTPCookieProcessor(self._cookiejar))

		if not userAgent is None:
			self._opener.addheaders = [('User-agent', userAgent)]

	def _encode_dict(self, header_dict):
		"""Function used to encode dicts based
		on the given encoding"""
		encoded_headers = dict()
		for header, value in header_dict.items():
			encoded_header = header.encode(self._encoding)
			encoded_value = value.encode(self._encoding)
			encoded_headers[encoded_header] = encoded_value
		return encoded_headers

	def _encode_url(self, params=None):
		"""Function used to encode urls based
		on the given encoding
		If params is None or has length 0,
		None will be returned.
		This was implemented as self._opener
		also takes None as an argument if 
		no params are needed
		"""
		if params is None or not len(params):
			return None
		else:
			return urlencode(self._encode_dict(params)).encode(self._encoding)

	def request(self, url, params=None):
		"""Usage:
		from CookieCon import CookieCon
		con = CookieCon()
		con.request("http://google.de")
			get-request, returns response
		con.request("http://google.de", {'foo': 'bar'})
			post-request, returns response
		"""
		with self._opener.open(url, self._encode_url(params)) as sock:
			return sock.read().decode(self._encoding)

	def urlgetfileinfo(self, url):
		"""Returns a tuple with the filesize and filename
		as defined in the Content-Length and Content-Disposition headers.
		If the header does not exist, None will be returned"""
		with self._opener.open(Request(url, method="HEAD")) as sock:
			try:
				header = sock.info()['Content-Disposition']
				filename = re.search('(?<=").*(?=")', header).group(0)
			except Exception:
				filename = None
			try:
				filesize = int(sock.headers['Content-Length'])
			except KeyError:
				filesize = None
		return (filename, filesize)

	class NoFileName(Exception):
		pass
	def urlretrieve(self, url, folder, optname = None):
		"""Usage:
		from CookieCon import CookieCon
		con = CookieCon()
		con.urlretrive("http://foo.bar/1.zip", "~/home/")
			Downloads 1.zip to ~/home/1.zip, resumes if file already exists.
			The filename is discovered by the Content-Disposition header
		con.urlretrive("http://foo.bar/1.zip", "~/home/", "2.zip")
			Downloads 1.zip to ~/home/2.zip, resumes if file already exists.
			The given filename is used.
		If no filename is given nor discovered, a NoFileName exception is raised.
		This will not check that the given foldername and filename are valid but 
		raise an exception.
		"""
		(urlname, urlfilesize) = self.urlgetfileinfo(url)
		filename = optname if optname is not None else urlname
		if filename is None:
			raise self.NoFileName
		filesize = float("inf") if urlfilesize is None else urlfilesize
		filepath = os.path.join(folder, filename)
		try:
			currentsize = os.path.getsize(filepath)
		except os.error:
			currentsize = 0
		if currentsize < filesize:
			headers = {"Range": "bytes=%s-" % currentsize}
			encoded_headers = self._encode_dict(headers)
			with self._opener.open(Request(url, headers=encoded_headers)) as sock:
				with open(filepath, "ab") as file:
					copyfileobj(sock, file)

hidemyass.py:

Code: Alles auswählen

__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.1"
__maintainer__ = "Frederik Lauber"
__status__ = "Development"
__contact__ = "https://flambda.de/impressum.html"
__side__ = "HideMyAss"
__type__ = "proxy_grabber"

from string import Template
from html.parser import HTMLParser
from bs4 import BeautifulSoup
from . import CookieCon
from . import comon


#ugly but works for now
class _IPParser(HTMLParser):
	"""Ip parser to get the IP from the html obfuscation done by the side"""
	def __init__(self, style_dict):
		HTMLParser.__init__(self)
		self.style = False
		self.style_dict = style_dict
		self.visible = True
		self.ip = []

	def handle_starttag(self, tag, attributes):
		if tag == "style":
			self.style = True
		else:
			for name, value in attributes:
				if "class" == name:
					try:
						self.visible = self.style_dict[value]
					except KeyError:
						self.visible = True
				elif "style" == name:
					self.visible = True if ("inline" in value) else False

	def handle_endtag(self, tag):
		if tag == "style":
			self.style = False
		self.visible = True

	def handle_data(self, data):
		if self.visible and not self.style:
			self.ip.append(data.rstrip("\n"))


class HideMyAssGrabber(comon.ProxyGrabber):
	row_names = ["last_update", "ip", "port", "country", "speed", "connection_time", "protocol", "anonymity"]
	PROXY_PAGE_TEMPLATE = Template('http://hidemyass.com/proxy-list/${page}')

	def __init__(self):
		comon.ProxyGrabber.__init__(self)
		self.con = CookieCon.CookieCon(userAgent="Mozilla/6.0 (X11; Linux i686; rv:27.0) Gecko/20103401 Firefox/26.0")

	def _parse_ip(self, td, info_dict):
		styles = td.find_all("style")[0].get_text()
		styles = styles.split(".")
		style_dict = dict()
		for style in styles:
			(class_name, separator, value) = style.partition("{display:")
			if "{display:" == separator:
				style_dict[class_name] = True if value == "inline}\n" else False
				spans = td.find_all("span")
				parser = _IPParser(style_dict)
				parser.feed(str(spans[0]))
		info_dict["ip"] = "".join(parser.ip)

	def _parse_port(self, td, info_dict):
		info_dict["port"] = td.get_text().strip("\n")

	def _parse_protocol(self, td, info_dict):
		info_dict["protocol"] = td.get_text().rstrip("\n")

	def _extract_trs(self, page):
		#ugly but for now works
		shit, sep1, rest_of_page_as_string = page.partition('<table id="listtable" cellpadding="0" cellspacing="0" rel="50">')
		data_with_header, sep2, shit = rest_of_page_as_string.partition('</table>')
		header, sep3, data = data_with_header.partition("</thead>")
		return BeautifulSoup(data).find_all('tr')

nntime.py:

Code: Alles auswählen

__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.1"
__maintainer__ = "Frederik Lauber"
__status__ = "Development"
__contact__ = "https://flambda.de/impressum.html"
__side__ = "Nntime"
__type__ = "proxy_grabber"

from string import Template
from bs4 import BeautifulSoup, SoupStrainer
from . import CookieCon
from . import comon
from . import samair
import re


class NntimeGrabber(samair.SamairGrabber):
	row_names = ["checkbox", "ip_obfuscated_port", "anonymity", "update_time", "country", "organisation"]
	PROXY_PAGE_TEMPLATE = Template('http://nntime.com/proxy-list-${page}.htm')

	def __init__(self):
		samair.SamairGrabber.__init__(self)
		self.con = CookieCon.CookieCon(encoding="iso-8859-1", userAgent="Mozilla/5.0 (X11; Linux i686; rv:26.0) Gecko/20100101 Firefox/26.0")

	def _extract_trs(self, page):
		return BeautifulSoup(page, parse_only=SoupStrainer('tr', {"class": re.compile("odd|even")}))

	def _per_page_parsing(self, page):
		tmp = re.search(u"""([a-z]=\w;)+""", page).group(0)
		self.decode_dict = comon.generate_decode_dict(tmp)

proxy3.py:

Code: Alles auswählen

"""Module containing the main proxy class used as a common base for the proxy grabbers"""
__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.1"
__maintainer__ = "Frederik Lauber"
__status__ = "Development"
__contact__ = "https://flambda.de/impressum.html"
import collections
import datetime
import ipaddress


class Port(int):
	"""Subclass of int to distinguish between an int and a port number"""
	def __new__(cls, *args, **kwargs):
		return super(Port, cls).__new__(cls, *args, **kwargs)


class ProxyProtocol(str):
	"""Class holding the protocol of the proxy, right now, only http, https, socks4, socks5"""
	supported = ["unknown", "http", "https", "socks4/5"]

	def __new__(cls, proxy_type):
		lowered = proxy_type.lower()
		if lowered in cls.supported:
			return super(ProxyProtocol, cls).__new__(cls, lowered)
		else:
			raise ValueError("Unsupported Protocol: ", proxy_type)

#no idea how to dea with this yet....
	def __eq__(self, other):
		return True if str(self) == "unknown" else str(self) == str(other)

	def __ne__(self, other):
		return False if str(self) == "unknown" else str(self) != str(other)

	def __hash__(self):
		return 0

class ProxyState(object):
	"""Class representing the mutable state of a proxy"""
	def __init__(self, *args, **kargs):
		self._lastly_available = datetime.datetime.now()

	def __hash__(self):
		return hash(self._lastly_available)

	def __eq__(self, other):
		return self._lastly_available == other.lastly_available

	def __ne__(self, other):
		return self._lastly_available != other.lastly_available

	def __lt__(self, other):
		if type(other) is ProxyState:
			return self._lastly_available > other.lastly_available
		else:
			return ValueError

	def __str__(self):
		return "Available at " + str(self._lastly_available)

	@property
	def lastly_available(self):
		return self._lastly_available


class Proxy(collections.namedtuple("Proxy", ["ip", "port", "protocol", "state"])):
	def __new__(cls, protocol, ip, port, *args, **kargs):
		return super(Proxy, cls).__new__(cls, ipaddress.ip_address(ip), Port(port), ProxyProtocol(protocol), ProxyState(*args, **kargs))

	def __hash__(self):
		return hash(self[:-2])

	def __eq__(self, other):
		return self.ip == other.ip and self.port == other.port and self.protocol == other.protocol

	def __ne__(self, other):
		return self.ip != other.ip or self.port != other.port or self.protocol != other.protocol

	@property
	def ip_string(self):
		return str(self.ip)

	@property
	def reference(self):
		return str(self.protocol) + "://" + str(self.ip) + ":" + str(self.port)

	def __str__(self):
		return self.reference + "\t\t" + str(self.state)


	@classmethod
	def ip_port(cls, protocol, ip_port,  *args, **kargs):
		"""ip_port = ip:port"""
		splitted_list= ip_port.split(":")
		port = splitted_list[-1]
		ip = ":".join(splitted_list[:-1])
		return cls(protocol, ip, port,  *args, **kargs)

	@classmethod
	def reference_string(cls, protocol_ip_port,  *args, **kargs):
		"""protocol_ip_port = protocol://ip:port"""
		(protocol, ip_port) = protocol_ip_port.split("://")
		splitted_list= ip_port.split(":")
		port = splitted_list[-1]
		ip = ":".join(splitted_list[:-1])
		return cls(protocol, ip, port,  *args, **kargs)

	@classmethod
	def info_dict(cls, info_dict):
		return cls(**info_dict)

proxy3_unittest.py:

Code: Alles auswählen

__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.1"
__maintainer__ = "Frederik Lauber"
__status__ = "Development"
__contact__ = "https://flambda.de/impressum.html"
import unittest
import ipaddress
from .proxy3 import Proxy, Port, ProxyProtocol, ProxyState

#           #
# UnitTests #
#           #
class ProxyMethods(unittest.TestCase):
	def setUp(self):
		self.protocol = "http"
		self.ip_string = "1.1.1.1"
		self.port = "2"
		self.proxy = Proxy(self.protocol, self.ip_string, self.port)

	def test_ip_string(self):
		self.assertEquals(self.proxy.ip_string, self.ip_string)

	def test_ip(self):
		self.assertEquals(self.proxy.ip, ipaddress.ip_address(self.ip_string))

	def test_port(self):
		self.assertEquals(self.proxy.port, Port(self.port))


class ComparingProxyStates(unittest.TestCase):
	def setUp(self):
		self.first = ProxyState()
		self.last = ProxyState()

	def test_by_creation_time_unequal(self):
		self.assertNotEqual(self.first, self.last)

	def test_by_creation_time_eq(self):
		self.assertEquals(self.first, self.first)


class SortingProxyStates(unittest.TestCase):
	def setUp(self):
		self.first_to_last = [ProxyState() for i in range(1000)]

	def test_sorting(self):
		last_to_first = sorted(self.first_to_last)
		self.assertListEqual(list(reversed(self.first_to_last)), last_to_first)

class ComparingProxies(unittest.TestCase):
	def setUp(self):
		self.a = Proxy("http", "1.1.1.1", "1")
		self.b = Proxy("http", "2.2.2.2", "1")
		self.c = Proxy("http", "1.1.1.1", "2")

	def test_by_ip(self):
		self.assertNotEqual(self.a, self.b)

	def test_by_port(self):
		self.assertNotEqual(self.a, self.c)


class SortProxies(unittest.TestCase):
	def setUp(self):
		self.a = Proxy("http", "1.1.1.1", "1")
		self.b = Proxy("http", "1.1.1.1", "1")
		self.c = Proxy("http", "1.1.1.1", "2")
		self.d = Proxy("http", "2.2.2.2", "1")
		self.proxy_list = [self.d, self.c, self.b, self.a]

	def test_sorting(self):
		n = [self.b, self.a, self.c, self.d]
		#b before a as b is younger
		k = sorted(self.proxy_list)
		self.assertListEqual(n, k)


class ProxiesCheckClassMethods(unittest.TestCase):
	def test_ip_port_ipv4(self):
		protocol = "http"
		ip = "001.100.10.01"
		port = 1
		ip_port = str(ip) + ":" + str(port)
		self.assertEqual(Proxy(protocol, ip, port), Proxy.ip_port(protocol, ip_port))

	def test_ip_port_ipv6(self):
		protocol = "http"
		ip = "fe80::45:1"
		port = 1
		ip_port = str(ip) + ":" + str(port)
		self.assertEqual(Proxy(protocol, ip, port), Proxy.ip_port(protocol, ip_port))

	def test_reference_string_ipv4(self):
		protocol = "http"
		ip = "001.100.10.01"
		port = 1
		protocol_ip_port = str(protocol) + "://" + str(ip) + ":" + str(port)
		self.assertEqual(Proxy(protocol, ip, port), Proxy.reference_string(protocol_ip_port))

	def test_reference_string_ipv6(self):
		protocol = "http"
		ip = "fe80::45:1"
		port = 1
		protocol_ip_port = str(protocol) + "://" + str(ip) + ":" + str(port)
		self.assertEqual(Proxy(protocol, ip, port), Proxy.reference_string(protocol_ip_port))


if __name__ == '__main__':
	unittest.main()

samair.py:

Code: Alles auswählen

__author__ = "Frederik Lauber"
__copyright__ = "Copyright 2014"
__license__ = "GPL3"
__version__ = "0.1"
__maintainer__ = "Frederik Lauber"
__status__ = "Development"
__contact__ = "https://flambda.de/impressum.html"
__side__ = "Samair"
__type__ = "proxy_grabber"


from string import Template
from bs4 import BeautifulSoup, SoupStrainer
from . import CookieCon
from . import comon
import jsbeautifier
import re

class SamairGrabber(comon.ProxyGrabber):
	row_names = ["ip_obfuscated_port", "anonymity", "update_time", "country"]
	PROXY_PAGE_TEMPLATE = Template("http://www.samair.ru/proxy/proxy-${page}.htm")

	def __init__(self):
		comon.ProxyGrabber.__init__(self)
		self.con = CookieCon.CookieCon(userAgent="Mozilla/6.0 (X11; Linux i686; rv:26.0) Gecko/201032101 Firefox/27.0")
		self.decode_dict = dict()
		self.default_protocol = "http"

	def _parse_ip_obfuscated_port(self, td, info_dict):
		info_dict["ip"] = td.contents[0]
		trash1, trash2, obfuscated_port = td.findAll("script")[0].string[:-1].partition('":"+')
		info_dict["port"] = comon.decode_port(obfuscated_port, self.decode_dict)
		info_dict["protocol"] = self.default_protocol

	def _extract_trs(self, page):
		proxylist_table = BeautifulSoup(page, parse_only=SoupStrainer('table', id="proxylist"))
		return proxylist_table.find_all("tr")[1:]

	@staticmethod
	def _prepare_page_number(page_number):
		return str(page_number).zfill(2)

	def _per_page_parsing(self, page):
		tmp = re.search("""/js/(\d+).js""", page).group(0)
		script_side = self.con.request("http://www.samair.ru" + tmp)
		res = jsbeautifier.beautify(script_side)
		self.decode_dict = comon.generate_decode_dict(res)

Euch schonmal vielen Dank für jede Anmerkung und Hilfe!

flambda