gooload version 10e-6 - ein downloadprogramm
Verfasst: Samstag 1. März 2008, 01:10
ziemlich lange her, das ich das hier geschrieben habe:
die idee an sich hat potenzial. nur das eine implementation mit 100% threads keine gute idee war...
ich versuchs nochmal mit twisted
und achtung:
dieser prgramm sendet automatische suchanfragen an google.
irgendwo hat google irgendwie geschrieben das man das nicht duerfen soll
or die
die idee an sich hat potenzial. nur das eine implementation mit 100% threads keine gute idee war...
ich versuchs nochmal mit twisted
und achtung:
dieser prgramm sendet automatische suchanfragen an google.
irgendwo hat google irgendwie geschrieben das man das nicht duerfen soll
Code: Alles auswählen
$ cowsay `uname -o`
___________
< GNU/Linux >
-----------
\ ^__^
\ (oo)\_______
(__)\ )\/\
||----w |
|| ||
Code: Alles auswählen
import urllib2
import threading
import re
import cherrypy as ch
import webbrowser
from time import sleep
from urllib import urlencode, unquote_plus, quote
from Tkinter import *
from sys import argv
if len(argv) > 1:
PORT = int(argv[1])
else:
PORT = 3000
import socket
socket.setdefaulttimeout(5)
def HTTP_get(url):
#print "\trequesting:", url
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.001 (windows; U; NT4.0; en-us) Gecko/25250101')
try:
conn = urllib2.urlopen(req)
except: # urllib2.HTTPError, urllib2.URLError:
return None
try:
resp = conn.read()
except:
return None
return resp
class Google(object):
def __init__(self, keyword):
self.kw = keyword
self.urls = []
self.regex = re.compile('h2 class=r><a href="(\w*://[^"]*)" class=l>')
self.last_urls = [None]
self.page = 0
def __iter__(self):
return self
def next(self):
if not self.urls:
self.get_more_urls()
return self.urls.pop()
def get_more_urls(self):
resp = HTTP_get("http://www.google.com/search?" + urlencode(dict(start=self.page*100, num=100, q=self.kw)))
if resp is None:
raise BLABLA("google gab keine antwort zurueck")
self.page += 1
urls = list(set(self.parse_urls(resp))) #dopelte hosts entfernen
if not urls or urls[0] == self.last_urls[0]:
raise StopIteration
self.urls.extend(urls)
self.last_urls = urls
def parse_urls(self, resp):
return self.regex.findall(resp)
class UrlIter(object):
def __init__(self, urls, threads2create=40, wait_time=0.5):
self.wait_time = wait_time
self.threads2create = threads2create
self.urls = urls
self.results = []
self.running = 0
[threading.Thread(target=self._threaded).start() for i in xrange(self.threads2create)]
def _threaded(self):
self.running += 1
while self.urls:
url = self.urls.pop()
content = HTTP_get(url)
if content:
self.results.append((url, content))
self.running -= 1
def __iter__(self):
return self
def next(self):
print "current num of threads running:", self.running
print 'urls to fetch', len(self.urls)
if not self.results:
if not self.urls and self.running <= self.threads2create/5: # auf die restlichen 20 prozent zu warten lohnt sich nicht!
raise StopIteration
else:
print "sleeping"
sleep(self.wait_time)
return self.next()
else:
return self.results.pop()
def __del__(self):
self.urls = []
stop = __del__
class FileIter(object):
def __init__(self, **criteria):
dc = dict(filetypes_accept=('mp3', 'ogg', 'aac', 'avi', 'wmv'), filetype_search='mp3')
dc.update(criteria)
criteria = dc
self.criteria = criteria
self.regex = re.compile('<a href="([^"]+\.(' + '|'.join(criteria['filetypes_accept']) +'))"')
print '<a href="([^"]+\.(' + '|'.join(criteria['filetypes_accept']) +'))"'
google_query = '"' + criteria['keyword'] + '" intitle:index.of intext:server.at intext:' + criteria['filetype_search']
print google_query
urls = list(Google(google_query))
print urls
if not urls:
raise BLABLA('google hat dafuer keine ergebnisse gefunden')
self.contents = UrlIter(urls).__iter__() # naja, UrlIter.__iter__() gibt self zurueck...
def __iter__(self):
return self
def next(self):
'returns a url and all files in this url which match criteria'
url, content = self.contents.next() # raises StopIteration
#return url.split('?')[0], [unquote_plus(file[0]) for file in self.regex.findall(content) if criteria['keyword'] in unquote_plus(file[0])]
files = []
for file in self.regex.findall(content):
file = file[0]
if self.criteria['keyword'] in file:
files.append(unquote_plus(file))
if not files:
return self.next()
else:
if '?' in url:
url = url.split('?', 1)[0]
return url, files
def stop(self):
self.contents.stop()
#=========================
class WebApp(object):
def __init__(self):
self.files = ''
def append(self, url, files):
print url, files
self.files += '<br />results for: <a href="%s">%s</a> <br /> \n' % (url, unquote_plus(url))
for file in files:
self.files += '<nobr> <a href="%s">%s</a> <br /> \n</nobr>' % (url+file, file)
@ch.expose
def index(self, stop=False):
if stop:
self.files_iter.stop()
self.files = ''
return '''
<html>
<head>
<title>bla</title>
</head>
<body>
<form method="GET" action="/search">
suchen nach: <input name="keyword"> <br /> \n
<input type="submit" value="go!">
</form>
</body>
</html>
'''
@ch.expose
def search(self, keyword):
self.files_iter = FileIter(keyword=keyword)
threading.Thread(target=lambda: [self.append(file, url) for file, url in self.files_iter]).start()
return self.show()
@ch.expose
def show(self):
return '''
<html>
<head>
<title>bla</title>
<META HTTP-EQUIV="refresh" CONTENT="2; URL=/show">
</head>
<body>
<center><a href="/?stop=yes">neue suche</a></center>
%s
</body>
</html>
''' % self.files
threading.Thread(target=lambda: sleep(1.5) or webbrowser.open_new("http://localhost:" + str(PORT) + "/")).start()#webbrowser NACH dem cherrypy start aufrufen
ch.config.update({'server.socket_port': PORT})
ch.quickstart(root=WebApp())