die idee an sich hat potenzial. nur das eine implementation mit 100% threads keine gute idee war...
ich versuchs nochmal mit twisted
und achtung:
dieser prgramm sendet automatische suchanfragen an google.
irgendwo hat google irgendwie geschrieben das man das nicht duerfen soll
Code: Alles auswählen
$ cowsay `uname -o`
___________
< GNU/Linux >
-----------
\ ^__^
\ (oo)\_______
(__)\ )\/\
||----w |
|| ||
Code: Alles auswählen
import urllib2
import threading
import re
import cherrypy as ch
import webbrowser
from time import sleep
from urllib import urlencode, unquote_plus, quote
from Tkinter import *
from sys import argv
if len(argv) > 1:
PORT = int(argv[1])
else:
PORT = 3000
import socket
socket.setdefaulttimeout(5)
def HTTP_get(url):
#print "\trequesting:", url
req = urllib2.Request(url)
req.add_header('User-Agent', 'Mozilla/5.001 (windows; U; NT4.0; en-us) Gecko/25250101')
try:
conn = urllib2.urlopen(req)
except: # urllib2.HTTPError, urllib2.URLError:
return None
try:
resp = conn.read()
except:
return None
return resp
class Google(object):
def __init__(self, keyword):
self.kw = keyword
self.urls = []
self.regex = re.compile('h2 class=r><a href="(\w*://[^"]*)" class=l>')
self.last_urls = [None]
self.page = 0
def __iter__(self):
return self
def next(self):
if not self.urls:
self.get_more_urls()
return self.urls.pop()
def get_more_urls(self):
resp = HTTP_get("http://www.google.com/search?" + urlencode(dict(start=self.page*100, num=100, q=self.kw)))
if resp is None:
raise BLABLA("google gab keine antwort zurueck")
self.page += 1
urls = list(set(self.parse_urls(resp))) #dopelte hosts entfernen
if not urls or urls[0] == self.last_urls[0]:
raise StopIteration
self.urls.extend(urls)
self.last_urls = urls
def parse_urls(self, resp):
return self.regex.findall(resp)
class UrlIter(object):
def __init__(self, urls, threads2create=40, wait_time=0.5):
self.wait_time = wait_time
self.threads2create = threads2create
self.urls = urls
self.results = []
self.running = 0
[threading.Thread(target=self._threaded).start() for i in xrange(self.threads2create)]
def _threaded(self):
self.running += 1
while self.urls:
url = self.urls.pop()
content = HTTP_get(url)
if content:
self.results.append((url, content))
self.running -= 1
def __iter__(self):
return self
def next(self):
print "current num of threads running:", self.running
print 'urls to fetch', len(self.urls)
if not self.results:
if not self.urls and self.running <= self.threads2create/5: # auf die restlichen 20 prozent zu warten lohnt sich nicht!
raise StopIteration
else:
print "sleeping"
sleep(self.wait_time)
return self.next()
else:
return self.results.pop()
def __del__(self):
self.urls = []
stop = __del__
class FileIter(object):
def __init__(self, **criteria):
dc = dict(filetypes_accept=('mp3', 'ogg', 'aac', 'avi', 'wmv'), filetype_search='mp3')
dc.update(criteria)
criteria = dc
self.criteria = criteria
self.regex = re.compile('<a href="([^"]+\.(' + '|'.join(criteria['filetypes_accept']) +'))"')
print '<a href="([^"]+\.(' + '|'.join(criteria['filetypes_accept']) +'))"'
google_query = '"' + criteria['keyword'] + '" intitle:index.of intext:server.at intext:' + criteria['filetype_search']
print google_query
urls = list(Google(google_query))
print urls
if not urls:
raise BLABLA('google hat dafuer keine ergebnisse gefunden')
self.contents = UrlIter(urls).__iter__() # naja, UrlIter.__iter__() gibt self zurueck...
def __iter__(self):
return self
def next(self):
'returns a url and all files in this url which match criteria'
url, content = self.contents.next() # raises StopIteration
#return url.split('?')[0], [unquote_plus(file[0]) for file in self.regex.findall(content) if criteria['keyword'] in unquote_plus(file[0])]
files = []
for file in self.regex.findall(content):
file = file[0]
if self.criteria['keyword'] in file:
files.append(unquote_plus(file))
if not files:
return self.next()
else:
if '?' in url:
url = url.split('?', 1)[0]
return url, files
def stop(self):
self.contents.stop()
#=========================
class WebApp(object):
def __init__(self):
self.files = ''
def append(self, url, files):
print url, files
self.files += '<br />results for: <a href="%s">%s</a> <br /> \n' % (url, unquote_plus(url))
for file in files:
self.files += '<nobr> <a href="%s">%s</a> <br /> \n</nobr>' % (url+file, file)
@ch.expose
def index(self, stop=False):
if stop:
self.files_iter.stop()
self.files = ''
return '''
<html>
<head>
<title>bla</title>
</head>
<body>
<form method="GET" action="/search">
suchen nach: <input name="keyword"> <br /> \n
<input type="submit" value="go!">
</form>
</body>
</html>
'''
@ch.expose
def search(self, keyword):
self.files_iter = FileIter(keyword=keyword)
threading.Thread(target=lambda: [self.append(file, url) for file, url in self.files_iter]).start()
return self.show()
@ch.expose
def show(self):
return '''
<html>
<head>
<title>bla</title>
<META HTTP-EQUIV="refresh" CONTENT="2; URL=/show">
</head>
<body>
<center><a href="/?stop=yes">neue suche</a></center>
%s
</body>
</html>
''' % self.files
threading.Thread(target=lambda: sleep(1.5) or webbrowser.open_new("http://localhost:" + str(PORT) + "/")).start()#webbrowser NACH dem cherrypy start aufrufen
ch.config.update({'server.socket_port': PORT})
ch.quickstart(root=WebApp())