Das deutsche Python-Forum

ziemlich lange her, das ich das hier geschrieben habe:
die idee an sich hat potenzial. nur das eine implementation mit 100% threads keine gute idee war...

ich versuchs nochmal mit twisted

und achtung:
dieser prgramm sendet automatische suchanfragen an google.
irgendwo hat google irgendwie geschrieben das man das nicht duerfen soll

Code: Alles auswählen

$ cowsay `uname -o`
 ___________
< GNU/Linux >
 -----------
        \   ^__^
         \  (oo)\_______
            (__)\       )\/\
                ||----w |
                ||     ||

or die

Code: Alles auswählen

import urllib2
import threading
import re
import cherrypy as ch
import webbrowser

from time import sleep
from urllib import urlencode, unquote_plus, quote
from Tkinter import *
from sys import argv


if len(argv) > 1:
	PORT = int(argv[1])
else:
	PORT = 3000

import socket
socket.setdefaulttimeout(5)


def HTTP_get(url):
	#print "\trequesting:", url
	req = urllib2.Request(url)
	req.add_header('User-Agent', 'Mozilla/5.001 (windows; U; NT4.0; en-us) Gecko/25250101')
	try:
		conn = urllib2.urlopen(req)
	except: # urllib2.HTTPError, urllib2.URLError:
		return None
	try:
		resp = conn.read()
	except:
		return None
	return resp


class Google(object):
	def __init__(self, keyword):
		self.kw = keyword
		self.urls = []
		self.regex = re.compile('h2 class=r><a href="(\w*://[^"]*)" class=l>')
		
		self.last_urls = [None]
		self.page = 0

	def __iter__(self):
		return self
	def next(self):
		if not self.urls:
			self.get_more_urls()
		return self.urls.pop()

	def get_more_urls(self):
		resp = HTTP_get("http://www.google.com/search?" + urlencode(dict(start=self.page*100, num=100, q=self.kw)))
		if resp is None:
			raise BLABLA("google gab keine antwort zurueck")
		self.page += 1
		urls = list(set(self.parse_urls(resp))) #dopelte hosts entfernen
		if not urls or urls[0] == self.last_urls[0]:
			raise StopIteration
		self.urls.extend(urls)
		self.last_urls = urls
	
	def parse_urls(self, resp): 
		return self.regex.findall(resp)


class UrlIter(object):
	def __init__(self, urls, threads2create=40, wait_time=0.5):
		self.wait_time = wait_time
		self.threads2create = threads2create
		self.urls = urls
		self.results = []
		self.running = 0
		[threading.Thread(target=self._threaded).start() for i in xrange(self.threads2create)]
	def _threaded(self):
		self.running += 1
		while self.urls:
			url = self.urls.pop()
			content = HTTP_get(url)
			if content:
				self.results.append((url, content))
		self.running -= 1
	
	def __iter__(self):
		return self
	def next(self):
		print "current num of threads running:", self.running
		print 'urls to fetch', len(self.urls)
		if not self.results:
			if not self.urls and self.running <= self.threads2create/5: # auf die restlichen 20 prozent zu warten lohnt sich nicht! 
				raise StopIteration
			else:
				print "sleeping"
				sleep(self.wait_time)
				return self.next()
		else:
			return self.results.pop()
	def __del__(self):
		self.urls = []
	stop = __del__ 





class FileIter(object):
	def __init__(self, **criteria):
		dc = dict(filetypes_accept=('mp3', 'ogg', 'aac', 'avi', 'wmv'), filetype_search='mp3')
		dc.update(criteria)
		criteria = dc
		self.criteria = criteria
		
		self.regex =  re.compile('<a href="([^"]+\.(' + '|'.join(criteria['filetypes_accept'])  +'))"')
		print '<a href="([^"]+\.(' + '|'.join(criteria['filetypes_accept'])  +'))"'
		google_query = '"' + criteria['keyword'] + '" intitle:index.of intext:server.at intext:' + criteria['filetype_search']
		print google_query
		
		urls = list(Google(google_query))
		print urls
		if not urls:
			raise BLABLA('google hat dafuer keine ergebnisse gefunden')
		
		self.contents = UrlIter(urls).__iter__() # naja, UrlIter.__iter__() gibt self zurueck...
		
	def __iter__(self):
		return self
	def next(self):
		'returns a url and all files in this url which match criteria'
		url, content = self.contents.next() # raises StopIteration
		#return url.split('?')[0], [unquote_plus(file[0]) for file in self.regex.findall(content) if criteria['keyword'] in unquote_plus(file[0])]
		files = []
		for file in self.regex.findall(content):
			file = file[0]
			if self.criteria['keyword'] in file:
				files.append(unquote_plus(file))
		if not files:
			return self.next()
		else:
			
			if '?' in url:
				url = url.split('?', 1)[0]
			
			return url, files

	def stop(self):
		self.contents.stop()

#=========================


class WebApp(object):
	def __init__(self):
		self.files = ''
	def append(self, url, files):
		print url, files
		self.files += '<br />results for: <a href="%s">%s</a> <br /> \n' % (url, unquote_plus(url))
		for file in files:
			self.files += '<nobr>   <a href="%s">%s</a> <br /> \n</nobr>' % (url+file, file)
	
	@ch.expose
	def index(self, stop=False):

		if stop:
			self.files_iter.stop()
			self.files = ''
		
		return '''
		<html>
		<head>
			<title>bla</title>
		</head>
		<body>

			<form method="GET" action="/search">
				suchen nach: <input name="keyword"> <br /> \n		
				<input type="submit" value="go!">
			</form>
		
		</body>
		</html>
		'''
	@ch.expose
	def search(self, keyword):
		self.files_iter = FileIter(keyword=keyword)
		threading.Thread(target=lambda: [self.append(file, url) for file, url in self.files_iter]).start()
		return self.show()
	@ch.expose
	def show(self):
		return '''
		<html>
		<head>
			<title>bla</title>
			<META HTTP-EQUIV="refresh" CONTENT="2; URL=/show">
		</head>
		<body>

		<center><a href="/?stop=yes">neue suche</a></center>
		%s
		</body>
		</html>
		''' % self.files



threading.Thread(target=lambda: sleep(1.5) or webbrowser.open_new("http://localhost:" + str(PORT) + "/")).start()#webbrowser NACH dem cherrypy start aufrufen

ch.config.update({'server.socket_port': PORT})
ch.quickstart(root=WebApp())

Hi

Warum verwendest du nicht die Google Search API? http://code.google.com/apis/ajaxsearch/

Und eine kleine Beschreibung was das Programm macht wäre auch nicht schlecht. Ich nehme mal an das es einfach Aufgrund eines Keywords Audio-Files sucht.

Gruss

Das deutsche Python-Forum

gooload version 10e-6 - ein downloadprogramm

gooload version 10e-6 - ein downloadprogramm