Code: Alles auswählen
# -*- coding: cp1252 -*-
import urllib, urllib2
import cookielib
import sys, urllib2, re, sets, random, httplib, time, socket
from BeautifulSoup import BeautifulSoup
def google(query):
counter = 10 # um die Seitenzahl zu erhöhen
urls = []
cj = cookielib.CookieJar()
while counter < 11:
#Die url zum googlen
url = 'http://www.google.com/search?hl=en&q=' \
+query+'&hl=en&lr=&start='+repr(counter)+'&sa=N'
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#Die haeder dazudichten
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)'),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
("Accept-Language", "de-de,de;q=0.8,en-us;q=0.5,en;q=0.3"),
("Accept-Encoding", "gzip,deflate"),
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7"),
("Keep-Alive", "300"),
("Connection", "keep-alive")]
data = opener.open(url).read()
# Der teil ist nicht von mir:
hosts = re.findall(('\w+\.[\w\.\-/]*\.\w+'),StripTags(data))
#Lets add sites found to a list if not already or a google site.
#We don't want to upset the people that got our list for us.
for x in hosts:
if x.find('www') != -1:
x = x[x.find('www'):]
if x not in urls and re.search("google", x) == None:
urls.append(x)
counter += 10
return urls
def bing(query):
cj = cookielib.CookieJar()
pos = 1 #Um die seitenzahl zu erhoehen
urls = []
tmp_urls1 = []
tmp_urls2 = []
while 1:
url = "http://www.bing.com/search?q=" + query + "&first=" + str(pos)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)'),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
("Accept-Language", "de-de,de;q=0.8,en-us;q=0.5,en;q=0.3"),
("Accept-Encoding", "gzip,deflate"),
("Accept-Charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7"),
("Keep-Alive", "300"),
("Connection", "keep-alive")]
html = opener.open(url).read() #Sollte html ergeben
#Zum debuggen - schaut selbst mal was da für muell herauskommt
a = open("bing.html", "w")
a.write(html)
a.close()
# -
soup = BeautifulSoup(html)
# Um die links zu filtern
for x in soup.findAll("a"):
#Nicht eigene links nehmen
if not "bing" in x and not "ms" in x and not "microsoft" in x:
tmp_urls1.append(x)
for x in tmp_urls1:
if not x in urls:
urls.append(x)
#Prüfen ob diese Seite schon benutzt wurde - anhand der links:
# Sind es die gleichen wie auf der letzten seite?
if tmp_urls1 == tmp_urls2 != []: break
tmp_urls2 = tmp_urls1
tmp_urls1 = []
#Position um 10 erhoehen, damit die naechste seite angwzeigt wird
pos += 10
bing("site:python.org")
google("site:python.org")
Bitte klärt mich auf!
für vorschläge bin ich offen!