Kleines Beispielprogramm für die angegebene Seite:
Code: Alles auswählen
from __future__ import division, with_statement
from functools import partial
from xml.etree import ElementTree as etree
from BeautifulSoup import BeautifulSoup
def process_day_cell(cell, year):
day_nr, date, time = cell(text=True)
date = date.split()[1].split('.')
time = (time.split()[0].split('.') + ['00'])[:2]
return (int(day_nr.split('.')[0]),
'%d-%s-%sT%s:%s' % (year, date[1], date[0], time[0], time[1]))
def process_row(year, row):
day_cell, opponents_cell, result_cell = row('td', recursive=False)[:3]
day_nr, date = process_day_cell(day_cell, year)
team_1, team_2 = (t.strip()
for t in opponents_cell(text=True)[0].split('-'))
result_1, result_2 = result_cell(text=True)[0].split(':')
return (day_nr, date, team_1, team_2, result_1, result_2)
def process_source(source, year):
soup = BeautifulSoup(source)
season_games_table = soup.find('div', 'content_text').table
return map(partial(process_row, year),
season_games_table('tr', recursive=False))
def build_xml(rows, year):
builder = etree.TreeBuilder()
builder.start('season', dict(year=str(year)))
for day_nr, date, team_1, team_2, result_1, result_2 in rows:
builder.start('game', dict(day_nr=str(day_nr),
date=date,
result_1=result_1,
result_2=result_2))
for team in (team_1, team_2):
builder.start('team', dict())
builder.data(team)
builder.end('team')
builder.end('game')
builder.end('season')
return etree.tostring(builder.close())
def main():
year = 2008
with open('test.html') as in_file:
source = in_file.read()
print build_xml(process_source(source, year), year)
if __name__ == '__main__':
main()