Daten (im Sinne von Datum) aus Webseiten extrahieren
Verfasst: Samstag 22. März 2014, 01:51
Hallo zusammen,
kann mir jemand bei folgendem Code behilflich sein, bisscen kommentieren wäre sehr für das Verständnis beitragend, bin nicht fit in Python.
Es geht darum, dass Daten von Webseiten extrahiert werden sollen.
Vielen Dank
Kevin
kann mir jemand bei folgendem Code behilflich sein, bisscen kommentieren wäre sehr für das Verständnis beitragend, bin nicht fit in Python.
Es geht darum, dass Daten von Webseiten extrahiert werden sollen.
Vielen Dank
Kevin
Code: Alles auswählen
import bs4
import csv
from datetime import datetime
import requests
import re
import sys
DATE_OUTPUT_FORMAT = '%Y-%m-%d'
MONTHS = 'january,february,march,april,may,june,july,august,september,october,november,december'.split(',')
MONTHS_SHORT = [m[:3] for m in MONTHS]
NUM_RE = r'(?P<%s>\d{%s})'
YEAR_RE = NUM_RE % ('year', 4)
or_re = lambda x: '|'.join(x)
MONTH_RE = r'(?P<month>\d{1,2}|%s|%s)' % (or_re(MONTHS), or_re(MONTHS_SHORT))
DAY_RE = NUM_RE % ('day', '1,2')
//ab hier werden wohl die regulären Ausdrücke definiert
ISO_RE = re.compile(r'%s-%s-%s' % (YEAR_RE, MONTH_RE, DAY_RE))
SPOKEN_RE = re.compile(r'%s %s(?:,|, | | of )%s' % (MONTH_RE, DAY_RE, YEAR_RE))
DATES_RES = ISO_RE, SPOKEN_RE
assert ISO_RE.match('2000-01-01')
assert ISO_RE.match('2000-1-1')
assert not ISO_RE.match('000-01-01.2000-01-0')
assert ISO_RE.match('2000-feb-01')
assert SPOKEN_RE.match('february 1, 2000')
assert SPOKEN_RE.match('february 1,2000')
assert SPOKEN_RE.match('february 1 2000')
assert SPOKEN_RE.match('february 1 of 2000')
def sentences(content):
if not content:
return
for a in content.split('\n'):
for s in a.split('. '):
s = s.strip()
if s.endswith('.'):
s = s[:-1]
yield s
_s = lambda x: list(sentences(x))
assert _s('') == []
assert _s('a') == ['a']
assert _s(' a ') == ['a']
assert _s('a.') == ['a']
assert _s('a a') == ['a a']
assert _s('a.a') == ['a.a']
assert _s('a. a') == ['a', 'a']
assert _s('a\na.') == ['a', 'a']
def parse_month(s):
if s.isdigit():
return int(s)
if s in MONTHS:
return MONTHS.index(s) + 1
if s in MONTHS_SHORT:
return MONTHS_SHORT.index(s) + 1
assert parse_month('1') == 1
assert parse_month('12') == 12
# we're not filtering yet
assert parse_month('13') == 13
assert parse_month('jan') == 1
assert parse_month('dec') == 12
assert parse_month('january') == 1
assert parse_month('december') == 12
def looks_like_date(year, month, day):
return ((1000 <= year <= 3000) and
(1 <= month <= 12) and
(1 <= day <= 31))
def extract_dates(content):
for sentence in sentences(content):
for date_re in DATES_RES:
for d in date_re.finditer(sentence.lower()):
m = date_re.match(d.group(0))
g = m.groupdict()
year, month, day = int(g['year']), parse_month(g['month']), int(g['day'])
if looks_like_date(year, month, day):
yield datetime(year, month, day), sentence
_extract = lambda x: list(extract_dates(x))
d = datetime(2000, 1, 1)
d2 = datetime(2000, 1, 2)
assert _extract("") == []
assert _extract("2000-01-01") == [(d, "2000-01-01")]
assert _extract("2000-1-1") == [(d, "2000-1-1")]
assert _extract(" 2000-01-01 ") == [(d, "2000-01-01")]
assert _extract("\t2000-01-01\t") == [(d, "2000-01-01")]
assert _extract("\n2000-01-01\n") == [(d, "2000-01-01")]
assert _extract("\r\n2000-01-01\r\n") == [(d, "2000-01-01")]
assert _extract("2000-01-01.2000-01-01") == [(d, "2000-01-01.2000-01-01")] * 2
assert _extract("2000-01-01. 2000-01-01") == [(d, "2000-01-01")] * 2
assert _extract("2000-01-01.2000-01-02") == [(d, "2000-01-01.2000-01-02"), (d2, "2000-01-01.2000-01-02")]
assert _extract("2000-01-01\n2000-01-01") == [(d, "2000-01-01")] * 2
assert _extract("In the year 2000-01-01") == [(d, "In the year 2000-01-01")]
assert _extract("2000-jan-01") == [(d, "2000-jan-01")]
assert _extract("2000-Jan-01") == [(d, "2000-Jan-01")]
assert _extract("January 1, 2000") == [(d, "January 1, 2000")]
def body_text(content):
for parser in ["html5lib", "lxml", "html.parser"]:
try:
return bs4.BeautifulSoup(content).text
except:
pass
def main():
if len(sys.argv) != 2:
print "usage: python dates.py http://example.com/page/etc"
url = sys.argv[1]
response = requests.get(url)
body = body_text(response.content)
csvfile = csv.writer(sys.stdout)
for date, sentence in extract_dates(body):
csvfile.writerow((date.strftime(DATE_OUTPUT_FORMAT), sentence))
if __name__ == '__main__':
main()