Web Scraper

LeoRom · Donnerstag 24. Juni 2021, 16:21

Hallo zusammen,

ich bin neu in diesem Forum und wollte erstmal allen ein "Hallo" sagen.

Da ich mich seit Jahren für Sportergebnisse interessiere, habe ich täglich Daten manuell kopiert und diese in eine Exceldatei eingefügt. Nun habe ich jedoch gesehen, dass man es auch automatisch mit Python tuen kann. Somit habe ich mich auf die Suche gemacht und auf GitHub folgendes gefunden:

Code: Alles auswählen

import argparse
import re
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from db_connection import DatabaseConnection
from model import Match, TableEntry, Season, TeamStats, League

main_url = 'https://www.flashscore.com'
db = DatabaseConnection('test.db')

countries_leagues = {'England': 'Premier League', 'Spain': 'LaLiga', 'Italy': 'Serie A', 'Germany': 'Bundesliga',
                     'France': 'Ligue 1', 'Portugal': 'Primeira Liga', 'Russia': 'Premier League',
                     'Netherlands': 'Eredivisie', 'Turkey': 'Super Lig'}


def execute_script_click(button):
    browser.execute_script('arguments[0].click();', button)


def click_league(country, league_name):
    left_panel = browser.find_element_by_id('lc')
    countries_menus = left_panel.find_elements_by_class_name('mbox0px')
    countries_lists = [menu.find_element_by_class_name('menu') for menu in countries_menus]
    countries_lists = [countries_list for countries_list in countries_lists]
    found_countries = [element.find_elements_by_link_text(country) for element in countries_lists]
    found_countries = [found_country for found_country in found_countries if len(found_country) > 0]
    found_country = found_countries[0][0] if found_countries and found_countries[0] else None
    execute_script_click(found_country)

    time.sleep(2)
    found_league = left_panel.find_element_by_link_text(country).find_element_by_xpath('..').find_element_by_class_name(
        'submenu').find_element_by_link_text(league_name)
    found_league_link = found_league.get_attribute('href')
    browser.get(found_league_link)


def get_table_entries_from_table_div(table_rows_soup, league, season):
    teams_with_places = {}
    for row in table_rows_soup:
        place = row.find('div', class_='table__cell--rank').text.strip()
        team_name = row.find('span', class_='team_name_span').a.text
        teams_with_places[team_name] = place
    teams = db.get_teams_by_league_and_names(league, list(teams_with_places))
    return [TableEntry(season=season, team=team, place=teams_with_places[team.name]) for team in teams]


def find_team_by_name(teams, name):
    for team in teams:
        if team.name == name:
            return team
    return None


def get_team_stats_from_table_div(table_rows_soup, season, teams):
    teams_stats = []
    for row in table_rows_soup:
        team_name = row.find('span', class_='team_name_span')
        team_name = team_name.a.text
        team = find_team_by_name(teams, team_name)
        matches_played = row.find('div', class_='table__cell--matches_played').text
        wins = row.find('div', class_='table__cell--wins_regular').text
        draws = row.find('div', class_='table__cell--draws').text
        losses = row.find('div', class_='table__cell--losses_regular').text
        goals = row.find('div', class_='table__cell--goals').text.split(':')
        goals_scored = goals[0]
        goals_conceded = goals[1]
        points = row.find('div', class_='table__cell--points').text
        stats = TeamStats(team=team, season=season, matches_played=matches_played, wins=wins, draws=draws,
                          losses=losses, goals_scored=goals_scored, goals_conceded=goals_conceded, points=points)
        teams_stats.append(stats)
    return teams_stats


def scrape_table(league_link, league, season):
    browser.get(league_link)
    standings_tab = browser.find_element_by_link_text('Standings')
    standings_tab.click()
    WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.ID, 'tabitem-table')))
    inner_standings = browser.find_element_by_id('tabitem-table')
    execute_script_click(inner_standings)
    WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'table__body')))
    source = browser.find_element_by_class_name('table__body').get_attribute('innerHTML')
    soup = BeautifulSoup(source, 'lxml')
    table_rows = soup.find_all('div', class_='table__row')
    table_entries = get_table_entries_from_table_div(table_rows, league, season)
    db.save_table_entries(table_entries)
    teams = [entry.team for entry in table_entries]
    teams_stats = get_team_stats_from_table_div(table_rows, season, teams)
    db.save_team_stats(teams_stats)


def calculate_dropped_elements(headers_and_match_divs, league_name):
    dropped_elements = []
    drop = True
    for ind, element in enumerate(headers_and_match_divs):
        if drop:
            dropped_elements.append(ind)
        if element['class'][0] == 'event__header':
            header_name = element.find('span', class_='event__title--name').text
            previous_drop = drop
            drop = header_name != league_name
            if previous_drop != drop and not previous_drop:
                dropped_elements.append(ind)
    return dropped_elements


def get_season_matches_as_html(league_link, league_name):
    browser.get(league_link)
    results_tab = browser.find_element_by_link_text('Results')
    results_tab.click()
    WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'event__more')))
    more_button = browser.find_element_by_class_name('event__more')
    more_matches = more_button is not None
    while more_matches:
        try:
            browser.execute_script('arguments[0].click();', more_button)
            time.sleep(3)
        except StaleElementReferenceException as exc:
            more_matches = False
    source = browser.find_element_by_class_name('event--results').get_attribute('innerHTML')
    soup = BeautifulSoup(source, 'lxml')
    headers_and_match_divs = soup.find_all('div', class_=['event__match', 'event__header'])

    # drop matches from results e.g. if there were additional relegation matches or play-offs
    dropped_elements_indexes = calculate_dropped_elements(headers_and_match_divs, league_name)
    match_divs = [element for ind, element in enumerate(headers_and_match_divs) if ind not in dropped_elements_indexes]
    return match_divs


def get_match_year(season, date):
    season_years = season.name.split('/')
    if len(season_years) == 1:
        return date + season_years[0]
    else:
        date_month = int(date.split('.')[1])
        return date + season_years[0] if date_month < 7 else date + season_years[1]


# has to be done after scrape_table where teams are loaded to database
def scrape_results(league_link, league, season):
    teams = db.get_teams_by_season(season)
    matches_soup = get_season_matches_as_html(league_link, league.name)
    matches = []
    for match_div in matches_soup:
        date_time = match_div.find('div', class_='event__time').text
        date = get_match_year(season, date_time.split(' ')[0])
        home_team_name = match_div.find('div', class_='event__participant--home').text
        away_team_name = match_div.find('div', class_='event__participant--away').text
        home_team = find_team_by_name(teams, home_team_name)
        away_team = find_team_by_name(teams, away_team_name)
        score = match_div.find('div', class_='event__scores').text.replace(' ', '').split('-')
        home_team_score = score[0]
        away_team_score = score[1]
        match = Match(season=season, date=date, home_team=home_team, away_team=away_team,
                      home_team_score=home_team_score, away_team_score=away_team_score)
        matches.append(match)
    db.save_matches(matches)


def get_years_from_season_name(season_name):
    two_years_season_name = re.search('[0-9][0-9][0-9][0-9]/[0-9][0-9][0-9][0-9]', season_name)
    if two_years_season_name is not None:
        return two_years_season_name.group()
    else:
        return re.search('[0-9][0-9][0-9][0-9]', season_name).group()


def scrape_league_history(country):
    league_name = countries_leagues[country]
    db.delete_league_by_name(league_name)

    browser.get(main_url)
    more_countries_element = browser.find_element_by_class_name('show-more')
    more_countries_button = more_countries_element.find_element_by_link_text('More')
    execute_script_click(more_countries_button)

    click_league(country, league_name)

    archive_button = browser.find_element_by_link_text('Archive')
    archive_button.click()

    season_names = browser.find_elements_by_class_name('leagueTable__season')[2:]
    season_names = [season.find_element_by_tag_name('a') for season in season_names][::-1]

    league = League(name=league_name, country=country)
    db.save_league(league)

    seasons = [Season(name=get_years_from_season_name(season_name.text), league=league) for season_name in season_names]
    links = [season.get_attribute('href') for season in season_names]

    for season, link in zip(seasons, links):
        scrape_table(link, league, season)
        scrape_results(link, league, season)


if __name__ == '__main__':
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('-s', '--os', choices=['linux', 'windows'], default='windows',
                             help='operating system to run the script on')
    args_parser.add_argument('-b', '--browser', choices=['firefox', 'chrome'], default='chrome',
                             help='browser to perform scraping with')
    args_parser.add_argument('-l', '--leagues', nargs='+', choices=list(countries_leagues.keys()), required=True,
                             help='names of countries from which data from the highest leagues are to be scraped')
    args = args_parser.parse_args()
    webdriver_name = 'chromedriver' if args.browser == 'chrome' else 'geckodriver'
    webdriver_extension = '.exe' if args.os == 'windows' else ''
    webdriver_path = 'webdrivers/{}/{}/{}{}'.format(args.browser, args.os, webdriver_name, webdriver_extension)
    browser = webdriver.Chrome(executable_path=webdriver_path)
    available_countries = list(countries_leagues)
    countries_to_scrape = args.leagues
    for country_name in countries_to_scrape:
        if country_name in available_countries:
            scrape_league_history(country_name)
    browser.quit()

Die dafür benötigten Bibliotheken:

beautifulsoup4==4.8.2
lxml==4.6.3
selenium==3.141.0
soupsieve==1.9.5
SQLAlchemy==1.3.13
urllib3==1.25.8

habe ich alle über pip bereits installieren können. Auch die mit beigefügte DB habe ich mit DB Browser for SQLite öffnen können. Jedoch bekomme ich beim ausführen des Codes folgende Fehlermeldung:

usage: scrape.py [-h] [-s {linux,windows}] [-b {firefox,chrome}] -l
{England,Spain,Italy,Germany,France,Portugal,Russia,Netherlands,Turkey}
[{England,Spain,Italy,Germany,France,Portugal,Russia,Netherlands,Turkey} ...]
scrape.py: error: the following arguments are required: -l/--leagues

Zusätzlich war da noch eine Beschreibung mit beigefügt:

Short Long Required Value
-s --os yes operating system: linux or windows
-b --browser yes browser used to scraping data: chrome or firefox
-l --leagues yes countries names; currently supported countries are listed in help
-h --help no shows parameters description and list of their available values

Usage example

python scrape.py -s windows -b chrome -l England Italy Netherlands

Ich vermute mal, dass ich im letzten Abschnitt des Codes:

Code: Alles auswählen

if __name__ == '__main__':
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('-s', '--os', choices=['linux', 'windows'], default='windows',
                             help='operating system to run the script on')
    args_parser.add_argument('-b', '--browser', choices=['firefox', 'chrome'], default='chrome',
                             help='browser to perform scraping with')
    args_parser.add_argument('-l', '--leagues', nargs='+', choices=list(countries_leagues.keys()), required=True,
                             help='names of countries from which data from the highest leagues are to be scraped')
    args = args_parser.parse_args()
    webdriver_name = 'chromedriver' if args.browser == 'chrome' else 'geckodriver'
    webdriver_extension = '.exe' if args.os == 'windows' else ''
    webdriver_path = 'webdrivers/{}/{}/{}{}'.format(args.browser, args.os, webdriver_name, webdriver_extension)
    browser = webdriver.Chrome(executable_path=webdriver_path)
    available_countries = list(countries_leagues)
    countries_to_scrape = args.leagues
    for country_name in countries_to_scrape:
        if country_name in available_countries:
            scrape_league_history(country_name)
    browser.quit()

dies ändern muss, aber leider weiss ich nicht an welcher Stelle. Ich habe es an mehreren Stellen versucht ( Windows, Chrome und die Ligen wie im Beispiel, aber leider ohne Erfolg )

Ich bin wie ihr mit Sicherheit erkennen könnt, ein kompletter Anfänger was Python betrifft und hoffe, dass ihr mir bei meinem Problem behilflich sein könnt.

Ich hoffe auch, dass es ok war den Code von GitHub hier zu veröffentlichen. Wenn nicht, dann sagt es mit bitte Bescheid und werde diesen unverzüglich wieder entfernen.

Vielen Dank schon mal im voraus.

Viele Grüße
Leorom

Sirius3 · Donnerstag 24. Juni 2021, 17:50

Das ist keine Fehlermeldung, sondern die Hilfe, wie das Programm zu benutzen ist. Es handelt sich um ein Kommandozeilenprogramm und die benötigen üblicherweise Parameter.

Also zwingend die Liga. Der Aufruf könnte also so aussehen:

Code: Alles auswählen

python scrape.py -l Germany

LeoRom · Freitag 25. Juni 2021, 13:53

Hallo Sirius3,

vielen Dank für deine schnelle Antwort.

Ich habe noch eine Frage ( bitte nicht lachen

, aber ist für mich komplett Neuland ):

Muss ich deinen Code in irgendeiner Zeile in dem oberen Code einfügen, oder muss ich die Datei mit diesem Namen so abspeichern?

Ich habe nämlich die scrape.py ( der oben angezeigte Code ) in dem was du geschrieben hast umbenannt, aber auch da passierte nichts.
Kannst du mir bitte sagen, wo ich deinen Code einfügen soll bzw was ich damit machen soll?

Vielen Dank für deine Mühe im voraus.

Viele Grüße
Leorom

__deets__ · Freitag 25. Juni 2021, 14:04

Das sind Argumente. An das Programm. Die gibt man beim Aufruf in der Kommandozeile mit an. Da muss nichts anders gespeichert oder umbenannt werden.

LeoRom · Freitag 25. Juni 2021, 15:17

Hallo _deets_,

vielen Dank für deine schnelle Antwort.

Ich habe es über die Kommandozeile ( habe dafür Windows PowerShell aus einem Video von YouTube ) genommen. Es hat sich auch geöffnet, aber bekomme nun folgenden Hinweis:

PS C:\Users\xxxxxx> python scrape.py -l Germany
Traceback (most recent call last):
File "C:\Users\xxxxxx\scrape.py", line 215, in <module>
browser = webdriver.Chrome(executable_path=webdriver_path)
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 76, in __init__
RemoteWebDriver.__init__(
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 157, in __init__
self.start_session(capabilities, browser_profile)
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 252, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unknown error: cannot find Chrome binary

Kann es sein, dass der Pfad vom Chromedriver nicht richtig hinterlegt ist? Muss ich den Chromedriver in demselben Ordner wie das Projekt hinterlegen?

Habe den Pfad vom Chromedriver eingetragen:

Code: Alles auswählen

browser = webdriver.Chrome(executable_path=webdriver_path)

Leider jedoch bekomme ich nun folgenden Fehler:

PS C:\Users\xxxxxx> python scrape.py -l Germany
SyntaxError: Non-UTF-8 code starting with '\x90' in file C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\python.exe on line 1, but no encoding declared; see http://python.org/dev/peps/pep-0263/ for details

Vielen Dank für deine Mühe im voraus.

Viele Grüße
Leorom

__deets__ · Freitag 25. Juni 2021, 15:43

Du musst die backslashes escapen, entweder indem du ein r vor den String stellst, also

Code: Alles auswählen

driver = r"C:\User\..."

Oder indem jeder \ in einen \\ gewandelt wird:

Code: Alles auswählen

driver = "C:\\User\\..."

LeoRom · Freitag 25. Juni 2021, 16:08

Das habe ich bereits getan, aber dann erhalte ich diese Meldung:

PS C:\Users\xxxxxx> python scrape.py -l Germany
SyntaxError: Non-UTF-8 code starting with '\x90' in file C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\python.exe on line 1, but no encoding declared; see http://python.org/dev/peps/pep-0263/ for details

Wenn ich die Datei öffne, dann habe ich folgendes:

Python 3.9.5 (tags/v3.9.5:0a7dcbd, May 3 2021, 17:27:52) [MSC v.1928 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license" for more information.
>>>

Es tut mir echt so leid, aber ich glaube ich verstehe es nicht so ganz

__deets__ · Freitag 25. Juni 2021, 16:15

Das ist der Pfad zum Python Interpreter. Ich weiss nicht was du da wo eingetragen hast, aber das sieht falsch und komisch aus. Du solltest keinerlei eigenen Datein *irgendwo* in der Python-Installation ablegen. Das ist fuer dich verbotenes Terrain. Wenn du darin rumgefummelt hast, schmeiss die Python Installation weg, und installier es nochmal. Und dann zeig bitte dein modifiziertes Skritp mit dem webdriver_path-Pfad, wie er da drin steht.

LeoRom · Freitag 25. Juni 2021, 19:55

So, Python deinstalliert und wieder neu installiert, die Version 3.9.5.

Jedoch erhalte ich jetzt beim ausführen des Scripts wieder die erste lange Fehlermeldung die ich vorhin gepostet habe

So habe ich es eingegeben:

Code: Alles auswählen

if __name__ == '__main__':
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('-s', '--os', choices=['linux', 'windows'], default='windows',
                             help='operating system to run the script on')
    args_parser.add_argument('-b', '--browser', choices=['firefox', 'chrome'], default='chrome',
                             help='browser to perform scraping with')
    args_parser.add_argument('-l', '--leagues', nargs='+', choices=list(countries_leagues.keys()), required=True,
                             help='names of countries from which data from the highest leagues are to be scraped')
    args = args_parser.parse_args()
    webdriver_name = 'chromedriver' if args.browser == 'chrome' else 'geckodriver'
    webdriver_extension = '.exe' if args.os == 'windows' else ''
    webdriver_path = 'C:/Users/xxxxxx/chromedriver.exe'.format(args.browser, args.os, webdriver_name, webdriver_extension)
    browser = webdriver.Chrome(executable_path=webdriver_path)
    available_countries = list(countries_leagues)
    countries_to_scrape = args.leagues
    for country_name in countries_to_scrape:
        if country_name in available_countries:
            scrape_league_history(country_name)
    browser.quit()

Der Fehler liegt eindeutig an mir, weil ich nicht weiss ( noch nicht ) wie das alles richtig hinterlegen muss

Dann noch bitte eine zusätzliche Frage:
Muss ich diesen Pfad

Code: Alles auswählen

db = DatabaseConnection('test.db')

auch umändern wo die DB liegt?

Danke wirklich für deine Mühe und Geduld mit mir

__deets__ · Freitag 25. Juni 2021, 21:15

Bitte den konkreten Fehler posten. Nicht eine vage Umschreibung, die hier auch nicht zum gezeigten Code passt. Und in voller Länge.

LeoRom · Freitag 25. Juni 2021, 21:45

Wie meinst du das, dass es nicht zu dem hier gezeigten Code passt?

Also ich nutze folgenden Code:

Code: Alles auswählen

import argparse
import re
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from db_connection import DatabaseConnection
from model import Match, TableEntry, Season, TeamStats, League

main_url = 'https://www.flashscore.com'
db = DatabaseConnection('test.db')

countries_leagues = {'England': 'Premier League', 'Spain': 'LaLiga', 'Italy': 'Serie A', 'Germany': 'Bundesliga',
                     'France': 'Ligue 1', 'Portugal': 'Primeira Liga', 'Russia': 'Premier League',
                     'Netherlands': 'Eredivisie', 'Turkey': 'Super Lig'}


def execute_script_click(button):
    browser.execute_script('arguments[0].click();', button)


def click_league(country, league_name):
    left_panel = browser.find_element_by_id('lc')
    countries_menus = left_panel.find_elements_by_class_name('mbox0px')
    countries_lists = [menu.find_element_by_class_name('menu') for menu in countries_menus]
    countries_lists = [countries_list for countries_list in countries_lists]
    found_countries = [element.find_elements_by_link_text(country) for element in countries_lists]
    found_countries = [found_country for found_country in found_countries if len(found_country) > 0]
    found_country = found_countries[0][0] if found_countries and found_countries[0] else None
    execute_script_click(found_country)

    time.sleep(2)
    found_league = left_panel.find_element_by_link_text(country).find_element_by_xpath('..').find_element_by_class_name(
        'submenu').find_element_by_link_text(league_name)
    found_league_link = found_league.get_attribute('href')
    browser.get(found_league_link)


def get_table_entries_from_table_div(table_rows_soup, league, season):
    teams_with_places = {}
    for row in table_rows_soup:
        place = row.find('div', class_='table__cell--rank').text.strip()
        team_name = row.find('span', class_='team_name_span').a.text
        teams_with_places[team_name] = place
    teams = db.get_teams_by_league_and_names(league, list(teams_with_places))
    return [TableEntry(season=season, team=team, place=teams_with_places[team.name]) for team in teams]


def find_team_by_name(teams, name):
    for team in teams:
        if team.name == name:
            return team
    return None


def get_team_stats_from_table_div(table_rows_soup, season, teams):
    teams_stats = []
    for row in table_rows_soup:
        team_name = row.find('span', class_='team_name_span')
        team_name = team_name.a.text
        team = find_team_by_name(teams, team_name)
        matches_played = row.find('div', class_='table__cell--matches_played').text
        wins = row.find('div', class_='table__cell--wins_regular').text
        draws = row.find('div', class_='table__cell--draws').text
        losses = row.find('div', class_='table__cell--losses_regular').text
        goals = row.find('div', class_='table__cell--goals').text.split(':')
        goals_scored = goals[0]
        goals_conceded = goals[1]
        points = row.find('div', class_='table__cell--points').text
        stats = TeamStats(team=team, season=season, matches_played=matches_played, wins=wins, draws=draws,
                          losses=losses, goals_scored=goals_scored, goals_conceded=goals_conceded, points=points)
        teams_stats.append(stats)
    return teams_stats


def scrape_table(league_link, league, season):
    browser.get(league_link)
    standings_tab = browser.find_element_by_link_text('Standings')
    standings_tab.click()
    WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.ID, 'tabitem-table')))
    inner_standings = browser.find_element_by_id('tabitem-table')
    execute_script_click(inner_standings)
    WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'table__body')))
    source = browser.find_element_by_class_name('table__body').get_attribute('innerHTML')
    soup = BeautifulSoup(source, 'lxml')
    table_rows = soup.find_all('div', class_='table__row')
    table_entries = get_table_entries_from_table_div(table_rows, league, season)
    db.save_table_entries(table_entries)
    teams = [entry.team for entry in table_entries]
    teams_stats = get_team_stats_from_table_div(table_rows, season, teams)
    db.save_team_stats(teams_stats)


def calculate_dropped_elements(headers_and_match_divs, league_name):
    dropped_elements = []
    drop = True
    for ind, element in enumerate(headers_and_match_divs):
        if drop:
            dropped_elements.append(ind)
        if element['class'][0] == 'event__header':
            header_name = element.find('span', class_='event__title--name').text
            previous_drop = drop
            drop = header_name != league_name
            if previous_drop != drop and not previous_drop:
                dropped_elements.append(ind)
    return dropped_elements


def get_season_matches_as_html(league_link, league_name):
    browser.get(league_link)
    results_tab = browser.find_element_by_link_text('Results')
    results_tab.click()
    WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'event__more')))
    more_button = browser.find_element_by_class_name('event__more')
    more_matches = more_button is not None
    while more_matches:
        try:
            browser.execute_script('arguments[0].click();', more_button)
            time.sleep(3)
        except StaleElementReferenceException as exc:
            more_matches = False
    source = browser.find_element_by_class_name('event--results').get_attribute('innerHTML')
    soup = BeautifulSoup(source, 'lxml')
    headers_and_match_divs = soup.find_all('div', class_=['event__match', 'event__header'])

    # drop matches from results e.g. if there were additional relegation matches or play-offs
    dropped_elements_indexes = calculate_dropped_elements(headers_and_match_divs, league_name)
    match_divs = [element for ind, element in enumerate(headers_and_match_divs) if ind not in dropped_elements_indexes]
    return match_divs


def get_match_year(season, date):
    season_years = season.name.split('/')
    if len(season_years) == 1:
        return date + season_years[0]
    else:
        date_month = int(date.split('.')[1])
        return date + season_years[0] if date_month < 7 else date + season_years[1]


# has to be done after scrape_table where teams are loaded to database
def scrape_results(league_link, league, season):
    teams = db.get_teams_by_season(season)
    matches_soup = get_season_matches_as_html(league_link, league.name)
    matches = []
    for match_div in matches_soup:
        date_time = match_div.find('div', class_='event__time').text
        date = get_match_year(season, date_time.split(' ')[0])
        home_team_name = match_div.find('div', class_='event__participant--home').text
        away_team_name = match_div.find('div', class_='event__participant--away').text
        home_team = find_team_by_name(teams, home_team_name)
        away_team = find_team_by_name(teams, away_team_name)
        score = match_div.find('div', class_='event__scores').text.replace(' ', '').split('-')
        home_team_score = score[0]
        away_team_score = score[1]
        match = Match(season=season, date=date, home_team=home_team, away_team=away_team,
                      home_team_score=home_team_score, away_team_score=away_team_score)
        matches.append(match)
    db.save_matches(matches)


def get_years_from_season_name(season_name):
    two_years_season_name = re.search('[0-9][0-9][0-9][0-9]/[0-9][0-9][0-9][0-9]', season_name)
    if two_years_season_name is not None:
        return two_years_season_name.group()
    else:
        return re.search('[0-9][0-9][0-9][0-9]', season_name).group()


def scrape_league_history(country):
    league_name = countries_leagues[country]
    db.delete_league_by_name(league_name)

    browser.get(main_url)
    more_countries_element = browser.find_element_by_class_name('show-more')
    more_countries_button = more_countries_element.find_element_by_link_text('More')
    execute_script_click(more_countries_button)

    click_league(country, league_name)

    archive_button = browser.find_element_by_link_text('Archive')
    archive_button.click()

    season_names = browser.find_elements_by_class_name('leagueTable__season')[2:]
    season_names = [season.find_element_by_tag_name('a') for season in season_names][::-1]

    league = League(name=league_name, country=country)
    db.save_league(league)

    seasons = [Season(name=get_years_from_season_name(season_name.text), league=league) for season_name in season_names]
    links = [season.get_attribute('href') for season in season_names]

    for season, link in zip(seasons, links):
        scrape_table(link, league, season)
        scrape_results(link, league, season)


if __name__ == '__main__':
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('-s', '--os', choices=['linux', 'windows'], default='windows',
                             help='operating system to run the script on')
    args_parser.add_argument('-b', '--browser', choices=['firefox', 'chrome'], default='chrome',
                             help='browser to perform scraping with')
    args_parser.add_argument('-l', '--leagues', nargs='+', choices=list(countries_leagues.keys()), required=True,
                             help='names of countries from which data from the highest leagues are to be scraped')
    args = args_parser.parse_args()
    webdriver_name = 'chromedriver' if args.browser == 'chrome' else 'geckodriver'
    webdriver_extension = '.exe' if args.os == 'windows' else ''
    webdriver_path = 'C:/Users/xxxxxx/chromedriver.exe'.format(args.browser, args.os, webdriver_name, webdriver_extension)
    browser = webdriver.Chrome(executable_path=webdriver_path)
    available_countries = list(countries_leagues)
    countries_to_scrape = args.leagues
    for country_name in countries_to_scrape:
        if country_name in available_countries:
            scrape_league_history(country_name)
    browser.quit()

Im unteren Teil habe ich beim webdriver_path folgendes eingetragen ( dies ist auch die einzige Änderung vom originalen Code s. den ersten ganz oben ):

Code: Alles auswählen

 webdriver_path = 'C:/Users/xxxxxx/chromedriver.exe'.format(args.browser, args.os, webdriver_name, webdriver_extension)

Wenn ich diesen Code in der Kommandozeile mit folgendem Befehl aufrufe:

Code: Alles auswählen

python scrape.py -l Germany

erhalte ich erneut folgende Fehlermeldung:

PS C:\Users\xxxxxx> python scrape.py -l Germany
Traceback (most recent call last):
File "C:\Users\xxxxxx\scrape.py", line 215, in <module>
browser = webdriver.Chrome(executable_path=webdriver_path)
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 76, in __init__
RemoteWebDriver.__init__(
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 157, in __init__
self.start_session(capabilities, browser_profile)
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 252, in start_session
response = self.execute(Command.NEW_SESSION, parameters)
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
self.error_handler.check_response(response)
File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unknown error: cannot find Chrome binary

Vielen Dank für deine Mühe.

__deets__ · Freitag 25. Juni 2021, 22:14

Das ist aber doch eine andere Fehlermeldung. Die war “SyntaxError: Non-UTF-8 code starting with '\x90' in file “. Und jetzt ist es “ cannot find Chrome binary”. Das ist doch ein substantieller Unterschied. Du hast aber behauptet, es wäre die gleiche wie vorher.

Er findet jetzt den driver. Aber nicht Chrome selbst. Hast du das installiert?

LeoRom · Freitag 25. Juni 2021, 23:11

Ich hatte erwähnt, dass wieder die lange Fehlermeldung erscheint, die ich oben eingetragen hatte.
Da habe ich mich wohl falsch ausgedrückt.
Ich habe Chrome und den Chromedriver erneut installiert und nun öffnet er mir zumindestens den Browser.

Jedoch erhalte ich folgende Meldung:

Code: Alles auswählen

DevTools listening on ws://xxx.x.x.x:xxxx/devtools/browser/852922af-9610-4b81-88d1-13b2fe8aab13
[4136:10136:0626/000549.308:ERROR:device_event_log_impl.cc(214)] [00:05:49.308] Bluetooth: bluetooth_adapter_winrt.cc:1072 Getting Default Adapter failed.
Traceback (most recent call last):
  File "C:\Users\xxxxxx\scrape.py", line 220, in <module>
    scrape_league_history(country_name)
  File "C:\Users\xxxxxx\scrape.py", line 180, in scrape_league_history
    more_countries_element = browser.find_element_by_class_name('show-more')
  File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 564, in find_element_by_class_name
    return self.find_element(by=By.CLASS_NAME, value=name)
  File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 976, in find_element
    return self.execute(Command.FIND_ELEMENT, {
  File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "C:\Users\xxxxxx\AppData\Local\Programs\Python\Python39\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".show-more"}
  (Session info: chrome=xx.x.xxxx.xxx)

Muss ich auch noch den Pfad für die Datenbank umändern?
Im Code steht folgendes geschrieben:

Code: Alles auswählen

db = DatabaseConnection('test.db')

Vielen Dank

__blackjack__ · Samstag 26. Juni 2021, 00:21

@LeoRom: Offenbar gibt es auf der Webseite kein Element mit der CSS-Klasse "show-more". Vielleicht nicht mehr? Webseiten werden ja manchmal geändert.