



Code: Alles auswählen
#!/usr/bin/env python
# coding: utf-8 -*-
from __future__ import print_function, division
#-------------------------------------------------------------------------------
# Name: scrapeHTMLTables_diffSites.py
# Purpose: scrape html-table and show results
#
# Author: Xxxxxx Xxxxxx
#
# Created: 07/29/2019
# Licence: n/a
#-------------------------------------------------------------------------------
"""
scrape html-table and show results by:
simple print out, Excel-file, bar-chart and image
contains functions:
parse_html_table - parses HTML-table using the URL, the selector and the position of the table in the HTML-document
parse_html_tables_different_sites - parses HTML-table spread on different pages
show_results - shows the results by simple print, converting to Microsoft Excel, creating a bar-chart
and save chart as png-image
rectify_data - rectifies the data: putting the comma at the correct position by dividing by 100
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
def parse_html_table(url, selector, content, table_position):
"""
parses HTML-table using the URL, the selector and the position of the table in the HTML-document
In:
URL of page to be scraped
html-selector of element to be scraped
html-content of element to be scraped
table-postion is number of position of table in html-document
Out:
dataframe containing former html-table
"""
df = pd.read_html(url, attrs={selector: content})[table_position]
return df
def parse_html_tables_different_sites(url, selector, content, table_position, pages=10):
"""
parses HTML-table spreaded on different sites
In:
URL of pages to be scraped
html-selector of element to be scraped
html-content of element to be scraped
table-postion is number of position of table in html-document
number of pages
Out:
dataframe containing former html-tables
"""
return pd.concat([
pd.read_html(f"{url}?p={page}", attrs={selector: content})[table_position]
for page in range(1, pages+1)
], ignore_index=True)
def show_results(data_frame_one_page,
data_frame_diff_pages,
output_path,
output_name_xlsx_one_page,
output_name_xlsx_diff_pages,
output_name_img_one_page,
output_name_img_diff_pages):
"""
shows the results of the different-sites-tableby simple print, converting to Microsoft Excel, creating a bar-chart
and save chart as png-image
In:
dataframe containing former html-table
dataframe containing former html-tables
path for output-files
name and format of Microsoft-Excel-file for one table
name and format of Microsoft-Excel-file for several tables
name and format of image-file for one table
name and format of image-file for several tables
Out:
no return value
"""
print(data_frame_one_page)
print(data_frame_diff_pages)
data_frame_one_page.to_excel(output_path + output_name_xlsx_one_page)
data_frame_diff_pages.to_excel(output_path + output_name_xlsx_diff_pages)
data_frame_one_page.plot.bar(width=1.5)
plt.savefig(output_path + output_name_img_one_page)
data_frame_diff_pages.plot.bar(width=1.5)
plt.savefig(output_path + output_name_img_diff_pages)
def rectify_data(data_frame):
"""
rectifies the data: putting the comma at the correct position by dividing by 100
In:
dataframe containing table or tables
Out:
dataframe containing table or tables with correct comma-positions
"""
for i in range(10):
if data_frame.iloc[:,i].dtype == np.int64:
data_frame.iloc[:,i] = data_frame.iloc[:,i] / 100
return data_frame
def main():
data_frame_one_page = parse_html_table(url="https://www.finanzen.net/index/dax/marktkapitalisierung",
selector="class",
content = "table",
table_position=0)
data_frame_diff_pages = parse_html_tables_different_sites(url="https://www.finanzen.net/index/s&p_500/marktkapitalisierung",
selector="class",
content = "table",
table_position=0)
data_frame_one_page = rectify_data(data_frame_one_page)
data_frame_diff_pages = rectify_data(data_frame_diff_pages)
show_results(data_frame_one_page,
data_frame_diff_pages,
output_path="output/",
output_name_xlsx_one_page="scrapedTable.xlsx",
output_name_xlsx_diff_pages="scrapedTableDiffPages.xlsx",
output_name_img_one_page="scrapedTable.png",
output_name_img_diff_pages="scrapedTableDiffPages.png")
if __name__ == "__main__":
main()
Code: Alles auswählen
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# main class
class scrapeHTMLTablesFromWwwFinanzenNet():
def __init__(self, url, selector, content, table_position, pages, output_path, file_name_excel, file_name_img):
self.datafame = pd.concat([
pd.read_html(f"{url}?p={page}", attrs={selector: content})[table_position]
for page in range(1, pages+1)
], ignore_index=True)
def show_result_print_out(self):
print(self.datafame)
def show_result_excel_file(self):
self.datafame.to_excel(self.)
Code: Alles auswählen
#!/usr/bin/env python
# coding: utf-8 -*-
from __future__ import print_function, division
#-------------------------------------------------------------------------------
# Name: scrapeHTMLTables_diffSites.py
# Purpose: scrape html-table and show results
#
# Author: Xxxxxx Xxxxxx
#
# Created: 07/29/2019
# Licence: n/a
#-------------------------------------------------------------------------------
"""
scrape html-table and show results by:
simple print out, Excel-file, bar-chart and image
contains functions:
parse_html_table - parses HTML-table using the URL, the selector and the position of the table in the HTML-document
parse_html_tables_different_sites - parses HTML-table spread on different pages
show_results - shows the results by simple print, converting to Microsoft Excel, creating a bar-chart
and save chart as png-image
rectify_data - rectifies the data: putting the comma at the correct position by dividing by 100
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# main class
class scrapeHTMLTablesFromWwwFinanzenNet():
def __init__(self, url, selector, content, table_position, pages, output_path, file_name_excel, file_name_img):
self.datafame = pd.concat([
pd.read_html(f"{url}?p={page}", attrs={selector: content})[table_position]
for page in range(1, pages+1)
], ignore_index=True, sort=False)
self.url = url
self.selector = selector
self.content = content
self.table_position = table_position
self.pages = pages
self.output_path = output_path
self.file_name_excel = file_name_excel
self.file_name_img = file_name_img
def show_result_print_out(self):
print(self.datafame)
def show_result_excel_file(self):
self.datafame.to_excel(self.output_path + self.file_name_excel)
def show_result_plot_barchart(self):
self.datafame.plot.bar(width=1.5)
def show_result_img_file(self):
plt.savefig(self.output_path + self.file_name_img)
def main():
table = scrapeHTMLTablesFromWwwFinanzenNet(url="https://www.finanzen.net/index/s&p_500/marktkapitalisierung",
selector="class",
content="table",
table_position=0,
pages=10,
output_path="output/",
file_name_excel="scrapedTable.xlsx",
file_name_img="scrapedTable.png")
table.show_result_print_out()
table.show_result_excel_file()
table.show_result_plot_barchart()
table.show_result_img_file()
if __name__ == "__main__":
main()
Die Frage kann ich nicht beantworten. Ich kann nur aufgrund ihrer Formulierung vermuten, dass meine Vorgehensweise fragwürdig ist. Wohin sollten die Argumente denn sonst?__blackjack__ hat geschrieben: Dienstag 30. Juli 2019, 17:10 Macht das Sinn die ganzen Argumente an das Objekt zu binden?
Code: Alles auswählen
# plt.savefig(os.path.join(self.output_path, self.file_name_img))
# self.table_barchart.savefig(fname=os.path.join(self.output_path, self.file_name_img))
# fig_path = os.path.join(self.output_path, self.file_name_img)
# plt.savefig(self.table_barchart, fig_path)
# plt.savefig(os.path.join(self.output_path, self.file_name_img))