Da ich ein Blutiger Anfänger bin ,würde ich mich um eure hilfe sehr freuen.
Ich versuche die Online Apotheke zu crawlen mit hilfe von Python. leider sind die daten sehr verkappselt und ich bekomme nicht ausgespruckt.
Ich crawle mit hilfe von Tampermokey die URL der einzenen Produkte.
Diese füge ich in eine csv Datei ein.
Code von Tameprmonkey:
Code: Alles auswählen
// ==UserScript==
// @name aponeo
// @namespace http://tampermonkey.net/
// @version 0.1
// @description aponeo
// @author Nexuz89
// @match https://www.aponeo.de/*
// @grant none
// ==/UserScript==
(function() {
'use strict';
window.setTimeout(installButton, 1000)
function installButton() {
const elem = document.createElement('button')
elem.innerText = 'Copy'
elem.style = 'z-index: 100000; position:fixed; top:0; left:0';
document.body.appendChild(elem);
elem.addEventListener('click', _ => exportCsv())
}
function exportCsv() {
let products = Array.prototype.map.call(document.querySelectorAll("div['apn-product-list-item'"), e => e)
let csv = products.map(productObject).map(p => '"' + p.title + '",' + p.price + ',"' + p.url + '"').join('\n')
console.log(csv)
navigator.clipboard.writeText(csv)
.then(_ => window.alert('In Zwischenablage kopiert'),
_ => window.alert('Fehler beim Kopieren in Zwischenablage'))
}
function productObject(e) {
const result = {};
result.url = e.querySelector('div[data-url]').dataset.url
return result
}
})();
Python CODE:
Code: Alles auswählen
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
from time import sleep
def connect():
try:
requests.get('http://google.com') #Python 3.x
return True
except:
return False
def gernate_file():
headers_text = []
headers_text.append('Article Number')
headers_text.append('Title')
headers_text.append('Amount')
headers_text.append('Weight')
headers_text.append('Price')
headers_text.append('Categorie')
headers_text.append('Link')
headers_text.append('Image Url')
headers_text.append('Article Number')
df = pd.DataFrame([], columns=headers_text)
df.to_csv('Results.csv', index=False, encoding='utf-8-sig')
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
# url='https://www.docmorris.de/orthomol-mental/05382070'
# url='https://www.docmorris.de/antiveno-heumann-venentabletten/11050136'
def profileScraper(url,index):
while(not connect()):
print('no internet')
sleep(5)
res= requests.get(url)
soup=BeautifulSoup(res.content,features='html.parser')
Title_no_tags = soup.find('div',class_='col-md-8')
if Title_no_tags is not None:
art_tags=Title_no_tags.find('div',class_='product-description')
if art_tags is not None:
art = art_tags.find('',class_='')
if art is not None:
Title=art.text.strip()
else:
return 0
print(Title)
Weight=soup.find('',class_='')
if Weight is not None:
Weight=Weight.text.strip()
else:
Weight='N/A'
# print(Weight)
# piece=soup.find('span',class_='productBasicInfo__quantity-text')
# print(piece.text)
Price_1=soup.find('table',class_='price-box')
if Price_1 is not None:
Price_2 = Price_1.find('span',class_='price')
if Price_2 is not None:
Price = Price_2.text.strip()
else:
Price='N/A'
# print(Price)
Categories=soup.find_all('ul',class_='breadcrumb-list')
if len(Categories) >3:
# print(Categories[2].text.strip())
Categories = Categories[3].text.strip()
else:
# print(Categories[-2].text.strip())
Categories = Categories[-2].text.strip()
# print(url)
amount=title.split('Stück')[0]
# print(amount)
amountSplit=title.split('Stück')[0].split(' ')
for split in amountSplit:
if hasNumbers(split):
amount=split
# print(amount)
if not hasNumbers(amount):
amount=soup.find('span',class_='productConfiguration__summary')
if amount is not None:
print(amount.text.split(':')[1].split('x')[0].strip())
amount=amount.text.split(':')[1].split('x')[0].strip()
else:
amount='1 Pack'
print(amount)
img_tag=soup.find('div',class_='col-md-4')
if img_tag is not None:
img_url=img_tag['src']
img_url_text=img_url.replace('//','')
print(img_url_text)
article_no_tags = soup.find('strong',class_='additional-information-panel-body')
if article_no_tags is not None:
art_tags=article_no_tags.find('table',class_='table-dotted')
if art_tags is not None:
art = art_tags.find('td',class_='')
if art is not None:
article_no=art.text.strip()
print(article_no)
return[index,title,'`'+amount,Weight,Price,Categories,url,img_url_text,article_no]
with open('lidl.csv', 'r', encoding='utf-8') as readFile:
reader = csv.reader(readFile)
file_lines = list(reader)
# print(file_lines[1][8])
gernate_file()
for index, row in enumerate(file_lines[1:]):
print(index)
# print(row[7])
record=[]
record = profileScraper(row[6],index+1)
df = pd.DataFrame([record])
df.to_csv('Results.csv', index=False, mode='a', encoding='utf-8-sig', header=False)
# break
print()
print()
# print(record)
Evtl könnte mir jemand ein tipp geben, wäre sehr hilfreich für mich.
CSV Datei:
Article Number,Title,Amount,Weight,Price,Categorie,Link,Image Url,Article Number
,,,,,,https://www.docmorris.de/cordyceps-kaps ... t/09640592,,
,,,,,,https://www.docmorris.de/heparin-ratiop ... 0/03892335,,
,,,,,,https://www.docmorris.de/heparin-60000- ... e/07466948,,
,,,,,,https://www.docmorris.de/venostasin-gel/04766785,,
,,,,,,https://www.docmorris.de/vitamin-b12-lu ... t/11161255,,
,,,,,,https://www.docmorris.de/antistax-frisch-gel/08913131,,
Vielen dank