String-Operation

__blackjack__ · Mittwoch 28. August 2019, 10:47

@Strawk: Dann musst Du das wohl im Skript deutlich machen. Der Hilfetext zu ``-b``/``--builtin`` von ``kernprof`` listet die verschiedenen Möglichkeiten ja auf. Neben dem Dekorator könntest Du zum Beispiel am Anfang auch ein ``profile.enable()`` einbauen.

Edit: Vielleicht sogar per ``-s``/``--setup`` ausserhalb des Quelltextes.

Strawk · Mittwoch 28. August 2019, 13:29

Hallo!
Einer der Flaschenhälse scheint dieser Befehl zu sein:

Code: Alles auswählen

dataframe = pd.DataFrame(res, columns=["Router Name",
                                                   "Bandwidth (KB/s)",
                                                   "Uptime (days)",
                                                   "Longitude",
                                                   "Latitude",
                                                   "Hostname",
                                                   "IP",
                                                   "Exit Server",
                                                   "Directory Server",
                                                   "Fast Server",
                                                   "Guard Server",
                                                   "Stable Server",
                                                   "Tor Version",
                                                   "Tor Operating System",
                                                   "ORPort",
                                                   "DirPort",
                                                   "First Seen",
                                                   "ASName"
                                                   ])

Er verbrät 35,5 % der Zeit. Frage mich wieso und wie ich das ändere.

Konnte die Zeit halbieren: Der Befehl war in der Schleife; da gehört er nicht hin!

Sirius3 · Mittwoch 28. August 2019, 14:52

@Strawk: da niemand außer Dir Deinen Code kennt, ist es schwierig, zu sagen, warum das die meiste Zeit verbrät.

Strawk · Samstag 31. August 2019, 09:47

Hallo!
Ich möchte aus einem Dataframe eine Liste ziehen, enthaltend die zweite und die dritte Spalte des Dataframes und unter Droppen der Zeilen ohne Werte (None, NaN).
Grüße M.
Hier meine bisherigen Versuche ( 1,5 h):

Code: Alles auswählen

# tor_df_lon_lat = tor_df.dropna(subset=['mlon'], ['mlat'])
        # tor_df_lon_lat = tor_df["mlon", "mlat"].notnull()
        # tor_df_lon_lat = tor_df.notnull(["mlon", "mlat"])
        # tor_df_lon_lat = tor_df["mlon", "mlat"].notnull()
        # tor_df_lon_lat = tor_df.dropna(axis=0, subset=("mlon", "mlat"))
        # tor_df_lon_lat = tor_df.dropna(axis=0, how='any', subset=('mlon', ))
        # tor_df_lon_lat = tor_df.dropna(axis=0, how='any', subset=['mlon', 'mlat'])
        # lon_lat_list = tor_df_lon_lat["mlon", "mlat"].tolist()
        # print(tor_df_lon_lat)
        # print(type(tor_df_lon_lat))
        # print(lon_lat_list)
        # print(len(lon_lat_list))

Sirius3 · Samstag 31. August 2019, 10:04

Auch hier helfen wieder Beispiel-Inputdaten und gewünschtes Ergebnis.
Was z.B. funktioniert an ›dropna‹ nicht (wenn man davon absieht, dass man erstmal gültiges Python schreiben sollte)?

ThomasL · Samstag 31. August 2019, 10:12

Ich würde es so machen

Code: Alles auswählen

lon_lat_list = tor_df[["mlon", "mlat"]].dropna().values

__blackjack__ · Montag 14. Oktober 2019, 13:56

Hier mal so ziemlich alles herausgezogen was dort in der Tabelle steht:

Code: Alles auswählen

#!/usr/bin/env python3
from datetime import datetime as DateTime
from functools import partial
from pathlib import Path
from urllib.parse import parse_qsl, urlparse

import bs4
import pandas
import requests

DURATION_UNIT_TO_DAY_FACTOR = {"d": 1, "h": 1 / 24}


def parse_string(name, node):
    return {name: node.text.strip() or None}


def parse_int(name, node, na_value=""):
    text = node.text.strip()
    return {name: None if text == na_value else int(text)}


def parse_date(name, node):
    return {name: DateTime.strptime(node.text.strip(), "%Y-%m-%d").date()}


def parse_router_name_cell(node):
    country_link_node, router_name_link_node = node("a", recursive=False)
    query_data = dict(parse_qsl(urlparse(country_link_node["href"]).query))
    try:
        longitude = float(query_data["mlon"])
        latitude = float(query_data["mlat"])
    except KeyError:
        longitude = None
        latitude = None

    return {
        "country": country_link_node.img["alt"] or None,
        "longitude": longitude,
        "latitude": latitude,
        "router_name": router_name_link_node.text.strip() or None,
    }


def parse_uptime_cell(node):
    value, space, unit = node.text.strip().partition(" ")
    if space != " ":
        raise ValueError(f"expected space between value and unit in {node}")
    return {"uptime": int(value) * DURATION_UNIT_TO_DAY_FACTOR[unit]}


def parse_flags_and_operating_system(node):
    flag_names = ["authority", "directory", "exit", "fast", "guard", "stable"]
    result = dict.fromkeys((f"is_{n}" for n in flag_names), False)
    remaining_descriptions = list()
    for image_description in (img["alt"] for img in node("img")):
        if image_description.endswith(" Server"):
            name, space, _ = image_description.partition(" ")
            if space != " ":
                raise ValueError(f"expected space in {image_description}")
            name = f"is_{name.lower()}"
            if name not in result:
                raise ValueError(f"unknown flag {name}")
            result[name] = True
        else:
            remaining_descriptions.append(image_description)

    if len(remaining_descriptions) > 1:
        raise ValueError(
            f"expected just the tor software and operating system name in"
            f" {remaining_descriptions}"
        )
    
    if remaining_descriptions:
        tor_software, delimiter, operating_system = remaining_descriptions[
            0
        ].rpartition(" on ")
        if not delimiter:
            raise ValueError(f"missing 'on' in {remaining_descriptions[0]}")
    else:
        tor_software = None
        operating_system = None

    result["tor_software"] = tor_software
    result["operating_system"] = operating_system

    return result


def parse_host_name_cell(node):
    name_cell = node.find("td", "iT")
    strings = name_cell.stripped_strings
    result = {
        "host_name": next(strings).split(None, 1)[0],
        "ip": next(strings),
    }
    result.update(parse_flags_and_operating_system(node))

    return result


def parse_bad_exit_cell(node):
    classes = node["class"]
    if "F0" in classes and "F1" in classes:
        raise ValueError(f"ambigious css classes in bad exit cell: {classes}")

    if "F0" in classes:
        result = False
    elif "F1" in classes:
        result = True
    else:
        raise ValueError(f"unknown css classes for bad exit: {classes}")

    return {"is_bad_exit": result}


def parse_row(node):
    result = dict()
    parsers = [
        parse_router_name_cell,
        partial(parse_int, "bandwith"),
        parse_uptime_cell,
        parse_host_name_cell,
        partial(parse_int, "or_port"),
        partial(parse_int, "dir_port", na_value="None"),
        parse_bad_exit_cell,
        partial(parse_date, "first_seen_on"),
        partial(parse_string, "as_name"),
        partial(parse_int, "as_number"),
        partial(parse_int, "consensus_bandwidth"),
        partial(parse_string, "or_address"),
    ]
    for parse, cell_node in zip(parsers, node("td", recursive=False)):
        result.update(parse(cell_node))

    return result


def parse_page(soup):
    return list(map(parse_row, soup.find("table", "displayTable")("tr", "r")))


def main():
    # html_source = Path("Tor.htm").read_bytes()
    response = requests.get("https://torstatus.blutmagie.de/")
    response.raise_for_status()
    html_source = response.text
    
    soup = bs4.BeautifulSoup(html_source, "lxml")
    exit_nodes = pandas.DataFrame(parse_page(soup))
    print(exit_nodes)
    print(exit_nodes.info())
    print(exit_nodes.head())


if __name__ == "__main__":
    main()