Sentimentanalyse mit Senti Ws
Verfasst: Montag 6. April 2026, 16:42
Hallo
ich möchte Reden mit einer Sentimentanalyse auswerten (SentiWS - https://www.kaggle.com/datasets/sibeliu ... ositiv.csv). Ich habe schon Code und möchte gerne wissen, ob und wie ich den reduzieren kann, da ich das Gefühl habe, dass der Code viel zu umständlich ist. Ich möchte als Ergebnis haben, ob das Sentiment je Rede eines Sprechers negativ (negative Zahl), positiv (positive Zahl)oder neutral (0) ist (Werte werden in Zahlen angegeben). Das habe ich bereits:
files = [ Rede.txt]
data = []
for file in files:
with open(file, encoding="utf-8") as f:
text = f.read()
year, speaker = file.replace(".txt","").split("_", 1)
data.append({
"Jahr": int(year),
"Sprecher": speaker,
"Text": text
})
Neujahrsansprachen = pd.DataFrame(data)
Neujahrsansprachen.head()
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
# Stopwörter laden
nltk.download("stopwords")
german_stop_words = set(stopwords.words("german"))
# SentiWS laden
def load_sentiws_simple(path):
df = pd.read_csv(path, sep=None, engine="python", header=0)
senti = {}
for _, row in df.iterrows():
word = str(row[1]).lower()
score = float(row[2])
senti[word] = score
return senti
senti_neg = load_sentiws_simple("SentiWS_ML_negativ.csv")
senti_pos = load_sentiws_simple("SentiWS_ML_positiv.csv")
senti_dict = {**senti_neg, **senti_pos}
print("Wörter im Lexikon:", len(senti_dict))
# ---------------------------------------------------------
# Tokenisierung
# ---------------------------------------------------------
def tokenize(text):
words = re.findall(r"[a-zA-ZäöüÄÖÜß]+", text.lower())
return [w for w in words if w not in german_stop_words]
Neujahrsansprachen["Tokens"] = Neujahrsansprachen["Text"].apply(tokenize)
# Sentiment pro Rede
def sentiws_score(tokens):
return sum(senti_dict.get(w, 0) for w in tokens)
Neujahrsansprachen["SentiWS_Score"] = Neujahrsansprachen["Tokens"].apply(sentiws_score)
# Sentiment pro Sprecher
sentiment_by_speaker = (
Neujahrsansprachen
.groupby("Sprecher")["SentiWS_Score"]
.mean()
.sort_values(ascending=False)
)
sentiment_by_speaker
files = [ Rede.txt]
data = []
for file in files:
with open(file, encoding="utf-8") as f:
text = f.read()
year, speaker = file.replace(".txt","").split("_", 1)
data.append({
"Jahr": int(year),
"Sprecher": speaker,
"Text": text
})
Neujahrsansprachen = pd.DataFrame(data)
Neujahrsansprachen.head()
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
# Stopwörter laden
nltk.download("stopwords")
german_stop_words = set(stopwords.words("german"))
# SentiWS laden
def load_sentiws_simple(path):
df = pd.read_csv(path, sep=None, engine="python", header=0)
senti = {}
for _, row in df.iterrows():
word = str(row[1]).lower()
score = float(row[2])
senti[word] = score
return senti
senti_neg = load_sentiws_simple("SentiWS_ML_negativ.csv")
senti_pos = load_sentiws_simple("SentiWS_ML_positiv.csv")
senti_dict = {**senti_neg, **senti_pos}
print("Wörter im Lexikon:", len(senti_dict))
# ---------------------------------------------------------
# Tokenisierung
# ---------------------------------------------------------
def tokenize(text):
words = re.findall(r"[a-zA-ZäöüÄÖÜß]+", text.lower())
return [w for w in words if w not in german_stop_words]
Neujahrsansprachen["Tokens"] = Neujahrsansprachen["Text"].apply(tokenize)
# Sentiment pro Rede
def sentiws_score(tokens):
return sum(senti_dict.get(w, 0) for w in tokens)
Neujahrsansprachen["SentiWS_Score"] = Neujahrsansprachen["Tokens"].apply(sentiws_score)
# Sentiment pro Sprecher
sentiment_by_speaker = (
Neujahrsansprachen
.groupby("Sprecher")["SentiWS_Score"]
.mean()
.sort_values(ascending=False)
)
sentiment_by_speaker