files = [ Rede.txt]
data = []
for file in files:
with open(file, encoding="utf-8") as f:
text = f.read()
year, speaker = file.replace(".txt","").split("_", 1)
data.append({
"Jahr": int(year),
"Sprecher": speaker,
"Text": text
})
Neujahrsansprachen = pd.DataFrame(data)
Neujahrsansprachen.head()
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
# Stopwörter laden
nltk.download("stopwords")
german_stop_words = set(stopwords.words("german"))
# SentiWS laden
def load_sentiws_simple(path):
df = pd.read_csv(path, sep=None, engine="python", header=0)
senti = {}
for _, row in df.iterrows():
word = str(row[1]).lower()
score = float(row[2])
senti[word] = score
return senti
senti_neg = load_sentiws_simple("SentiWS_ML_negativ.csv")
senti_pos = load_sentiws_simple("SentiWS_ML_positiv.csv")
senti_dict = {**senti_neg, **senti_pos}
print("Wörter im Lexikon:", len(senti_dict))
# ---------------------------------------------------------
# Tokenisierung
# ---------------------------------------------------------
def tokenize(text):
words = re.findall(r"[a-zA-ZäöüÄÖÜß]+", text.lower())
return [w for w in words if w not in german_stop_words]
Neujahrsansprachen["Tokens"] = Neujahrsansprachen["Text"].apply(tokenize)
# Sentiment pro Rede
def sentiws_score(tokens):
return sum(senti_dict.get(w, 0) for w in tokens)
Neujahrsansprachen["SentiWS_Score"] = Neujahrsansprachen["Tokens"].apply(sentiws_score)
# Sentiment pro Sprecher
sentiment_by_speaker = (
Neujahrsansprachen
.groupby("Sprecher")["SentiWS_Score"]
.mean()
.sort_values(ascending=False)
)
sentiment_by_speaker
