Implementierung des EM-Algoirthmus

Wenn du dir nicht sicher bist, in welchem der anderen Foren du die Frage stellen sollst, dann bist du hier im Forum für allgemeine Fragen sicher richtig.
Antworten
MathGenie123
User
Beiträge: 43
Registriert: Montag 18. April 2022, 13:13

Ich soll den EM-Algorithmus implementieren und erhalte dauernd als Fehler:
ValueError: array must not contain infs or NaNs

Den Datensatz findet ihr hier: https://www.kaggle.com/datasets/varpit9 ... ings-stats


Code: Alles auswählen

from scipy import stats
import numpy as np
import pandas as pd
from pingouin import multivariate_normality
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal

rng = np.random.default_rng()

pts = 1000

a = rng.normal(0, 1, size=pts)

b = rng.normal(2, 1, size=pts)

df = pd.read_csv("Football teams.csv")


data = df.drop(['Team','Tournament'], axis=1) 


multivariate_normality(data, alpha=.5)

df.head()



# In[25]:


sns.pairplot(data, hue="Rating")


# In[ ]:





# In[87]:


data['Rating'] = np.where(df["Rating"] <= 6.6, 0, 1)
sum(data['Rating'] == 0)


# In[88]:


sns.pairplot(data, hue="Rating")


# In[108]:


K = 2 # number of clusters

means = np.zeros((K,8))
covs = np.zeros((K,8,8))
for k in range(K):
    means[k] = np.random.normal(size=(8,))
    covs[k] = np.eye(8)

weights = np.ones((K,1))/K
print("Initial mean vectors (one per row):\n" + str(means))
print(data.shape)


# In[107]:


r = np.zeros((K,98)) # will store the responsibilities
X, Y = np.meshgrid(np.linspace(-10,10,100), np.linspace(-10,10,100))
N=98
for em_iter in range(100):    
    means_old = means.copy()
    
    # E-step: update responsibilities
    for k in range(K):
        r[k] = weights[k]*multivariate_normal.pdf(mean=means[k,:], cov=covs[k,:,:], x=data)  
       
        
    r = r/np.sum(r, axis=0) 
        
    # M-step
    N_k = np.sum(r, axis=1)

    for k in range(K): 
        # update means
        means[k] = np.sum(r[k]*data.T, axis=1)/N_k[k]
        
        # update covariances
        diff = data.T - means[k:k+1].T
        _tmp = np.sqrt(r[k:k+1])*diff
        covs[k] = np.inner(_tmp, _tmp)/N_k[k]
        
    # weights
    weights = N_k/N 
    
    # log-likelihood
    gmm_nll = 0
    for k in range(K):
        gmm_nll += weights[k]*multivariate_normal.pdf(mean=means[k,:].ravel(), cov=covs[k,:,:], x=data)
    NLL += [-np.sum(np.log(gmm_nll))]
    
    plt.figure() 
    plt.plot(x, y, 'ko', alpha=0.3)
    plt.plot(means[:,0], means[:,1], 'oy', markersize=25)
    for k in range(K):
        rv = multivariate_normal(means[k,:], covs[k])
        plt.contour(X, Y, rv.pdf(pos), alpha = 1.0, zorder=10)
        
    plt.xlabel("$x_1$")
    plt.ylabel("$x_2$")
    plt.text(x=3.5, y=8, s="EM iteration "+str(em_iter+1))
    
    if la.norm(NLL[em_iter+1]-NLL[em_iter]) < 1e-6:
        print("Converged after iteration ", em_iter+1)
        break
   
# plot final the mixture model
plt.figure() 
gmm = 0
for k in range(3):
    mix_comp = multivariate_normal(means[k,:].ravel(), covs[k,:,:])
    gmm += weights[k]*mix_comp.pdf(pos)

plt.plot(data['Goals'], data['yellow_cards'], 'ko', alpha=0.3)
plt.contour(X, Y, gmm,  alpha = 1.0, zorder=10)    
plt.xlim([-8,8]);
plt.ylim([-6,6]);


# In[ ]:



Bitte dringend um Hilfe
Benutzeravatar
__blackjack__
User
Beiträge: 14067
Registriert: Samstag 2. Juni 2018, 10:21
Wohnort: 127.0.0.1
Kontaktdaten:

@MathGenie123: Das ist keine ausreichende Fehlerbeschreibung. Und das kann auch niemand nachvollziehen weil der Code undefinierte Namen enthält. Ich denke auch nicht, dass der ganze Code tatsächlich notwendig ist um das Problem zu beschreiben.
“Vir, intelligence has nothing to do with politics!” — Londo Mollari
Antworten