Aktuell habe ich folgenden Code am laufen, der funktioniert und plausible Ergebnisse liefert:
Code: Alles auswählen
from __future__ import absolute_import, division, print_function
import pandas as pd
import numpy as np
from matplotlib import pyplot as plp
from sklearn import preprocessing
from sklearn.cluster import KMeans
import sys
def extract_articles(data, article_numbers):
return pd.DataFrame(
[
data[data['ARTICLENO'] == article_no]['QUANTITY'].values
for article_no in article_numbers
]
).fillna(0)
def read_csv_file(file_name, number_of_lines):
return pd.read_csv(file_name, parse_dates=['DATE'], nrows=number_of_lines)
def get_unique_article_numbers(data):
return data['ARTICLENO'].unique()
def main():
data = read_csv_file('statistics.csv', 20000)
data['DATE'] = data['DATE'].astype(int)
modeling_article_numbers = get_unique_article_numbers(data)
modeling_data = extract_articles(data, modeling_article_numbers)
modeling_data = modeling_data.iloc[:, :50]
normalized_modeling_data = preprocessing.normalize(modeling_data, norm='l2')
predicting_article_numbers = [430079229, 430079854, 430086845]
predicting_article_data = extract_articles(data, predicting_article_numbers)
normalized_predicting_article_data = preprocessing.normalize(
predicting_article_data, norm='l1'
)
# print("Predicting data:", normalized_predicting_article_data)
print("Size of whole data dataframe:", sys.getsizeof(data), "Bytes")
# print(normalized_predicting_article_data)
kmeans = KMeans(n_clusters=4, random_state=0).fit(normalized_modeling_data)
print(kmeans.labels_)
# for data, article_no in [
# (normalized_predicting_article_data, 430079229),
# (normalized_predicting_article_data, 430079854)
# ]:
# print('Predicting article {0}'.format(article_no))
# print(kmeans.predict(data[0]))
for i, cluster_center in enumerate(kmeans.cluster_centers_):
plp.plot(cluster_center, label='Center {0}'.format(i))
plp.legend(loc='best')
plp.show()
main()
Ich muss noch die Liste der Vorhersagedaten and die Modelldaten anpassen.
Es muss gelten:
foreach model in normalized_modeling_data:
foreach predicting in normalized_predicting_data:
len(model) == len(predicting)
Verstehst du was ich meine? Alle Unterlisten müssen global die gleiche Länge haben.