Pandas - ValueError: Length of values does not match length of index
Verfasst: Sonntag 26. August 2018, 13:47
Liebe Mitglieder!
Ich programmiere aktuell ein Script, womit ich mehrere .csv Dateien auslese und Links aus einem Text extrahiere. Die extrahierten Links sollen dann in einer neuen Spalte namens 'full_URL_list' bzw. df['full_URL_list']
entstehen.
Leider erhalte ich jedoch immer folgenden Fehler:
Mein Code lautet wie folgt (Fehler verursachende Zeile wurde mit "#Hier soll laut Anaconda der Fehler liegen." markiert):
Ich suche im Internet, aber es sind immer individuelle Fälle und nicht übertragbar. Ich stehe aktuell auf dem Schlauch
Ich danke euch für die Hilfe im Voraus!
LG
pytony
Ich programmiere aktuell ein Script, womit ich mehrere .csv Dateien auslese und Links aus einem Text extrahiere. Die extrahierten Links sollen dann in einer neuen Spalte namens 'full_URL_list' bzw. df['full_URL_list']
entstehen.
Leider erhalte ich jedoch immer folgenden Fehler:
Code: Alles auswählen
ValueError Traceback (most recent call last)
<ipython-input-1-a1e40b4e67cf> in <module>()
103 full_URL_list = list(itertools.chain.from_iterable(full_URL_list))
104 print(full_URL_list)
--> 105 df['full_URL_list'] = full_URL_list
106
107 mask = df.full_URL_list.str.len() != 0
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3114 else:
3115 # set column
-> 3116 self._set_item(key, value)
3117
3118 def _setitem_slice(self, key, value):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3189
3190 self._ensure_valid_index(value)
-> 3191 value = self._sanitize_column(key, value)
3192 NDFrame._set_item(self, key, value)
3193
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
3386
3387 # turn me into an ndarray
-> 3388 value = _sanitize_index(value, self.index, copy=False)
3389 if not isinstance(value, (np.ndarray, Index)):
3390 if isinstance(value, list) and len(value) > 0:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _sanitize_index(data, index, copy)
3996
3997 if len(data) != len(index):
-> 3998 raise ValueError('Length of values does not match length of ' 'index')
3999
4000 if isinstance(data, ABCIndexClass) and not copy:
ValueError: Length of values does not match length of index
Mein Code lautet wie folgt (Fehler verursachende Zeile wurde mit "#Hier soll laut Anaconda der Fehler liegen." markiert):
Code: Alles auswählen
import pandas as pd
import re
import glob
import os
import shutil
import time
import itertools
start_time = time.time()
print("Start time is: " + str(start_time))
# Building URL Validation (Django way)
find_urls_1 = re.compile(
r'^(?:http|ftp)s?://' # Find URLS with beginnings http:// or https://...
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # and domains...
r'localhost|' # and localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # look for ipv4 addresses...
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # look for ipv6 addresses...
r'(?::\d+)?' # or ip addresses with ports
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
url_function_1 = lambda x: re.findall(find_urls_1, str(x))
"""
url matching regex
http://daringfireball.net/2010/07/improved_regex_for_matching_urls
"""
"""
The regex patterns in this gist are intended to match any URLs,
including "mailto:foo@example.com", "x-whatever://foo", etc. For a
pattern that attempts only to match web URLs (http, https), see:
https://gist.github.com/gruber/8891611
"""
ANY_URL_REGEX = re.compile(r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[
^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""",
re.IGNORECASE)
"""
The regex patterns in this gist are intended only to match web URLs -- http,
https, and naked domains like "example.com". For a pattern that attempts to
match all URLs, regardless of protocol, see: https://gist.github.com/gruber/249502
"""
WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
"""
This regex pattern matches IP:PORT
"""
IP_REGEX = r"""[0-9]+(?:\.[0-9]+){3}:[0-9]+"""
url_function_2 = lambda x: re.findall(ANY_URL_REGEX, str(x))
def CreateNewFolder(directory):
try:
if not os.path.exists(directory):
os.makedirs(directory)
except OSError:
print('Error when creating directory: ' + directory)
pass
Destination = r'D:/submissions_URL/'
CreateNewFolder(Destination)
# for csvfile in glob.iglob(r'/2018_01/*.csv'):
for csvfile in glob.glob(r'D:/submissions/**/*.csv'):
df = pd.read_csv(csvfile, sep=',', header=0, low_memory=False, usecols=['id',
'created_utc',
'subreddit_id',
'selftext',
'author',
'url',
'title'])
df['urllist_self_text_1'] = df.selftext.apply(url_function_1)
df['urllist_self_text_2'] = df.selftext.apply(url_function_2)
df['urllist_title_1'] = df.title.apply(url_function_1)
df['urllist_title_2'] = df.title.apply(url_function_2)
df['url_function_1'] = df['urllist_self_text_1'] + df['urllist_title_1']
df['url_function_2'] = df['urllist_self_text_2'] + df['urllist_title_2']
#PRE_full_URL_list = df['url_function_1'] + df['url_function_2']
PRE_URL_list = []
#[full_URL_list.append(url) for url in PRE_full_URL_list if not url in full_URL_list]
#print(full_URL_list)
[PRE_URL_list.append(url) for url in df['url_function_1'] if url]
[PRE_URL_list.append(url) for url in df['url_function_2'] if not url in PRE_URL_list and url]
full_URL_list = [url for sublist in PRE_URL_list for url in PRE_URL_list]
full_URL_list = list(itertools.chain.from_iterable(full_URL_list))
print(full_URL_list)
df['full_URL_list'] = full_URL_list #Hier soll laut Anaconda der Fehler liegen.
mask = df.full_URL_list.str.len() != 0
indices = df.full_URL_list[mask].index
URLs_reddit = df.loc[indices, :]
directory_name = os.path.dirname(csvfile) + '_URL'
print('Directory: ' + directory_name)
CreateNewFolder(directory_name)
Filename = os.path.basename(csvfile)
print('Filename: ' + Filename)
# new_filename = Filename + '_URLs' + '.csv'
new_filename = Filename + '_URLs' + '.csv'
URLs_reddit.to_csv(os.path.join(directory_name, new_filename))
print('New Path: ' + directory_name)
print('New File: ' + new_filename)
c = int(round(time.time() - start_time))
print("--- " + str(c) +" seconds for New File: " + new_filename)
df.drop(labels='full_URL_list')
for directory in glob.iglob(r'D:/submissions/*_URL/'):
try:
directory_url = os.path.dirname(directory)
shutil.move(directory_url, Destination)
print('Moved ' + directory_url + ' to ' + Destination)
except OSError:
print('Error when moving directory: ' + directory_url)
pass
print("Performance: %s days for the whole loop and file transfer" % int(round(((time.time() - start_time))/86400)))
Ich danke euch für die Hilfe im Voraus!
LG
pytony