Ich programmiere aktuell ein Script, womit ich mehrere .csv Dateien auslese und Links aus einem Text extrahiere. Die extrahierten Links sollen dann in einer neuen Spalte namens 'full_URL_list' bzw. df['full_URL_list']
entstehen.
Leider erhalte ich jedoch immer folgenden Fehler:
Code: Alles auswählen
ValueError Traceback (most recent call last)
<ipython-input-1-a1e40b4e67cf> in <module>()
103 full_URL_list = list(itertools.chain.from_iterable(full_URL_list))
104 print(full_URL_list)
--> 105 df['full_URL_list'] = full_URL_list
106
107 mask = df.full_URL_list.str.len() != 0
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3114 else:
3115 # set column
-> 3116 self._set_item(key, value)
3117
3118 def _setitem_slice(self, key, value):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
3189
3190 self._ensure_valid_index(value)
-> 3191 value = self._sanitize_column(key, value)
3192 NDFrame._set_item(self, key, value)
3193
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
3386
3387 # turn me into an ndarray
-> 3388 value = _sanitize_index(value, self.index, copy=False)
3389 if not isinstance(value, (np.ndarray, Index)):
3390 if isinstance(value, list) and len(value) > 0:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in _sanitize_index(data, index, copy)
3996
3997 if len(data) != len(index):
-> 3998 raise ValueError('Length of values does not match length of ' 'index')
3999
4000 if isinstance(data, ABCIndexClass) and not copy:
ValueError: Length of values does not match length of index
Mein Code lautet wie folgt (Fehler verursachende Zeile wurde mit "#Hier soll laut Anaconda der Fehler liegen." markiert):
Code: Alles auswählen
import pandas as pd
import re
import glob
import os
import shutil
import time
import itertools
start_time = time.time()
print("Start time is: " + str(start_time))
# Building URL Validation (Django way)
find_urls_1 = re.compile(
r'^(?:http|ftp)s?://' # Find URLS with beginnings http:// or https://...
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # and domains...
r'localhost|' # and localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # look for ipv4 addresses...
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # look for ipv6 addresses...
r'(?::\d+)?' # or ip addresses with ports
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
url_function_1 = lambda x: re.findall(find_urls_1, str(x))
"""
url matching regex
http://daringfireball.net/2010/07/improved_regex_for_matching_urls
"""
"""
The regex patterns in this gist are intended to match any URLs,
including "mailto:foo@example.com", "x-whatever://foo", etc. For a
pattern that attempts only to match web URLs (http, https), see:
https://gist.github.com/gruber/8891611
"""
ANY_URL_REGEX = re.compile(r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[
^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""",
re.IGNORECASE)
"""
The regex patterns in this gist are intended only to match web URLs -- http,
https, and naked domains like "example.com". For a pattern that attempts to
match all URLs, regardless of protocol, see: https://gist.github.com/gruber/249502
"""
WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
"""
This regex pattern matches IP:PORT
"""
IP_REGEX = r"""[0-9]+(?:\.[0-9]+){3}:[0-9]+"""
url_function_2 = lambda x: re.findall(ANY_URL_REGEX, str(x))
def CreateNewFolder(directory):
try:
if not os.path.exists(directory):
os.makedirs(directory)
except OSError:
print('Error when creating directory: ' + directory)
pass
Destination = r'D:/submissions_URL/'
CreateNewFolder(Destination)
# for csvfile in glob.iglob(r'/2018_01/*.csv'):
for csvfile in glob.glob(r'D:/submissions/**/*.csv'):
df = pd.read_csv(csvfile, sep=',', header=0, low_memory=False, usecols=['id',
'created_utc',
'subreddit_id',
'selftext',
'author',
'url',
'title'])
df['urllist_self_text_1'] = df.selftext.apply(url_function_1)
df['urllist_self_text_2'] = df.selftext.apply(url_function_2)
df['urllist_title_1'] = df.title.apply(url_function_1)
df['urllist_title_2'] = df.title.apply(url_function_2)
df['url_function_1'] = df['urllist_self_text_1'] + df['urllist_title_1']
df['url_function_2'] = df['urllist_self_text_2'] + df['urllist_title_2']
#PRE_full_URL_list = df['url_function_1'] + df['url_function_2']
PRE_URL_list = []
#[full_URL_list.append(url) for url in PRE_full_URL_list if not url in full_URL_list]
#print(full_URL_list)
[PRE_URL_list.append(url) for url in df['url_function_1'] if url]
[PRE_URL_list.append(url) for url in df['url_function_2'] if not url in PRE_URL_list and url]
full_URL_list = [url for sublist in PRE_URL_list for url in PRE_URL_list]
full_URL_list = list(itertools.chain.from_iterable(full_URL_list))
print(full_URL_list)
df['full_URL_list'] = full_URL_list #Hier soll laut Anaconda der Fehler liegen.
mask = df.full_URL_list.str.len() != 0
indices = df.full_URL_list[mask].index
URLs_reddit = df.loc[indices, :]
directory_name = os.path.dirname(csvfile) + '_URL'
print('Directory: ' + directory_name)
CreateNewFolder(directory_name)
Filename = os.path.basename(csvfile)
print('Filename: ' + Filename)
# new_filename = Filename + '_URLs' + '.csv'
new_filename = Filename + '_URLs' + '.csv'
URLs_reddit.to_csv(os.path.join(directory_name, new_filename))
print('New Path: ' + directory_name)
print('New File: ' + new_filename)
c = int(round(time.time() - start_time))
print("--- " + str(c) +" seconds for New File: " + new_filename)
df.drop(labels='full_URL_list')
for directory in glob.iglob(r'D:/submissions/*_URL/'):
try:
directory_url = os.path.dirname(directory)
shutil.move(directory_url, Destination)
print('Moved ' + directory_url + ' to ' + Destination)
except OSError:
print('Error when moving directory: ' + directory_url)
pass
print("Performance: %s days for the whole loop and file transfer" % int(round(((time.time() - start_time))/86400)))

Ich danke euch für die Hilfe im Voraus!
LG
pytony