wie ihr in einem anderen Thread lesen könnt, habe ich Twitter Daten gesammelt und diese als JSON gespeichert.
Der Inhalt der JSON Datei sieht dann wie folgt aus:
Code: Alles auswählen
{"contributors": null, "coordinates": null, "created_at": "Sat Jul 01 20:40:22 +0000 2017", "entities": {"hashtags": [{"indices": [80, 93], "text": "gardadautore"}, {"indices": [94, 103], "text": "gangbank"}], "symbols": [], "urls": [], "user_mentions": [{"id": 384927198, "id_str": "384927198", "indices": [1, 11], "name": "Gianluigi Paragone", "screen_name": "gparagone"}]}, "favorite_count": 0, "favorited": false, "geo": null, "id": 881251152926576646, "id_str": "881251152926576646", "in_reply_to_screen_name": null, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "is_quote_status": false, "lang": "it", "metadata": {"iso_language_code": "it", "result_type": "recent"}, "place": {"attributes": {}, "bounding_box": {"coordinates": [[[10.6298172, 45.5576723], [10.7395842, 45.5576723], [10.7395842, 45.5905671], [10.6298172, 45.5905671]]], "type": "Polygon"}, "contained_within": [], "country": "Italy", "country_code": "IT", "full_name": "Garda, Veneto", "id": "aa9ef434c6691046", "name": "Garda", "place_type": "city", "url": "https://api.twitter.com/1.1/geo/id/aa9ef434c6691046.json"}, "retweet_count": 0, "retweeted": false, "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>", "text": ".@gparagone \"Draghi ci dice di non poter nemmeno discutere di uscire dall'euro\" #gardadautore #gangbank", "truncated": false, "user": {"contributors_enabled": false, "created_at": "Fri Jun 14 08:27:22 +0000 2013", "default_profile": false, "default_profile_image": false, "description": "#Garda e #Ponza D'Autore. Rassegna per chi ama i libri, la cultura e i luoghi. Powered by @vis_verbi", "entities": {"description": {"urls": []}, "url": {"urls": [{"display_url": "dautore.org", "expanded_url": "http://www.dautore.org", "indices": [0, 23], "url": "https://t.co/YYDI2NrChn"}]}}, "favourites_count": 725, "follow_request_sent": null, "followers_count": 732, "following": null, "friends_count": 256, "geo_enabled": true, "has_extended_profile": false, "id": 1515769148, "id_str": "1515769148", "is_translation_enabled": false, "is_translator": false, "lang": "it", "listed_count": 6, "location": "", "name": "D'Autore", "notifications": null, "profile_background_color": "000000", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_tile": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/1515769148/1496821885", "profile_image_url": "http://pbs.twimg.com/profile_images/872070235058208769/X2aJnlw2_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/872070235058208769/X2aJnlw2_normal.jpg", "profile_link_color": "FAB81E", "profile_sidebar_border_color": "000000", "profile_sidebar_fill_color": "000000", "profile_text_color": "000000", "profile_use_background_image": false, "protected": false, "screen_name": "D_Autore", "statuses_count": 1940, "time_zone": "Pacific Time (US & Canada)", "translator_type": "none", "url": "https://t.co/YYDI2NrChn", "utc_offset": -25200, "verified": false}}
{"contributors": null, "coordinates": null, "created_at": "Sat Jul 01 20:39:55 +0000 2017", "entities": {"hashtags": [{"indices": [121, 134], "text": "gardadautore"}], "symbols": [], "urls": [], "user_mentions": [{"id": 384927198, "id_str": "384927198", "indices": [1, 11], "name": "Gianluigi Paragone", "screen_name": "gparagone"}]}, "favorite_count": 0, "favorited": false, "geo": null, "id": 881251040162611201, "id_str": "881251040162611201", "in_reply_to_screen_name": null, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "is_quote_status": false, "lang": "it", "metadata": {"iso_language_code": "it", "result_type": "recent"}, "place": {"attributes": {}, "bounding_box": {"coordinates": [[[10.6298172, 45.5576723], [10.7395842, 45.5576723], [10.7395842, 45.5905671], [10.6298172, 45.5905671]]], "type": "Polygon"}, "contained_within": [], "country": "Italy", "country_code": "IT", "full_name": "Garda, Veneto", "id": "aa9ef434c6691046", "name": "Garda", "place_type": "city", "url": "https://api.twitter.com/1.1/geo/id/aa9ef434c6691046.json"}, "retweet_count": 0, "retweeted": false, "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>", "text": ".@gparagone \"Draghi dice che la gran parte delle misure di riforme finanziarie andranno avanti con il pilota automatico\" #gardadautore", "truncated": false, "user": {"contributors_enabled": false, "created_at": "Fri Jun 14 08:27:22 +0000 2013", "default_profile": false, "default_profile_image": false, "description": "#Garda e #Ponza D'Autore. Rassegna per chi ama i libri, la cultura e i luoghi. Powered by @vis_verbi", "entities": {"description": {"urls": []}, "url": {"urls": [{"display_url": "dautore.org", "expanded_url": "http://www.dautore.org", "indices": [0, 23], "url": "https://t.co/YYDI2NrChn"}]}}, "favourites_count": 725, "follow_request_sent": null, "followers_count": 732, "following": null, "friends_count": 256, "geo_enabled": true, "has_extended_profile": false, "id": 1515769148, "id_str": "1515769148", "is_translation_enabled": false, "is_translator": false, "lang": "it", "listed_count": 6, "location": "", "name": "D'Autore", "notifications": null, "profile_background_color": "000000", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_tile": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/1515769148/1496821885", "profile_image_url": "http://pbs.twimg.com/profile_images/872070235058208769/X2aJnlw2_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/872070235058208769/X2aJnlw2_normal.jpg", "profile_link_color": "FAB81E", "profile_sidebar_border_color": "000000", "profile_sidebar_fill_color": "000000", "profile_text_color": "000000", "profile_use_background_image": false, "protected": false, "screen_name": "D_Autore", "statuses_count": 1940, "time_zone": "Pacific Time (US & Canada)", "translator_type": "none", "url": "https://t.co/YYDI2NrChn", "utc_offset": -25200, "verified": false}}
{"contributors": null, "coordinates": null, "created_at": "Sat Jul 01 20:30:59 +0000 2017", "entities": {"hashtags": [{"indices": [127, 140], "text": "gardadautore"}], "symbols": [], "urls": [], "user_mentions": [{"id": 384927198, "id_str": "384927198", "indices": [1, 11], "name": "Gianluigi Paragone", "screen_name": "gparagone"}]}, "favorite_count": 0, "favorited": false, "geo": null, "id": 881248793525723136, "id_str": "881248793525723136", "in_reply_to_screen_name": null, "in_reply_to_status_id": null, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "in_reply_to_user_id_str": null, "is_quote_status": false, "lang": "it", "metadata": {"iso_language_code": "it", "result_type": "recent"}, "place": {"attributes": {}, "bounding_box": {"coordinates": [[[10.6298172, 45.5576723], [10.7395842, 45.5576723], [10.7395842, 45.5905671], [10.6298172, 45.5905671]]], "type": "Polygon"}, "contained_within": [], "country": "Italy", "country_code": "IT", "full_name": "Garda, Veneto", "id": "aa9ef434c6691046", "name": "Garda", "place_type": "city", "url": "https://api.twitter.com/1.1/geo/id/aa9ef434c6691046.json"}, "retweet_count": 0, "retweeted": false, "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>", "text": ".@gparagone \"Mario Draghi \u00e8 stato formato in Goldman Sachs, infatti la BCE non d\u00e0 soldi all'economia reale, solo alla finanza\" #gardadautore", "truncated": false, "user": {"contributors_enabled": false, "created_at": "Fri Jun 14 08:27:22 +0000 2013", "default_profile": false, "default_profile_image": false, "description": "#Garda e #Ponza D'Autore. Rassegna per chi ama i libri, la cultura e i luoghi. Powered by @vis_verbi", "entities": {"description": {"urls": []}, "url": {"urls": [{"display_url": "dautore.org", "expanded_url": "http://www.dautore.org", "indices": [0, 23], "url": "https://t.co/YYDI2NrChn"}]}}, "favourites_count": 725, "follow_request_sent": null, "followers_count": 732, "following": null, "friends_count": 256, "geo_enabled": true, "has_extended_profile": false, "id": 1515769148, "id_str": "1515769148", "is_translation_enabled": false, "is_translator": false, "lang": "it", "listed_count": 6, "location": "", "name": "D'Autore", "notifications": null, "profile_background_color": "000000", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "profile_background_tile": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/1515769148/1496821885", "profile_image_url": "http://pbs.twimg.com/profile_images/872070235058208769/X2aJnlw2_normal.jpg", "profile_image_url_https": "https://pbs.twimg.com/profile_images/872070235058208769/X2aJnlw2_normal.jpg", "profile_link_color": "FAB81E", "profile_sidebar_border_color": "000000", "profile_sidebar_fill_color": "000000", "profile_text_color": "000000", "profile_use_background_image": false, "protected": false, "screen_name": "D_Autore", "statuses_count": 1940, "time_zone": "Pacific Time (US & Canada)", "translator_type": "none", "url": "https://t.co/YYDI2NrChn", "utc_offset": -25200, "verified": false}}
Natürlich ist es so, dass ich es natürlich nicht riskieren will Tweets zu verpassen, habe ich nicht exakt alle 10 Tage später Tweets heruntergeladen. Sondern alle 8-9 Tage. Manchmal sogar 3 Tage später. Das heißt:
1. Es gibt mehrere JSON Dateien (Jedes Mal, wenn ich Tweets anfordere entsteht eine neue JSON Datei. Das liegt daran, dass Tweepy nicht erkennt, wo es das letzte Mal gestoppt hat und führt dann dazu, dass die alte Datei nochmal überschrieben wird und man verliert Informationen dadurch).
2. Es gibt Duplikate in den JSON Dateien.
Nun will ich alle JSON Dateien in einer JSON Datei zusammenbringen. Dabei soll die allererste JSON Datei, die erstellt worden ist, immer wieder aktualisiert werden bzw. nur NEUE Informationen sollen hinzugefügt werden. Duplikate sollen erst gar nicht aufgenommen werden.
Dafür habe ich zwei Ansätze versucht, die gescheitert sind:
1. Ansatz:
Zum Verständnis:"16.06.2017 ECB Draghi.json" = Der Titel bedeutet: Die JSON Datei, die am 16.06.2017 Tweets mit den Suchbegriffen "ECB" und "Draghi" der letzten 10 Tagen beinhaltet.
Code: Alles auswählen
import json
with open("16.06.2017 ECB Draghi.json") as td:
data1 = json.load(td)
with open("17.06.2017 ECB Draghi.json") as td:
data2 = json.load(td)
with open("18.06.2017 ECB Draghi.json") as td:
data3 = json.load(td)
with open("19.06.2017 ECB Draghi.json") as td:
data4 = json.load(td)
with open("20.06.2017 ECB Draghi.json") as td:
data5 = json.load(td)
with open("22.06.2017 ECB Draghi.json") as td:
data6 = json.load(td)
with open("23.06.2017 ECB Draghi.json") as td:
data7 = json.load(td)
with open("25.06.2017 ECB Draghi.json") as td:
data8 = json.load(td)
with open("30.06.2017 ECB Draghi.json") as td:
data9 = json.load(td)
with open("01.07.2017 ECB Draghi.json") as td:
data10 = json.load(td)
with open("02.07.2017 ECB Draghi.json") as td:
data11 = json.load(td)
with open("04.07.2017 ECB Draghi.json") as td:
data12 = json.load(td)
with open("09.07.2017 ECB Draghi.json") as td:
data13 = json.load(td)
with open("10.07.2017 ECB Draghi.json") as td:
data14 = json.load(td)
with open("17.07.2017 ECB Draghi.json") as td:
data15 = json.load(td)
with open("23.07.2017 ECB Draghi.json") as td:
data16 = json.load(td)
with open("28.07.2017 ECB Draghi.json") as td:
data17 = json.load(td)
with open("29.07.2017 ECB Draghi.json") as td:
data18 = json.load(td)
with open("01.08.2017 ECB Draghi.json") as td:
data19 = json.load(td)
with open("04.08.2017 ECB Draghi.json") as td:
data20 = json.load(td)
with open("08.08.2017 ECB Draghi.json") as td:
data21 = json.load(td)
with open("10.08.2017 ECB Draghi.json") as td:
data22 = json.load(td)
with open("11.08.2017 ECB Draghi.json") as td:
data23 = json.load(td)
data1.update(data2)
data1.update(data3)
data1.update(data4)
data1.update(data5)
data1.update(data6)
data1.update(data7)
data1.update(data8)
data1.update(data9)
data1.update(data10)
data1.update(data11)
data1.update(data12)
data1.update(data13)
data1.update(data14)
data1.update(data15)
data1.update(data16)
data1.update(data17)
data1.update(data18)
data1.update(data19)
data1.update(data20)
data1.update(data21)
data1.update(data22)
data1.update(data23)
data1.update(data24)
data1.update(data25)
data1.update(data26)
data1.update(data27)
with open("MERGE_ECB_DRAGHI", "w") as td:
json.dump(data1, td)
2. Ansatz:
Code: Alles auswählen
import json
with open("16.06.2017 ECB Draghi.json","r") as td, \
open("17.06.2017 ECB Draghi.json","r") as td2, \
open("18.06.2017 ECB Draghi.json","r") as td3, \
open("19.06.2017 ECB Draghi.json","r") as td4, \
open("20.06.2017 ECB Draghi.json","r") as td5, \
open("22.06.2017 ECB Draghi.json","r") as td6, \
open("23.06.2017 ECB Draghi.json","r") as td7, \
open("25.06.2017 ECB Draghi.json","r") as td8, \
open("30.06.2017 ECB Draghi.json","r") as td9, \
open("01.07.2017 ECB Draghi.json","r") as td10, \
open("02.07.2017 ECB Draghi.json","r") as td11, \
open("04.07.2017 ECB Draghi.json","r") as td12, \
open("09.07.2017 ECB Draghi.json","r") as td13, \
open("10.07.2017 ECB Draghi.json","r") as td14, \
open("17.07.2017 ECB Draghi.json","r") as td15, \
open("23.07.2017 ECB Draghi.json","r") as td16, \
open("23.07.2017 ECB Draghi.json","r") as td17, \
open("28.07.2017 ECB Draghi.json","r") as td18, \
open("29.07.2017 ECB Draghi.json","r") as td19, \
open("01.08.2017 ECB Draghi.json","r") as td20, \
open("04.08.2017 ECB Draghi.json","r") as td21, \
open("08.08.2017 ECB Draghi.json","r") as td22, \
open("10.08.2017 ECB Draghi.json","r") as td23, \
open("11.08.2017 ECB Draghi.json","r") as td24:
data1 = json.load(td1)
data2 = json.loads(td2, td3, td4, td5, td6, td7, td8, td9, td10, td11, td12, td13, td14, td15, td16, td17, \
td18, td19, td20, td21, td22, td23, td24)
data1.update(data2)
with open("MERGE_ECB_DRAGHI", "w") as td:
json.dump(data1, td)
Als Link könnt ihr ein WinRAR Archiv mit 3 JSON Dateien als Inhalt herunterladen, um euch die Dateien falls nötig näher anzuschauen und vielleicht selbst probieren könnt: https://www.dropbox.com/s/4yhsywxbhqmtb ... r.rar?dl=0
Leider kann man hier keinen Anhang hinzufügen :-/
Vielen lieben Dank im Voraus!
BG
pytony