Jede Timestamp, Id und jeder jeder Inhalt kann von Zeile zu Zeile und von Excel-Datei zu Excel-Datei variieren.
Ich bleibe mit einem Errorcode hängen, den ich bekomme, ich hoffe, Sie könnt mir hier weiterhelfen.
Dies ist mein Code:
Code: Alles auswählen
import os
import re
import pymysql
pymysql.install_as_MySQLdb()
import pandas as pd
import sqlalchemy
def insert_or_update(engine, pd_table, table_name):
inserts = 0
updates = 0
for i in range(len(pd_table)):
vals_with_quotes = ["'" + str(x) + "'" for x in pd_table.loc[i, :].values]
# print(vals_with_quotes)
update_pairs = [str(c) + " = '" + str(v) + "'" for c, v in zip(pd_table.columns, pd_table.loc[i, :])]
query = f"INSERT INTO {table_name} ({', '.join(list(pd_table.columns.values))}) " \
f"VALUES ({', '.join(vals_with_quotes)}) " \
f"ON DUPLICATE KEY UPDATE {', '.join(update_pairs)}"
print(query)
result = engine.execute(query)
if result.lastrowid == 0:
updates += 1
else:
inserts += 1
print(f"Inserted {inserts} rows and updated {updates} rows.")
schema = '---'
alchemy_connect = "---"
engine = sqlalchemy.create_engine(alchemy_connect) # connect to server
engine.execute(f"USE {schema}") # select new db
# engine.execute("SET NAMES UTF8MB4;")
query = "SELECT * FROM --- where study=---"
pm = pd.read_sql(query, engine)
rootpath = "path/"
for root, dirs, files in os.walk(rootpath):
for file in files:
print(root, dirs, files, file)
d = pd.read_excel(root + file, header=None)
d.drop(columns=[0], inplace=True)
d.rename(columns={1: "content"}, inplace=True)
participants = []
for ix, row in d.iterrows():
c = row["content"]
match = re.search(r"^\[(\d{1,2}:\d{1,2}:\d{1,2})]\s+(\w+):\s*(.*)", c)
if match:
timestamp = match.group(1)
id_code = match.group(2)
only_content = match.group(3)
if re.search(r"(?:Versuch(?:sleit(?:er)?)?|ersuchsleiter)", id_code):
id_code = "Versuchsleiter"
# get pid
participant = pm.loc[pm["id_code"] == id_code, "pid"]
try:
pid = participant.values[0]
except:
pid = "Versuchsleiter"
# print(ix, pid, id_code, only_content, timestamp)
if pid and pid not in participants and pid != "Versuchsleiter":
participants.append(pid)
d.loc[ix, "pid"] = pid
d.loc[ix, "timestamp"] = timestamp
d.loc[ix, "content"] = only_content.strip()
d.loc[ix, "is_participant"] = 0 if pid == "Versuchsleiter" else 1
d = d[["pid", "is_participant", "content", "timestamp"]]
d.loc[(d['pid'] == "Versuchsleiter"), "pid"] = participants[0]
d.loc[(d['pid'] == None), "pid"] = participants[0]
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
# print(d)
insert_or_update(engine, d, "YM_Transcripts")
und das ist mein Fehlercode den ich erhalte
Code: Alles auswählen
File "C:/Users/.../PycharmProjects/.../.../---.py", line 20, in insert_or_update
result = engine.execute(query)
File "C:\Users\...\PycharmProjects\...\...\lib\site-packages\sqlalchemy\engine\base.py", line 2191, in execute
return connection.execute(statement, *multiparams, **params)
TypeError: not enough arguments for format string