ich muss alle Daten aus einem Excel workbook in eine PostgreSQL - Datenbank importieren.
Ich exportiere dazu in Excel die spreadsheets als .csv - Dateien.
Mein momentanes Python Skript importiert dann eine einzelne spreadsheet-csv-Datei, und generiert PostgreSQL Dateien.
Die .psql Dateien beinhalten dann entsprechende SQL Anweisungen, mit denen die Daten aus dem spreadsheet in die Datenbank importiert werden.
Mein Problem momentan:
Das Excel-Workbook beinhaltet mehrere spreadsheets. Diese sind untereinander durch Hyperlinks verbunden, und enthalten manchmal Kommentare in den Zellen.
Wie kann ich mit Python Hyperlinks, und Kommentare in Zellen aus einem spreadsheet auslesen, und in einer Tabelle (Ziel: PostgreSQL-Datenbank) speichern?
Ich kann gerne das Excel workbook bzw. eine Beispiel- .csv-Datei eines spreadsheets per email schicken, falls meine Fragestellung unklar ist!
Hier mein momentanes Python Script:
Code: Alles auswählen
#
# This script prepares a number of psql files to bootstrap data into the
# samson database , from the samples spreadsheet
#
#
# Gene Full Name Abbreviations and Alternative Names Order Cellular localisation Functional Classification Magnitude of Change Cycle Stage Follicular Localisation Human Chromosome Weighted Score Count
#
#
# 142 Unknown - 207 Unknown Unknown - - - - 30 2
# 143 Unknown - 207 Unknown Unknown - - - - 30 2
# A2B1 Heterogeneous nuclear ribonucleoproteins A2/B1 hnRNP A2 / hnRNP B1, HNRPA2B1 128 Nuclear Structural - - - 07p15 110 2
# ABC7 ATP-binding cassette, sub-family B (MDR/TAP), member 7 ABC7, ABCB7, ASAT, Atm1p 108 Cytoplasmic Transporter - - - Xq12-q13 125 2
# ACLY ATP citrate lyase ACLY 162 Cytoplasmic Lipid Metabolism - - - 17q12-q21 75 2
#
#
#Gene ---> geneticOb.geneticObName , geneticOb.obkeywords
#Full Name --> geneticOb.geneticObDescription
#Abbreviations.. --> geneticOb.geneticObSymbols, geneticOb.obkeywords
#order ---> geneticFact.AttributeName/Value
#Cellular localisation ----> geneticExpressionFact.cellularLocalisation
#Functional Classification ----> geneticFunctionFact.functionComment
#Magnitude of Change ---> geneticFact.AttributeName/Value
#FollicularLocalisation ----> geneticFact.Attribute/Value
#Chromosome -----> GeneticLocationFact.species ,.chromosome, cytopos
#WeightedScore -----> GeneticFact.attribute/Value
#Count ----> geneticFact.count
import csv
from sets import Set
infilename = "../data/SamsonGenes.csv";
def main():
genGeneBootstrap()
def writeHeader(outfilename):
outfile = open(outfilename,"w")
outfile.write("""
/*
* script generated by gengeneticlocusbootstrap.py
*/
""")
return outfile
def writeFooter(outfile):
outfile.write("""
/*
* **** done ****
*/
""")
outfile.close()
def genGeneBootstrap():
geneSQLDict={}
samsonSQLDict={}
expressionSQLDict={}
functionSQLDict={}
locationSQLDict={}
geneinfoSQLDict={}
reader = csv.reader(open(infilename, "rb"))
cols = ("Gene","Full Name","Abbreviations and Alternative Names",
"Order","Cellular localisation","Functional Classification",
"Magnitude of Change","Cycle Stage",
"Follicular Localisation","Human Chromosome",
"Weighted Score","Count")
coldict={}
ucols = [item.upper() for item in cols]
for row in reader:
#print row
#get column positions if necessary
if len(coldict) == 0 :
ucrow = [item.upper() for item in row]
#print ucrow
try :
for ucitem in ucols :
coldict[ucitem]=ucrow.index(ucitem)
except ValueError:
None
#construct SQL if we have the gene column
#GENE
#FULL NAME
#ABBREVIATIONS
#ORDER
#CELLULAR LOCALISATION
#FUNCTIONAL CLASSIFICATION
#MAGNITUDE OF CHANGE
#FOLLICULARLOCALISATION
#CHROMOSOME
#WEIGHTEDSCORE
#COUNT
if coldict.has_key('GENE'):
if (not geneSQLDict.has_key(row[coldict['GENE']])) and \
row[coldict['GENE']] != 'Gene' and \
len(row[coldict['GENE']].split()) > 0:
# geneticOb insert
print "adding ",row[coldict['GENE']]
geneSQLDict[row[coldict['GENE']]] = "insert into geneticOb(geneticobname,xreflsid,geneticobtype,geneticobdescription,geneticobsymbols,obkeywords) "
geneSQLDict[row[coldict['GENE']]] = geneSQLDict[row[coldict['GENE']]] + \
"values('" + row[coldict['GENE']] + "','http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=search&term=" + \
row[coldict['GENE']] + "','Gene','" + row[coldict['FULL NAME']] + " (from SAMSON database)','" + row[coldict['GENE']] + " " + \
row[coldict['ABBREVIATIONS AND ALTERNATIVE NAMES']] + "','" + row[coldict['GENE']] + " " + row[coldict['ABBREVIATIONS AND ALTERNATIVE NAMES']] + " SAMSON');"
if len(row[coldict["COUNT"]].strip()) == 0:
row[coldict["COUNT"]] = "null"
#samsonGeneticFact insert
samsonSQLDict[row[coldict['GENE']]] = "insert into samsonGeneticFact(geneticob,genesummaryorder,"
samsonSQLDict[row[coldict['GENE']]] = samsonSQLDict[row[coldict['GENE']]] + \
"changemagnitude,follicularlocalisation,genesummaryweightedscore,genesummarycount) select obid," +\
row[coldict['ORDER']] + ",'" + row[coldict['MAGNITUDE OF CHANGE']] + "','" + \
row[coldict["FOLLICULAR LOCALISATION"]] + "'," + row[coldict["WEIGHTED SCORE"]] + "," + \
row[coldict["COUNT"]] + " from geneticob where geneticObName = '" + row[coldict['GENE']] +"';"
samsonSQLDict[row[coldict['GENE']]] = samsonSQLDict[row[coldict['GENE']]].replace(",,",",null,")
samsonSQLDict[row[coldict['GENE']]] = samsonSQLDict[row[coldict['GENE']]].replace(",#N/A,",",null,")
#GeneticExpressionFact insert
expressionSQLDict[row[coldict['GENE']]] = "insert into geneticExpressionFact(geneticob,cellularlocalisation,lifecyclestage) " + \
"select obid,'" + row[coldict['CELLULAR LOCALISATION']] + "','" + row[coldict['CYCLE STAGE']] + \
"' from geneticob where geneticObName = '" + row[coldict['GENE']] +"';"
#GeneticFunctionFact insert
if row[coldict['FUNCTIONAL CLASSIFICATION']].strip().upper() != "UNKNOWN":
functionSQLDict[row[coldict['GENE']]] = "insert into geneticFunctionFact(geneticob,functionComment) " + \
"select obid,'" + row[coldict['FUNCTIONAL CLASSIFICATION']] + \
"' from geneticob where geneticObName = '" + row[coldict['GENE']] +"';"
#GeneticLocationFact insert
cytopos=row[coldict['HUMAN CHROMOSOME']].strip()
if cytopos != "-" and len(cytopos) > 0 :
xreflsid= "'http://genome.ucsc.edu/cgi-bin/hgTracks?org=Human&db=hg17&position=" + cytopos + "'"
cytopos="'" + cytopos + "'"
species="'Homo Sapiens'"
locationSQLDict[row[coldict['GENE']]] = "insert into geneticLocationFact(geneticob,xreflsid,cytopos,speciesname) " + \
"select obid," + xreflsid + "," +cytopos + "," + species + \
" from geneticob where geneticObName = '" + row[coldict['GENE']] +"';"
#generate SQL that when executed on the Oracle database, will generate
#Postgres updates of the GeneticFunction and Location tables. Example Oracle SQL :
"""
select
/*+ RULE */
'insert into geneticfact(ob,factnamespace,attributename,attributevalue) ' ||
'select obid, ''EntrezGene'',''Summary'',''<dd>Geneid=' ||
to_char(geneid) || ' Symbol='||
symbol || ' Synonyms=' ||
synonyms || '<dd>Description='||
description || ' <dd>GOdescription=' ||
pubplsqlutils.getGoString(geneid) || ' <dd>Map=' ||
g.chromosome||':'||g.map_location || ''' from geneticob where obname = ''CRP1'''
from
pubstore.geneinfo g
where
(symbol='CRP1' or
synonyms like 'CRP1|' or
synonyms like '%|CRP1|%' or
synonyms like '%|CRP1') and taxid = 9606
#example pgsql :
insert into geneticfact(ob,factnamespace,attributename,attributevalue) select obid, 'EntrezGene','Summary','<dd>Geneid=1053 Symbol=CEBPE Synonyms=C/EBP-epsilon|CRP1<dd>Description=CCAAT/enhancer binding protein (C/EBP), epsilon <dd>GOdescription=DNA binding ; defense response ; nucleus ; regulation of transcription, DNA-dependent ; transcription <dd>Map=14:14q11.2' from geneticob where obname = 'CRP1'
insert into geneticfact(ob,factnamespace,attributename,attributevalue) select obid, 'EntrezGene','Summary','<dd>Geneid=1396 Symbol=CRIP1 Synonyms=CRHP|CRIP|CRP1<dd>Description=cysteine-rich protein 1 (intestinal) <dd>GOdescription=antimicrobial humoral response (sensu Vertebrata) ; cell proliferation ; cytoplasm ; metal ion binding ; zinc ion binding <dd>Map=14:14q32.33' from geneticob where obname = 'CRP1'
insert into geneticfact(ob,factnamespace,attributename,attributevalue) select obid, 'EntrezGene','Summary','<dd>Geneid=1465 Symbol=CSRP1 Synonyms=CRP|CRP1|CSRP|CYRP|D1S181E|DKFZp686M148<dd>Description=cysteine and glycine-rich protein 1 <dd>GOdescription=metal ion binding ; nucleus ; zinc ion binding <dd>Map=1:1q32' from geneticob where obname = 'CRP1'
if cytopos != "-" and len(cytopos) > 0 : """
outfile=writeHeader("../bin/bootstrapgenes.psql")
# SQL to set up SAMSON project list of genes
outfile.write("insert into GeneticObList(listName,maxmembership,listComment) values('SAMSON Database of Hair Follicle Genes',25000,'SAMSON Gene Project List. This List contains GeneticOb objects in the SAMSON database project');\n");
# SQL to insert genes
for value in geneSQLDict.values():
outfile.write(value)
outfile.write('\n')
# SQL to populate projects list
outfile.write("insert into geneticOblistMembershipLink(geneticObList,geneticOb) \
select l.obid,g.obid from \
geneticOb g, geneticObList l \
where l.listName = 'SAMSON Database of Hair Follicle Genes' and \
g.geneticObDescription like '%from SAMSON database%'; \n")
writeFooter(outfile)
outfile=writeHeader("../bin/bootstrapgenesamson.psql")
for value in samsonSQLDict.values():
outfile.write(value)
outfile.write('\n')
writeFooter(outfile)
outfile=writeHeader("../bin/bootstrapgeneexpression.psql")
for value in expressionSQLDict.values():
outfile.write(value)
outfile.write('\n')
writeFooter(outfile)
outfile=writeHeader("../bin/bootstrapgenefunction.psql")
for value in functionSQLDict.values():
outfile.write(value)
outfile.write('\n')
writeFooter(outfile)
outfile=writeHeader("../bin/bootstrapgenelocation.psql")
for value in locationSQLDict.values():
outfile.write(value)
outfile.write('\n')
writeFooter(outfile)
print coldict
if __name__ == "__main__":
main()
Juergen