@@ -225,7 +225,6 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
225225
226226 return (improve_mapped_cn_df )
227227
228-
229228def map_transcriptomics (transciptomics_data , improve_id_data , entrez_data ):
230229
231230 # read in data
@@ -234,7 +233,7 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
234233
235234 if isinstance (improve_id_data , pd .DataFrame ) == False :
236235 improve_id_data = pd .read_csv (improve_id_data )
237-
236+
238237 if isinstance (entrez_data , pd .DataFrame ) == False :
239238 entrez_data = pd .read_csv (entrez_data )
240239
@@ -253,20 +252,20 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
253252
254253 # run tpmFromCounts.py to convert counts to tpm
255254 os .system ("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv" )
256-
255+
257256 # melt the df so there is one sample and gene per row
258257 long_transcriptomics_df = pd .read_csv ("/tmp/transcriptomics_tpm.tsv" ,sep = '\t ' )
259258 long_transcriptomics_df = pd .melt (long_transcriptomics_df , id_vars = ['stable_id' ], value_vars = long_transcriptomics_df .columns [long_transcriptomics_df .columns != 'stable_id' ])
260259 long_transcriptomics_df = long_transcriptomics_df .rename (columns = {'value' :'transcriptomics' , 0 :'sample_name' })
261-
262260
263- # map gene names to entrez id's
261+
262+ # map gene names to entrez id's
264263 mapped_transcriptomics_df = pd .merge (long_transcriptomics_df , entrez_data [['other_id' ,'entrez_id' ]].drop_duplicates (), how = 'inner' , left_on = "stable_id" , right_on = "other_id" )
265264 mapped_transcriptomics_df = mapped_transcriptomics_df .dropna (subset = ['entrez_id' ])
266265
267266 # mapping improve sample id'samples_df
268267 mapped_transcriptomics_df = pd .merge (mapped_transcriptomics_df , improve_id_data [['other_id' ,'improve_sample_id' ]].drop_duplicates (), how = 'inner' , left_on = "variable" , right_on = "other_id" )
269-
268+
270269 # clean up column names and data types
271270 mapped_transcriptomics_df = mapped_transcriptomics_df .drop (columns = ['stable_id' ,'variable' ,'other_id_x' ,'other_id_y' ])
272271 mapped_transcriptomics_df ['source' ] = "Synapse"
@@ -278,6 +277,46 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
278277 return (mapped_transcriptomics_df )
279278
280279
280+ def map_proteomics (proteomics_data , improve_id_data , entrez_data ):
281+
282+ # read in data
283+ if isinstance (proteomics_data , pd .DataFrame ) == False :
284+ proteomics_data = pd .read_csv (proteomics_data )
285+
286+ if isinstance (improve_id_data , pd .DataFrame ) == False :
287+ improve_id_data = pd .read_csv (improve_id_data )
288+
289+ if isinstance (entrez_data , pd .DataFrame ) == False :
290+ entrez_data = pd .read_csv (entrez_data )
291+
292+ # first, replace colnames with first row and delete first row
293+ proteomics_data .columns = proteomics_data .iloc [0 ,:]
294+ proteomics_data = proteomics_data .iloc [1 :]
295+
296+ # melt the df so there is one sample and prot per row
297+ proteomics_data = proteomics_data .rename (columns = {proteomics_data .columns [0 ]:'gene_symbol' })
298+ long_prot_df = pd .melt (proteomics_data , id_vars = ['gene_symbol' ], value_vars = proteomics_data .columns [proteomics_data .columns != 'gene_symbol' ])
299+ long_prot_df = long_prot_df .rename (columns = {0 :'sample_name' , 'value' :'proteomics' })
300+
301+
302+ # map gene names to entrez id's
303+ mapped_proteomics_df = pd .merge (long_prot_df , entrez_data [['other_id' ,'entrez_id' ]].drop_duplicates (), how = 'inner' , left_on = "gene_symbol" , right_on = "other_id" )
304+ mapped_proteomics_df = mapped_proteomics_df .dropna (subset = ['entrez_id' ])
305+
306+ # mapping improve sample id'samples_df
307+ mapped_proteomics_df = pd .merge (mapped_proteomics_df , improve_id_data [['other_id' ,'improve_sample_id' ]].drop_duplicates (), how = 'inner' , left_on = "sample_name" , right_on = "other_id" )
308+
309+ # clean up column names and data types
310+ mapped_proteomics_df = mapped_proteomics_df .drop (columns = ['gene_symbol' ,'sample_name' ,'other_id_x' ,'other_id_y' ])
311+ mapped_proteomics_df ['source' ] = "Synapse"
312+ mapped_proteomics_df ['study' ] = "liver"
313+ mapped_proteomics_df = mapped_proteomics_df .dropna ()
314+ mapped_proteomics_df = mapped_proteomics_df .astype ({'entrez_id' :'int' ,'improve_sample_id' :'int' })
315+ mapped_proteomics_df = mapped_proteomics_df [['entrez_id' ,'proteomics' ,'improve_sample_id' ,'source' ,'study' ]]
316+
317+ return (mapped_proteomics_df )
318+
319+
281320if __name__ == "__main__" :
282321 parser = argparse .ArgumentParser (description = '###' )
283322
@@ -292,6 +331,7 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
292331 parser .add_argument ('-T' , '--transcriptomics' , action = 'store_true' , default = False , help = 'Generate transcriptomics data' )
293332 parser .add_argument ('-M' , '--mutations' , action = 'store_true' , default = False , help = 'Generate mutations data' )
294333 parser .add_argument ('-C' , '--copy_number' , action = 'store_true' , default = False , help = 'Generate copy number data' )
334+ parser .add_argument ('-R' , '--proteomics' , action = 'store_true' , default = False , help = 'Generate proteomics data' )
295335
296336 args = parser .parse_args ()
297337
@@ -347,4 +387,16 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
347387 print ("Starting copy number data." )
348388 mutation_df = map_copy_number (copy_number_data = "/tmp/raw_copynum_data.csv" , improve_id_data = "/tmp/liver_samples.csv" , entrez_data = "/tmp/genes.csv" )
349389 mutation_df .to_csv ("/tmp/liver_copy_number.csv" , index = False )
390+
391+ if args .proteomics :
392+ if args .genes is None or args .genes == '' :
393+ print ("No genes data provided. Exiting script." )
394+ exit ()
395+ if args .ids is None or args .ids == '' :
396+ print ("No samples data provided. Exiting script." )
397+ exit ()
398+ else :
399+ print ("Starting proteomics data." )
400+ proteomics_df = map_proteomics (proteomics_data = "/tmp/raw_proteomics_data.csv" , improve_id_data = "/tmp/liver_samples.csv" , entrez_data = "/tmp/genes.csv" )
401+ proteomics_df .to_csv ("/tmp/liver_proteomics.csv" , index = False )
350402
0 commit comments