Skip to content

Commit 0093960

Browse files
authored
Merge pull request #458 from PNNL-CompBio/liverpdo_proteomics
Liverpdo proteomics
2 parents 6131ed4 + f776fff commit 0093960

File tree

4 files changed

+62
-8
lines changed

4 files changed

+62
-8
lines changed

coderbuild/build_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def process_omics(executor, dataset, should_continue):
133133
'bladder': ['copy_number', 'mutations', 'transcriptomics'],
134134
'colorectal':['copy_number', 'mutations', 'transcriptomics'],
135135
'novartis':['copy_number', 'mutations', 'transcriptomics'],
136-
'liver':['copy_number', 'mutations', 'transcriptomics']
136+
'liver':['copy_number', 'mutations', 'transcriptomics','proteomics']
137137
}
138138

139139
expected_omics = dataset_omics_files.get(dataset, [])

coderbuild/liver/02-omics-liver.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,6 @@ def map_copy_number(copy_number_data, improve_id_data, entrez_data):
225225

226226
return(improve_mapped_cn_df)
227227

228-
229228
def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
230229

231230
# read in data
@@ -234,7 +233,7 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
234233

235234
if isinstance(improve_id_data, pd.DataFrame) == False:
236235
improve_id_data = pd.read_csv(improve_id_data)
237-
236+
238237
if isinstance(entrez_data, pd.DataFrame) == False:
239238
entrez_data = pd.read_csv(entrez_data)
240239

@@ -253,20 +252,20 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
253252

254253
# run tpmFromCounts.py to convert counts to tpm
255254
os.system("python3 tpmFromCounts.py --counts /tmp/counts_for_tpm_conversion.tsv --genome_build https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.gtf.gz --gene_col stable_id --exclude_col stable_id --out_file /tmp/transcriptomics_tpm.tsv")
256-
255+
257256
# melt the df so there is one sample and gene per row
258257
long_transcriptomics_df = pd.read_csv("/tmp/transcriptomics_tpm.tsv",sep='\t')
259258
long_transcriptomics_df = pd.melt(long_transcriptomics_df, id_vars=['stable_id'], value_vars=long_transcriptomics_df.columns[long_transcriptomics_df.columns != 'stable_id'])
260259
long_transcriptomics_df = long_transcriptomics_df.rename(columns = {'value':'transcriptomics', 0:'sample_name'})
261-
262260

263-
# map gene names to entrez id's
261+
262+
# map gene names to entrez id's
264263
mapped_transcriptomics_df = pd.merge(long_transcriptomics_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "stable_id", right_on= "other_id")
265264
mapped_transcriptomics_df = mapped_transcriptomics_df.dropna(subset=['entrez_id'])
266265

267266
# mapping improve sample id'samples_df
268267
mapped_transcriptomics_df = pd.merge(mapped_transcriptomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "variable", right_on= "other_id")
269-
268+
270269
# clean up column names and data types
271270
mapped_transcriptomics_df = mapped_transcriptomics_df.drop(columns=['stable_id','variable','other_id_x','other_id_y'])
272271
mapped_transcriptomics_df['source'] = "Synapse"
@@ -278,6 +277,46 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
278277
return(mapped_transcriptomics_df)
279278

280279

280+
def map_proteomics(proteomics_data, improve_id_data, entrez_data):
281+
282+
# read in data
283+
if isinstance(proteomics_data, pd.DataFrame) == False:
284+
proteomics_data = pd.read_csv(proteomics_data)
285+
286+
if isinstance(improve_id_data, pd.DataFrame) == False:
287+
improve_id_data = pd.read_csv(improve_id_data)
288+
289+
if isinstance(entrez_data, pd.DataFrame) == False:
290+
entrez_data = pd.read_csv(entrez_data)
291+
292+
# first, replace colnames with first row and delete first row
293+
proteomics_data.columns = proteomics_data.iloc[0,:]
294+
proteomics_data = proteomics_data.iloc[1:]
295+
296+
# melt the df so there is one sample and prot per row
297+
proteomics_data = proteomics_data.rename(columns = {proteomics_data.columns[0]:'gene_symbol'})
298+
long_prot_df = pd.melt(proteomics_data, id_vars=['gene_symbol'], value_vars=proteomics_data.columns[proteomics_data.columns != 'gene_symbol'])
299+
long_prot_df = long_prot_df.rename(columns = {0:'sample_name', 'value':'proteomics'})
300+
301+
302+
# map gene names to entrez id's
303+
mapped_proteomics_df = pd.merge(long_prot_df, entrez_data[['other_id','entrez_id']].drop_duplicates(), how = 'inner', left_on= "gene_symbol", right_on= "other_id")
304+
mapped_proteomics_df = mapped_proteomics_df.dropna(subset=['entrez_id'])
305+
306+
# mapping improve sample id'samples_df
307+
mapped_proteomics_df = pd.merge(mapped_proteomics_df, improve_id_data[['other_id','improve_sample_id']].drop_duplicates(), how = 'inner', left_on= "sample_name", right_on= "other_id")
308+
309+
# clean up column names and data types
310+
mapped_proteomics_df = mapped_proteomics_df.drop(columns=['gene_symbol','sample_name','other_id_x','other_id_y'])
311+
mapped_proteomics_df['source'] = "Synapse"
312+
mapped_proteomics_df['study'] = "liver"
313+
mapped_proteomics_df = mapped_proteomics_df.dropna()
314+
mapped_proteomics_df = mapped_proteomics_df.astype({'entrez_id':'int','improve_sample_id':'int'})
315+
mapped_proteomics_df = mapped_proteomics_df[['entrez_id','proteomics','improve_sample_id','source','study']]
316+
317+
return(mapped_proteomics_df)
318+
319+
281320
if __name__ == "__main__":
282321
parser = argparse.ArgumentParser(description='###')
283322

@@ -292,6 +331,7 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
292331
parser.add_argument('-T', '--transcriptomics', action = 'store_true', default=False, help='Generate transcriptomics data')
293332
parser.add_argument('-M', '--mutations', action = 'store_true', default=False, help='Generate mutations data')
294333
parser.add_argument('-C', '--copy_number', action = 'store_true', default=False, help='Generate copy number data')
334+
parser.add_argument('-R', '--proteomics', action = 'store_true', default=False, help='Generate proteomics data')
295335

296336
args = parser.parse_args()
297337

@@ -347,4 +387,16 @@ def map_transcriptomics(transciptomics_data, improve_id_data, entrez_data):
347387
print("Starting copy number data.")
348388
mutation_df = map_copy_number(copy_number_data = "/tmp/raw_copynum_data.csv", improve_id_data = "/tmp/liver_samples.csv", entrez_data = "/tmp/genes.csv")
349389
mutation_df.to_csv("/tmp/liver_copy_number.csv", index=False)
390+
391+
if args.proteomics:
392+
if args.genes is None or args.genes=='':
393+
print("No genes data provided. Exiting script.")
394+
exit()
395+
if args.ids is None or args.ids=='':
396+
print("No samples data provided. Exiting script.")
397+
exit()
398+
else:
399+
print("Starting proteomics data.")
400+
proteomics_df = map_proteomics(proteomics_data = "/tmp/raw_proteomics_data.csv", improve_id_data = "/tmp/liver_samples.csv", entrez_data = "/tmp/genes.csv")
401+
proteomics_df.to_csv("/tmp/liver_proteomics.csv", index=False)
350402

coderbuild/liver/build_omics.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ set -euo pipefail
44
trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit 1' ERR
55

66
echo "Running 02-omics-liver.py with token, improve_sample_id $2, and genes $1."
7-
python3 02-omics-liver.py --parse --transcriptomics --mutations --copy_number --ids $2 --genes $1 --token $SYNAPSE_AUTH_TOKEN
7+
python3 02-omics-liver.py --parse --transcriptomics --mutations --copy_number --proteomics --ids $2 --genes $1 --token $SYNAPSE_AUTH_TOKEN

schema/expected_files.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,8 @@ datasets:
270270
file: /tmp/liver_copy_number.csv
271271
- target_class: Mutations
272272
file: /tmp/liver_mutations.csv
273+
- target_class: Proteomics
274+
file: /tmp/liver_proteomics.csv
273275
- target_class: Experiments
274276
file: /tmp/liver_experiments.tsv
275277
- target_class: Drug

0 commit comments

Comments
 (0)