@@ -39,41 +39,22 @@ class Split:
3939
4040class Dataset :
4141
42- data_format_params = {
43- "samples" : (
44- "improve_sample_id" , "cancer_type" , "model_type" , "common_name" ,
45- "other_id" , "other_names" , "id_source" , "species"
46- ),
47- "transcriptomics" : (
48- "improve_sample_id" , "entrez_id" , "transcriptomics"
49- ),
50- "proteomics" : ("improve_sample_id" , "entrez_id" , "proteomics" ),
51- "mutations" : ("improve_sample_id" , "entrez_id" , "mutation" ),
52- "copy_number" : ("improve_sample_id" , "entrez_id" , "copy_number" ),
53- "methylation" : ("improve_sample_id" , "entrez_id" , "methylation" ),
54- "experiments" : (
55- "improve_sample_id" , "improve_drug_id" , "dose_response_value"
56- ),
57- "drugs" : ("improve_drug_id" , "chem_name" , "isoSMILES" ),
58- "genes" : ("entrez_id" , "gene_symbol" , "other_id" )
59- }
60-
6142 def __init__ (
6243 self ,
63- name : str = None ,
64- transcriptomics : pd .DataFrame = None ,
65- proteomics : pd .DataFrame = None ,
66- mutations : pd .DataFrame = None ,
67- copy_number : pd .DataFrame = None ,
68- samples : pd .DataFrame = None ,
69- drugs : pd .DataFrame = None ,
70- drug_descriptors : pd .DataFrame = None ,
71- mirna : pd .DataFrame = None ,
72- experiments : pd .DataFrame = None ,
73- methylation : pd .DataFrame = None ,
74- metabolomics : pd .DataFrame = None ,
75- genes : pd .DataFrame = None ,
76- combinations : pd .DataFrame = None ,
44+ name : Optional [ str ] = None ,
45+ transcriptomics : Optional [ pd .DataFrame ] = None ,
46+ proteomics : Optional [ pd .DataFrame ] = None ,
47+ mutations : Optional [ pd .DataFrame ] = None ,
48+ copy_number : Optional [ pd .DataFrame ] = None ,
49+ samples : Optional [ pd .DataFrame ] = None ,
50+ drugs : Optional [ pd .DataFrame ] = None ,
51+ drug_descriptors : Optional [ pd .DataFrame ] = None ,
52+ mirna : Optional [ pd .DataFrame ] = None ,
53+ experiments : Optional [ pd .DataFrame ] = None ,
54+ methylation : Optional [ pd .DataFrame ] = None ,
55+ metabolomics : Optional [ pd .DataFrame ] = None ,
56+ genes : Optional [ pd .DataFrame ] = None ,
57+ combinations : Optional [ pd .DataFrame ] = None ,
7758 ):
7859 """
7960 Load datasets of a specific type into predefined attributes of this class instance.
@@ -131,12 +112,6 @@ def __init__(
131112 # getters / setters & deleters
132113 # ----------------------------
133114
134-
135- @property
136- def data_format_params (self ):
137- return self ._data_format_params
138-
139-
140115 @property
141116 def name (self ):
142117 return self ._name
@@ -330,10 +305,10 @@ def format(
330305 'experiments' , 'combinations' , 'drug_descriptor' , 'drugs' ,
331306 'genes' , 'samples' ,
332307 ],
333- use_polars : bool = False ,
308+ remove_na : bool = False ,
334309 ** kwargs : dict ,
335310 ):
336- return format (self , data_type = data_type , use_polars = use_polars , ** kwargs )
311+ return format (self , data_type = data_type , remove_na = False , ** kwargs )
337312
338313
339314 def split_train_other (
@@ -470,6 +445,21 @@ def load(
470445 _description_
471446 """
472447
448+ data_types_to_load = (
449+ 'transcriptomics' ,
450+ 'proteomics' ,
451+ 'mutations' ,
452+ 'copy_number' ,
453+ 'samples' ,
454+ 'drugs' ,
455+ 'drug_descriptors' ,
456+ 'mirna' ,
457+ 'experiments' ,
458+ 'methylation' ,
459+ 'metabolomics' ,
460+ 'genes' ,
461+ )
462+
473463 if type (local_path ) is not Path :
474464 try :
475465 local_path = Path (local_path )
@@ -487,30 +477,63 @@ def load(
487477 dataset = Dataset (name )
488478 accepted_file_endings = ('.csv' , '.tsv' , '.csv.gz' , '.tsv.gz' )
489479 print (f"Importing raw data ..." , file = sys .stderr )
490- for child in local_path .iterdir ():
491- if child .name in ["genes.csv" , "genes.csv.gz" ]:
480+
481+ # generating the file list that contains all files that need to
482+ # be imported based on the Dataset name
483+ files = {}
484+ for p in local_path .glob (f'{ name } _*' ):
485+ if p .name .endswith (accepted_file_endings ) and p .is_file ():
486+ dataset_type = p .name [len (name )+ 1 :].split ('.' )[0 ]
487+ files [dataset_type ] = p
488+ for p in local_path .glob (f'genes*' ):
489+ if p .name .endswith (accepted_file_endings ) and p .is_file ():
490+ files ['genes' ] = p
491+
492+ for dataset_type in data_types_to_load :
493+ if dataset_type not in files :
492494 print (
493- f"Importing 'genes' from { child } ... " ,
494- end = ' ' ,
495+ f"' { dataset_type } ' not available for { name } " ,
496+ end = '\n ' ,
495497 file = sys .stderr
496498 )
497- dataset .genes = _load_file (child )
498- print ("DONE" , file = sys .stderr )
499-
500- if (
501- child .name .startswith (name )
502- and child .name .endswith (accepted_file_endings )
503- ):
504-
505- dataset_type = child .name [len (name )+ 1 :].split ('.' )[0 ]
499+ continue
500+ file = files [dataset_type ]
501+ if dataset_type != 'genes' :
506502 print (
507- f"Importing '{ dataset_type } ' from { child } ..." ,
503+ f"Importing '{ dataset_type } ' from { file } ..." ,
508504 end = ' ' ,
509505 file = sys .stderr
510506 )
511507 if hasattr (dataset , dataset_type ):
512- setattr (dataset , dataset_type , _load_file (child ))
508+ setattr (dataset , dataset_type , _load_file (file ))
513509 print ("DONE" , file = sys .stderr )
510+ else :
511+ '''
512+ The genes dataset available in the online repository is
513+ universal and contains information on genes of all
514+ datasets. To that end it needs to be subsetted to only
515+ those genes that are associate with a specific cancer
516+ dataset.
517+ '''
518+ print (
519+ f"Importing 'genes' from { file } ..." ,
520+ end = ' ' ,
521+ file = sys .stderr
522+ )
523+ dataset .genes = _load_file (file )
524+
525+ entrez_ids = set ()
526+ for dataset_type in ('transcriptomics' , 'proteomics' ,
527+ 'mutations' , 'copy_number' ):
528+ if getattr (dataset , dataset_type ) is not None :
529+ entrez_ids .update (list (
530+ getattr (dataset , dataset_type )['entrez_id' ].unique ()
531+ ))
532+ dataset .genes = dataset .genes [
533+ dataset .genes ['entrez_id' ].isin (entrez_ids )
534+ ]
535+ print ("DONE" , file = sys .stderr )
536+
514537 print (f"Importing raw data ... DONE" , file = sys .stderr )
515538 return dataset
516539
@@ -526,6 +549,7 @@ def load(
526549 dataset = pickle .load (file = file )
527550 print ("DONE" , file = sys .stderr )
528551 return dataset
552+ raise FileNotFoundError ("No suitable pickle file found." )
529553
530554
531555
@@ -536,7 +560,7 @@ def format(
536560 'experiments' , 'combinations' , 'drug_descriptor' , 'drugs' ,
537561 'genes' , 'samples' ,
538562 ],
539- use_polars : bool = False ,
563+ remove_na : bool = False ,
540564 ** kwargs : dict ,
541565 ):
542566
@@ -642,6 +666,8 @@ def format(
642666 columns = 'dose_response_metric' ,
643667 values = 'dose_response_value'
644668 ).reset_index ().rename_axis (None , axis = 1 )
669+ if remove_na :
670+ ret .dropna (axis = 'index' , inplace = True )
645671 elif shape == 'matrix' :
646672 if len (metrics ) > 1 :
647673 raise ValueError (
@@ -654,7 +680,6 @@ def format(
654680 index = 'improve_drug_id' ,
655681 columns = 'improve_sample_id'
656682 )
657- return ret
658683
659684 elif data_type == "combinations" :
660685 raise NotImplementedError (
@@ -771,7 +796,7 @@ def split_train_test_validate(
771796 train , other = _split_two_way (
772797 data = data ,
773798 split_type = split_type ,
774- ratio = [ ratio [0 ], ratio [1 ] + ratio [2 ]] ,
799+ ratio = ( ratio [0 ], ratio [1 ] + ratio [2 ]) ,
775800 stratify_by = stratify_by ,
776801 balance = balance ,
777802 random_state = random_state ,
@@ -781,7 +806,7 @@ def split_train_test_validate(
781806 test , val = _split_two_way (
782807 data = other ,
783808 split_type = split_type ,
784- ratio = [ ratio [1 ], ratio [2 ]] ,
809+ ratio = ( ratio [1 ], ratio [2 ]) ,
785810 stratify_by = stratify_by ,
786811 balance = balance ,
787812 random_state = random_state ,
@@ -993,10 +1018,10 @@ def _filter(data: Dataset, split: pd.DataFrame) -> Dataset:
9931018 return data_ret
9941019
9951020def _balance_data (
996- data : pd .Dataframe ,
1021+ data : pd .DataFrame ,
9971022 random_state : Optional [Union [int ,RandomState ]]= None ,
9981023 # oversample: bool=False,
999- ) -> pd .Dataframe :
1024+ ) -> pd .DataFrame :
10001025 tmp = deepcopy (data )
10011026 counts = tmp .value_counts ('split_class' )
10021027 ret_df = (
@@ -1012,7 +1037,7 @@ def _create_classes(
10121037 metric : str ,
10131038 num_classes : int = 2 ,
10141039 quantiles : bool = True ,
1015- thresh : float = None ,
1040+ thresh : Optional [ float ] = None ,
10161041 ) -> pd .DataFrame :
10171042 """
10181043 Helper function that bins experiment data into a number of defined
@@ -1101,7 +1126,7 @@ def _split_two_way(
11011126 split_type : Literal [
11021127 'mixed-set' , 'drug-blind' , 'cancer-blind'
11031128 ]= 'mixed-set' ,
1104- ratio : tuple [int , int , int ]= (8 ,2 ),
1129+ ratio : tuple [int , int ]= (8 ,2 ),
11051130 balance : bool = False ,
11061131 stratify_by : Optional [str ]= None ,
11071132 random_state : Optional [Union [int ,RandomState ]]= None ,
@@ -1207,7 +1232,8 @@ def _split_two_way(
12071232 columns = 'dose_response_metric' ,
12081233 values = 'dose_response_value'
12091234 ).reset_index ()
1210-
1235+ if stratify_by is not None :
1236+ df_full .dropna (axis = 'index' , subset = [stratify_by ], inplace = True )
12111237 # Defining the split sizes.
12121238 train_size = float (ratio [0 ]) / sum (ratio )
12131239 test_val_size = float (ratio [1 ]) / sum (ratio )
0 commit comments