PNNL-CompBio
diff --git a/‎coderdata/__init__.py‎
Lines changed: 11 additions & 1 deletion b/‎coderdata/__init__.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎coderdata/dataset/dataset.py‎
Lines changed: 91 additions & 65 deletions b/‎coderdata/dataset/dataset.py‎
Lines changed: 91 additions & 65 deletions
diff --git a/‎coderdata/download/downloader.py‎
Lines changed: 33 additions & 14 deletions b/‎coderdata/download/downloader.py‎
Lines changed: 33 additions & 14 deletions
diff --git a/‎coderdata/utils/__init__.py‎
Lines changed: 16 additions & 1 deletion b/‎coderdata/utils/__init__.py‎
Lines changed: 16 additions & 1 deletion
@@ -13,4 +13,14 @@
 
 
 from .utils.utils import version
-from .utils.utils import list_datasets
+from .utils.utils import list_datasets
+
+try:
+    import matplotlib
+    import seaborn as sns
+except ModuleNotFoundError:
+    pass
+else:
+    from .utils.stats import summarize_response_metric
+    from .utils.stats import plot_response_metric
+    from .utils.stats import plot_2d_respones_metric
@@ -39,41 +39,22 @@ class Split:
 
 class Dataset:
 
-    data_format_params = {
-        "samples": (
-            "improve_sample_id", "cancer_type", "model_type", "common_name",
-            "other_id", "other_names", "id_source", "species"
-            ),
-        "transcriptomics": (
-            "improve_sample_id", "entrez_id", "transcriptomics"
-            ),
-        "proteomics": ("improve_sample_id", "entrez_id", "proteomics"),
-        "mutations": ("improve_sample_id", "entrez_id", "mutation"),
-        "copy_number": ("improve_sample_id", "entrez_id", "copy_number"),
-        "methylation": ("improve_sample_id", "entrez_id", "methylation"),
-        "experiments": (
-            "improve_sample_id", "improve_drug_id", "dose_response_value"
-            ),
-        "drugs": ("improve_drug_id", "chem_name", "isoSMILES"),
-        "genes": ("entrez_id", "gene_symbol", "other_id")
-        }
-
     def __init__(
             self,
-            name: str=None,
-            transcriptomics: pd.DataFrame=None,
-            proteomics: pd.DataFrame=None,
-            mutations: pd.DataFrame=None,
-            copy_number: pd.DataFrame=None,
-            samples: pd.DataFrame=None,
-            drugs: pd.DataFrame=None,
-            drug_descriptors: pd.DataFrame=None,
-            mirna: pd.DataFrame=None,
-            experiments: pd.DataFrame=None,
-            methylation: pd.DataFrame=None,
-            metabolomics: pd.DataFrame=None,
-            genes: pd.DataFrame=None,
-            combinations: pd.DataFrame=None,
+            name: Optional[str]=None,
+            transcriptomics: Optional[pd.DataFrame]=None,
+            proteomics: Optional[pd.DataFrame]=None,
+            mutations: Optional[pd.DataFrame]=None,
+            copy_number: Optional[pd.DataFrame]=None,
+            samples: Optional[pd.DataFrame]=None,
+            drugs: Optional[pd.DataFrame]=None,
+            drug_descriptors: Optional[pd.DataFrame]=None,
+            mirna: Optional[pd.DataFrame]=None,
+            experiments: Optional[pd.DataFrame]=None,
+            methylation: Optional[pd.DataFrame]=None,
+            metabolomics: Optional[pd.DataFrame]=None,
+            genes: Optional[pd.DataFrame]=None,
+            combinations: Optional[pd.DataFrame]=None,
             ):
         """
         Load datasets of a specific type into predefined attributes of this class instance.
@@ -131,12 +112,6 @@ def __init__(
     # getters / setters & deleters
     # ----------------------------
 
-
-    @property
-    def data_format_params(self):
-        return self._data_format_params
-
-
     @property
     def name(self):
         return self._name
@@ -330,10 +305,10 @@ def format(
                 'experiments', 'combinations', 'drug_descriptor', 'drugs',
                 'genes', 'samples',
                 ],
-            use_polars: bool=False,
+            remove_na: bool=False,
             **kwargs: dict,
             ):
-        return format(self, data_type=data_type, use_polars=use_polars, **kwargs)
+        return format(self, data_type=data_type, remove_na=False, **kwargs)
 
 
     def split_train_other(
@@ -470,6 +445,21 @@ def load(
         _description_
     """
 
+    data_types_to_load = (
+        'transcriptomics',
+        'proteomics',
+        'mutations',
+        'copy_number',
+        'samples',
+        'drugs',
+        'drug_descriptors',
+        'mirna',
+        'experiments',
+        'methylation',
+        'metabolomics',
+        'genes',
+    )
+
     if type(local_path) is not Path:
         try:
             local_path = Path(local_path)
@@ -487,30 +477,63 @@ def load(
         dataset = Dataset(name)
         accepted_file_endings = ('.csv', '.tsv', '.csv.gz', '.tsv.gz')
         print(f"Importing raw data ...", file=sys.stderr)
-        for child in local_path.iterdir():
-            if child.name in ["genes.csv", "genes.csv.gz"]:
+        
+        # generating the file list that contains all files that need to 
+        # be imported based on the Dataset name
+        files = {}
+        for p in local_path.glob(f'{name}_*'):
+            if p.name.endswith(accepted_file_endings) and p.is_file():
+                dataset_type = p.name[len(name)+1:].split('.')[0]
+                files[dataset_type] = p
+        for p in local_path.glob(f'genes*'):
+            if p.name.endswith(accepted_file_endings) and p.is_file():
+                files['genes'] = p
+
+        for dataset_type in data_types_to_load:
+            if dataset_type not in files:
                 print(
-                    f"Importing 'genes' from {child} ...",
-                    end=' ',
+                    f"'{dataset_type}' not available for {name}",
+                    end='\n',
                     file=sys.stderr
                     )
-                dataset.genes = _load_file(child)
-                print("DONE", file=sys.stderr)
-
-            if (
-                child.name.startswith(name)
-                and child.name.endswith(accepted_file_endings)
-                ):
-
-                dataset_type = child.name[len(name)+1:].split('.')[0]
+                continue
+            file = files[dataset_type]
+            if dataset_type != 'genes':
                 print(
-                    f"Importing '{dataset_type}' from {child} ...",
+                    f"Importing '{dataset_type}' from {file} ...",
                     end=' ',
                     file=sys.stderr
                     )
                 if hasattr(dataset, dataset_type):
-                    setattr(dataset, dataset_type, _load_file(child))
+                    setattr(dataset, dataset_type, _load_file(file))
                     print("DONE", file=sys.stderr)
+            else:
+                '''
+                The genes dataset available in the online repository is
+                universal and contains information on genes of all 
+                datasets. To that end it needs to be subsetted to only
+                those genes that are associate with a specific cancer
+                dataset.
+                '''
+                print(
+                    f"Importing 'genes' from {file} ...",
+                    end=' ',
+                    file=sys.stderr
+                    )
+                dataset.genes = _load_file(file)
+
+                entrez_ids = set()
+                for dataset_type in ('transcriptomics', 'proteomics',
+                                     'mutations', 'copy_number'):
+                    if getattr(dataset, dataset_type) is not None:
+                        entrez_ids.update(list(
+                           getattr(dataset, dataset_type)['entrez_id'].unique()
+                        ))                
+                dataset.genes = dataset.genes[
+                    dataset.genes['entrez_id'].isin(entrez_ids)
+                    ]
+                print("DONE", file=sys.stderr)
+
         print(f"Importing raw data ... DONE", file=sys.stderr)
         return dataset
 
@@ -526,6 +549,7 @@ def load(
                     dataset = pickle.load(file=file)
                 print("DONE", file=sys.stderr)
                 return dataset
+        raise FileNotFoundError("No suitable pickle file found.")
 
 
 
@@ -536,7 +560,7 @@ def format(
             'experiments', 'combinations', 'drug_descriptor', 'drugs',
             'genes', 'samples',
             ],
-        use_polars: bool=False,
+        remove_na: bool=False,
         **kwargs: dict,
         ):
 
@@ -642,6 +666,8 @@ def format(
                 columns = 'dose_response_metric',
                 values = 'dose_response_value'
             ).reset_index().rename_axis(None, axis=1)
+            if remove_na:
+                ret.dropna(axis='index', inplace=True)
         elif shape == 'matrix':
             if len(metrics) > 1:
                 raise ValueError(
@@ -654,7 +680,6 @@ def format(
                 index='improve_drug_id',
                 columns='improve_sample_id'
             )
-        return ret
 
     elif data_type == "combinations":
         raise NotImplementedError(
@@ -771,7 +796,7 @@ def split_train_test_validate(
     train, other = _split_two_way(
         data=data,
         split_type=split_type,
-        ratio=[ratio[0], ratio[1] + ratio[2]],
+        ratio=(ratio[0], ratio[1] + ratio[2]),
         stratify_by=stratify_by,
         balance=balance,
         random_state=random_state,
@@ -781,7 +806,7 @@ def split_train_test_validate(
     test, val = _split_two_way(
         data=other,
         split_type=split_type,
-        ratio=[ratio[1], ratio[2]],
+        ratio=(ratio[1], ratio[2]),
         stratify_by=stratify_by,
         balance=balance,
         random_state=random_state,
@@ -993,10 +1018,10 @@ def _filter(data: Dataset, split: pd.DataFrame) -> Dataset:
     return data_ret
 
 def _balance_data(
-        data: pd.Dataframe,
+        data: pd.DataFrame,
         random_state: Optional[Union[int,RandomState]]=None,
         # oversample: bool=False,
-        ) -> pd.Dataframe:
+        ) -> pd.DataFrame:
     tmp = deepcopy(data)
     counts = tmp.value_counts('split_class')
     ret_df = (
@@ -1012,7 +1037,7 @@ def _create_classes(
         metric: str,
         num_classes: int=2,
         quantiles: bool=True,
-        thresh: float=None,
+        thresh: Optional[float]=None,
         ) -> pd.DataFrame:
     """
     Helper function that bins experiment data into a number of defined 
@@ -1101,7 +1126,7 @@ def _split_two_way(
         split_type: Literal[
             'mixed-set', 'drug-blind', 'cancer-blind'
             ]='mixed-set',
-        ratio: tuple[int, int, int]=(8,2),
+        ratio: tuple[int, int]=(8,2),
         balance: bool=False,
         stratify_by: Optional[str]=None,
         random_state: Optional[Union[int,RandomState]]=None,
@@ -1207,7 +1232,8 @@ def _split_two_way(
         columns = 'dose_response_metric',
         values = 'dose_response_value'
     ).reset_index()
-
+    if stratify_by is not None:
+        df_full.dropna(axis='index', subset=[stratify_by], inplace=True)
     # Defining the split sizes. 
     train_size = float(ratio[0]) / sum(ratio)
     test_val_size = float(ratio[1]) / sum(ratio)
 
@@ -1,6 +1,7 @@
 # coderdata/download/downloader.py
 
 from importlib import resources
+from hashlib import md5
 from pathlib import Path
 from os import PathLike
 import os
@@ -86,22 +87,40 @@ def download(
 
     for file_name, file_data in unique_files.items():
         file_info = file_data['file_info']
-        file_url = file_info['download_url']
-
+        file_id = str(file_info['id'])
+        file_url = "https://api.figshare.com/v2/file/download/" + file_id
+        file_md5sum = file_info['supplied_md5']
+        retry_count = 10
         # Download the file
-        with requests.get(file_url, stream=True) as r:
-            r.raise_for_status()
-            if file_name.exists() and not exist_ok:
+        while retry_count > 0:
+            with requests.get(file_url, stream=True) as r:
+                r.raise_for_status()
+                if file_name.exists() and not exist_ok:
+                    warnings.warn(
+                        f"{file_name} already exists. Use argument 'exist_ok=True'"
+                        "to overwrite existing file."
+                        )
+                else:
+                    with open(file_name, 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=8192): 
+                            f.write(chunk)
+            with open(file_name, 'rb') as f:
+                check_md5sum = md5(f.read()).hexdigest()
+            if file_md5sum == check_md5sum:
+                break
+            elif retry_count > 0:
                 warnings.warn(
-                    f"{file_name} already exists. Use argument 'exist_ok=True'"
-                    "to overwrite existing file."
-                    )
-            else:
-                with open(file_name, 'wb') as f:
-                    for chunk in r.iter_content(chunk_size=8192): 
-                        f.write(chunk)
-
-        print(f"Downloaded '{file_url}' to '{file_name}'")
+                    f"{file_name} could not be downloaded successfully. "
+                    f"(expected md5sum: {file_md5sum} - "
+                    f"calculated md5sum: {check_md5sum})... retrying..."
+                )
+                retry_count = retry_count - 1
+        if retry_count == 0:
+            warnings.warn(
+                f"{file_name} could not be downloaded. Try again."
+                )
+        else:
+            print(f"Downloaded '{file_url}' to '{file_name}'")
 
     return
 
@@ -1,2 +1,17 @@
 from .utils import version
-from .utils import list_datasets
+from .utils import list_datasets
+
+try:
+    import matplotlib
+    import seaborn as sns
+except ModuleNotFoundError:
+    import warnings
+    warnings.warn(
+        "package was not availble. To use coderdata.utils.stats functions "
+        "please make sure 'matplotlib' & 'seaborn' are available in the "
+        "environment."
+        )
+else:
+    from .stats import summarize_response_metric
+    from .stats import plot_response_metric
+    from .stats import plot_2d_respones_metric