doubt.datasets.fish_bioconcentration
Fish bioconcentration data set.
This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.
1"""Fish bioconcentration data set. 2 3This data set is from the UCI data set archive, with the description being the original 4description verbatim. Some feature names may have been altered, based on the 5description. 6""" 7 8import io 9import zipfile 10 11import pandas as pd 12 13from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset 14 15 16class FishBioconcentration(BaseDataset): 17 __doc__ = f""" 18 This dataset contains manually-curated experimental bioconcentration factor (BCF) 19 for 1058 molecules (continuous values). Each row contains a molecule, identified by 20 a CAS number, a name (if available), and a SMILES string. Additionally, the KOW 21 (experimental or predicted) is reported. In this database, you will also find 22 Extended Connectivity Fingerprints (binary vectors of 1024 bits), to be used as 23 independent variables to predict the BCF. 24 25 {BASE_DATASET_DESCRIPTION} 26 27 Features: 28 logkow (float): 29 Octanol water paritioning coefficient (experimental or predicted, as 30 indicated by ``KOW type`` 31 kow_exp (int): 32 Indicates whether ``logKOW`` is experimental or predicted, with 1 denoting 33 experimental and 0 denoting predicted 34 smiles_[idx] for idx = 0..125 (int): 35 Encoding of SMILES string to identify the 2D molecular structure. The 36 encoding is as follows, where 'x' is a padding string to ensure that all 37 the SMILES strings are of the same length: 38 39 - 0 = 'x' 40 - 1 = '#' 41 - 2 = '(' 42 - 3 = ')' 43 - 4 = '+' 44 - 5 = '-' 45 - 6 = '/' 46 - 7 = '1' 47 - 8 = '2' 48 - 9 = '3' 49 - 10 = '4' 50 - 11 = '5' 51 - 12 = '6' 52 - 13 = '7' 53 - 14 = '8' 54 - 15 = '=' 55 - 16 = '@' 56 - 17 = 'B' 57 - 18 = 'C' 58 - 19 = 'F' 59 - 20 = 'H' 60 - 21 = 'I' 61 - 22 = 'N' 62 - 23 = 'O' 63 - 24 = 'P' 64 - 25 = 'S' 65 - 26 = '[' 66 - 27 = '\\' 67 - 28 = ']' 68 - 29 = 'c' 69 - 30 = 'i' 70 - 31 = 'l' 71 - 32 = 'n' 72 - 33 = 'o' 73 - 34 = 'r' 74 - 35 = 's' 75 76 Targets: 77 logbcf (float): 78 Experimental fish bioconcentration factor (logarithm form) 79 80 Source: 81 https://archive.ics.uci.edu/ml/datasets/QSAR+fish+bioconcentration+factor+%28BCF%29 82 83 Examples: 84 Load in the data set:: 85 86 >>> dataset = FishBioconcentration() 87 >>> dataset.shape 88 (1054, 129) 89 90 Split the data set into features and targets, as NumPy arrays:: 91 92 >>> X, y = dataset.split() 93 >>> X.shape, y.shape 94 ((1054, 128), (1054,)) 95 96 Perform a train/test split, also outputting NumPy arrays:: 97 98 >>> train_test_split = dataset.split(test_size=0.2, random_seed=42) 99 >>> X_train, X_test, y_train, y_test = train_test_split 100 >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape 101 ((825, 128), (825,), (229, 128), (229,)) 102 103 Output the underlying Pandas DataFrame:: 104 105 >>> df = dataset.to_pandas() 106 >>> type(df) 107 <class 'pandas.core.frame.DataFrame'> 108 """ 109 110 _url = ( 111 "https://archive.ics.uci.edu/ml/machine-learning-databases/" 112 "00511/QSAR_fish_BCF.zip" 113 ) 114 115 _features = range(128) 116 _targets = [128] 117 118 def _prep_data(self, data: bytes) -> pd.DataFrame: 119 """Prepare the data set. 120 121 Args: 122 data (bytes): The raw data 123 124 Returns: 125 Pandas dataframe: The prepared data 126 """ 127 # Convert the bytes into a file-like object 128 buffer = io.BytesIO(data) 129 130 # Unzip the file and pull out the csv file 131 with zipfile.ZipFile(buffer, "r") as zip_file: 132 csv = zip_file.read("QSAR_BCF_Kow.csv") 133 134 # Convert the string into a file-like object 135 csv_file = io.BytesIO(csv) 136 137 # Read the file-like object into a dataframe 138 cols = ["cas", "name", "smiles", "logkow", "kow_exp", "logbcf"] 139 df = pd.read_csv( 140 csv_file, 141 names=cols, 142 header=0, 143 usecols=[col for col in cols if col not in ["cas", "name"]], 144 ) 145 146 # Drop NaNs 147 df = df.dropna().reset_index(drop=True) 148 149 # Encode KOW types 150 kow_types = ["pred", "exp"] 151 df["kow_exp"] = df.kow_exp.map(lambda txt: kow_types.index(txt)) 152 153 # Get maximum SMILE string length 154 max_smile = max(len(smile_string) for smile_string in df.smiles) 155 156 # Pad SMILE strings 157 df["smiles"] = [ 158 smile_string + "x" * (max_smile - len(smile_string)) 159 for smile_string in df.smiles 160 ] 161 162 # Split up the SMILE strings into a matrix 163 smile_df = pd.DataFrame(df.smiles.map(list).values.tolist()) 164 165 # Set the column values of the SMILE dataframe 166 smile_df.columns = pd.Index( 167 [f"smiles_{idx}" for idx in range(smile_df.shape[1])] 168 ) 169 170 # Add the smile dataframe to the original dataframe 171 df = pd.concat([df, smile_df], axis=1) 172 173 # Drop original SMILE feature 174 df = df.drop(columns="smiles") 175 176 # Put the target variable at the end 177 cols = ["logkow", "kow_exp"] 178 cols += [f"smiles_{idx}" for idx in range(max_smile)] 179 cols += ["logbcf"] 180 df = df[cols] 181 182 # Ensure that the `logkow` column is numeric 183 df["logkow"] = pd.to_numeric(df.logkow) 184 185 return df
17class FishBioconcentration(BaseDataset): 18 __doc__ = f""" 19 This dataset contains manually-curated experimental bioconcentration factor (BCF) 20 for 1058 molecules (continuous values). Each row contains a molecule, identified by 21 a CAS number, a name (if available), and a SMILES string. Additionally, the KOW 22 (experimental or predicted) is reported. In this database, you will also find 23 Extended Connectivity Fingerprints (binary vectors of 1024 bits), to be used as 24 independent variables to predict the BCF. 25 26 {BASE_DATASET_DESCRIPTION} 27 28 Features: 29 logkow (float): 30 Octanol water paritioning coefficient (experimental or predicted, as 31 indicated by ``KOW type`` 32 kow_exp (int): 33 Indicates whether ``logKOW`` is experimental or predicted, with 1 denoting 34 experimental and 0 denoting predicted 35 smiles_[idx] for idx = 0..125 (int): 36 Encoding of SMILES string to identify the 2D molecular structure. The 37 encoding is as follows, where 'x' is a padding string to ensure that all 38 the SMILES strings are of the same length: 39 40 - 0 = 'x' 41 - 1 = '#' 42 - 2 = '(' 43 - 3 = ')' 44 - 4 = '+' 45 - 5 = '-' 46 - 6 = '/' 47 - 7 = '1' 48 - 8 = '2' 49 - 9 = '3' 50 - 10 = '4' 51 - 11 = '5' 52 - 12 = '6' 53 - 13 = '7' 54 - 14 = '8' 55 - 15 = '=' 56 - 16 = '@' 57 - 17 = 'B' 58 - 18 = 'C' 59 - 19 = 'F' 60 - 20 = 'H' 61 - 21 = 'I' 62 - 22 = 'N' 63 - 23 = 'O' 64 - 24 = 'P' 65 - 25 = 'S' 66 - 26 = '[' 67 - 27 = '\\' 68 - 28 = ']' 69 - 29 = 'c' 70 - 30 = 'i' 71 - 31 = 'l' 72 - 32 = 'n' 73 - 33 = 'o' 74 - 34 = 'r' 75 - 35 = 's' 76 77 Targets: 78 logbcf (float): 79 Experimental fish bioconcentration factor (logarithm form) 80 81 Source: 82 https://archive.ics.uci.edu/ml/datasets/QSAR+fish+bioconcentration+factor+%28BCF%29 83 84 Examples: 85 Load in the data set:: 86 87 >>> dataset = FishBioconcentration() 88 >>> dataset.shape 89 (1054, 129) 90 91 Split the data set into features and targets, as NumPy arrays:: 92 93 >>> X, y = dataset.split() 94 >>> X.shape, y.shape 95 ((1054, 128), (1054,)) 96 97 Perform a train/test split, also outputting NumPy arrays:: 98 99 >>> train_test_split = dataset.split(test_size=0.2, random_seed=42) 100 >>> X_train, X_test, y_train, y_test = train_test_split 101 >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape 102 ((825, 128), (825,), (229, 128), (229,)) 103 104 Output the underlying Pandas DataFrame:: 105 106 >>> df = dataset.to_pandas() 107 >>> type(df) 108 <class 'pandas.core.frame.DataFrame'> 109 """ 110 111 _url = ( 112 "https://archive.ics.uci.edu/ml/machine-learning-databases/" 113 "00511/QSAR_fish_BCF.zip" 114 ) 115 116 _features = range(128) 117 _targets = [128] 118 119 def _prep_data(self, data: bytes) -> pd.DataFrame: 120 """Prepare the data set. 121 122 Args: 123 data (bytes): The raw data 124 125 Returns: 126 Pandas dataframe: The prepared data 127 """ 128 # Convert the bytes into a file-like object 129 buffer = io.BytesIO(data) 130 131 # Unzip the file and pull out the csv file 132 with zipfile.ZipFile(buffer, "r") as zip_file: 133 csv = zip_file.read("QSAR_BCF_Kow.csv") 134 135 # Convert the string into a file-like object 136 csv_file = io.BytesIO(csv) 137 138 # Read the file-like object into a dataframe 139 cols = ["cas", "name", "smiles", "logkow", "kow_exp", "logbcf"] 140 df = pd.read_csv( 141 csv_file, 142 names=cols, 143 header=0, 144 usecols=[col for col in cols if col not in ["cas", "name"]], 145 ) 146 147 # Drop NaNs 148 df = df.dropna().reset_index(drop=True) 149 150 # Encode KOW types 151 kow_types = ["pred", "exp"] 152 df["kow_exp"] = df.kow_exp.map(lambda txt: kow_types.index(txt)) 153 154 # Get maximum SMILE string length 155 max_smile = max(len(smile_string) for smile_string in df.smiles) 156 157 # Pad SMILE strings 158 df["smiles"] = [ 159 smile_string + "x" * (max_smile - len(smile_string)) 160 for smile_string in df.smiles 161 ] 162 163 # Split up the SMILE strings into a matrix 164 smile_df = pd.DataFrame(df.smiles.map(list).values.tolist()) 165 166 # Set the column values of the SMILE dataframe 167 smile_df.columns = pd.Index( 168 [f"smiles_{idx}" for idx in range(smile_df.shape[1])] 169 ) 170 171 # Add the smile dataframe to the original dataframe 172 df = pd.concat([df, smile_df], axis=1) 173 174 # Drop original SMILE feature 175 df = df.drop(columns="smiles") 176 177 # Put the target variable at the end 178 cols = ["logkow", "kow_exp"] 179 cols += [f"smiles_{idx}" for idx in range(max_smile)] 180 cols += ["logbcf"] 181 df = df[cols] 182 183 # Ensure that the `logkow` column is numeric 184 df["logkow"] = pd.to_numeric(df.logkow) 185 186 return df
This dataset contains manually-curated experimental bioconcentration factor (BCF) for 1058 molecules (continuous values). Each row contains a molecule, identified by a CAS number, a name (if available), and a SMILES string. Additionally, the KOW (experimental or predicted) is reported. In this database, you will also find Extended Connectivity Fingerprints (binary vectors of 1024 bits), to be used as independent variables to predict the BCF.
Arguments:
- cache (str or None, optional): The name of the cache. It will be saved to
cache
in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
- cache (str or None): The name of the cache.
- shape (tuple of integers): Dimensions of the data set
- columns (list of strings): List of column names in the data set
Features:
logkow (float): Octanol water paritioning coefficient (experimental or predicted, as indicated by
KOW type
kow_exp (int): Indicates whetherlogKOW
is experimental or predicted, with 1 denoting experimental and 0 denoting predicted smiles_[idx] for idx = 0..125 (int): Encoding of SMILES string to identify the 2D molecular structure. The encoding is as follows, where 'x' is a padding string to ensure that all the SMILES strings are of the same length:- 0 = 'x' - 1 = '#' - 2 = '(' - 3 = ')' - 4 = '+' - 5 = '-' - 6 = '/' - 7 = '1' - 8 = '2' - 9 = '3' - 10 = '4' - 11 = '5' - 12 = '6' - 13 = '7' - 14 = '8' - 15 = '=' - 16 = '@' - 17 = 'B' - 18 = 'C' - 19 = 'F' - 20 = 'H' - 21 = 'I' - 22 = 'N' - 23 = 'O' - 24 = 'P' - 25 = 'S' - 26 = '[' - 27 = '\' - 28 = ']' - 29 = 'c' - 30 = 'i' - 31 = 'l' - 32 = 'n' - 33 = 'o' - 34 = 'r' - 35 = 's'
Targets:
logbcf (float): Experimental fish bioconcentration factor (logarithm form)
Source:
https://archive.ics.uci.edu/ml/datasets/QSAR+fish+bioconcentration+factor+%28BCF%29
Examples:
Load in the data set::
>>> dataset = FishBioconcentration() >>> dataset.shape (1054, 129)
Split the data set into features and targets, as NumPy arrays::
>>> X, y = dataset.split() >>> X.shape, y.shape ((1054, 128), (1054,))
Perform a train/test split, also outputting NumPy arrays::
>>> train_test_split = dataset.split(test_size=0.2, random_seed=42) >>> X_train, X_test, y_train, y_test = train_test_split >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape ((825, 128), (825,), (229, 128), (229,))
Output the underlying Pandas DataFrame::
>>> df = dataset.to_pandas() >>> type(df) <class 'pandas.core.frame.DataFrame'>