doubt.datasets.fish_bioconcentration

Fish bioconcentration data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""Fish bioconcentration data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9import zipfile
 10
 11import pandas as pd
 12
 13from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 14
 15
 16class FishBioconcentration(BaseDataset):
 17    __doc__ = f"""
 18    This dataset contains manually-curated experimental bioconcentration factor (BCF)
 19    for 1058 molecules (continuous values). Each row contains a molecule, identified by
 20    a CAS number, a name (if available), and a SMILES string. Additionally, the KOW
 21    (experimental or predicted) is reported. In this database, you will also find
 22    Extended Connectivity Fingerprints (binary vectors of 1024 bits), to be used as
 23    independent variables to predict the BCF.
 24
 25    {BASE_DATASET_DESCRIPTION}
 26
 27    Features:
 28        logkow (float):
 29            Octanol water paritioning coefficient (experimental or predicted, as
 30            indicated by ``KOW type``
 31        kow_exp (int):
 32            Indicates whether ``logKOW`` is experimental or predicted, with 1 denoting
 33            experimental and 0 denoting predicted
 34        smiles_[idx] for idx = 0..125 (int):
 35            Encoding of SMILES string to identify the 2D molecular structure. The
 36            encoding is as follows, where 'x' is a padding string to ensure that all
 37            the SMILES strings are of the same length:
 38
 39                - 0  = 'x'
 40                - 1  = '#'
 41                - 2  = '('
 42                - 3  = ')'
 43                - 4  = '+'
 44                - 5  = '-'
 45                - 6  = '/'
 46                - 7  = '1'
 47                - 8  = '2'
 48                - 9  = '3'
 49                - 10 = '4'
 50                - 11 = '5'
 51                - 12 = '6'
 52                - 13 = '7'
 53                - 14 = '8'
 54                - 15 = '='
 55                - 16 = '@'
 56                - 17 = 'B'
 57                - 18 = 'C'
 58                - 19 = 'F'
 59                - 20 = 'H'
 60                - 21 = 'I'
 61                - 22 = 'N'
 62                - 23 = 'O'
 63                - 24 = 'P'
 64                - 25 = 'S'
 65                - 26 = '['
 66                - 27 = '\\'
 67                - 28 = ']'
 68                - 29 = 'c'
 69                - 30 = 'i'
 70                - 31 = 'l'
 71                - 32 = 'n'
 72                - 33 = 'o'
 73                - 34 = 'r'
 74                - 35 = 's'
 75
 76    Targets:
 77        logbcf (float):
 78            Experimental fish bioconcentration factor (logarithm form)
 79
 80    Source:
 81        https://archive.ics.uci.edu/ml/datasets/QSAR+fish+bioconcentration+factor+%28BCF%29
 82
 83    Examples:
 84        Load in the data set::
 85
 86            >>> dataset = FishBioconcentration()
 87            >>> dataset.shape
 88            (1054, 129)
 89
 90        Split the data set into features and targets, as NumPy arrays::
 91
 92            >>> X, y = dataset.split()
 93            >>> X.shape, y.shape
 94            ((1054, 128), (1054,))
 95
 96        Perform a train/test split, also outputting NumPy arrays::
 97
 98            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 99            >>> X_train, X_test, y_train, y_test = train_test_split
100            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
101            ((825, 128), (825,), (229, 128), (229,))
102
103        Output the underlying Pandas DataFrame::
104
105            >>> df = dataset.to_pandas()
106            >>> type(df)
107            <class 'pandas.core.frame.DataFrame'>
108    """
109
110    _url = (
111        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
112        "00511/QSAR_fish_BCF.zip"
113    )
114
115    _features = range(128)
116    _targets = [128]
117
118    def _prep_data(self, data: bytes) -> pd.DataFrame:
119        """Prepare the data set.
120
121        Args:
122            data (bytes): The raw data
123
124        Returns:
125            Pandas dataframe: The prepared data
126        """
127        # Convert the bytes into a file-like object
128        buffer = io.BytesIO(data)
129
130        # Unzip the file and pull out the csv file
131        with zipfile.ZipFile(buffer, "r") as zip_file:
132            csv = zip_file.read("QSAR_BCF_Kow.csv")
133
134        # Convert the string into a file-like object
135        csv_file = io.BytesIO(csv)
136
137        # Read the file-like object into a dataframe
138        cols = ["cas", "name", "smiles", "logkow", "kow_exp", "logbcf"]
139        df = pd.read_csv(
140            csv_file,
141            names=cols,
142            header=0,
143            usecols=[col for col in cols if col not in ["cas", "name"]],
144        )
145
146        # Drop NaNs
147        df = df.dropna().reset_index(drop=True)
148
149        # Encode KOW types
150        kow_types = ["pred", "exp"]
151        df["kow_exp"] = df.kow_exp.map(lambda txt: kow_types.index(txt))
152
153        # Get maximum SMILE string length
154        max_smile = max(len(smile_string) for smile_string in df.smiles)
155
156        # Pad SMILE strings
157        df["smiles"] = [
158            smile_string + "x" * (max_smile - len(smile_string))
159            for smile_string in df.smiles
160        ]
161
162        # Split up the SMILE strings into a matrix
163        smile_df = pd.DataFrame(df.smiles.map(list).values.tolist())
164
165        # Set the column values of the SMILE dataframe
166        smile_df.columns = pd.Index(
167            [f"smiles_{idx}" for idx in range(smile_df.shape[1])]
168        )
169
170        # Add the smile dataframe to the original dataframe
171        df = pd.concat([df, smile_df], axis=1)
172
173        # Drop original SMILE feature
174        df = df.drop(columns="smiles")
175
176        # Put the target variable at the end
177        cols = ["logkow", "kow_exp"]
178        cols += [f"smiles_{idx}" for idx in range(max_smile)]
179        cols += ["logbcf"]
180        df = df[cols]
181
182        # Ensure that the `logkow` column is numeric
183        df["logkow"] = pd.to_numeric(df.logkow)
184
185        return df
class FishBioconcentration(doubt.datasets.dataset.BaseDataset):
 17class FishBioconcentration(BaseDataset):
 18    __doc__ = f"""
 19    This dataset contains manually-curated experimental bioconcentration factor (BCF)
 20    for 1058 molecules (continuous values). Each row contains a molecule, identified by
 21    a CAS number, a name (if available), and a SMILES string. Additionally, the KOW
 22    (experimental or predicted) is reported. In this database, you will also find
 23    Extended Connectivity Fingerprints (binary vectors of 1024 bits), to be used as
 24    independent variables to predict the BCF.
 25
 26    {BASE_DATASET_DESCRIPTION}
 27
 28    Features:
 29        logkow (float):
 30            Octanol water paritioning coefficient (experimental or predicted, as
 31            indicated by ``KOW type``
 32        kow_exp (int):
 33            Indicates whether ``logKOW`` is experimental or predicted, with 1 denoting
 34            experimental and 0 denoting predicted
 35        smiles_[idx] for idx = 0..125 (int):
 36            Encoding of SMILES string to identify the 2D molecular structure. The
 37            encoding is as follows, where 'x' is a padding string to ensure that all
 38            the SMILES strings are of the same length:
 39
 40                - 0  = 'x'
 41                - 1  = '#'
 42                - 2  = '('
 43                - 3  = ')'
 44                - 4  = '+'
 45                - 5  = '-'
 46                - 6  = '/'
 47                - 7  = '1'
 48                - 8  = '2'
 49                - 9  = '3'
 50                - 10 = '4'
 51                - 11 = '5'
 52                - 12 = '6'
 53                - 13 = '7'
 54                - 14 = '8'
 55                - 15 = '='
 56                - 16 = '@'
 57                - 17 = 'B'
 58                - 18 = 'C'
 59                - 19 = 'F'
 60                - 20 = 'H'
 61                - 21 = 'I'
 62                - 22 = 'N'
 63                - 23 = 'O'
 64                - 24 = 'P'
 65                - 25 = 'S'
 66                - 26 = '['
 67                - 27 = '\\'
 68                - 28 = ']'
 69                - 29 = 'c'
 70                - 30 = 'i'
 71                - 31 = 'l'
 72                - 32 = 'n'
 73                - 33 = 'o'
 74                - 34 = 'r'
 75                - 35 = 's'
 76
 77    Targets:
 78        logbcf (float):
 79            Experimental fish bioconcentration factor (logarithm form)
 80
 81    Source:
 82        https://archive.ics.uci.edu/ml/datasets/QSAR+fish+bioconcentration+factor+%28BCF%29
 83
 84    Examples:
 85        Load in the data set::
 86
 87            >>> dataset = FishBioconcentration()
 88            >>> dataset.shape
 89            (1054, 129)
 90
 91        Split the data set into features and targets, as NumPy arrays::
 92
 93            >>> X, y = dataset.split()
 94            >>> X.shape, y.shape
 95            ((1054, 128), (1054,))
 96
 97        Perform a train/test split, also outputting NumPy arrays::
 98
 99            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
100            >>> X_train, X_test, y_train, y_test = train_test_split
101            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
102            ((825, 128), (825,), (229, 128), (229,))
103
104        Output the underlying Pandas DataFrame::
105
106            >>> df = dataset.to_pandas()
107            >>> type(df)
108            <class 'pandas.core.frame.DataFrame'>
109    """
110
111    _url = (
112        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
113        "00511/QSAR_fish_BCF.zip"
114    )
115
116    _features = range(128)
117    _targets = [128]
118
119    def _prep_data(self, data: bytes) -> pd.DataFrame:
120        """Prepare the data set.
121
122        Args:
123            data (bytes): The raw data
124
125        Returns:
126            Pandas dataframe: The prepared data
127        """
128        # Convert the bytes into a file-like object
129        buffer = io.BytesIO(data)
130
131        # Unzip the file and pull out the csv file
132        with zipfile.ZipFile(buffer, "r") as zip_file:
133            csv = zip_file.read("QSAR_BCF_Kow.csv")
134
135        # Convert the string into a file-like object
136        csv_file = io.BytesIO(csv)
137
138        # Read the file-like object into a dataframe
139        cols = ["cas", "name", "smiles", "logkow", "kow_exp", "logbcf"]
140        df = pd.read_csv(
141            csv_file,
142            names=cols,
143            header=0,
144            usecols=[col for col in cols if col not in ["cas", "name"]],
145        )
146
147        # Drop NaNs
148        df = df.dropna().reset_index(drop=True)
149
150        # Encode KOW types
151        kow_types = ["pred", "exp"]
152        df["kow_exp"] = df.kow_exp.map(lambda txt: kow_types.index(txt))
153
154        # Get maximum SMILE string length
155        max_smile = max(len(smile_string) for smile_string in df.smiles)
156
157        # Pad SMILE strings
158        df["smiles"] = [
159            smile_string + "x" * (max_smile - len(smile_string))
160            for smile_string in df.smiles
161        ]
162
163        # Split up the SMILE strings into a matrix
164        smile_df = pd.DataFrame(df.smiles.map(list).values.tolist())
165
166        # Set the column values of the SMILE dataframe
167        smile_df.columns = pd.Index(
168            [f"smiles_{idx}" for idx in range(smile_df.shape[1])]
169        )
170
171        # Add the smile dataframe to the original dataframe
172        df = pd.concat([df, smile_df], axis=1)
173
174        # Drop original SMILE feature
175        df = df.drop(columns="smiles")
176
177        # Put the target variable at the end
178        cols = ["logkow", "kow_exp"]
179        cols += [f"smiles_{idx}" for idx in range(max_smile)]
180        cols += ["logbcf"]
181        df = df[cols]
182
183        # Ensure that the `logkow` column is numeric
184        df["logkow"] = pd.to_numeric(df.logkow)
185
186        return df

This dataset contains manually-curated experimental bioconcentration factor (BCF) for 1058 molecules (continuous values). Each row contains a molecule, identified by a CAS number, a name (if available), and a SMILES string. Additionally, the KOW (experimental or predicted) is reported. In this database, you will also find Extended Connectivity Fingerprints (binary vectors of 1024 bits), to be used as independent variables to predict the BCF.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:

logkow (float): Octanol water paritioning coefficient (experimental or predicted, as indicated by KOW type kow_exp (int): Indicates whether logKOW is experimental or predicted, with 1 denoting experimental and 0 denoting predicted smiles_[idx] for idx = 0..125 (int): Encoding of SMILES string to identify the 2D molecular structure. The encoding is as follows, where 'x' is a padding string to ensure that all the SMILES strings are of the same length:

    - 0  = 'x'
    - 1  = '#'
    - 2  = '('
    - 3  = ')'
    - 4  = '+'
    - 5  = '-'
    - 6  = '/'
    - 7  = '1'
    - 8  = '2'
    - 9  = '3'
    - 10 = '4'
    - 11 = '5'
    - 12 = '6'
    - 13 = '7'
    - 14 = '8'
    - 15 = '='
    - 16 = '@'
    - 17 = 'B'
    - 18 = 'C'
    - 19 = 'F'
    - 20 = 'H'
    - 21 = 'I'
    - 22 = 'N'
    - 23 = 'O'
    - 24 = 'P'
    - 25 = 'S'
    - 26 = '['
    - 27 = '\'
    - 28 = ']'
    - 29 = 'c'
    - 30 = 'i'
    - 31 = 'l'
    - 32 = 'n'
    - 33 = 'o'
    - 34 = 'r'
    - 35 = 's'
Targets:

logbcf (float): Experimental fish bioconcentration factor (logarithm form)

Source:

https://archive.ics.uci.edu/ml/datasets/QSAR+fish+bioconcentration+factor+%28BCF%29

Examples:

Load in the data set::

>>> dataset = FishBioconcentration()
>>> dataset.shape
(1054, 129)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((1054, 128), (1054,))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((825, 128), (825,), (229, 128), (229,))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>