doubt.datasets.protein

Protein data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""Protein data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9
 10import pandas as pd
 11
 12from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 13
 14
 15class Protein(BaseDataset):
 16    __doc__ = f"""
 17    This is a data set of Physicochemical Properties of Protein Tertiary Structure. The
 18    data set is taken from CASP 5-9. There are 45730 decoys and size varying from 0 to
 19    21 armstrong.
 20
 21    {BASE_DATASET_DESCRIPTION}
 22
 23    Features:
 24        F1 (float):
 25            Total surface area
 26        F2 (float):
 27            Non polar exposed area
 28        F3 (float):
 29            Fractional area of exposed non polar residue
 30        F4 (float):
 31            Fractional area of exposed non polar part of residue
 32        F5 (float):
 33            Molecular mass weighted exposed area
 34        F6 (float):
 35            Average deviation from standard exposed area of residue
 36        F7 (float):
 37            Euclidean distance
 38        F8 (float):
 39            Secondary structure penalty
 40        F9 (float):
 41            Spacial Distribution constraints (N,K Value)
 42
 43    Targets:
 44        RMSD (float):
 45            Size of the residue
 46
 47    Source:
 48        https://archive.ics.uci.edu/ml/datasets/Physicochemical+Properties+of+Protein+Tertiary+Structure
 49
 50    Examples:
 51        Load in the data set::
 52
 53            >>> dataset = Protein()
 54            >>> dataset.shape
 55            (45730, 10)
 56
 57        Split the data set into features and targets, as NumPy arrays::
 58
 59            >>> X, y = dataset.split()
 60            >>> X.shape, y.shape
 61            ((45730, 9), (45730,))
 62
 63        Perform a train/test split, also outputting NumPy arrays::
 64
 65            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 66            >>> X_train, X_test, y_train, y_test = train_test_split
 67            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 68            ((36580, 9), (36580,), (9150, 9), (9150,))
 69
 70        Output the underlying Pandas DataFrame::
 71
 72            >>> df = dataset.to_pandas()
 73            >>> type(df)
 74            <class 'pandas.core.frame.DataFrame'>
 75    """
 76
 77    _url = "https://archive.ics.uci.edu/ml/machine-learning-databases/" "00265/CASP.csv"
 78
 79    _features = range(9)
 80    _targets = [9]
 81
 82    def _prep_data(self, data: bytes) -> pd.DataFrame:
 83        """Prepare the data set.
 84
 85        Args:
 86            data (bytes): The raw data
 87
 88        Returns:
 89            Pandas dataframe: The prepared data
 90        """
 91        # Convert the bytes into a file-like object
 92        csv_file = io.BytesIO(data)
 93
 94        # Load in the dataframe
 95        df = pd.read_csv(csv_file)
 96
 97        # Put the target column at the end
 98        df = df[[f"F{i}" for i in range(1, 10)] + ["RMSD"]]
 99
100        return df
class Protein(doubt.datasets.dataset.BaseDataset):
 16class Protein(BaseDataset):
 17    __doc__ = f"""
 18    This is a data set of Physicochemical Properties of Protein Tertiary Structure. The
 19    data set is taken from CASP 5-9. There are 45730 decoys and size varying from 0 to
 20    21 armstrong.
 21
 22    {BASE_DATASET_DESCRIPTION}
 23
 24    Features:
 25        F1 (float):
 26            Total surface area
 27        F2 (float):
 28            Non polar exposed area
 29        F3 (float):
 30            Fractional area of exposed non polar residue
 31        F4 (float):
 32            Fractional area of exposed non polar part of residue
 33        F5 (float):
 34            Molecular mass weighted exposed area
 35        F6 (float):
 36            Average deviation from standard exposed area of residue
 37        F7 (float):
 38            Euclidean distance
 39        F8 (float):
 40            Secondary structure penalty
 41        F9 (float):
 42            Spacial Distribution constraints (N,K Value)
 43
 44    Targets:
 45        RMSD (float):
 46            Size of the residue
 47
 48    Source:
 49        https://archive.ics.uci.edu/ml/datasets/Physicochemical+Properties+of+Protein+Tertiary+Structure
 50
 51    Examples:
 52        Load in the data set::
 53
 54            >>> dataset = Protein()
 55            >>> dataset.shape
 56            (45730, 10)
 57
 58        Split the data set into features and targets, as NumPy arrays::
 59
 60            >>> X, y = dataset.split()
 61            >>> X.shape, y.shape
 62            ((45730, 9), (45730,))
 63
 64        Perform a train/test split, also outputting NumPy arrays::
 65
 66            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 67            >>> X_train, X_test, y_train, y_test = train_test_split
 68            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 69            ((36580, 9), (36580,), (9150, 9), (9150,))
 70
 71        Output the underlying Pandas DataFrame::
 72
 73            >>> df = dataset.to_pandas()
 74            >>> type(df)
 75            <class 'pandas.core.frame.DataFrame'>
 76    """
 77
 78    _url = "https://archive.ics.uci.edu/ml/machine-learning-databases/" "00265/CASP.csv"
 79
 80    _features = range(9)
 81    _targets = [9]
 82
 83    def _prep_data(self, data: bytes) -> pd.DataFrame:
 84        """Prepare the data set.
 85
 86        Args:
 87            data (bytes): The raw data
 88
 89        Returns:
 90            Pandas dataframe: The prepared data
 91        """
 92        # Convert the bytes into a file-like object
 93        csv_file = io.BytesIO(data)
 94
 95        # Load in the dataframe
 96        df = pd.read_csv(csv_file)
 97
 98        # Put the target column at the end
 99        df = df[[f"F{i}" for i in range(1, 10)] + ["RMSD"]]
100
101        return df

This is a data set of Physicochemical Properties of Protein Tertiary Structure. The data set is taken from CASP 5-9. There are 45730 decoys and size varying from 0 to 21 armstrong.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:

F1 (float): Total surface area F2 (float): Non polar exposed area F3 (float): Fractional area of exposed non polar residue F4 (float): Fractional area of exposed non polar part of residue F5 (float): Molecular mass weighted exposed area F6 (float): Average deviation from standard exposed area of residue F7 (float): Euclidean distance F8 (float): Secondary structure penalty F9 (float): Spacial Distribution constraints (N,K Value)

Targets:

RMSD (float): Size of the residue

Source:

https://archive.ics.uci.edu/ml/datasets/Physicochemical+Properties+of+Protein+Tertiary+Structure

Examples:

Load in the data set::

>>> dataset = Protein()
>>> dataset.shape
(45730, 10)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((45730, 9), (45730,))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((36580, 9), (36580,), (9150, 9), (9150,))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>