doubt.datasets.cpu

CPU data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""CPU data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9
 10import pandas as pd
 11
 12from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 13
 14
 15class CPU(BaseDataset):
 16    __doc__ = f"""
 17    Relative CPU Performance Data, described in terms of its cycle time, memory size,
 18    etc.
 19
 20    {BASE_DATASET_DESCRIPTION}
 21
 22    Features:
 23        vendor_name (string):
 24            Name of the vendor, 30 unique values
 25        model_name (string):
 26            Name of the model
 27        myct (int):
 28            Machine cycle time in nanoseconds
 29        mmin (int):
 30            Minimum main memory in kilobytes
 31        mmax (int):
 32            Maximum main memory in kilobytes
 33        cach (int):
 34            Cache memory in kilobytes
 35        chmin (int):
 36            Minimum channels in units
 37        chmax (int):
 38            Maximum channels in units
 39
 40    Targets:
 41        prp (int):
 42            Published relative performance
 43
 44    Source:
 45        https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
 46
 47    Examples:
 48        Load in the data set::
 49
 50            >>> dataset = CPU()
 51            >>> dataset.shape
 52            (209, 9)
 53
 54        Split the data set into features and targets, as NumPy arrays::
 55
 56            >>> X, y = dataset.split()
 57            >>> X.shape, y.shape
 58            ((209, 8), (209,))
 59
 60        Perform a train/test split, also outputting NumPy arrays::
 61
 62            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 63            >>> X_train, X_test, y_train, y_test = train_test_split
 64            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 65            ((162, 8), (162,), (47, 8), (47,))
 66
 67        Output the underlying Pandas DataFrame::
 68
 69            >>> df = dataset.to_pandas()
 70            >>> type(df)
 71            <class 'pandas.core.frame.DataFrame'>
 72    """
 73
 74    _url = (
 75        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 76        "cpu-performance/machine.data"
 77    )
 78
 79    _features = range(8)
 80    _targets = [8]
 81
 82    def _prep_data(self, data: bytes) -> pd.DataFrame:
 83        """Prepare the data set.
 84
 85        Args:
 86            data (bytes): The raw data
 87
 88        Returns:
 89            Pandas dataframe: The prepared data
 90        """
 91
 92        # Convert the bytes into a file-like object
 93        csv_file = io.BytesIO(data)
 94
 95        # Name the columns
 96        cols = [
 97            "vendor_name",
 98            "model_name",
 99            "myct",
100            "mmin",
101            "mmax",
102            "cach",
103            "chmin",
104            "chmax",
105            "prp",
106        ]
107
108        # Load the file-like object into a data frame
109        df = pd.read_csv(csv_file, header=None, usecols=range(9), names=cols)
110        return df
class CPU(doubt.datasets.dataset.BaseDataset):
 16class CPU(BaseDataset):
 17    __doc__ = f"""
 18    Relative CPU Performance Data, described in terms of its cycle time, memory size,
 19    etc.
 20
 21    {BASE_DATASET_DESCRIPTION}
 22
 23    Features:
 24        vendor_name (string):
 25            Name of the vendor, 30 unique values
 26        model_name (string):
 27            Name of the model
 28        myct (int):
 29            Machine cycle time in nanoseconds
 30        mmin (int):
 31            Minimum main memory in kilobytes
 32        mmax (int):
 33            Maximum main memory in kilobytes
 34        cach (int):
 35            Cache memory in kilobytes
 36        chmin (int):
 37            Minimum channels in units
 38        chmax (int):
 39            Maximum channels in units
 40
 41    Targets:
 42        prp (int):
 43            Published relative performance
 44
 45    Source:
 46        https://archive.ics.uci.edu/ml/datasets/Computer+Hardware
 47
 48    Examples:
 49        Load in the data set::
 50
 51            >>> dataset = CPU()
 52            >>> dataset.shape
 53            (209, 9)
 54
 55        Split the data set into features and targets, as NumPy arrays::
 56
 57            >>> X, y = dataset.split()
 58            >>> X.shape, y.shape
 59            ((209, 8), (209,))
 60
 61        Perform a train/test split, also outputting NumPy arrays::
 62
 63            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 64            >>> X_train, X_test, y_train, y_test = train_test_split
 65            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 66            ((162, 8), (162,), (47, 8), (47,))
 67
 68        Output the underlying Pandas DataFrame::
 69
 70            >>> df = dataset.to_pandas()
 71            >>> type(df)
 72            <class 'pandas.core.frame.DataFrame'>
 73    """
 74
 75    _url = (
 76        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 77        "cpu-performance/machine.data"
 78    )
 79
 80    _features = range(8)
 81    _targets = [8]
 82
 83    def _prep_data(self, data: bytes) -> pd.DataFrame:
 84        """Prepare the data set.
 85
 86        Args:
 87            data (bytes): The raw data
 88
 89        Returns:
 90            Pandas dataframe: The prepared data
 91        """
 92
 93        # Convert the bytes into a file-like object
 94        csv_file = io.BytesIO(data)
 95
 96        # Name the columns
 97        cols = [
 98            "vendor_name",
 99            "model_name",
100            "myct",
101            "mmin",
102            "mmax",
103            "cach",
104            "chmin",
105            "chmax",
106            "prp",
107        ]
108
109        # Load the file-like object into a data frame
110        df = pd.read_csv(csv_file, header=None, usecols=range(9), names=cols)
111        return df

Relative CPU Performance Data, described in terms of its cycle time, memory size, etc.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:

vendor_name (string): Name of the vendor, 30 unique values model_name (string): Name of the model myct (int): Machine cycle time in nanoseconds mmin (int): Minimum main memory in kilobytes mmax (int): Maximum main memory in kilobytes cach (int): Cache memory in kilobytes chmin (int): Minimum channels in units chmax (int): Maximum channels in units

Targets:

prp (int): Published relative performance

Source:

https://archive.ics.uci.edu/ml/datasets/Computer+Hardware

Examples:

Load in the data set::

>>> dataset = CPU()
>>> dataset.shape
(209, 9)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((209, 8), (209,))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((162, 8), (162,), (47, 8), (47,))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>