doubt.datasets.superconductivity

Superconductivity data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""Superconductivity data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9import zipfile
 10
 11import pandas as pd
 12
 13from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 14
 15
 16class Superconductivity(BaseDataset):
 17    __doc__ = f"""
 18    This dataset contains data on 21,263 superconductors and their relevant features.
 19    The goal here is to predict the critical temperature based on the features
 20    extracted.
 21
 22    {BASE_DATASET_DESCRIPTION}
 23
 24    Features:
 25        - number_of_elements (int)
 26        - mean_atomic_mass (float)
 27        - wtd_mean_atomic_mass (float)
 28        - gmean_atomic_mass (float)
 29        - wtd_gmean_atomic_mass (float)
 30        - entropy_atomic_mass (float)
 31        - wtd_entropy_atomic_mass (float)
 32        - range_atomic_mass (float)
 33        - wtd_range_atomic_mass (float)
 34        - std_atomic_mass (float)
 35        - wtd_std_atomic_mass (float)
 36        - mean_fie (float)
 37        - wtd_mean_fie (float)
 38        - gmean_fie (float)
 39        - wtd_gmean_fie (float)
 40        - entropy_fie (float)
 41        - wtd_entropy_fie (float)
 42        - range_fie (float)
 43        - wtd_range_fie (float)
 44        - std_fie (float)
 45        - wtd_std_fie (float)
 46        - mean_atomic_radius (float)
 47        - wtd_mean_atomic_radius (float)
 48        - gmean_atomic_radius (float)
 49        - wtd_gmean_atomic_radius (float)
 50        - entropy_atomic_radius (float)
 51        - wtd_entropy_atomic_radius (float)
 52        - range_atomic_radius (float)
 53        - wtd_range_atomic_radius (float)
 54        - std_atomic_radius (float)
 55        - wtd_std_atomic_radius (float)
 56        - mean_Density (float)
 57        - wtd_mean_Density (float)
 58        - gmean_Density (float)
 59        - wtd_gmean_Density (float)
 60        - entropy_Density (float)
 61        - wtd_entropy_Density (float)
 62        - range_Density (float)
 63        - wtd_range_Density (float)
 64        - std_Density (float)
 65        - wtd_std_Density (float)
 66        - mean_ElectronAffinity (float)
 67        - wtd_mean_ElectronAffinity (float)
 68        - gmean_ElectronAffinity (float)
 69        - wtd_gmean_ElectronAffinity (float)
 70        - entropy_ElectronAffinity (float)
 71        - wtd_entropy_ElectronAffinity (float)
 72        - range_ElectronAffinity (float)
 73        - wtd_range_ElectronAffinity (float)
 74        - std_ElectronAffinity (float)
 75        - wtd_std_ElectronAffinity (float)
 76        - mean_FusionHeat (float)
 77        - wtd_mean_FusionHeat (float)
 78        - gmean_FusionHeat (float)
 79        - wtd_gmean_FusionHeat (float)
 80        - entropy_FusionHeat (float)
 81        - wtd_entropy_FusionHeat (float)
 82        - range_FusionHeat (float)
 83        - wtd_range_FusionHeat (float)
 84        - std_FusionHeat (float)
 85        - wtd_std_FusionHeat (float)
 86        - mean_ThermalConductivity (float)
 87        - wtd_mean_ThermalConductivity (float)
 88        - gmean_ThermalConductivity (float)
 89        - wtd_gmean_ThermalConductivity (float)
 90        - entropy_ThermalConductivity (float)
 91        - wtd_entropy_ThermalConductivity (float)
 92        - range_ThermalConductivity (float)
 93        - wtd_range_ThermalConductivity (float)
 94        - std_ThermalConductivity (float)
 95        - wtd_std_ThermalConductivity (float)
 96        - mean_Valence (float)
 97        - wtd_mean_Valence (float)
 98        - gmean_Valence (float)
 99        - wtd_gmean_Valence (float)
100        - entropy_Valence (float)
101        - wtd_entropy_Valence (float)
102        - range_Valence (float)
103        - wtd_range_Valence (float)
104        - std_Valence (float)
105        - wtd_std_Valence (float)
106
107    Targets:
108        - critical_temp (float)
109
110    Source:
111        https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data
112
113    Examples:
114        Load in the data set::
115
116            >>> dataset = Superconductivity()
117            >>> dataset.shape
118            (21263, 82)
119
120        Split the data set into features and targets, as NumPy arrays::
121
122            >>> X, y = dataset.split()
123            >>> X.shape, y.shape
124            ((21263, 81), (21263,))
125
126        Perform a train/test split, also outputting NumPy arrays::
127
128            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
129            >>> X_train, X_test, y_train, y_test = train_test_split
130            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
131            ((17004, 81), (17004,), (4259, 81), (4259,))
132
133        Output the underlying Pandas DataFrame::
134
135            >>> df = dataset.to_pandas()
136            >>> type(df)
137            <class 'pandas.core.frame.DataFrame'>
138    """
139
140    _url = (
141        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
142        "00464/superconduct.zip"
143    )
144
145    _features = range(81)
146    _targets = [81]
147
148    def _prep_data(self, data: bytes) -> pd.DataFrame:
149        """Prepare the data set.
150
151        Args:
152            data (bytes): The raw data
153
154        Returns:
155            Pandas dataframe: The prepared data
156        """
157        # Convert the bytes into a file-like object
158        buffer = io.BytesIO(data)
159
160        # Unzip the file and pull out the text
161        with zipfile.ZipFile(buffer, "r") as zip_file:
162            txt = zip_file.read("train.csv")
163
164        # Convert text to csv file
165        csv_file = io.BytesIO(txt)
166
167        # Load the csv file into a dataframe
168        df = pd.read_csv(csv_file)
169
170        return df
class Superconductivity(doubt.datasets.dataset.BaseDataset):
 17class Superconductivity(BaseDataset):
 18    __doc__ = f"""
 19    This dataset contains data on 21,263 superconductors and their relevant features.
 20    The goal here is to predict the critical temperature based on the features
 21    extracted.
 22
 23    {BASE_DATASET_DESCRIPTION}
 24
 25    Features:
 26        - number_of_elements (int)
 27        - mean_atomic_mass (float)
 28        - wtd_mean_atomic_mass (float)
 29        - gmean_atomic_mass (float)
 30        - wtd_gmean_atomic_mass (float)
 31        - entropy_atomic_mass (float)
 32        - wtd_entropy_atomic_mass (float)
 33        - range_atomic_mass (float)
 34        - wtd_range_atomic_mass (float)
 35        - std_atomic_mass (float)
 36        - wtd_std_atomic_mass (float)
 37        - mean_fie (float)
 38        - wtd_mean_fie (float)
 39        - gmean_fie (float)
 40        - wtd_gmean_fie (float)
 41        - entropy_fie (float)
 42        - wtd_entropy_fie (float)
 43        - range_fie (float)
 44        - wtd_range_fie (float)
 45        - std_fie (float)
 46        - wtd_std_fie (float)
 47        - mean_atomic_radius (float)
 48        - wtd_mean_atomic_radius (float)
 49        - gmean_atomic_radius (float)
 50        - wtd_gmean_atomic_radius (float)
 51        - entropy_atomic_radius (float)
 52        - wtd_entropy_atomic_radius (float)
 53        - range_atomic_radius (float)
 54        - wtd_range_atomic_radius (float)
 55        - std_atomic_radius (float)
 56        - wtd_std_atomic_radius (float)
 57        - mean_Density (float)
 58        - wtd_mean_Density (float)
 59        - gmean_Density (float)
 60        - wtd_gmean_Density (float)
 61        - entropy_Density (float)
 62        - wtd_entropy_Density (float)
 63        - range_Density (float)
 64        - wtd_range_Density (float)
 65        - std_Density (float)
 66        - wtd_std_Density (float)
 67        - mean_ElectronAffinity (float)
 68        - wtd_mean_ElectronAffinity (float)
 69        - gmean_ElectronAffinity (float)
 70        - wtd_gmean_ElectronAffinity (float)
 71        - entropy_ElectronAffinity (float)
 72        - wtd_entropy_ElectronAffinity (float)
 73        - range_ElectronAffinity (float)
 74        - wtd_range_ElectronAffinity (float)
 75        - std_ElectronAffinity (float)
 76        - wtd_std_ElectronAffinity (float)
 77        - mean_FusionHeat (float)
 78        - wtd_mean_FusionHeat (float)
 79        - gmean_FusionHeat (float)
 80        - wtd_gmean_FusionHeat (float)
 81        - entropy_FusionHeat (float)
 82        - wtd_entropy_FusionHeat (float)
 83        - range_FusionHeat (float)
 84        - wtd_range_FusionHeat (float)
 85        - std_FusionHeat (float)
 86        - wtd_std_FusionHeat (float)
 87        - mean_ThermalConductivity (float)
 88        - wtd_mean_ThermalConductivity (float)
 89        - gmean_ThermalConductivity (float)
 90        - wtd_gmean_ThermalConductivity (float)
 91        - entropy_ThermalConductivity (float)
 92        - wtd_entropy_ThermalConductivity (float)
 93        - range_ThermalConductivity (float)
 94        - wtd_range_ThermalConductivity (float)
 95        - std_ThermalConductivity (float)
 96        - wtd_std_ThermalConductivity (float)
 97        - mean_Valence (float)
 98        - wtd_mean_Valence (float)
 99        - gmean_Valence (float)
100        - wtd_gmean_Valence (float)
101        - entropy_Valence (float)
102        - wtd_entropy_Valence (float)
103        - range_Valence (float)
104        - wtd_range_Valence (float)
105        - std_Valence (float)
106        - wtd_std_Valence (float)
107
108    Targets:
109        - critical_temp (float)
110
111    Source:
112        https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data
113
114    Examples:
115        Load in the data set::
116
117            >>> dataset = Superconductivity()
118            >>> dataset.shape
119            (21263, 82)
120
121        Split the data set into features and targets, as NumPy arrays::
122
123            >>> X, y = dataset.split()
124            >>> X.shape, y.shape
125            ((21263, 81), (21263,))
126
127        Perform a train/test split, also outputting NumPy arrays::
128
129            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
130            >>> X_train, X_test, y_train, y_test = train_test_split
131            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
132            ((17004, 81), (17004,), (4259, 81), (4259,))
133
134        Output the underlying Pandas DataFrame::
135
136            >>> df = dataset.to_pandas()
137            >>> type(df)
138            <class 'pandas.core.frame.DataFrame'>
139    """
140
141    _url = (
142        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
143        "00464/superconduct.zip"
144    )
145
146    _features = range(81)
147    _targets = [81]
148
149    def _prep_data(self, data: bytes) -> pd.DataFrame:
150        """Prepare the data set.
151
152        Args:
153            data (bytes): The raw data
154
155        Returns:
156            Pandas dataframe: The prepared data
157        """
158        # Convert the bytes into a file-like object
159        buffer = io.BytesIO(data)
160
161        # Unzip the file and pull out the text
162        with zipfile.ZipFile(buffer, "r") as zip_file:
163            txt = zip_file.read("train.csv")
164
165        # Convert text to csv file
166        csv_file = io.BytesIO(txt)
167
168        # Load the csv file into a dataframe
169        df = pd.read_csv(csv_file)
170
171        return df

This dataset contains data on 21,263 superconductors and their relevant features. The goal here is to predict the critical temperature based on the features extracted.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:
  • number_of_elements (int)
  • mean_atomic_mass (float)
  • wtd_mean_atomic_mass (float)
  • gmean_atomic_mass (float)
  • wtd_gmean_atomic_mass (float)
  • entropy_atomic_mass (float)
  • wtd_entropy_atomic_mass (float)
  • range_atomic_mass (float)
  • wtd_range_atomic_mass (float)
  • std_atomic_mass (float)
  • wtd_std_atomic_mass (float)
  • mean_fie (float)
  • wtd_mean_fie (float)
  • gmean_fie (float)
  • wtd_gmean_fie (float)
  • entropy_fie (float)
  • wtd_entropy_fie (float)
  • range_fie (float)
  • wtd_range_fie (float)
  • std_fie (float)
  • wtd_std_fie (float)
  • mean_atomic_radius (float)
  • wtd_mean_atomic_radius (float)
  • gmean_atomic_radius (float)
  • wtd_gmean_atomic_radius (float)
  • entropy_atomic_radius (float)
  • wtd_entropy_atomic_radius (float)
  • range_atomic_radius (float)
  • wtd_range_atomic_radius (float)
  • std_atomic_radius (float)
  • wtd_std_atomic_radius (float)
  • mean_Density (float)
  • wtd_mean_Density (float)
  • gmean_Density (float)
  • wtd_gmean_Density (float)
  • entropy_Density (float)
  • wtd_entropy_Density (float)
  • range_Density (float)
  • wtd_range_Density (float)
  • std_Density (float)
  • wtd_std_Density (float)
  • mean_ElectronAffinity (float)
  • wtd_mean_ElectronAffinity (float)
  • gmean_ElectronAffinity (float)
  • wtd_gmean_ElectronAffinity (float)
  • entropy_ElectronAffinity (float)
  • wtd_entropy_ElectronAffinity (float)
  • range_ElectronAffinity (float)
  • wtd_range_ElectronAffinity (float)
  • std_ElectronAffinity (float)
  • wtd_std_ElectronAffinity (float)
  • mean_FusionHeat (float)
  • wtd_mean_FusionHeat (float)
  • gmean_FusionHeat (float)
  • wtd_gmean_FusionHeat (float)
  • entropy_FusionHeat (float)
  • wtd_entropy_FusionHeat (float)
  • range_FusionHeat (float)
  • wtd_range_FusionHeat (float)
  • std_FusionHeat (float)
  • wtd_std_FusionHeat (float)
  • mean_ThermalConductivity (float)
  • wtd_mean_ThermalConductivity (float)
  • gmean_ThermalConductivity (float)
  • wtd_gmean_ThermalConductivity (float)
  • entropy_ThermalConductivity (float)
  • wtd_entropy_ThermalConductivity (float)
  • range_ThermalConductivity (float)
  • wtd_range_ThermalConductivity (float)
  • std_ThermalConductivity (float)
  • wtd_std_ThermalConductivity (float)
  • mean_Valence (float)
  • wtd_mean_Valence (float)
  • gmean_Valence (float)
  • wtd_gmean_Valence (float)
  • entropy_Valence (float)
  • wtd_entropy_Valence (float)
  • range_Valence (float)
  • wtd_range_Valence (float)
  • std_Valence (float)
  • wtd_std_Valence (float)
Targets:
  • critical_temp (float)
Source:

https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data

Examples:

Load in the data set::

>>> dataset = Superconductivity()
>>> dataset.shape
(21263, 82)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((21263, 81), (21263,))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((17004, 81), (17004,), (4259, 81), (4259,))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>