doubt.datasets.superconductivity
Superconductivity data set.
This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.
1"""Superconductivity data set. 2 3This data set is from the UCI data set archive, with the description being the original 4description verbatim. Some feature names may have been altered, based on the 5description. 6""" 7 8import io 9import zipfile 10 11import pandas as pd 12 13from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset 14 15 16class Superconductivity(BaseDataset): 17 __doc__ = f""" 18 This dataset contains data on 21,263 superconductors and their relevant features. 19 The goal here is to predict the critical temperature based on the features 20 extracted. 21 22 {BASE_DATASET_DESCRIPTION} 23 24 Features: 25 - number_of_elements (int) 26 - mean_atomic_mass (float) 27 - wtd_mean_atomic_mass (float) 28 - gmean_atomic_mass (float) 29 - wtd_gmean_atomic_mass (float) 30 - entropy_atomic_mass (float) 31 - wtd_entropy_atomic_mass (float) 32 - range_atomic_mass (float) 33 - wtd_range_atomic_mass (float) 34 - std_atomic_mass (float) 35 - wtd_std_atomic_mass (float) 36 - mean_fie (float) 37 - wtd_mean_fie (float) 38 - gmean_fie (float) 39 - wtd_gmean_fie (float) 40 - entropy_fie (float) 41 - wtd_entropy_fie (float) 42 - range_fie (float) 43 - wtd_range_fie (float) 44 - std_fie (float) 45 - wtd_std_fie (float) 46 - mean_atomic_radius (float) 47 - wtd_mean_atomic_radius (float) 48 - gmean_atomic_radius (float) 49 - wtd_gmean_atomic_radius (float) 50 - entropy_atomic_radius (float) 51 - wtd_entropy_atomic_radius (float) 52 - range_atomic_radius (float) 53 - wtd_range_atomic_radius (float) 54 - std_atomic_radius (float) 55 - wtd_std_atomic_radius (float) 56 - mean_Density (float) 57 - wtd_mean_Density (float) 58 - gmean_Density (float) 59 - wtd_gmean_Density (float) 60 - entropy_Density (float) 61 - wtd_entropy_Density (float) 62 - range_Density (float) 63 - wtd_range_Density (float) 64 - std_Density (float) 65 - wtd_std_Density (float) 66 - mean_ElectronAffinity (float) 67 - wtd_mean_ElectronAffinity (float) 68 - gmean_ElectronAffinity (float) 69 - wtd_gmean_ElectronAffinity (float) 70 - entropy_ElectronAffinity (float) 71 - wtd_entropy_ElectronAffinity (float) 72 - range_ElectronAffinity (float) 73 - wtd_range_ElectronAffinity (float) 74 - std_ElectronAffinity (float) 75 - wtd_std_ElectronAffinity (float) 76 - mean_FusionHeat (float) 77 - wtd_mean_FusionHeat (float) 78 - gmean_FusionHeat (float) 79 - wtd_gmean_FusionHeat (float) 80 - entropy_FusionHeat (float) 81 - wtd_entropy_FusionHeat (float) 82 - range_FusionHeat (float) 83 - wtd_range_FusionHeat (float) 84 - std_FusionHeat (float) 85 - wtd_std_FusionHeat (float) 86 - mean_ThermalConductivity (float) 87 - wtd_mean_ThermalConductivity (float) 88 - gmean_ThermalConductivity (float) 89 - wtd_gmean_ThermalConductivity (float) 90 - entropy_ThermalConductivity (float) 91 - wtd_entropy_ThermalConductivity (float) 92 - range_ThermalConductivity (float) 93 - wtd_range_ThermalConductivity (float) 94 - std_ThermalConductivity (float) 95 - wtd_std_ThermalConductivity (float) 96 - mean_Valence (float) 97 - wtd_mean_Valence (float) 98 - gmean_Valence (float) 99 - wtd_gmean_Valence (float) 100 - entropy_Valence (float) 101 - wtd_entropy_Valence (float) 102 - range_Valence (float) 103 - wtd_range_Valence (float) 104 - std_Valence (float) 105 - wtd_std_Valence (float) 106 107 Targets: 108 - critical_temp (float) 109 110 Source: 111 https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data 112 113 Examples: 114 Load in the data set:: 115 116 >>> dataset = Superconductivity() 117 >>> dataset.shape 118 (21263, 82) 119 120 Split the data set into features and targets, as NumPy arrays:: 121 122 >>> X, y = dataset.split() 123 >>> X.shape, y.shape 124 ((21263, 81), (21263,)) 125 126 Perform a train/test split, also outputting NumPy arrays:: 127 128 >>> train_test_split = dataset.split(test_size=0.2, random_seed=42) 129 >>> X_train, X_test, y_train, y_test = train_test_split 130 >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape 131 ((17004, 81), (17004,), (4259, 81), (4259,)) 132 133 Output the underlying Pandas DataFrame:: 134 135 >>> df = dataset.to_pandas() 136 >>> type(df) 137 <class 'pandas.core.frame.DataFrame'> 138 """ 139 140 _url = ( 141 "https://archive.ics.uci.edu/ml/machine-learning-databases/" 142 "00464/superconduct.zip" 143 ) 144 145 _features = range(81) 146 _targets = [81] 147 148 def _prep_data(self, data: bytes) -> pd.DataFrame: 149 """Prepare the data set. 150 151 Args: 152 data (bytes): The raw data 153 154 Returns: 155 Pandas dataframe: The prepared data 156 """ 157 # Convert the bytes into a file-like object 158 buffer = io.BytesIO(data) 159 160 # Unzip the file and pull out the text 161 with zipfile.ZipFile(buffer, "r") as zip_file: 162 txt = zip_file.read("train.csv") 163 164 # Convert text to csv file 165 csv_file = io.BytesIO(txt) 166 167 # Load the csv file into a dataframe 168 df = pd.read_csv(csv_file) 169 170 return df
17class Superconductivity(BaseDataset): 18 __doc__ = f""" 19 This dataset contains data on 21,263 superconductors and their relevant features. 20 The goal here is to predict the critical temperature based on the features 21 extracted. 22 23 {BASE_DATASET_DESCRIPTION} 24 25 Features: 26 - number_of_elements (int) 27 - mean_atomic_mass (float) 28 - wtd_mean_atomic_mass (float) 29 - gmean_atomic_mass (float) 30 - wtd_gmean_atomic_mass (float) 31 - entropy_atomic_mass (float) 32 - wtd_entropy_atomic_mass (float) 33 - range_atomic_mass (float) 34 - wtd_range_atomic_mass (float) 35 - std_atomic_mass (float) 36 - wtd_std_atomic_mass (float) 37 - mean_fie (float) 38 - wtd_mean_fie (float) 39 - gmean_fie (float) 40 - wtd_gmean_fie (float) 41 - entropy_fie (float) 42 - wtd_entropy_fie (float) 43 - range_fie (float) 44 - wtd_range_fie (float) 45 - std_fie (float) 46 - wtd_std_fie (float) 47 - mean_atomic_radius (float) 48 - wtd_mean_atomic_radius (float) 49 - gmean_atomic_radius (float) 50 - wtd_gmean_atomic_radius (float) 51 - entropy_atomic_radius (float) 52 - wtd_entropy_atomic_radius (float) 53 - range_atomic_radius (float) 54 - wtd_range_atomic_radius (float) 55 - std_atomic_radius (float) 56 - wtd_std_atomic_radius (float) 57 - mean_Density (float) 58 - wtd_mean_Density (float) 59 - gmean_Density (float) 60 - wtd_gmean_Density (float) 61 - entropy_Density (float) 62 - wtd_entropy_Density (float) 63 - range_Density (float) 64 - wtd_range_Density (float) 65 - std_Density (float) 66 - wtd_std_Density (float) 67 - mean_ElectronAffinity (float) 68 - wtd_mean_ElectronAffinity (float) 69 - gmean_ElectronAffinity (float) 70 - wtd_gmean_ElectronAffinity (float) 71 - entropy_ElectronAffinity (float) 72 - wtd_entropy_ElectronAffinity (float) 73 - range_ElectronAffinity (float) 74 - wtd_range_ElectronAffinity (float) 75 - std_ElectronAffinity (float) 76 - wtd_std_ElectronAffinity (float) 77 - mean_FusionHeat (float) 78 - wtd_mean_FusionHeat (float) 79 - gmean_FusionHeat (float) 80 - wtd_gmean_FusionHeat (float) 81 - entropy_FusionHeat (float) 82 - wtd_entropy_FusionHeat (float) 83 - range_FusionHeat (float) 84 - wtd_range_FusionHeat (float) 85 - std_FusionHeat (float) 86 - wtd_std_FusionHeat (float) 87 - mean_ThermalConductivity (float) 88 - wtd_mean_ThermalConductivity (float) 89 - gmean_ThermalConductivity (float) 90 - wtd_gmean_ThermalConductivity (float) 91 - entropy_ThermalConductivity (float) 92 - wtd_entropy_ThermalConductivity (float) 93 - range_ThermalConductivity (float) 94 - wtd_range_ThermalConductivity (float) 95 - std_ThermalConductivity (float) 96 - wtd_std_ThermalConductivity (float) 97 - mean_Valence (float) 98 - wtd_mean_Valence (float) 99 - gmean_Valence (float) 100 - wtd_gmean_Valence (float) 101 - entropy_Valence (float) 102 - wtd_entropy_Valence (float) 103 - range_Valence (float) 104 - wtd_range_Valence (float) 105 - std_Valence (float) 106 - wtd_std_Valence (float) 107 108 Targets: 109 - critical_temp (float) 110 111 Source: 112 https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data 113 114 Examples: 115 Load in the data set:: 116 117 >>> dataset = Superconductivity() 118 >>> dataset.shape 119 (21263, 82) 120 121 Split the data set into features and targets, as NumPy arrays:: 122 123 >>> X, y = dataset.split() 124 >>> X.shape, y.shape 125 ((21263, 81), (21263,)) 126 127 Perform a train/test split, also outputting NumPy arrays:: 128 129 >>> train_test_split = dataset.split(test_size=0.2, random_seed=42) 130 >>> X_train, X_test, y_train, y_test = train_test_split 131 >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape 132 ((17004, 81), (17004,), (4259, 81), (4259,)) 133 134 Output the underlying Pandas DataFrame:: 135 136 >>> df = dataset.to_pandas() 137 >>> type(df) 138 <class 'pandas.core.frame.DataFrame'> 139 """ 140 141 _url = ( 142 "https://archive.ics.uci.edu/ml/machine-learning-databases/" 143 "00464/superconduct.zip" 144 ) 145 146 _features = range(81) 147 _targets = [81] 148 149 def _prep_data(self, data: bytes) -> pd.DataFrame: 150 """Prepare the data set. 151 152 Args: 153 data (bytes): The raw data 154 155 Returns: 156 Pandas dataframe: The prepared data 157 """ 158 # Convert the bytes into a file-like object 159 buffer = io.BytesIO(data) 160 161 # Unzip the file and pull out the text 162 with zipfile.ZipFile(buffer, "r") as zip_file: 163 txt = zip_file.read("train.csv") 164 165 # Convert text to csv file 166 csv_file = io.BytesIO(txt) 167 168 # Load the csv file into a dataframe 169 df = pd.read_csv(csv_file) 170 171 return df
This dataset contains data on 21,263 superconductors and their relevant features. The goal here is to predict the critical temperature based on the features extracted.
Arguments:
- cache (str or None, optional): The name of the cache. It will be saved to
cache
in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
- cache (str or None): The name of the cache.
- shape (tuple of integers): Dimensions of the data set
- columns (list of strings): List of column names in the data set
Features:
- number_of_elements (int)
- mean_atomic_mass (float)
- wtd_mean_atomic_mass (float)
- gmean_atomic_mass (float)
- wtd_gmean_atomic_mass (float)
- entropy_atomic_mass (float)
- wtd_entropy_atomic_mass (float)
- range_atomic_mass (float)
- wtd_range_atomic_mass (float)
- std_atomic_mass (float)
- wtd_std_atomic_mass (float)
- mean_fie (float)
- wtd_mean_fie (float)
- gmean_fie (float)
- wtd_gmean_fie (float)
- entropy_fie (float)
- wtd_entropy_fie (float)
- range_fie (float)
- wtd_range_fie (float)
- std_fie (float)
- wtd_std_fie (float)
- mean_atomic_radius (float)
- wtd_mean_atomic_radius (float)
- gmean_atomic_radius (float)
- wtd_gmean_atomic_radius (float)
- entropy_atomic_radius (float)
- wtd_entropy_atomic_radius (float)
- range_atomic_radius (float)
- wtd_range_atomic_radius (float)
- std_atomic_radius (float)
- wtd_std_atomic_radius (float)
- mean_Density (float)
- wtd_mean_Density (float)
- gmean_Density (float)
- wtd_gmean_Density (float)
- entropy_Density (float)
- wtd_entropy_Density (float)
- range_Density (float)
- wtd_range_Density (float)
- std_Density (float)
- wtd_std_Density (float)
- mean_ElectronAffinity (float)
- wtd_mean_ElectronAffinity (float)
- gmean_ElectronAffinity (float)
- wtd_gmean_ElectronAffinity (float)
- entropy_ElectronAffinity (float)
- wtd_entropy_ElectronAffinity (float)
- range_ElectronAffinity (float)
- wtd_range_ElectronAffinity (float)
- std_ElectronAffinity (float)
- wtd_std_ElectronAffinity (float)
- mean_FusionHeat (float)
- wtd_mean_FusionHeat (float)
- gmean_FusionHeat (float)
- wtd_gmean_FusionHeat (float)
- entropy_FusionHeat (float)
- wtd_entropy_FusionHeat (float)
- range_FusionHeat (float)
- wtd_range_FusionHeat (float)
- std_FusionHeat (float)
- wtd_std_FusionHeat (float)
- mean_ThermalConductivity (float)
- wtd_mean_ThermalConductivity (float)
- gmean_ThermalConductivity (float)
- wtd_gmean_ThermalConductivity (float)
- entropy_ThermalConductivity (float)
- wtd_entropy_ThermalConductivity (float)
- range_ThermalConductivity (float)
- wtd_range_ThermalConductivity (float)
- std_ThermalConductivity (float)
- wtd_std_ThermalConductivity (float)
- mean_Valence (float)
- wtd_mean_Valence (float)
- gmean_Valence (float)
- wtd_gmean_Valence (float)
- entropy_Valence (float)
- wtd_entropy_Valence (float)
- range_Valence (float)
- wtd_range_Valence (float)
- std_Valence (float)
- wtd_std_Valence (float)
Targets:
- critical_temp (float)
Source:
https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data
Examples:
Load in the data set::
>>> dataset = Superconductivity() >>> dataset.shape (21263, 82)
Split the data set into features and targets, as NumPy arrays::
>>> X, y = dataset.split() >>> X.shape, y.shape ((21263, 81), (21263,))
Perform a train/test split, also outputting NumPy arrays::
>>> train_test_split = dataset.split(test_size=0.2, random_seed=42) >>> X_train, X_test, y_train, y_test = train_test_split >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape ((17004, 81), (17004,), (4259, 81), (4259,))
Output the underlying Pandas DataFrame::
>>> df = dataset.to_pandas() >>> type(df) <class 'pandas.core.frame.DataFrame'>