doubt.datasets.dataset
Base class for data sets
1"""Base class for data sets""" 2 3import re 4import warnings 5from abc import ABC, abstractmethod 6from typing import Iterable, Optional, Tuple, Union 7 8import numpy as np 9import pandas as pd 10import requests 11 12BASE_DATASET_DESCRIPTION = """ 13 Parameters: 14 cache (str or None, optional): 15 The name of the cache. It will be saved to `cache` in the current working 16 directory. If None then no cache will be saved. Defaults to 17 '.dataset_cache'. 18 19 Attributes: 20 cache (str or None): 21 The name of the cache. 22 shape (tuple of integers): 23 Dimensions of the data set 24 columns (list of strings): 25 List of column names in the data set 26""" 27 28 29class BaseDataset(ABC): 30 _url: str 31 _features: Iterable 32 _targets: Iterable 33 34 def __init__(self, cache: Optional[str] = ".dataset_cache"): 35 self.cache = cache 36 self._data = self.get_data() 37 self.shape = self._data.shape 38 self.columns = self._data.columns 39 40 @abstractmethod 41 def _prep_data(self, data: bytes) -> pd.DataFrame: 42 return 43 44 def get_data(self) -> pd.DataFrame: 45 """Download and prepare the dataset. 46 47 Returns: 48 Pandas DataFrame: The dataset. 49 """ 50 51 # Get name of dataset, being the class name converted to snake case 52 name = re.sub(r"([A-Z])", r"_\1", type(self).__name__) 53 name = name.lower().strip("_") 54 55 try: 56 if self.cache is not None: 57 data = pd.read_hdf(self.cache, name) 58 except (FileNotFoundError, KeyError): 59 with warnings.catch_warnings(): 60 warnings.simplefilter("ignore") 61 response = requests.get(self._url, verify=False) 62 data = self._prep_data(response.content) 63 if self.cache is not None: 64 data.to_hdf(self.cache, name) 65 return data 66 67 def to_pandas(self) -> pd.DataFrame: 68 return self._data 69 70 def __len__(self) -> int: 71 return len(self._data) 72 73 def head(self, n: int = 5) -> pd.DataFrame: 74 return self._data.head(n) 75 76 def close(self): 77 del self._data 78 del self 79 80 def __exit__(self, exc_type: str, exc_value: str, exc_traceback: str): 81 self.close() 82 83 def __str__(self) -> str: 84 return str(self._data) 85 86 def __repr__(self) -> str: 87 return repr(self._data) 88 89 def _repr_html_(self): 90 return self._data._repr_html_() 91 92 def split( 93 self, test_size: Optional[float] = None, random_seed: Optional[float] = None 94 ) -> Union[ 95 Tuple[np.ndarray, np.ndarray], 96 Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], 97 ]: 98 """Split dataset into features and targets and optionally train/test. 99 100 Args: 101 test_size (float or None): 102 The fraction of the dataset that will constitute the test set. If None 103 then no train/test split will happen. Defaults to None. 104 random_seed (float or None): 105 The random seed used for the train/test split. If None then a random 106 number will be chosen. Defaults to None. 107 108 Returns: 109 If `test_size` is not `None` then a tuple of numpy arrays 110 (X_train, y_train, X_test, y_test) is returned, and otherwise 111 the tuple (X, y) of numpy arrays is returned. 112 """ 113 # Initialise random number generator 114 rng = np.random.default_rng(random_seed) 115 116 nrows = len(self._data) 117 features = self._features 118 targets = self._targets 119 120 if test_size is not None: 121 test_idxs = rng.random(size=(nrows,)) < test_size 122 train_idxs = ~test_idxs 123 124 X_train = self._data.iloc[train_idxs, features].values 125 y_train = self._data.iloc[train_idxs, targets].values.squeeze() 126 X_test = self._data.iloc[test_idxs, features].values 127 y_test = self._data.iloc[test_idxs, targets].values.squeeze() 128 129 return X_train, X_test, y_train, y_test 130 131 else: 132 X = self._data.iloc[:, features].values 133 y = self._data.iloc[:, targets].values.squeeze() 134 return X, y
BASE_DATASET_DESCRIPTION =
"\n Parameters:\n cache (str or None, optional):\n The name of the cache. It will be saved to `cache` in the current working\n directory. If None then no cache will be saved. Defaults to\n '.dataset_cache'.\n\n Attributes:\n cache (str or None):\n The name of the cache.\n shape (tuple of integers):\n Dimensions of the data set\n columns (list of strings):\n List of column names in the data set\n"
class
BaseDataset(abc.ABC):
30class BaseDataset(ABC): 31 _url: str 32 _features: Iterable 33 _targets: Iterable 34 35 def __init__(self, cache: Optional[str] = ".dataset_cache"): 36 self.cache = cache 37 self._data = self.get_data() 38 self.shape = self._data.shape 39 self.columns = self._data.columns 40 41 @abstractmethod 42 def _prep_data(self, data: bytes) -> pd.DataFrame: 43 return 44 45 def get_data(self) -> pd.DataFrame: 46 """Download and prepare the dataset. 47 48 Returns: 49 Pandas DataFrame: The dataset. 50 """ 51 52 # Get name of dataset, being the class name converted to snake case 53 name = re.sub(r"([A-Z])", r"_\1", type(self).__name__) 54 name = name.lower().strip("_") 55 56 try: 57 if self.cache is not None: 58 data = pd.read_hdf(self.cache, name) 59 except (FileNotFoundError, KeyError): 60 with warnings.catch_warnings(): 61 warnings.simplefilter("ignore") 62 response = requests.get(self._url, verify=False) 63 data = self._prep_data(response.content) 64 if self.cache is not None: 65 data.to_hdf(self.cache, name) 66 return data 67 68 def to_pandas(self) -> pd.DataFrame: 69 return self._data 70 71 def __len__(self) -> int: 72 return len(self._data) 73 74 def head(self, n: int = 5) -> pd.DataFrame: 75 return self._data.head(n) 76 77 def close(self): 78 del self._data 79 del self 80 81 def __exit__(self, exc_type: str, exc_value: str, exc_traceback: str): 82 self.close() 83 84 def __str__(self) -> str: 85 return str(self._data) 86 87 def __repr__(self) -> str: 88 return repr(self._data) 89 90 def _repr_html_(self): 91 return self._data._repr_html_() 92 93 def split( 94 self, test_size: Optional[float] = None, random_seed: Optional[float] = None 95 ) -> Union[ 96 Tuple[np.ndarray, np.ndarray], 97 Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], 98 ]: 99 """Split dataset into features and targets and optionally train/test. 100 101 Args: 102 test_size (float or None): 103 The fraction of the dataset that will constitute the test set. If None 104 then no train/test split will happen. Defaults to None. 105 random_seed (float or None): 106 The random seed used for the train/test split. If None then a random 107 number will be chosen. Defaults to None. 108 109 Returns: 110 If `test_size` is not `None` then a tuple of numpy arrays 111 (X_train, y_train, X_test, y_test) is returned, and otherwise 112 the tuple (X, y) of numpy arrays is returned. 113 """ 114 # Initialise random number generator 115 rng = np.random.default_rng(random_seed) 116 117 nrows = len(self._data) 118 features = self._features 119 targets = self._targets 120 121 if test_size is not None: 122 test_idxs = rng.random(size=(nrows,)) < test_size 123 train_idxs = ~test_idxs 124 125 X_train = self._data.iloc[train_idxs, features].values 126 y_train = self._data.iloc[train_idxs, targets].values.squeeze() 127 X_test = self._data.iloc[test_idxs, features].values 128 y_test = self._data.iloc[test_idxs, targets].values.squeeze() 129 130 return X_train, X_test, y_train, y_test 131 132 else: 133 X = self._data.iloc[:, features].values 134 y = self._data.iloc[:, targets].values.squeeze() 135 return X, y
Helper class that provides a standard way to create an ABC using inheritance.
def
get_data(self) -> pandas.core.frame.DataFrame:
45 def get_data(self) -> pd.DataFrame: 46 """Download and prepare the dataset. 47 48 Returns: 49 Pandas DataFrame: The dataset. 50 """ 51 52 # Get name of dataset, being the class name converted to snake case 53 name = re.sub(r"([A-Z])", r"_\1", type(self).__name__) 54 name = name.lower().strip("_") 55 56 try: 57 if self.cache is not None: 58 data = pd.read_hdf(self.cache, name) 59 except (FileNotFoundError, KeyError): 60 with warnings.catch_warnings(): 61 warnings.simplefilter("ignore") 62 response = requests.get(self._url, verify=False) 63 data = self._prep_data(response.content) 64 if self.cache is not None: 65 data.to_hdf(self.cache, name) 66 return data
Download and prepare the dataset.
Returns:
Pandas DataFrame: The dataset.
def
split( self, test_size: Optional[float] = None, random_seed: Optional[float] = None) -> Union[Tuple[numpy.ndarray, numpy.ndarray], Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]]:
93 def split( 94 self, test_size: Optional[float] = None, random_seed: Optional[float] = None 95 ) -> Union[ 96 Tuple[np.ndarray, np.ndarray], 97 Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], 98 ]: 99 """Split dataset into features and targets and optionally train/test. 100 101 Args: 102 test_size (float or None): 103 The fraction of the dataset that will constitute the test set. If None 104 then no train/test split will happen. Defaults to None. 105 random_seed (float or None): 106 The random seed used for the train/test split. If None then a random 107 number will be chosen. Defaults to None. 108 109 Returns: 110 If `test_size` is not `None` then a tuple of numpy arrays 111 (X_train, y_train, X_test, y_test) is returned, and otherwise 112 the tuple (X, y) of numpy arrays is returned. 113 """ 114 # Initialise random number generator 115 rng = np.random.default_rng(random_seed) 116 117 nrows = len(self._data) 118 features = self._features 119 targets = self._targets 120 121 if test_size is not None: 122 test_idxs = rng.random(size=(nrows,)) < test_size 123 train_idxs = ~test_idxs 124 125 X_train = self._data.iloc[train_idxs, features].values 126 y_train = self._data.iloc[train_idxs, targets].values.squeeze() 127 X_test = self._data.iloc[test_idxs, features].values 128 y_test = self._data.iloc[test_idxs, targets].values.squeeze() 129 130 return X_train, X_test, y_train, y_test 131 132 else: 133 X = self._data.iloc[:, features].values 134 y = self._data.iloc[:, targets].values.squeeze() 135 return X, y
Split dataset into features and targets and optionally train/test.
Arguments:
- test_size (float or None): The fraction of the dataset that will constitute the test set. If None then no train/test split will happen. Defaults to None.
- random_seed (float or None): The random seed used for the train/test split. If None then a random number will be chosen. Defaults to None.
Returns:
If
test_size
is notNone
then a tuple of numpy arrays (X_train, y_train, X_test, y_test) is returned, and otherwise the tuple (X, y) of numpy arrays is returned.