doubt.datasets.dataset

Base class for data sets

View Source

  1"""Base class for data sets"""
  2
  3import re
  4import warnings
  5from abc import ABC, abstractmethod
  6from typing import Iterable, Optional, Tuple, Union
  7
  8import numpy as np
  9import pandas as pd
 10import requests
 11
 12BASE_DATASET_DESCRIPTION = """
 13    Parameters:
 14        cache (str or None, optional):
 15            The name of the cache. It will be saved to `cache` in the current working
 16            directory. If None then no cache will be saved. Defaults to
 17            '.dataset_cache'.
 18
 19    Attributes:
 20        cache (str or None):
 21            The name of the cache.
 22        shape (tuple of integers):
 23            Dimensions of the data set
 24        columns (list of strings):
 25            List of column names in the data set
 26"""
 27
 28
 29class BaseDataset(ABC):
 30    _url: str
 31    _features: Iterable
 32    _targets: Iterable
 33
 34    def __init__(self, cache: Optional[str] = ".dataset_cache"):
 35        self.cache = cache
 36        self._data = self.get_data()
 37        self.shape = self._data.shape
 38        self.columns = self._data.columns
 39
 40    @abstractmethod
 41    def _prep_data(self, data: bytes) -> pd.DataFrame:
 42        return
 43
 44    def get_data(self) -> pd.DataFrame:
 45        """Download and prepare the dataset.
 46
 47        Returns:
 48            Pandas DataFrame: The dataset.
 49        """
 50
 51        # Get name of dataset, being the class name converted to snake case
 52        name = re.sub(r"([A-Z])", r"_\1", type(self).__name__)
 53        name = name.lower().strip("_")
 54
 55        try:
 56            if self.cache is not None:
 57                data = pd.read_hdf(self.cache, name)
 58        except (FileNotFoundError, KeyError):
 59            with warnings.catch_warnings():
 60                warnings.simplefilter("ignore")
 61                response = requests.get(self._url, verify=False)
 62            data = self._prep_data(response.content)
 63            if self.cache is not None:
 64                data.to_hdf(self.cache, name)
 65        return data
 66
 67    def to_pandas(self) -> pd.DataFrame:
 68        return self._data
 69
 70    def __len__(self) -> int:
 71        return len(self._data)
 72
 73    def head(self, n: int = 5) -> pd.DataFrame:
 74        return self._data.head(n)
 75
 76    def close(self):
 77        del self._data
 78        del self
 79
 80    def __exit__(self, exc_type: str, exc_value: str, exc_traceback: str):
 81        self.close()
 82
 83    def __str__(self) -> str:
 84        return str(self._data)
 85
 86    def __repr__(self) -> str:
 87        return repr(self._data)
 88
 89    def _repr_html_(self):
 90        return self._data._repr_html_()
 91
 92    def split(
 93        self, test_size: Optional[float] = None, random_seed: Optional[float] = None
 94    ) -> Union[
 95        Tuple[np.ndarray, np.ndarray],
 96        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray],
 97    ]:
 98        """Split dataset into features and targets and optionally train/test.
 99
100        Args:
101            test_size (float or None):
102                The fraction of the dataset that will constitute the test set. If None
103                then no train/test split will happen. Defaults to None.
104            random_seed (float or None):
105                The random seed used for the train/test split. If None then a random
106                number will be chosen. Defaults to None.
107
108        Returns:
109            If `test_size` is not `None` then a tuple of numpy arrays
110            (X_train, y_train, X_test, y_test) is returned, and otherwise
111            the tuple (X, y) of numpy arrays is returned.
112        """
113        # Initialise random number generator
114        rng = np.random.default_rng(random_seed)
115
116        nrows = len(self._data)
117        features = self._features
118        targets = self._targets
119
120        if test_size is not None:
121            test_idxs = rng.random(size=(nrows,)) < test_size
122            train_idxs = ~test_idxs
123
124            X_train = self._data.iloc[train_idxs, features].values
125            y_train = self._data.iloc[train_idxs, targets].values.squeeze()
126            X_test = self._data.iloc[test_idxs, features].values
127            y_test = self._data.iloc[test_idxs, targets].values.squeeze()
128
129            return X_train, X_test, y_train, y_test
130
131        else:
132            X = self._data.iloc[:, features].values
133            y = self._data.iloc[:, targets].values.squeeze()
134            return X, y

BASE_DATASET_DESCRIPTION = "\n Parameters:\n cache (str or None, optional):\n The name of the cache. It will be saved to `cache` in the current working\n directory. If None then no cache will be saved. Defaults to\n '.dataset_cache'.\n\n Attributes:\n cache (str or None):\n The name of the cache.\n shape (tuple of integers):\n Dimensions of the data set\n columns (list of strings):\n List of column names in the data set\n"

class BaseDataset(abc.ABC): View Source

 30class BaseDataset(ABC):
 31    _url: str
 32    _features: Iterable
 33    _targets: Iterable
 34
 35    def __init__(self, cache: Optional[str] = ".dataset_cache"):
 36        self.cache = cache
 37        self._data = self.get_data()
 38        self.shape = self._data.shape
 39        self.columns = self._data.columns
 40
 41    @abstractmethod
 42    def _prep_data(self, data: bytes) -> pd.DataFrame:
 43        return
 44
 45    def get_data(self) -> pd.DataFrame:
 46        """Download and prepare the dataset.
 47
 48        Returns:
 49            Pandas DataFrame: The dataset.
 50        """
 51
 52        # Get name of dataset, being the class name converted to snake case
 53        name = re.sub(r"([A-Z])", r"_\1", type(self).__name__)
 54        name = name.lower().strip("_")
 55
 56        try:
 57            if self.cache is not None:
 58                data = pd.read_hdf(self.cache, name)
 59        except (FileNotFoundError, KeyError):
 60            with warnings.catch_warnings():
 61                warnings.simplefilter("ignore")
 62                response = requests.get(self._url, verify=False)
 63            data = self._prep_data(response.content)
 64            if self.cache is not None:
 65                data.to_hdf(self.cache, name)
 66        return data
 67
 68    def to_pandas(self) -> pd.DataFrame:
 69        return self._data
 70
 71    def __len__(self) -> int:
 72        return len(self._data)
 73
 74    def head(self, n: int = 5) -> pd.DataFrame:
 75        return self._data.head(n)
 76
 77    def close(self):
 78        del self._data
 79        del self
 80
 81    def __exit__(self, exc_type: str, exc_value: str, exc_traceback: str):
 82        self.close()
 83
 84    def __str__(self) -> str:
 85        return str(self._data)
 86
 87    def __repr__(self) -> str:
 88        return repr(self._data)
 89
 90    def _repr_html_(self):
 91        return self._data._repr_html_()
 92
 93    def split(
 94        self, test_size: Optional[float] = None, random_seed: Optional[float] = None
 95    ) -> Union[
 96        Tuple[np.ndarray, np.ndarray],
 97        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray],
 98    ]:
 99        """Split dataset into features and targets and optionally train/test.
100
101        Args:
102            test_size (float or None):
103                The fraction of the dataset that will constitute the test set. If None
104                then no train/test split will happen. Defaults to None.
105            random_seed (float or None):
106                The random seed used for the train/test split. If None then a random
107                number will be chosen. Defaults to None.
108
109        Returns:
110            If `test_size` is not `None` then a tuple of numpy arrays
111            (X_train, y_train, X_test, y_test) is returned, and otherwise
112            the tuple (X, y) of numpy arrays is returned.
113        """
114        # Initialise random number generator
115        rng = np.random.default_rng(random_seed)
116
117        nrows = len(self._data)
118        features = self._features
119        targets = self._targets
120
121        if test_size is not None:
122            test_idxs = rng.random(size=(nrows,)) < test_size
123            train_idxs = ~test_idxs
124
125            X_train = self._data.iloc[train_idxs, features].values
126            y_train = self._data.iloc[train_idxs, targets].values.squeeze()
127            X_test = self._data.iloc[test_idxs, features].values
128            y_test = self._data.iloc[test_idxs, targets].values.squeeze()
129
130            return X_train, X_test, y_train, y_test
131
132        else:
133            X = self._data.iloc[:, features].values
134            y = self._data.iloc[:, targets].values.squeeze()
135            return X, y

Helper class that provides a standard way to create an ABC using inheritance.

cache

shape

columns

def get_data(self) -> pandas.core.frame.DataFrame: View Source

45    def get_data(self) -> pd.DataFrame:
46        """Download and prepare the dataset.
47
48        Returns:
49            Pandas DataFrame: The dataset.
50        """
51
52        # Get name of dataset, being the class name converted to snake case
53        name = re.sub(r"([A-Z])", r"_\1", type(self).__name__)
54        name = name.lower().strip("_")
55
56        try:
57            if self.cache is not None:
58                data = pd.read_hdf(self.cache, name)
59        except (FileNotFoundError, KeyError):
60            with warnings.catch_warnings():
61                warnings.simplefilter("ignore")
62                response = requests.get(self._url, verify=False)
63            data = self._prep_data(response.content)
64            if self.cache is not None:
65                data.to_hdf(self.cache, name)
66        return data

Download and prepare the dataset.

Returns:

Pandas DataFrame: The dataset.

def to_pandas(self) -> pandas.core.frame.DataFrame: View Source

68    def to_pandas(self) -> pd.DataFrame:
69        return self._data

def head(self, n: int = 5) -> pandas.core.frame.DataFrame: View Source

74    def head(self, n: int = 5) -> pd.DataFrame:
75        return self._data.head(n)

def close(self): View Source

77    def close(self):
78        del self._data
79        del self

def split( self, test_size: Optional[float] = None, random_seed: Optional[float] = None) -> Union[Tuple[numpy.ndarray, numpy.ndarray], Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]]: View Source

 93    def split(
 94        self, test_size: Optional[float] = None, random_seed: Optional[float] = None
 95    ) -> Union[
 96        Tuple[np.ndarray, np.ndarray],
 97        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray],
 98    ]:
 99        """Split dataset into features and targets and optionally train/test.
100
101        Args:
102            test_size (float or None):
103                The fraction of the dataset that will constitute the test set. If None
104                then no train/test split will happen. Defaults to None.
105            random_seed (float or None):
106                The random seed used for the train/test split. If None then a random
107                number will be chosen. Defaults to None.
108
109        Returns:
110            If `test_size` is not `None` then a tuple of numpy arrays
111            (X_train, y_train, X_test, y_test) is returned, and otherwise
112            the tuple (X, y) of numpy arrays is returned.
113        """
114        # Initialise random number generator
115        rng = np.random.default_rng(random_seed)
116
117        nrows = len(self._data)
118        features = self._features
119        targets = self._targets
120
121        if test_size is not None:
122            test_idxs = rng.random(size=(nrows,)) < test_size
123            train_idxs = ~test_idxs
124
125            X_train = self._data.iloc[train_idxs, features].values
126            y_train = self._data.iloc[train_idxs, targets].values.squeeze()
127            X_test = self._data.iloc[test_idxs, features].values
128            y_test = self._data.iloc[test_idxs, targets].values.squeeze()
129
130            return X_train, X_test, y_train, y_test
131
132        else:
133            X = self._data.iloc[:, features].values
134            y = self._data.iloc[:, targets].values.squeeze()
135            return X, y

Split dataset into features and targets and optionally train/test.

Arguments:

test_size (float or None): The fraction of the dataset that will constitute the test set. If None then no train/test split will happen. Defaults to None.
random_seed (float or None): The random seed used for the train/test split. If None then a random number will be chosen. Defaults to None.

Returns:

If test_size is not None then a tuple of numpy arrays (X_train, y_train, X_test, y_test) is returned, and otherwise the tuple (X, y) of numpy arrays is returned.