doubt.datasets.parkinsons

Parkinsons data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""Parkinsons data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9
 10import pandas as pd
 11
 12from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 13
 14
 15class Parkinsons(BaseDataset):
 16    __doc__ = f"""
 17    This dataset is composed of a range of biomedical voice measurements from 42 people
 18    with early-stage Parkinson's disease recruited to a six-month trial of a
 19    telemonitoring device for remote symptom progression monitoring. The recordings
 20    were automatically captured in the patient's homes.
 21
 22    Columns in the table contain subject number, subject age, subject gender, time
 23    interval from baseline recruitment date, motor UPDRS, total UPDRS, and 16
 24    biomedical voice measures. Each row corresponds to one of 5,875 voice recording
 25    from these individuals. The main aim of the data is to predict the motor and total
 26    UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16 voice measures.
 27
 28    {BASE_DATASET_DESCRIPTION}
 29
 30    Features:
 31        subject# (int):
 32            Integer that uniquely identifies each subject
 33        age (int):
 34            Subject age
 35        sex (int):
 36            Binary feature. Subject sex, with 0 being male and 1 female
 37        test_time (float):
 38            Time since recruitment into the trial. The integer part is the
 39            number of days since recruitment
 40        Jitter(%) (float):
 41            Measure of variation in fundamental frequency
 42        Jitter(Abs) (float):
 43            Measure of variation in fundamental frequency
 44        Jitter:RAP (float):
 45            Measure of variation in fundamental frequency
 46        Jitter:PPQ5 (float):
 47            Measure of variation in fundamental frequency
 48        Jitter:DDP (float):
 49            Measure of variation in fundamental frequency
 50        Shimmer (float):
 51            Measure of variation in amplitude
 52        Shimmer(dB) (float):
 53            Measure of variation in amplitude
 54        Shimmer:APQ3 (float):
 55            Measure of variation in amplitude
 56        Shimmer:APQ5 (float):
 57            Measure of variation in amplitude
 58        Shimmer:APQ11 (float):
 59            Measure of variation in amplitude
 60        Shimmer:DDA (float):
 61            Measure of variation in amplitude
 62        NHR (float):
 63            Measure of ratio of noise to tonal components in the voice
 64        HNR (float):
 65            Measure of ratio of noise to tonal components in the voice
 66        RPDE (float):
 67            A nonlinear dynamical complexity measure
 68        DFA (float):
 69            Signal fractal scaling exponent
 70        PPE (float):
 71            A nonlinear measure of fundamental frequency variation
 72
 73    Targets:
 74        motor_UPDRS (float):
 75            Clinician's motor UPDRS score, linearly interpolated
 76        total_UPDRS (float):
 77            Clinician's total UPDRS score, linearly interpolated
 78
 79    Source:
 80        https://archive.ics.uci.edu/ml/datasets/Parkinsons+Telemonitoring
 81
 82    Examples:
 83        Load in the data set::
 84
 85            >>> dataset = Parkinsons()
 86            >>> dataset.shape
 87            (5875, 22)
 88
 89        Split the data set into features and targets, as NumPy arrays::
 90
 91            >>> X, y = dataset.split()
 92            >>> X.shape, y.shape
 93            ((5875, 20), (5875, 2))
 94
 95        Perform a train/test split, also outputting NumPy arrays::
 96
 97            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 98            >>> X_train, X_test, y_train, y_test = train_test_split
 99            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
100            ((4659, 20), (4659, 2), (1216, 20), (1216, 2))
101
102        Output the underlying Pandas DataFrame::
103
104            >>> df = dataset.to_pandas()
105            >>> type(df)
106            <class 'pandas.core.frame.DataFrame'>
107    """
108
109    _url = (
110        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
111        "parkinsons/telemonitoring/parkinsons_updrs.data"
112    )
113
114    _features = range(20)
115    _targets = [20, 21]
116
117    def _prep_data(self, data: bytes) -> pd.DataFrame:
118        """Prepare the data set.
119
120        Args:
121            data (bytes): The raw data
122
123        Returns:
124            Pandas dataframe: The prepared data
125        """
126        # Convert the bytes into a file-like object
127        csv_file = io.BytesIO(data)
128
129        # Load in dataframe
130        df = pd.read_csv(csv_file, header=0)
131
132        # Put target columns at the end
133        cols = [col for col in df.columns if col[-5:] != "UPDRS"]
134        df = df[cols + ["motor_UPDRS", "total_UPDRS"]]
135
136        return df
class Parkinsons(doubt.datasets.dataset.BaseDataset):
 16class Parkinsons(BaseDataset):
 17    __doc__ = f"""
 18    This dataset is composed of a range of biomedical voice measurements from 42 people
 19    with early-stage Parkinson's disease recruited to a six-month trial of a
 20    telemonitoring device for remote symptom progression monitoring. The recordings
 21    were automatically captured in the patient's homes.
 22
 23    Columns in the table contain subject number, subject age, subject gender, time
 24    interval from baseline recruitment date, motor UPDRS, total UPDRS, and 16
 25    biomedical voice measures. Each row corresponds to one of 5,875 voice recording
 26    from these individuals. The main aim of the data is to predict the motor and total
 27    UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16 voice measures.
 28
 29    {BASE_DATASET_DESCRIPTION}
 30
 31    Features:
 32        subject# (int):
 33            Integer that uniquely identifies each subject
 34        age (int):
 35            Subject age
 36        sex (int):
 37            Binary feature. Subject sex, with 0 being male and 1 female
 38        test_time (float):
 39            Time since recruitment into the trial. The integer part is the
 40            number of days since recruitment
 41        Jitter(%) (float):
 42            Measure of variation in fundamental frequency
 43        Jitter(Abs) (float):
 44            Measure of variation in fundamental frequency
 45        Jitter:RAP (float):
 46            Measure of variation in fundamental frequency
 47        Jitter:PPQ5 (float):
 48            Measure of variation in fundamental frequency
 49        Jitter:DDP (float):
 50            Measure of variation in fundamental frequency
 51        Shimmer (float):
 52            Measure of variation in amplitude
 53        Shimmer(dB) (float):
 54            Measure of variation in amplitude
 55        Shimmer:APQ3 (float):
 56            Measure of variation in amplitude
 57        Shimmer:APQ5 (float):
 58            Measure of variation in amplitude
 59        Shimmer:APQ11 (float):
 60            Measure of variation in amplitude
 61        Shimmer:DDA (float):
 62            Measure of variation in amplitude
 63        NHR (float):
 64            Measure of ratio of noise to tonal components in the voice
 65        HNR (float):
 66            Measure of ratio of noise to tonal components in the voice
 67        RPDE (float):
 68            A nonlinear dynamical complexity measure
 69        DFA (float):
 70            Signal fractal scaling exponent
 71        PPE (float):
 72            A nonlinear measure of fundamental frequency variation
 73
 74    Targets:
 75        motor_UPDRS (float):
 76            Clinician's motor UPDRS score, linearly interpolated
 77        total_UPDRS (float):
 78            Clinician's total UPDRS score, linearly interpolated
 79
 80    Source:
 81        https://archive.ics.uci.edu/ml/datasets/Parkinsons+Telemonitoring
 82
 83    Examples:
 84        Load in the data set::
 85
 86            >>> dataset = Parkinsons()
 87            >>> dataset.shape
 88            (5875, 22)
 89
 90        Split the data set into features and targets, as NumPy arrays::
 91
 92            >>> X, y = dataset.split()
 93            >>> X.shape, y.shape
 94            ((5875, 20), (5875, 2))
 95
 96        Perform a train/test split, also outputting NumPy arrays::
 97
 98            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 99            >>> X_train, X_test, y_train, y_test = train_test_split
100            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
101            ((4659, 20), (4659, 2), (1216, 20), (1216, 2))
102
103        Output the underlying Pandas DataFrame::
104
105            >>> df = dataset.to_pandas()
106            >>> type(df)
107            <class 'pandas.core.frame.DataFrame'>
108    """
109
110    _url = (
111        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
112        "parkinsons/telemonitoring/parkinsons_updrs.data"
113    )
114
115    _features = range(20)
116    _targets = [20, 21]
117
118    def _prep_data(self, data: bytes) -> pd.DataFrame:
119        """Prepare the data set.
120
121        Args:
122            data (bytes): The raw data
123
124        Returns:
125            Pandas dataframe: The prepared data
126        """
127        # Convert the bytes into a file-like object
128        csv_file = io.BytesIO(data)
129
130        # Load in dataframe
131        df = pd.read_csv(csv_file, header=0)
132
133        # Put target columns at the end
134        cols = [col for col in df.columns if col[-5:] != "UPDRS"]
135        df = df[cols + ["motor_UPDRS", "total_UPDRS"]]
136
137        return df

This dataset is composed of a range of biomedical voice measurements from 42 people with early-stage Parkinson's disease recruited to a six-month trial of a telemonitoring device for remote symptom progression monitoring. The recordings were automatically captured in the patient's homes.

Columns in the table contain subject number, subject age, subject gender, time interval from baseline recruitment date, motor UPDRS, total UPDRS, and 16 biomedical voice measures. Each row corresponds to one of 5,875 voice recording from these individuals. The main aim of the data is to predict the motor and total UPDRS scores ('motor_UPDRS' and 'total_UPDRS') from the 16 voice measures.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:

subject# (int): Integer that uniquely identifies each subject age (int): Subject age sex (int): Binary feature. Subject sex, with 0 being male and 1 female test_time (float): Time since recruitment into the trial. The integer part is the number of days since recruitment Jitter(%) (float): Measure of variation in fundamental frequency Jitter(Abs) (float): Measure of variation in fundamental frequency Jitter:RAP (float): Measure of variation in fundamental frequency Jitter:PPQ5 (float): Measure of variation in fundamental frequency Jitter:DDP (float): Measure of variation in fundamental frequency Shimmer (float): Measure of variation in amplitude Shimmer(dB) (float): Measure of variation in amplitude Shimmer:APQ3 (float): Measure of variation in amplitude Shimmer:APQ5 (float): Measure of variation in amplitude Shimmer:APQ11 (float): Measure of variation in amplitude Shimmer:DDA (float): Measure of variation in amplitude NHR (float): Measure of ratio of noise to tonal components in the voice HNR (float): Measure of ratio of noise to tonal components in the voice RPDE (float): A nonlinear dynamical complexity measure DFA (float): Signal fractal scaling exponent PPE (float): A nonlinear measure of fundamental frequency variation

Targets:

motor_UPDRS (float): Clinician's motor UPDRS score, linearly interpolated total_UPDRS (float): Clinician's total UPDRS score, linearly interpolated

Source:

https://archive.ics.uci.edu/ml/datasets/Parkinsons+Telemonitoring

Examples:

Load in the data set::

>>> dataset = Parkinsons()
>>> dataset.shape
(5875, 22)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((5875, 20), (5875, 2))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((4659, 20), (4659, 2), (1216, 20), (1216, 2))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>