doubt.datasets.new_taipei_housing

New Taipei Housing data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""New Taipei Housing data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9
 10import pandas as pd
 11
 12from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 13
 14
 15class NewTaipeiHousing(BaseDataset):
 16    __doc__ = f"""
 17    The "real estate valuation" is a regression problem. The market historical data set
 18    of real estate valuation are collected from Sindian Dist., New Taipei City, Taiwan.
 19
 20    {BASE_DATASET_DESCRIPTION}
 21
 22    Features:
 23        transaction_date (float):
 24            The transaction date encoded as a floating point value. For instance,
 25            2013.250 is March 2013 and 2013.500 is June March
 26        house_age (float):
 27            The age of the house
 28        mrt_distance (float):
 29            Distance to the nearest MRT station
 30        n_stores (int):
 31            Number of convenience stores
 32        lat (float):
 33            Latitude
 34        lng (float):
 35            Longitude
 36
 37    Targets:
 38        house_price (float):
 39            House price of unit area
 40
 41    Source:
 42        https://archive.ics.uci.edu/ml/datasets/Real+estate+valuation+data+set
 43
 44    Examples:
 45        Load in the data set::
 46
 47            >>> dataset = NewTaipeiHousing()
 48            >>> dataset.shape
 49            (414, 7)
 50
 51        Split the data set into features and targets, as NumPy arrays::
 52
 53            >>> X, y = dataset.split()
 54            >>> X.shape, y.shape
 55            ((414, 6), (414,))
 56
 57        Perform a train/test split, also outputting NumPy arrays::
 58
 59            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 60            >>> X_train, X_test, y_train, y_test = train_test_split
 61            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 62            ((323, 6), (323,), (91, 6), (91,))
 63
 64        Output the underlying Pandas DataFrame::
 65
 66            >>> df = dataset.to_pandas()
 67            >>> type(df)
 68            <class 'pandas.core.frame.DataFrame'>
 69    """
 70
 71    _url = (
 72        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 73        "00477/Real%20estate%20valuation%20data%20set.xlsx"
 74    )
 75
 76    _features = range(6)
 77    _targets = [6]
 78
 79    def _prep_data(self, data: bytes) -> pd.DataFrame:
 80        """Prepare the data set.
 81
 82        Args:
 83            data (bytes): The raw data
 84
 85        Returns:
 86            Pandas dataframe: The prepared data
 87        """
 88        # Convert the bytes into a file-like object
 89        xlsx_file = io.BytesIO(data)
 90
 91        # Load in the dataframe
 92        cols = [
 93            "idx",
 94            "transaction_date",
 95            "house_age",
 96            "mrt_distance",
 97            "n_stores",
 98            "lat",
 99            "lng",
100            "house_price",
101        ]
102        df = pd.read_excel(xlsx_file, header=0, names=cols)
103
104        # Remove the index
105        df = df.iloc[:, 1:]
106
107        return df
class NewTaipeiHousing(doubt.datasets.dataset.BaseDataset):
 16class NewTaipeiHousing(BaseDataset):
 17    __doc__ = f"""
 18    The "real estate valuation" is a regression problem. The market historical data set
 19    of real estate valuation are collected from Sindian Dist., New Taipei City, Taiwan.
 20
 21    {BASE_DATASET_DESCRIPTION}
 22
 23    Features:
 24        transaction_date (float):
 25            The transaction date encoded as a floating point value. For instance,
 26            2013.250 is March 2013 and 2013.500 is June March
 27        house_age (float):
 28            The age of the house
 29        mrt_distance (float):
 30            Distance to the nearest MRT station
 31        n_stores (int):
 32            Number of convenience stores
 33        lat (float):
 34            Latitude
 35        lng (float):
 36            Longitude
 37
 38    Targets:
 39        house_price (float):
 40            House price of unit area
 41
 42    Source:
 43        https://archive.ics.uci.edu/ml/datasets/Real+estate+valuation+data+set
 44
 45    Examples:
 46        Load in the data set::
 47
 48            >>> dataset = NewTaipeiHousing()
 49            >>> dataset.shape
 50            (414, 7)
 51
 52        Split the data set into features and targets, as NumPy arrays::
 53
 54            >>> X, y = dataset.split()
 55            >>> X.shape, y.shape
 56            ((414, 6), (414,))
 57
 58        Perform a train/test split, also outputting NumPy arrays::
 59
 60            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 61            >>> X_train, X_test, y_train, y_test = train_test_split
 62            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 63            ((323, 6), (323,), (91, 6), (91,))
 64
 65        Output the underlying Pandas DataFrame::
 66
 67            >>> df = dataset.to_pandas()
 68            >>> type(df)
 69            <class 'pandas.core.frame.DataFrame'>
 70    """
 71
 72    _url = (
 73        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 74        "00477/Real%20estate%20valuation%20data%20set.xlsx"
 75    )
 76
 77    _features = range(6)
 78    _targets = [6]
 79
 80    def _prep_data(self, data: bytes) -> pd.DataFrame:
 81        """Prepare the data set.
 82
 83        Args:
 84            data (bytes): The raw data
 85
 86        Returns:
 87            Pandas dataframe: The prepared data
 88        """
 89        # Convert the bytes into a file-like object
 90        xlsx_file = io.BytesIO(data)
 91
 92        # Load in the dataframe
 93        cols = [
 94            "idx",
 95            "transaction_date",
 96            "house_age",
 97            "mrt_distance",
 98            "n_stores",
 99            "lat",
100            "lng",
101            "house_price",
102        ]
103        df = pd.read_excel(xlsx_file, header=0, names=cols)
104
105        # Remove the index
106        df = df.iloc[:, 1:]
107
108        return df

The "real estate valuation" is a regression problem. The market historical data set of real estate valuation are collected from Sindian Dist., New Taipei City, Taiwan.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:

transaction_date (float): The transaction date encoded as a floating point value. For instance, 2013.250 is March 2013 and 2013.500 is June March house_age (float): The age of the house mrt_distance (float): Distance to the nearest MRT station n_stores (int): Number of convenience stores lat (float): Latitude lng (float): Longitude

Targets:

house_price (float): House price of unit area

Source:

https://archive.ics.uci.edu/ml/datasets/Real+estate+valuation+data+set

Examples:

Load in the data set::

>>> dataset = NewTaipeiHousing()
>>> dataset.shape
(414, 7)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((414, 6), (414,))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((323, 6), (323,), (91, 6), (91,))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>