doubt.datasets.tehran_housing

Tehran housing data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""Tehran housing data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9
 10import pandas as pd
 11
 12from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 13
 14
 15class TehranHousing(BaseDataset):
 16    __doc__ = f"""
 17    Data set includes construction cost, sale prices, project variables, and economic
 18    variables corresponding to real estate single-family residential apartments in
 19    Tehran, Iran.
 20
 21    {BASE_DATASET_DESCRIPTION}
 22
 23    Features:
 24        start_year (int):
 25            Start year in the Persian calendar
 26        start_quarter (int)
 27            Start quarter in the Persian calendar
 28        completion_year (int)
 29            Completion year in the Persian calendar
 30        completion_quarter (int)
 31            Completion quarter in the Persian calendar
 32        V-1..V-8 (floats):
 33            Project physical and financial variables
 34        V-11-1..29-1 (floats):
 35            Economic variables and indices in time, lag 1
 36        V-11-2..29-2 (floats):
 37            Economic variables and indices in time, lag 2
 38        V-11-3..29-3 (floats):
 39            Economic variables and indices in time, lag 3
 40        V-11-4..29-4 (floats):
 41            Economic variables and indices in time, lag 4
 42        V-11-5..29-5 (floats):
 43            Economic variables and indices in time, lag 5
 44
 45    Targets:
 46        construction_cost (float)
 47        sale_price (float)
 48
 49    Source:
 50        https://archive.ics.uci.edu/ml/datasets/Residential+Building+Data+Set
 51
 52    Examples:
 53        Load in the data set::
 54
 55            >>> dataset = TehranHousing()
 56            >>> dataset.shape
 57            (371, 109)
 58
 59        Split the data set into features and targets, as NumPy arrays::
 60
 61            >>> X, y = dataset.split()
 62            >>> X.shape, y.shape
 63            ((371, 107), (371, 2))
 64
 65        Perform a train/test split, also outputting NumPy arrays::
 66
 67            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 68            >>> X_train, X_test, y_train, y_test = train_test_split
 69            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 70            ((288, 107), (288, 2), (83, 107), (83, 2))
 71
 72        Output the underlying Pandas DataFrame::
 73
 74            >>> df = dataset.to_pandas()
 75            >>> type(df)
 76            <class 'pandas.core.frame.DataFrame'>
 77    """
 78
 79    _url = (
 80        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 81        "00437/Residential-Building-Data-Set.xlsx"
 82    )
 83
 84    _features = range(107)
 85    _targets = [107, 108]
 86
 87    def _prep_data(self, data: bytes) -> pd.DataFrame:
 88        """Prepare the data set.
 89
 90        Args:
 91            data (bytes): The raw data
 92
 93        Returns:
 94            Pandas dataframe: The prepared data
 95        """
 96        # Convert the bytes into a file-like object
 97        xlsx_file = io.BytesIO(data)
 98
 99        # Load it into dataframe
100        cols = (
101            ["start_year", "start_quarter", "completion_year", "completion_quarter"]
102            + [f"V-{i}" for i in range(1, 9)]
103            + [f"V-{i}-{j}" for j in range(1, 6) for i in range(11, 30)]
104            + ["construction_cost", "sale_price"]
105        )
106        df = pd.read_excel(xlsx_file, skiprows=[0, 1], names=cols)
107        return df
class TehranHousing(doubt.datasets.dataset.BaseDataset):
 16class TehranHousing(BaseDataset):
 17    __doc__ = f"""
 18    Data set includes construction cost, sale prices, project variables, and economic
 19    variables corresponding to real estate single-family residential apartments in
 20    Tehran, Iran.
 21
 22    {BASE_DATASET_DESCRIPTION}
 23
 24    Features:
 25        start_year (int):
 26            Start year in the Persian calendar
 27        start_quarter (int)
 28            Start quarter in the Persian calendar
 29        completion_year (int)
 30            Completion year in the Persian calendar
 31        completion_quarter (int)
 32            Completion quarter in the Persian calendar
 33        V-1..V-8 (floats):
 34            Project physical and financial variables
 35        V-11-1..29-1 (floats):
 36            Economic variables and indices in time, lag 1
 37        V-11-2..29-2 (floats):
 38            Economic variables and indices in time, lag 2
 39        V-11-3..29-3 (floats):
 40            Economic variables and indices in time, lag 3
 41        V-11-4..29-4 (floats):
 42            Economic variables and indices in time, lag 4
 43        V-11-5..29-5 (floats):
 44            Economic variables and indices in time, lag 5
 45
 46    Targets:
 47        construction_cost (float)
 48        sale_price (float)
 49
 50    Source:
 51        https://archive.ics.uci.edu/ml/datasets/Residential+Building+Data+Set
 52
 53    Examples:
 54        Load in the data set::
 55
 56            >>> dataset = TehranHousing()
 57            >>> dataset.shape
 58            (371, 109)
 59
 60        Split the data set into features and targets, as NumPy arrays::
 61
 62            >>> X, y = dataset.split()
 63            >>> X.shape, y.shape
 64            ((371, 107), (371, 2))
 65
 66        Perform a train/test split, also outputting NumPy arrays::
 67
 68            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 69            >>> X_train, X_test, y_train, y_test = train_test_split
 70            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 71            ((288, 107), (288, 2), (83, 107), (83, 2))
 72
 73        Output the underlying Pandas DataFrame::
 74
 75            >>> df = dataset.to_pandas()
 76            >>> type(df)
 77            <class 'pandas.core.frame.DataFrame'>
 78    """
 79
 80    _url = (
 81        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 82        "00437/Residential-Building-Data-Set.xlsx"
 83    )
 84
 85    _features = range(107)
 86    _targets = [107, 108]
 87
 88    def _prep_data(self, data: bytes) -> pd.DataFrame:
 89        """Prepare the data set.
 90
 91        Args:
 92            data (bytes): The raw data
 93
 94        Returns:
 95            Pandas dataframe: The prepared data
 96        """
 97        # Convert the bytes into a file-like object
 98        xlsx_file = io.BytesIO(data)
 99
100        # Load it into dataframe
101        cols = (
102            ["start_year", "start_quarter", "completion_year", "completion_quarter"]
103            + [f"V-{i}" for i in range(1, 9)]
104            + [f"V-{i}-{j}" for j in range(1, 6) for i in range(11, 30)]
105            + ["construction_cost", "sale_price"]
106        )
107        df = pd.read_excel(xlsx_file, skiprows=[0, 1], names=cols)
108        return df

Data set includes construction cost, sale prices, project variables, and economic variables corresponding to real estate single-family residential apartments in Tehran, Iran.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:

start_year (int): Start year in the Persian calendar start_quarter (int) Start quarter in the Persian calendar completion_year (int) Completion year in the Persian calendar completion_quarter (int) Completion quarter in the Persian calendar V-1..V-8 (floats): Project physical and financial variables V-11-1..29-1 (floats): Economic variables and indices in time, lag 1 V-11-2..29-2 (floats): Economic variables and indices in time, lag 2 V-11-3..29-3 (floats): Economic variables and indices in time, lag 3 V-11-4..29-4 (floats): Economic variables and indices in time, lag 4 V-11-5..29-5 (floats): Economic variables and indices in time, lag 5

Targets:

construction_cost (float) sale_price (float)

Source:

https://archive.ics.uci.edu/ml/datasets/Residential+Building+Data+Set

Examples:

Load in the data set::

>>> dataset = TehranHousing()
>>> dataset.shape
(371, 109)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((371, 107), (371, 2))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((288, 107), (288, 2), (83, 107), (83, 2))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>