doubt.datasets.forest_fire

Forest fire data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""Forest fire data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9
 10import pandas as pd
 11
 12from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 13
 14
 15class ForestFire(BaseDataset):
 16    __doc__ = f"""
 17    This is a difficult regression task, where the aim is to predict the burned area of
 18    forest fires, in the northeast region of Portugal, by using meteorological and
 19    other data.
 20
 21    {BASE_DATASET_DESCRIPTION}
 22
 23    Features:
 24        X (float):
 25            The x-axis spatial coordinate within the Montesinho park map. Ranges from 1
 26            to 9.
 27        Y (float):
 28            The y-axis spatial coordinate within the Montesinho park map Ranges from 2
 29            to 9.
 30        month (int):
 31            Month of the year. Ranges from 0 to 11
 32        day (int):
 33            Day of the week. Ranges from 0 to 6
 34        FFMC (float):
 35            FFMC index from the FWI system. Ranges from 18.7 to 96.20
 36        DMC (float):
 37            DMC index from the FWI system. Ranges from 1.1 to 291.3
 38        DC (float):
 39            DC index from the FWI system. Ranges from 7.9 to 860.6
 40        ISI (float):
 41            ISI index from the FWI system. Ranges from 0.0 to 56.1
 42        temp (float):
 43            Temperature in Celsius degrees. Ranges from 2.2 to 33.3
 44        RH (float):
 45            Relative humidity in %. Ranges from 15.0 to 100.0
 46        wind (float):
 47            Wind speed in km/h. Ranges from 0.4 to 9.4
 48        rain (float):
 49            Outside rain in mm/m2. Ranges from 0.0 to 6.4
 50
 51    Targets:
 52        area (float):
 53            The burned area of the forest (in ha). Ranges from 0.00 to 1090.84
 54
 55    Notes:
 56        The target variable is very skewed towards 0.0, thus it may make sense to model
 57        with the logarithm transform.
 58
 59    Source:
 60        https://archive.ics.uci.edu/ml/datasets/Forest+Fires
 61
 62    Examples:
 63        Load in the data set::
 64
 65            >>> dataset = ForestFire()
 66            >>> dataset.shape
 67            (517, 13)
 68
 69        Split the data set into features and targets, as NumPy arrays::
 70
 71            >>> X, y = dataset.split()
 72            >>> X.shape, y.shape
 73            ((517, 12), (517,))
 74
 75        Perform a train/test split, also outputting NumPy arrays::
 76
 77            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 78            >>> X_train, X_test, y_train, y_test = train_test_split
 79            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 80            ((401, 12), (401,), (116, 12), (116,))
 81
 82        Output the underlying Pandas DataFrame::
 83
 84            >>> df = dataset.to_pandas()
 85            >>> type(df)
 86            <class 'pandas.core.frame.DataFrame'>
 87    """
 88
 89    _url = (
 90        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 91        "forest-fires/forestfires.csv"
 92    )
 93
 94    _features = range(12)
 95    _targets = [12]
 96
 97    def _prep_data(self, data: bytes) -> pd.DataFrame:
 98        """Prepare the data set.
 99
100        Args:
101            data (bytes): The raw data
102
103        Returns:
104            Pandas dataframe: The prepared data
105        """
106        # Convert the bytes into a file-like object
107        csv_file = io.BytesIO(data)
108
109        # Read the file-like object into a dataframe
110        df = pd.read_csv(csv_file)
111
112        # Encode month
113        months = [
114            "jan",
115            "feb",
116            "mar",
117            "apr",
118            "may",
119            "jun",
120            "jul",
121            "aug",
122            "sep",
123            "oct",
124            "nov",
125            "dec",
126        ]
127        df["month"] = df.month.map(lambda string: months.index(string))
128
129        # Encode day
130        weekdays = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]
131        df["day"] = df.day.map(lambda string: weekdays.index(string))
132
133        return df
class ForestFire(doubt.datasets.dataset.BaseDataset):
 16class ForestFire(BaseDataset):
 17    __doc__ = f"""
 18    This is a difficult regression task, where the aim is to predict the burned area of
 19    forest fires, in the northeast region of Portugal, by using meteorological and
 20    other data.
 21
 22    {BASE_DATASET_DESCRIPTION}
 23
 24    Features:
 25        X (float):
 26            The x-axis spatial coordinate within the Montesinho park map. Ranges from 1
 27            to 9.
 28        Y (float):
 29            The y-axis spatial coordinate within the Montesinho park map Ranges from 2
 30            to 9.
 31        month (int):
 32            Month of the year. Ranges from 0 to 11
 33        day (int):
 34            Day of the week. Ranges from 0 to 6
 35        FFMC (float):
 36            FFMC index from the FWI system. Ranges from 18.7 to 96.20
 37        DMC (float):
 38            DMC index from the FWI system. Ranges from 1.1 to 291.3
 39        DC (float):
 40            DC index from the FWI system. Ranges from 7.9 to 860.6
 41        ISI (float):
 42            ISI index from the FWI system. Ranges from 0.0 to 56.1
 43        temp (float):
 44            Temperature in Celsius degrees. Ranges from 2.2 to 33.3
 45        RH (float):
 46            Relative humidity in %. Ranges from 15.0 to 100.0
 47        wind (float):
 48            Wind speed in km/h. Ranges from 0.4 to 9.4
 49        rain (float):
 50            Outside rain in mm/m2. Ranges from 0.0 to 6.4
 51
 52    Targets:
 53        area (float):
 54            The burned area of the forest (in ha). Ranges from 0.00 to 1090.84
 55
 56    Notes:
 57        The target variable is very skewed towards 0.0, thus it may make sense to model
 58        with the logarithm transform.
 59
 60    Source:
 61        https://archive.ics.uci.edu/ml/datasets/Forest+Fires
 62
 63    Examples:
 64        Load in the data set::
 65
 66            >>> dataset = ForestFire()
 67            >>> dataset.shape
 68            (517, 13)
 69
 70        Split the data set into features and targets, as NumPy arrays::
 71
 72            >>> X, y = dataset.split()
 73            >>> X.shape, y.shape
 74            ((517, 12), (517,))
 75
 76        Perform a train/test split, also outputting NumPy arrays::
 77
 78            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 79            >>> X_train, X_test, y_train, y_test = train_test_split
 80            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 81            ((401, 12), (401,), (116, 12), (116,))
 82
 83        Output the underlying Pandas DataFrame::
 84
 85            >>> df = dataset.to_pandas()
 86            >>> type(df)
 87            <class 'pandas.core.frame.DataFrame'>
 88    """
 89
 90    _url = (
 91        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 92        "forest-fires/forestfires.csv"
 93    )
 94
 95    _features = range(12)
 96    _targets = [12]
 97
 98    def _prep_data(self, data: bytes) -> pd.DataFrame:
 99        """Prepare the data set.
100
101        Args:
102            data (bytes): The raw data
103
104        Returns:
105            Pandas dataframe: The prepared data
106        """
107        # Convert the bytes into a file-like object
108        csv_file = io.BytesIO(data)
109
110        # Read the file-like object into a dataframe
111        df = pd.read_csv(csv_file)
112
113        # Encode month
114        months = [
115            "jan",
116            "feb",
117            "mar",
118            "apr",
119            "may",
120            "jun",
121            "jul",
122            "aug",
123            "sep",
124            "oct",
125            "nov",
126            "dec",
127        ]
128        df["month"] = df.month.map(lambda string: months.index(string))
129
130        # Encode day
131        weekdays = ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]
132        df["day"] = df.day.map(lambda string: weekdays.index(string))
133
134        return df

This is a difficult regression task, where the aim is to predict the burned area of forest fires, in the northeast region of Portugal, by using meteorological and other data.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:

X (float): The x-axis spatial coordinate within the Montesinho park map. Ranges from 1 to 9. Y (float): The y-axis spatial coordinate within the Montesinho park map Ranges from 2 to 9. month (int): Month of the year. Ranges from 0 to 11 day (int): Day of the week. Ranges from 0 to 6 FFMC (float): FFMC index from the FWI system. Ranges from 18.7 to 96.20 DMC (float): DMC index from the FWI system. Ranges from 1.1 to 291.3 DC (float): DC index from the FWI system. Ranges from 7.9 to 860.6 ISI (float): ISI index from the FWI system. Ranges from 0.0 to 56.1 temp (float): Temperature in Celsius degrees. Ranges from 2.2 to 33.3 RH (float): Relative humidity in %. Ranges from 15.0 to 100.0 wind (float): Wind speed in km/h. Ranges from 0.4 to 9.4 rain (float): Outside rain in mm/m2. Ranges from 0.0 to 6.4

Targets:

area (float): The burned area of the forest (in ha). Ranges from 0.00 to 1090.84

Notes:

The target variable is very skewed towards 0.0, thus it may make sense to model with the logarithm transform.

Source:

https://archive.ics.uci.edu/ml/datasets/Forest+Fires

Examples:

Load in the data set::

>>> dataset = ForestFire()
>>> dataset.shape
(517, 13)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((517, 12), (517,))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((401, 12), (401,), (116, 12), (116,))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>