doubt.datasets.facebook_metrics

Facebook metrics data set.

This data set is from the UCI data set archive, with the description being the original description verbatim. Some feature names may have been altered, based on the description.

  1"""Facebook metrics data set.
  2
  3This data set is from the UCI data set archive, with the description being the original
  4description verbatim. Some feature names may have been altered, based on the
  5description.
  6"""
  7
  8import io
  9import zipfile
 10
 11import pandas as pd
 12
 13from .dataset import BASE_DATASET_DESCRIPTION, BaseDataset
 14
 15
 16class FacebookMetrics(BaseDataset):
 17    __doc__ = f"""
 18    The data is related to posts' published during the year of 2014 on the Facebook's
 19    page of a renowned cosmetics brand.
 20
 21    {BASE_DATASET_DESCRIPTION}
 22
 23    Features:
 24        page_likes(int):
 25            The total number of likes of the Facebook page at the given time.
 26        post_type (int):
 27            The type of post. Here 0 means 'Photo', 1 means 'Status', 2 means 'Link'
 28            and 3 means 'Video'
 29        post_category (int):
 30            The category of the post.
 31        post_month (int):
 32            The month the post was posted, from 1 to 12 inclusive.
 33        post_weekday (int):
 34            The day of the week the post was posted, from 1 to 7 inclusive.
 35        post_hour (int):
 36            The hour the post was posted, from 0 to 23 inclusive
 37        paid (int):
 38            Binary feature, whether the post was paid for.
 39
 40    Targets:
 41        total_reach (int):
 42            The lifetime post total reach.
 43        total_impressions (int):
 44            The lifetime post total impressions.
 45        engaged_users (int):
 46            The lifetime engaged users.
 47        post_consumers (int):
 48            The lifetime post consumers.
 49        post_consumptions (int):
 50            The lifetime post consumptions.
 51        post_impressions (int):
 52            The lifetime post impressions by people who liked the page.
 53        post_reach (int):
 54            The lifetime post reach by people who liked the page.
 55        post_engagements (int):
 56            The lifetime people who have liked the page and engaged with
 57            the post.
 58        comments (int):
 59            The number of comments.
 60        shares (int):
 61            The number of shares.
 62        total_interactions (int):
 63            The total number of interactions
 64
 65    Source:
 66        https://archive.ics.uci.edu/ml/datasets/Facebook+metrics
 67
 68    Examples:
 69        Load in the data set::
 70
 71            >>> dataset = FacebookMetrics()
 72            >>> dataset.shape
 73            (500, 18)
 74
 75        Split the data set into features and targets, as NumPy arrays::
 76
 77            >>> X, y = dataset.split()
 78            >>> X.shape, y.shape
 79            ((500, 7), (500, 11))
 80
 81        Perform a train/test split, also outputting NumPy arrays::
 82
 83            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 84            >>> X_train, X_test, y_train, y_test = train_test_split
 85            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 86            ((388, 7), (388, 11), (112, 7), (112, 11))
 87
 88        Output the underlying Pandas DataFrame::
 89
 90            >>> df = dataset.to_pandas()
 91            >>> type(df)
 92            <class 'pandas.core.frame.DataFrame'>
 93    """
 94
 95    _url = (
 96        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 97        "00368/Facebook_metrics.zip"
 98    )
 99
100    _features = range(7)
101    _targets = range(7, 18)
102
103    def _prep_data(self, data: bytes) -> pd.DataFrame:
104        """Prepare the data set.
105
106        Args:
107            data (bytes): The raw data
108
109        Returns:
110            Pandas dataframe: The prepared data
111        """
112        # Convert the bytes into a file-like object
113        buffer = io.BytesIO(data)
114
115        # Unzip the file and pull out the csv file
116        with zipfile.ZipFile(buffer, "r") as zip_file:
117            csv = zip_file.read("dataset_Facebook.csv")
118
119        # Convert the bytes into a file-like object
120        csv_file = io.BytesIO(csv)
121
122        # Read the file-like object into a dataframe
123        cols = [
124            "page_likes",
125            "post_type",
126            "post_category",
127            "post_month",
128            "post_weekday",
129            "post_hour",
130            "paid",
131            "total_reach",
132            "total_impressions",
133            "engaged_users",
134            "post_consumers",
135            "post_consumptions",
136            "post_impressions",
137            "post_reach",
138            "post_engagements",
139            "comments",
140            "shares",
141            "total_interactions",
142        ]
143        df = pd.read_csv(csv_file, sep=";", names=cols, header=0, index_col=0)
144
145        # Numericalise post type
146        post_types = list(df.post_type.unique())
147        df["post_type"] = df.post_type.map(lambda txt: post_types.index(txt))
148
149        return df
class FacebookMetrics(doubt.datasets.dataset.BaseDataset):
 17class FacebookMetrics(BaseDataset):
 18    __doc__ = f"""
 19    The data is related to posts' published during the year of 2014 on the Facebook's
 20    page of a renowned cosmetics brand.
 21
 22    {BASE_DATASET_DESCRIPTION}
 23
 24    Features:
 25        page_likes(int):
 26            The total number of likes of the Facebook page at the given time.
 27        post_type (int):
 28            The type of post. Here 0 means 'Photo', 1 means 'Status', 2 means 'Link'
 29            and 3 means 'Video'
 30        post_category (int):
 31            The category of the post.
 32        post_month (int):
 33            The month the post was posted, from 1 to 12 inclusive.
 34        post_weekday (int):
 35            The day of the week the post was posted, from 1 to 7 inclusive.
 36        post_hour (int):
 37            The hour the post was posted, from 0 to 23 inclusive
 38        paid (int):
 39            Binary feature, whether the post was paid for.
 40
 41    Targets:
 42        total_reach (int):
 43            The lifetime post total reach.
 44        total_impressions (int):
 45            The lifetime post total impressions.
 46        engaged_users (int):
 47            The lifetime engaged users.
 48        post_consumers (int):
 49            The lifetime post consumers.
 50        post_consumptions (int):
 51            The lifetime post consumptions.
 52        post_impressions (int):
 53            The lifetime post impressions by people who liked the page.
 54        post_reach (int):
 55            The lifetime post reach by people who liked the page.
 56        post_engagements (int):
 57            The lifetime people who have liked the page and engaged with
 58            the post.
 59        comments (int):
 60            The number of comments.
 61        shares (int):
 62            The number of shares.
 63        total_interactions (int):
 64            The total number of interactions
 65
 66    Source:
 67        https://archive.ics.uci.edu/ml/datasets/Facebook+metrics
 68
 69    Examples:
 70        Load in the data set::
 71
 72            >>> dataset = FacebookMetrics()
 73            >>> dataset.shape
 74            (500, 18)
 75
 76        Split the data set into features and targets, as NumPy arrays::
 77
 78            >>> X, y = dataset.split()
 79            >>> X.shape, y.shape
 80            ((500, 7), (500, 11))
 81
 82        Perform a train/test split, also outputting NumPy arrays::
 83
 84            >>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
 85            >>> X_train, X_test, y_train, y_test = train_test_split
 86            >>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
 87            ((388, 7), (388, 11), (112, 7), (112, 11))
 88
 89        Output the underlying Pandas DataFrame::
 90
 91            >>> df = dataset.to_pandas()
 92            >>> type(df)
 93            <class 'pandas.core.frame.DataFrame'>
 94    """
 95
 96    _url = (
 97        "https://archive.ics.uci.edu/ml/machine-learning-databases/"
 98        "00368/Facebook_metrics.zip"
 99    )
100
101    _features = range(7)
102    _targets = range(7, 18)
103
104    def _prep_data(self, data: bytes) -> pd.DataFrame:
105        """Prepare the data set.
106
107        Args:
108            data (bytes): The raw data
109
110        Returns:
111            Pandas dataframe: The prepared data
112        """
113        # Convert the bytes into a file-like object
114        buffer = io.BytesIO(data)
115
116        # Unzip the file and pull out the csv file
117        with zipfile.ZipFile(buffer, "r") as zip_file:
118            csv = zip_file.read("dataset_Facebook.csv")
119
120        # Convert the bytes into a file-like object
121        csv_file = io.BytesIO(csv)
122
123        # Read the file-like object into a dataframe
124        cols = [
125            "page_likes",
126            "post_type",
127            "post_category",
128            "post_month",
129            "post_weekday",
130            "post_hour",
131            "paid",
132            "total_reach",
133            "total_impressions",
134            "engaged_users",
135            "post_consumers",
136            "post_consumptions",
137            "post_impressions",
138            "post_reach",
139            "post_engagements",
140            "comments",
141            "shares",
142            "total_interactions",
143        ]
144        df = pd.read_csv(csv_file, sep=";", names=cols, header=0, index_col=0)
145
146        # Numericalise post type
147        post_types = list(df.post_type.unique())
148        df["post_type"] = df.post_type.map(lambda txt: post_types.index(txt))
149
150        return df

The data is related to posts' published during the year of 2014 on the Facebook's page of a renowned cosmetics brand.

Arguments:
  • cache (str or None, optional): The name of the cache. It will be saved to cache in the current working directory. If None then no cache will be saved. Defaults to '.dataset_cache'.
Attributes:
  • cache (str or None): The name of the cache.
  • shape (tuple of integers): Dimensions of the data set
  • columns (list of strings): List of column names in the data set
Features:

page_likes(int): The total number of likes of the Facebook page at the given time. post_type (int): The type of post. Here 0 means 'Photo', 1 means 'Status', 2 means 'Link' and 3 means 'Video' post_category (int): The category of the post. post_month (int): The month the post was posted, from 1 to 12 inclusive. post_weekday (int): The day of the week the post was posted, from 1 to 7 inclusive. post_hour (int): The hour the post was posted, from 0 to 23 inclusive paid (int): Binary feature, whether the post was paid for.

Targets:

total_reach (int): The lifetime post total reach. total_impressions (int): The lifetime post total impressions. engaged_users (int): The lifetime engaged users. post_consumers (int): The lifetime post consumers. post_consumptions (int): The lifetime post consumptions. post_impressions (int): The lifetime post impressions by people who liked the page. post_reach (int): The lifetime post reach by people who liked the page. post_engagements (int): The lifetime people who have liked the page and engaged with the post. comments (int): The number of comments. shares (int): The number of shares. total_interactions (int): The total number of interactions

Source:

https://archive.ics.uci.edu/ml/datasets/Facebook+metrics

Examples:

Load in the data set::

>>> dataset = FacebookMetrics()
>>> dataset.shape
(500, 18)

Split the data set into features and targets, as NumPy arrays::

>>> X, y = dataset.split()
>>> X.shape, y.shape
((500, 7), (500, 11))

Perform a train/test split, also outputting NumPy arrays::

>>> train_test_split = dataset.split(test_size=0.2, random_seed=42)
>>> X_train, X_test, y_train, y_test = train_test_split
>>> X_train.shape, y_train.shape, X_test.shape, y_test.shape
((388, 7), (388, 11), (112, 7), (112, 11))

Output the underlying Pandas DataFrame::

>>> df = dataset.to_pandas()
>>> type(df)
<class 'pandas.core.frame.DataFrame'>