AirbnbMulticityDataset

srai.datasets.AirbnbMulticityDataset ¶

AirbnbMulticityDataset()

Bases: PointDataset

AirbnbMulticity dataset.

Dataset description will be added.

Source code in srai/datasets/airbnb_multicity.py

def __init__(self) -> None:
    """Create the dataset."""
    categorical_columns = ["name", "host_name", "neighborhood", "room_type", "city"]
    numerical_columns = [
        "number_of_reviews",
        "minimum_nights",
        "availability_365",
        "calculated_host_listings_count",
        "number_of_reviews_ltm",
    ]
    target = "price"
    type = "point"
    super().__init__(
        "kraina/airbnb_multicity",
        type=type,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
        target=target,
    )

get_h3_with_labels ¶

get_h3_with_labels() -> (
    tuple[
        gpd.GeoDataFrame, Optional[gpd.GeoDataFrame], Optional[gpd.GeoDataFrame]
    ]
)

Returns h3 indexes with target labels from the dataset.

Points are aggregated to hexes and target column values are averaged or if target column is None, then the number of points is calculted within a hex and scaled to [0,1].

RETURNS	DESCRIPTION
`tuple[GeoDataFrame, Optional[GeoDataFrame], Optional[GeoDataFrame]]`	tuple[gpd.GeoDataFrame, Optional[gpd.GeoDataFrame], Optional[gpd.GeoDataFrame]]: Train, Val, Test hexes with target labels in GeoDataFrames

Source code in srai/datasets/_base.py

def get_h3_with_labels(
    self,
    # resolution: Optional[int] = None,
    # target_column: Optional[str] = None,
) -> tuple[gpd.GeoDataFrame, Optional[gpd.GeoDataFrame], Optional[gpd.GeoDataFrame]]:
    """
    Returns h3 indexes with target labels from the dataset.

    Points are aggregated to hexes and target column values are averaged or if target column \
    is None, then the number of points is calculted within a hex and scaled to [0,1].

    Returns:
        tuple[gpd.GeoDataFrame, Optional[gpd.GeoDataFrame], Optional[gpd.GeoDataFrame]]:\
            Train, Val, Test hexes with target labels in GeoDataFrames
    """
    # if target_column is None:
    #     target_column = "count"

    # resolution = resolution if resolution is not None else self.resolution

    assert self.train_gdf is not None
    # If resolution is still None, raise an error
    if self.resolution is None:
        raise ValueError(
            "No preset resolution for the dataset in self.resolution. Please"
            "provide a resolution."
        )
    # elif self.resolution is not None and resolution != self.resolution:
    #     raise ValueError(
    #         "Resolution provided is different from the preset resolution for the"
    #         "dataset. This may result in a data leak between splits."
    #     )

    _train_gdf = self._aggregate_hexes(self.train_gdf, self.resolution, self.target)

    if self.test_gdf is not None:
        _test_gdf = self._aggregate_hexes(self.test_gdf, self.resolution, self.target)
    else:
        _test_gdf = None

    if self.val_gdf is not None:
        _val_gdf = self._aggregate_hexes(self.val_gdf, self.resolution, self.target)
    else:
        _val_gdf = None

    # Scale the "count" column to [0, 1] if it is the target column
    if self.target == "count":
        scaler = MinMaxScaler()
        # Fit the scaler on the train dataset and transform
        _train_gdf["count"] = scaler.fit_transform(_train_gdf[["count"]])
        if _test_gdf is not None:
            _test_gdf["count"] = scaler.transform(_test_gdf[["count"]])
            _test_gdf["count"] = np.clip(_test_gdf["count"], 0, 1)
        if _val_gdf is not None:
            _val_gdf["count"] = scaler.transform(_val_gdf[["count"]])
            _val_gdf["count"] = np.clip(_val_gdf["count"], 0, 1)

    return _train_gdf, _val_gdf, _test_gdf

load ¶

load(
    version: Optional[Union[int, str]] = 8, hf_token: Optional[str] = None
) -> dict[str, gpd.GeoDataFrame]

Method to load dataset.

PARAMETER	DESCRIPTION
`hf_token`	If needed, a User Access Token needed to authenticate to the Hugging Face Hub. Environment variable `HF_TOKEN` can be also used. Defaults to None. TYPE: `str` DEFAULT: `None`
`version`	version of a dataset. Available: '8', '9', '10', where number is a h3 resolution used in train-test split. Benchmark version comprises six cities: Paris, Rome, London, Amsterdam, Melbourne, New York City. Raw, full data from ~80 cities available as 'all'. TYPE: `str or int` DEFAULT: `8`

RETURNS	DESCRIPTION
`dict[str, GeoDataFrame]`	dict[str, gpd.GeoDataFrame]: Dictionary with all splits loaded from the dataset. Will contain keys "train" and "test" if available.

Source code in srai/datasets/airbnb_multicity.py

def load(
    self, version: Optional[Union[int, str]] = 8, hf_token: Optional[str] = None
) -> dict[str, gpd.GeoDataFrame]:
    """
    Method to load dataset.

    Args:
        hf_token (str, optional): If needed, a User Access Token needed to authenticate to
            the Hugging Face Hub. Environment variable `HF_TOKEN` can be also used.
            Defaults to None.
        version (str or int, optional): version of a dataset.
            Available: '8', '9', '10', where number is a h3 resolution used in train-test \
                split. Benchmark version comprises six cities: Paris, Rome, London, Amsterdam, \
                    Melbourne, New York City. Raw, full data from ~80 cities available as 'all'.

    Returns:
        dict[str, gpd.GeoDataFrame]: Dictionary with all splits loaded from the dataset. Will
            contain keys "train" and "test" if available.
    """
    return super().load(version=version, hf_token=hf_token)

train_test_split ¶

train_test_split(
    target_column: Optional[str] = None,
    resolution: Optional[int] = None,
    test_size: float = 0.2,
    n_bins: int = 7,
    random_state: Optional[int] = None,
    validation_split: bool = False,
    force_split: bool = False,
    task: Optional[str] = None,
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]

Method to generate splits from GeoDataFrame, based on the target_column values.

PARAMETER	DESCRIPTION
`target_column`	Target column name. If None, split is generated based on number of points within a hex of a given resolution. Defaults to preset dataset target column. TYPE: `Optional[str]` DEFAULT: `None`
`resolution`	h3 resolution to regionalize data. Defaults to default value from the dataset. TYPE: `int` DEFAULT: `None`
`test_size`	Percentage of test set. Defaults to 0.2. TYPE: `float` DEFAULT: `0.2`
`n_bins`	Bucket number used to stratify target data. Defaults to 7. TYPE: `int` DEFAULT: `7`
`random_state`	Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function. Defaults to None. TYPE: `int` DEFAULT: `None`
`validation_split`	If True, creates a validation split from existing train split and assigns it to self.val_gdf. TYPE: `bool` DEFAULT: `False`
`force_split`	If True, forces a new split to be created, even if an existing train/test or validation split is already present. - With `validation_split=False`, regenerates and overwrites the test split. - With `validation_split=True`, regenerates and overwrites the validation split. TYPE: `bool` DEFAULT: `False`
`task`	Currently not supported. Ignored in this subclass. TYPE: `Optional[str]` DEFAULT: `None`

RETURNS	DESCRIPTION
`tuple`	Train-test or train-val split made on previous train subset. TYPE: `(GeoDataFrame, GeoDataFrame)`

Source code in srai/datasets/_base.py

def train_test_split(
    self,
    target_column: Optional[str] = None,
    resolution: Optional[int] = None,
    test_size: float = 0.2,
    n_bins: int = 7,
    random_state: Optional[int] = None,
    validation_split: bool = False,
    force_split: bool = False,
    task: Optional[str] = None,
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
    """
    Method to generate splits from GeoDataFrame, based on the target_column values.

    Args:
        target_column (Optional[str], optional): Target column name. If None, split is\
            generated based on number of points within a hex of a given resolution.\
            Defaults to preset dataset target column.
        resolution (int, optional): h3 resolution to regionalize data. Defaults to default\
            value from the dataset.
        test_size (float, optional): Percentage of test set. Defaults to 0.2.
        n_bins (int, optional): Bucket number used to stratify target data.\
            Defaults to 7.
        random_state (int, optional):  Controls the shuffling applied to the data before\
            applying the split. \
            Pass an int for reproducible output across multiple function. Defaults to None.
        validation_split (bool): If True, creates a validation split from existing train split\
            and assigns it to self.val_gdf.
        force_split: If True, forces a new split to be created, even if an existing train/test\
            or validation split is already present.
            - With `validation_split=False`, regenerates and overwrites the test split.
            - With `validation_split=True`, regenerates and overwrites the validation split.
        task (Optional[str], optional): Currently not supported. Ignored in this subclass.

    Returns:
        tuple(gpd.GeoDataFrame, gpd.GeoDataFrame): Train-test or train-val split made on\
            previous train subset.
    """
    assert self.train_gdf is not None

    if (self.val_gdf is not None and validation_split and not force_split) or (
        self.test_gdf is not None and not validation_split and not force_split
    ):
        raise ValueError(
            "A split already exists. Use `force_split=True` to overwrite the existing "
            f"{'validation' if validation_split else 'test'} split."
        )

    resolution = resolution or self.resolution

    if resolution is None:
        raise ValueError(
            "No preset resolution for the dataset in self.resolution. Please "
            "provide a resolution."
        )
    elif self.resolution is not None and resolution != self.resolution:
        raise ValueError(
            "Resolution provided is different from the preset resolution for the "
            "dataset. This may result in a data leak between splits."
        )

    if self.resolution is None:
        self.resolution = resolution
    target_column = target_column if target_column is not None else self.target
    if target_column is None:
        target_column = "count"

    gdf = self.train_gdf
    gdf_ = gdf.copy()

    train, test = train_test_spatial_split(
        gdf_,
        parent_h3_resolution=resolution,
        target_column=target_column,
        test_size=test_size,
        n_bins=n_bins,
        random_state=random_state,
    )

    self.train_gdf = train
    if not validation_split:
        self.test_gdf = test
        test_len = len(self.test_gdf) if self.test_gdf is not None else 0
        print(
            f"Created new train_gdf and test_gdf. Train len: {len(self.train_gdf)},"
            f"test len: {test_len}"
        )
    else:
        self.val_gdf = test
        val_len = len(self.val_gdf) if self.val_gdf is not None else 0
        test_len = len(self.test_gdf) if self.test_gdf is not None else 0
        print(
            f"Created new train_gdf and val_gdf. Test split remains unchanged."
            f"Train len: {len(self.train_gdf)}, val len: {val_len},"
            f"test len: {test_len}"
        )
    return train, test