PhiladelphiaCrimeDataset

srai.datasets.PhiladelphiaCrimeDataset ¶

PhiladelphiaCrimeDataset()

Bases: PointDataset

Philadelphia Crime dataset.

Crime incidents from the Philadelphia Police Department. Part I crimes include violent offenses such as aggravated assault, rape, arson, among others. Part II crimes include simple assault, prostitution, gambling, fraud, and other non-violent offenses.

Source code in srai/datasets/philadelphia_crime.py

def __init__(self) -> None:
    """Create the dataset."""
    numerical_columns = None
    categorical_columns = [
        "hour",
        "dispatch_date",
        "dispatch_time",
        "dc_dist",
        "psa",
    ]
    type = "point"
    # target = "text_general_code"
    target = "count"

    super().__init__(
        "kraina/philadelphia_crime",
        type=type,
        numerical_columns=numerical_columns,
        categorical_columns=categorical_columns,
        target=target,
    )

download_data ¶

download_data(version: Optional[Union[int, str]] = 2023) -> None

Download and cache the Philadelphia crime dataset for a given year.

If the Parquet cache already exists, the download step is skipped.
Otherwise, the CSV is streamed from the API, converted in-memory to Parquet, and cached for future use.

PARAMETER	DESCRIPTION
`version`	Dataset year to download (e.g., 2013-2023). If given as a short H3 resolution code ('8', '9', '10'), defaults to benchmark splits of the year 2023. TYPE: `int` DEFAULT: `2023`

Source code in srai/datasets/philadelphia_crime.py

def download_data(self, version: Optional[Union[int, str]] = 2023) -> None:
    """
    Download and cache the Philadelphia crime dataset for a given year.

    - If the Parquet cache already exists, the download step is skipped.
    - Otherwise, the CSV is streamed from the API, converted in-memory to Parquet,
    and cached for future use.

    Args:
        version (int): Dataset year to download (e.g., 2013-2023).
            If given as a short H3 resolution code ('8', '9', '10'),
            defaults to benchmark splits of the year 2023.
    """
    if version is None or len(str(version)) <= 3:
        version = 2023

    cache_file = self._get_cache_file(str(version))
    cache_file.parent.mkdir(parents=True, exist_ok=True)

    if not cache_file.exists():
        url = self._make_url(int(version))

        print(f"Downloading crime data for {version}...")
        duckdb.read_csv(url).to_parquet(str(cache_file), compression="zstd")

get_h3_with_labels ¶

get_h3_with_labels() -> (
    tuple[
        gpd.GeoDataFrame, Optional[gpd.GeoDataFrame], Optional[gpd.GeoDataFrame]
    ]
)

Returns h3 indexes with target labels from the dataset.

Points are aggregated to hexes and target column values are averaged or if target column is None, then the number of points is calculted within a hex and scaled to [0,1].

RETURNS	DESCRIPTION
`tuple[GeoDataFrame, Optional[GeoDataFrame], Optional[GeoDataFrame]]`	tuple[gpd.GeoDataFrame, Optional[gpd.GeoDataFrame], Optional[gpd.GeoDataFrame]]: Train, Val, Test hexes with target labels in GeoDataFrames

Source code in srai/datasets/_base.py

def get_h3_with_labels(
    self,
    # resolution: Optional[int] = None,
    # target_column: Optional[str] = None,
) -> tuple[gpd.GeoDataFrame, Optional[gpd.GeoDataFrame], Optional[gpd.GeoDataFrame]]:
    """
    Returns h3 indexes with target labels from the dataset.

    Points are aggregated to hexes and target column values are averaged or if target column \
    is None, then the number of points is calculted within a hex and scaled to [0,1].

    Returns:
        tuple[gpd.GeoDataFrame, Optional[gpd.GeoDataFrame], Optional[gpd.GeoDataFrame]]:\
            Train, Val, Test hexes with target labels in GeoDataFrames
    """
    # if target_column is None:
    #     target_column = "count"

    # resolution = resolution if resolution is not None else self.resolution

    assert self.train_gdf is not None
    # If resolution is still None, raise an error
    if self.resolution is None:
        raise ValueError(
            "No preset resolution for the dataset in self.resolution. Please"
            "provide a resolution."
        )
    # elif self.resolution is not None and resolution != self.resolution:
    #     raise ValueError(
    #         "Resolution provided is different from the preset resolution for the"
    #         "dataset. This may result in a data leak between splits."
    #     )

    _train_gdf = self._aggregate_hexes(self.train_gdf, self.resolution, self.target)

    if self.test_gdf is not None:
        _test_gdf = self._aggregate_hexes(self.test_gdf, self.resolution, self.target)
    else:
        _test_gdf = None

    if self.val_gdf is not None:
        _val_gdf = self._aggregate_hexes(self.val_gdf, self.resolution, self.target)
    else:
        _val_gdf = None

    # Scale the "count" column to [0, 1] if it is the target column
    if self.target == "count":
        scaler = MinMaxScaler()
        # Fit the scaler on the train dataset and transform
        _train_gdf["count"] = scaler.fit_transform(_train_gdf[["count"]])
        if _test_gdf is not None:
            _test_gdf["count"] = scaler.transform(_test_gdf[["count"]])
            _test_gdf["count"] = np.clip(_test_gdf["count"], 0, 1)
        if _val_gdf is not None:
            _val_gdf["count"] = scaler.transform(_val_gdf[["count"]])
            _val_gdf["count"] = np.clip(_val_gdf["count"], 0, 1)

    return _train_gdf, _val_gdf, _test_gdf

load ¶

load(
    version: Optional[Union[int, str]] = 8, hf_token: Optional[str] = None
) -> dict[str, gpd.GeoDataFrame]

Method to load dataset.

PARAMETER	DESCRIPTION
`hf_token`	If needed, a User Access Token needed to authenticate to the Hugging Face Hub. Environment variable `HF_TOKEN` can be also used. Defaults to None. TYPE: `str` DEFAULT: `None`
`version`	version of a dataset. Available: Official spatial train-test split from year 2023 in chosen h3 resolution: '8', '9, '10'. Defaults to '8'. Raw data from other years available as: '2013', '2014', '2015', '2016', '2017', '2018','2019', '2020', '2021', '2022', '2023'. TYPE: `str or int` DEFAULT: `8`

RETURNS	DESCRIPTION
`dict[str, GeoDataFrame]`	dict[str, gpd.GeoDataFrame]: Dictionary with all splits loaded from the dataset. Will contain keys "train" and "test" if available.

Source code in srai/datasets/philadelphia_crime.py

def load(
    self, version: Optional[Union[int, str]] = 8, hf_token: Optional[str] = None
) -> dict[str, gpd.GeoDataFrame]:
    """
    Method to load dataset.

    Args:
        hf_token (str, optional): If needed, a User Access Token needed to authenticate to
            the Hugging Face Hub. Environment variable `HF_TOKEN` can be also used.
            Defaults to None.
        version (str or int, optional): version of a dataset.
            Available: Official spatial train-test split from year 2023 in chosen h3 resolution:
            '8', '9, '10'. Defaults to '8'. Raw data from other years available
            as: '2013', '2014', '2015', '2016', '2017', '2018','2019', '2020', '2021',
            '2022', '2023'.

    Returns:
        dict[str, gpd.GeoDataFrame]: Dictionary with all splits loaded from the dataset. Will
            contain keys "train" and "test" if available.
    """
    self.resolution = None
    self.download_data(version=version)

    from datasets import load_dataset

    result = {}

    self.train_gdf, self.val_gdf, self.test_gdf = None, None, None
    dataset_name = self.path
    self.version = str(version)

    if self.resolution is None and self.version in ("8", "9", "10"):
        with suppress(ValueError):
            # Try to parse version as int (e.g. "8" or "9")
            self.resolution = int(self.version)

    if len(str(version)) <= 3:
        data = load_dataset(dataset_name, str(version), token=hf_token, trust_remote_code=True)
    else:
        empty_dataset = Dataset.from_pandas(pd.DataFrame())
        data = {"train": empty_dataset}
    train = data["train"].to_pandas()
    processed_train = self._preprocessing(train)
    self.train_gdf = processed_train
    result["train"] = processed_train
    if "test" in data:
        test = data["test"].to_pandas()
        processed_test = self._preprocessing(test)
        self.test_gdf = processed_test
        result["test"] = processed_test

    return result

train_test_split ¶

train_test_split(
    target_column: Optional[str] = None,
    resolution: Optional[int] = None,
    test_size: float = 0.2,
    n_bins: int = 7,
    random_state: Optional[int] = None,
    validation_split: bool = False,
    force_split: bool = False,
    task: Optional[str] = None,
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]

Method to generate splits from GeoDataFrame, based on the target_column values.

PARAMETER	DESCRIPTION
`target_column`	Target column name. If None, split is generated based on number of points within a hex of a given resolution. Defaults to preset dataset target column. TYPE: `Optional[str]` DEFAULT: `None`
`resolution`	h3 resolution to regionalize data. Defaults to default value from the dataset. TYPE: `int` DEFAULT: `None`
`test_size`	Percentage of test set. Defaults to 0.2. TYPE: `float` DEFAULT: `0.2`
`n_bins`	Bucket number used to stratify target data. Defaults to 7. TYPE: `int` DEFAULT: `7`
`random_state`	Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function. Defaults to None. TYPE: `int` DEFAULT: `None`
`validation_split`	If True, creates a validation split from existing train split and assigns it to self.val_gdf. TYPE: `bool` DEFAULT: `False`
`force_split`	If True, forces a new split to be created, even if an existing train/test or validation split is already present. - With `validation_split=False`, regenerates and overwrites the test split. - With `validation_split=True`, regenerates and overwrites the validation split. TYPE: `bool` DEFAULT: `False`
`task`	Currently not supported. Ignored in this subclass. TYPE: `Optional[str]` DEFAULT: `None`

RETURNS	DESCRIPTION
`tuple`	Train-test or train-val split made on previous train subset. TYPE: `(GeoDataFrame, GeoDataFrame)`

Source code in srai/datasets/_base.py

def train_test_split(
    self,
    target_column: Optional[str] = None,
    resolution: Optional[int] = None,
    test_size: float = 0.2,
    n_bins: int = 7,
    random_state: Optional[int] = None,
    validation_split: bool = False,
    force_split: bool = False,
    task: Optional[str] = None,
) -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
    """
    Method to generate splits from GeoDataFrame, based on the target_column values.

    Args:
        target_column (Optional[str], optional): Target column name. If None, split is\
            generated based on number of points within a hex of a given resolution.\
            Defaults to preset dataset target column.
        resolution (int, optional): h3 resolution to regionalize data. Defaults to default\
            value from the dataset.
        test_size (float, optional): Percentage of test set. Defaults to 0.2.
        n_bins (int, optional): Bucket number used to stratify target data.\
            Defaults to 7.
        random_state (int, optional):  Controls the shuffling applied to the data before\
            applying the split. \
            Pass an int for reproducible output across multiple function. Defaults to None.
        validation_split (bool): If True, creates a validation split from existing train split\
            and assigns it to self.val_gdf.
        force_split: If True, forces a new split to be created, even if an existing train/test\
            or validation split is already present.
            - With `validation_split=False`, regenerates and overwrites the test split.
            - With `validation_split=True`, regenerates and overwrites the validation split.
        task (Optional[str], optional): Currently not supported. Ignored in this subclass.

    Returns:
        tuple(gpd.GeoDataFrame, gpd.GeoDataFrame): Train-test or train-val split made on\
            previous train subset.
    """
    assert self.train_gdf is not None

    if (self.val_gdf is not None and validation_split and not force_split) or (
        self.test_gdf is not None and not validation_split and not force_split
    ):
        raise ValueError(
            "A split already exists. Use `force_split=True` to overwrite the existing "
            f"{'validation' if validation_split else 'test'} split."
        )

    resolution = resolution or self.resolution

    if resolution is None:
        raise ValueError(
            "No preset resolution for the dataset in self.resolution. Please "
            "provide a resolution."
        )
    elif self.resolution is not None and resolution != self.resolution:
        raise ValueError(
            "Resolution provided is different from the preset resolution for the "
            "dataset. This may result in a data leak between splits."
        )

    if self.resolution is None:
        self.resolution = resolution
    target_column = target_column if target_column is not None else self.target
    if target_column is None:
        target_column = "count"

    gdf = self.train_gdf
    gdf_ = gdf.copy()

    train, test = train_test_spatial_split(
        gdf_,
        parent_h3_resolution=resolution,
        target_column=target_column,
        test_size=test_size,
        n_bins=n_bins,
        random_state=random_state,
    )

    self.train_gdf = train
    if not validation_split:
        self.test_gdf = test
        test_len = len(self.test_gdf) if self.test_gdf is not None else 0
        print(
            f"Created new train_gdf and test_gdf. Train len: {len(self.train_gdf)},"
            f"test len: {test_len}"
        )
    else:
        self.val_gdf = test
        val_len = len(self.val_gdf) if self.val_gdf is not None else 0
        test_len = len(self.test_gdf) if self.test_gdf is not None else 0
        print(
            f"Created new train_gdf and val_gdf. Test split remains unchanged."
            f"Train len: {len(self.train_gdf)}, val len: {val_len},"
            f"test len: {test_len}"
        )
    return train, test