Geovex embedder

In [1]:

Copied!





from srai.embedders import GeoVexEmbedder, Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.neighbourhoods import H3Neighbourhood
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
from srai.plotting import plot_regions, plot_numeric_data
from srai.h3 import ring_buffer_h3_regions_gdf

import warnings

from pytorch_lightning import seed_everything
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.cluster import KMeans
import torch
from srai.embedders import GeoVexEmbedder, Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.neighbourhoods import H3Neighbourhood
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
from srai.plotting import plot_regions, plot_numeric_data
from srai.h3 import ring_buffer_h3_regions_gdf

import warnings

from pytorch_lightning import seed_everything
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.cluster import KMeans
import torch

In [2]:

Copied!

SEED = 71
seed_everything(SEED)
SEED = 71
seed_everything(SEED)

Seed set to 71

Out[2]:

Load data from OSM¶

First use geocoding to get the area

In [3]:

Copied!

area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")

Out[3]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Buffer the Area¶

The GeoVex embedder requires a buffer around the area of interest, as the hexagon needs to have its radius k neighbors in the dataset as well. The buffer is defined in hexagon radius units, so a buffer of 1 means that the hexagon will have its 1-neighborhood in the dataset as well.

In [4]:

Copied!

area_gdf.head()
area_gdf.head()

Out[4]:

	geometry
region_id
Wrocław, Lower Silesian Voivodeship, Poland	POLYGON ((16.80734 51.13895, 16.80859 51.13887...

In [5]:

Copied!





resolution = 9
k_ring_buffer_radius = 4

regionalizer = H3Regionalizer(resolution=resolution)
base_h3_regions = regionalizer.transform(area_gdf)

buffered_h3_regions = ring_buffer_h3_regions_gdf(base_h3_regions, distance=k_ring_buffer_radius)
buffered_h3_geometry = buffered_h3_regions.unary_union

print("Base regions:", len(base_h3_regions))
print("Buffered regions:", len(buffered_h3_regions))
resolution = 9
k_ring_buffer_radius = 4

regionalizer = H3Regionalizer(resolution=resolution)
base_h3_regions = regionalizer.transform(area_gdf)

buffered_h3_regions = ring_buffer_h3_regions_gdf(base_h3_regions, distance=k_ring_buffer_radius)
buffered_h3_geometry = buffered_h3_regions.unary_union

print("Base regions:", len(base_h3_regions))
print("Buffered regions:", len(buffered_h3_regions))

Base regions: 3168
Buffered regions: 4319

Download the Data¶

Next, download the data for the selected region and the specified tags.

In [6]:

Copied!

tags = GEOFABRIK_LAYERS
loader = OSMPbfLoader()

features_gdf = loader.load(buffered_h3_geometry, tags)
tags = GEOFABRIK_LAYERS
loader = OSMPbfLoader()

features_gdf = loader.load(buffered_h3_geometry, tags)

/opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/srai/loaders/osm_loaders/pbf_file_downloader.py:154: UserWarning: Error occured (Expecting value: line 1 column 1 (char 0)). Auto-switching to 'geofabrik' download source.
  warnings.warn(
Finding matching extracts: 100%|██████████| 1/1 [00:00<00:00, 99.00it/s]
Filtering extracts: 100%|██████████| 1/1 [00:00<00:00, 555.02it/s]
/opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/srai/loaders/osm_loaders/openstreetmap_extracts.py:344: FutureWarning: `unary_union` returned None due to all-None GeoSeries. In future, `unary_union` will return 'GEOMETRYCOLLECTION EMPTY' instead.
  ].unary_union
dolnoslaskie.osm.pbf: 100%|██████████| 148M/148M [00:09<00:00, 17.1MiB/s]
osmconvert: 100%|██████████| 246k/246k [00:00<00:00, 556kiB/s]
Clipping PBF files: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]
[0] Counting pbf features: 3245755it [00:08, 394507.86it/s]
[0] Parsing pbf file #1: 100%|██████████| 3245755/3245755 [01:11<00:00, 45602.84it/s]
Grouping features: 100%|██████████| 28/28 [00:05<00:00,  4.84it/s]

Prepare the data for embedding¶

After downloading the data, we need to prepare it for embedding. In the previous step we have regionalized the selected area and buffered it, now we have to join the features with prepared regions.

In [7]:

Copied!

plot_regions(buffered_h3_regions, tiles_style="CartoDB positron")
plot_regions(buffered_h3_regions, tiles_style="CartoDB positron")

Out[7]:

Make this Notebook Trusted to load map: File -> Trust Notebook

In [8]:

Copied!

joiner = IntersectionJoiner()
joint_gdf = joiner.transform(buffered_h3_regions, features_gdf)
joint_gdf
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(buffered_h3_regions, features_gdf)
joint_gdf

Out[8]:


region_id	feature_id
891e204183bffff	way/925740071
891e2041877ffff	way/925740071
891e204183bffff	way/1061385575
891e2041877ffff	way/1061385575
891e204183bffff	node/4105097025
...	...
891e204400bffff	way/1153377164
	way/1036878741
	way/1153377161
	way/1153377162
	way/360854244

456716 rows × 0 columns

GeoVex-Embedding¶

After preparing the data we can proceed with generating embeddings for the regions.

In [9]:

Copied!





neighbourhood = H3Neighbourhood(buffered_h3_regions)

embedder = GeoVexEmbedder(
    target_features=GEOFABRIK_LAYERS,
    batch_size=10,
    neighbourhood_radius=k_ring_buffer_radius,
    convolutional_layers=2,
    embedding_size=50,
)
neighbourhood = H3Neighbourhood(buffered_h3_regions)

embedder = GeoVexEmbedder(
    target_features=GEOFABRIK_LAYERS,
    batch_size=10,
    neighbourhood_radius=k_ring_buffer_radius,
    convolutional_layers=2,
    embedding_size=50,
)

In [10]:

Copied!





with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    embeddings = embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        trainer_kwargs={
            # "max_epochs": 20, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": (
                "cpu" if torch.backends.mps.is_available() else "auto"
            ),  # GeoVexEmbedder does not support MPS
        },
        learning_rate=0.001,
    )

embeddings.head()
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    embeddings = embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        trainer_kwargs={
            # "max_epochs": 20, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": (
                "cpu" if torch.backends.mps.is_available() else "auto"
            ),  # GeoVexEmbedder does not support MPS
        },
        learning_rate=0.001,
    )

embeddings.head()

100%|██████████| 4319/4319 [00:00<00:00, 5526.59it/s]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 2.4 M 
1 | decoder | Sequential | 1.8 M 
2 | _loss   | GeoVeXLoss | 0     
---------------------------------------
4.2 M     Trainable params
0         Non-trainable params
4.2 M     Total params
16.673    Total estimated model params size (MB)

`Trainer.fit` stopped: `max_epochs=5` reached.

Warning: Some regions were not able to be encoded, as they don't have r=4 neighbors.

Out[10]:

	0	1	2	3	4	5	6	7	8	9	...	40	41	42	43	44	45	46	47	48	49
region_id
891e204183bffff	-13.974160	-16.418303	-20.196938	8.804747	14.584895	-4.893361	22.196383	18.547247	2.622602	-14.869571	...	-12.507459	-8.770598	-10.020007	23.294451	0.263832	7.707475	15.594930	-13.536656	-9.086084	-5.962478
891e2055627ffff	-25.789455	-20.886932	-15.164046	7.000266	21.472120	-4.135105	18.078911	13.064086	-5.276225	-12.488445	...	-12.125144	-6.436296	-10.196702	24.486149	-11.478381	-3.609332	14.637024	-20.513647	-15.173162	6.176047
891e204e547ffff	-11.932467	-21.011549	-6.589490	-4.712363	18.367599	-5.190247	22.252245	17.299946	-2.251908	-13.417464	...	-13.243283	-8.406216	-11.621208	27.240257	0.168340	-4.911313	18.569231	-15.657393	-9.788277	4.532925
891e2051b6fffff	-25.413239	-23.691917	-20.414238	11.212957	19.427988	-2.622771	18.819981	14.501032	-0.484304	-14.150456	...	-11.811519	-9.700583	-11.296882	23.303148	-10.651851	-5.139103	15.264569	-16.366791	-13.728007	4.370296
891e2040bcfffff	8.214122	-5.371178	23.361122	-4.485201	19.785112	6.374000	35.022743	25.302542	-0.789087	-22.764334	...	-12.493669	-10.056778	-16.884756	31.351204	-10.768524	-2.064900	24.290670	-26.965254	-2.367098	3.681940

5 rows × 50 columns

Hex2Vec Embedding¶

In [11]:

Copied!





neighbourhood = H3Neighbourhood(buffered_h3_regions)

hex2vec_embedder = Hex2VecEmbedder(
    encoder_sizes=[300, 150, 50],
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    hex2vec_embeddings = hex2vec_embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        negative_sample_k_distance=2,
        batch_size=64,
        learning_rate=0.001,
        trainer_kwargs={
            # "max_epochs": 50, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": "auto",
        },
    )

hex2vec_embeddings.head()
neighbourhood = H3Neighbourhood(buffered_h3_regions)

hex2vec_embedder = Hex2VecEmbedder(
    encoder_sizes=[300, 150, 50],
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    hex2vec_embeddings = hex2vec_embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        negative_sample_k_distance=2,
        batch_size=64,
        learning_rate=0.001,
        trainer_kwargs={
            # "max_epochs": 50, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": "auto",
        },
    )

hex2vec_embeddings.head()

100%|██████████| 4319/4319 [00:00<00:00, 31151.89it/s]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 136 K 
---------------------------------------
136 K     Trainable params
0         Non-trainable params
136 K     Total params
0.544     Total estimated model params size (MB)

`Trainer.fit` stopped: `max_epochs=5` reached.

Out[11]:

	0	1	2	3	4	5	6	7	8	9	...	40	41	42	43	44	45	46	47	48	49
region_id
891e204183bffff	0.025501	-0.412463	-0.080646	0.293095	-0.426965	0.266867	0.280766	0.243965	0.175183	0.151771	...	-0.318394	0.249635	-0.059346	-0.310585	-0.576608	0.208978	0.367321	-0.322218	0.055463	0.181674
891e2055627ffff	0.105834	0.131315	0.030233	-0.055389	-0.061091	0.085414	-0.098394	-0.314005	-0.087649	-0.065255	...	0.077125	-0.352293	0.089327	0.153773	-0.173634	-0.170836	0.176308	0.341642	-0.055950	0.159905
891e204e547ffff	-0.134959	-0.380925	-0.344720	0.313990	-0.380668	0.196804	0.060416	0.610162	0.385788	0.626101	...	-0.330846	0.355080	0.231830	0.011853	-0.832743	0.161955	0.668382	-0.296195	-0.175420	-0.181936
891e2051b6fffff	-0.086926	0.027231	-0.101769	-0.200843	0.107405	0.049931	0.184848	0.062769	-0.068150	0.233873	...	-0.018954	-0.070953	-0.199584	0.142664	-0.295934	0.374827	0.221490	-0.388492	-0.442332	0.021022
891e2041127ffff	-0.151679	-0.032471	-0.177545	0.018834	0.252875	0.172773	-0.023538	0.095769	0.430205	-0.443768	...	0.504530	0.455436	-0.555820	0.274231	0.090535	-0.375809	-0.539327	-0.145607	0.056447	0.252030

5 rows × 50 columns

Comparing the Embeddings¶

GeoVex Embedding¶

PCA¶

In [12]:

Copied!





# do pca with three components and then cast to RGB
pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)
# do pca with three components and then cast to RGB
pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)

Out[12]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Clustering¶

In [13]:

Copied!





clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings.index.name = "region_id"
embeddings["cluster"] = clusterizer.labels_
embeddings["cluster"]
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings.index.name = "region_id"
embeddings["cluster"] = clusterizer.labels_
embeddings["cluster"]

/opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)

Out[13]:

region_id
891e204183bffff    2
891e2055627ffff    1
891e204e547ffff    2
891e2051b6fffff    1
891e2040bcfffff    0
                  ..
891e2041d2fffff    4
891e2050bd3ffff    1
891e20406d7ffff    2
891e204626bffff    2
891e20452c3ffff    4
Name: cluster, Length: 3251, dtype: int32

In [14]:

Copied!

plot_numeric_data(base_h3_regions, "cluster", embeddings, tiles_style="CartoDB positron")
plot_numeric_data(base_h3_regions, "cluster", embeddings, tiles_style="CartoDB positron")

Out[14]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Hex2Vec¶

PCA¶

In [15]:

Copied!





# do pca with three components and then cast to RGB
pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(hex2vec_embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=hex2vec_embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)
# do pca with three components and then cast to RGB
pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(hex2vec_embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=hex2vec_embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)

Out[15]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Clustering¶

In [16]:

Copied!

clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(hex2vec_embeddings)

hex2vec_embeddings["cluster"] = clusterizer.labels_
hex2vec_embeddings["cluster"]
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(hex2vec_embeddings)

hex2vec_embeddings["cluster"] = clusterizer.labels_
hex2vec_embeddings["cluster"]

/opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)

Out[16]:

region_id
891e204183bffff    1
891e2055627ffff    2
891e204e547ffff    1
891e2051b6fffff    1
891e2041127ffff    4
                  ..
891e2050bd3ffff    2
891e20406d7ffff    0
891e204626bffff    1
891e20452c3ffff    4
891e204400bffff    0
Name: cluster, Length: 4319, dtype: int32

In [17]:

Copied!

plot_numeric_data(base_h3_regions, "cluster", hex2vec_embeddings, tiles_style="CartoDB positron")
plot_numeric_data(base_h3_regions, "cluster", hex2vec_embeddings, tiles_style="CartoDB positron")

Out[17]:

Make this Notebook Trusted to load map: File -> Trust Notebook