Geovex embedder
In [1]:
Copied!
import warnings
import pandas as pd
import torch
from pytorch_lightning import seed_everything
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from srai.embedders import GeoVexEmbedder, Hex2VecEmbedder
from srai.h3 import ring_buffer_h3_regions_gdf
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.neighbourhoods import H3Neighbourhood
from srai.plotting import plot_numeric_data, plot_regions
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
import warnings
import pandas as pd
import torch
from pytorch_lightning import seed_everything
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from srai.embedders import GeoVexEmbedder, Hex2VecEmbedder
from srai.h3 import ring_buffer_h3_regions_gdf
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.neighbourhoods import H3Neighbourhood
from srai.plotting import plot_numeric_data, plot_regions
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
In [2]:
Copied!
SEED = 71
seed_everything(SEED)
SEED = 71
seed_everything(SEED)
Seed set to 71
Out[2]:
71
Load data from OSM¶
First use geocoding to get the area
In [3]:
Copied!
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")
Out[3]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Buffer the Area¶
The GeoVex embedder requires a buffer around the area of interest, as the hexagon needs to have its radius k neighbors in the dataset as well. The buffer is defined in hexagon radius units, so a buffer of 1 means that the hexagon will have its 1-neighborhood in the dataset as well.
In [4]:
Copied!
area_gdf.head()
area_gdf.head()
Out[4]:
geometry | |
---|---|
region_id | |
Wrocław, Lower Silesian Voivodeship, Poland | POLYGON ((16.80734 51.13895, 16.80859 51.13887... |
In [5]:
Copied!
resolution = 9
k_ring_buffer_radius = 4
regionalizer = H3Regionalizer(resolution=resolution)
base_h3_regions = regionalizer.transform(area_gdf)
buffered_h3_regions = ring_buffer_h3_regions_gdf(base_h3_regions, distance=k_ring_buffer_radius)
buffered_h3_geometry = buffered_h3_regions.unary_union
print("Base regions:", len(base_h3_regions))
print("Buffered regions:", len(buffered_h3_regions))
resolution = 9
k_ring_buffer_radius = 4
regionalizer = H3Regionalizer(resolution=resolution)
base_h3_regions = regionalizer.transform(area_gdf)
buffered_h3_regions = ring_buffer_h3_regions_gdf(base_h3_regions, distance=k_ring_buffer_radius)
buffered_h3_geometry = buffered_h3_regions.unary_union
print("Base regions:", len(base_h3_regions))
print("Buffered regions:", len(buffered_h3_regions))
Base regions: 3168 Buffered regions: 4319
/tmp/ipykernel_3818/2274359378.py:8: DeprecationWarning: The 'unary_union' attribute is deprecated, use the 'union_all()' method instead. buffered_h3_geometry = buffered_h3_regions.unary_union
Download the Data¶
Next, download the data for the selected region and the specified tags.
In [6]:
Copied!
tags = GEOFABRIK_LAYERS
loader = OSMPbfLoader()
features_gdf = loader.load(buffered_h3_geometry, tags)
tags = GEOFABRIK_LAYERS
loader = OSMPbfLoader()
features_gdf = loader.load(buffered_h3_geometry, tags)
/opt/hostedtoolcache/Python/3.10.15/x64/lib/python3.10/site-packages/quackosm/osm_extracts/__init__.py:602: GeometryNotCoveredWarning: Skipping extract because of low IoU value (geofabrik_europe_poland_dolnoslaskie, 0.000151). warnings.warn(
Finished operation in 0:00:20
Prepare the data for embedding¶
After downloading the data, we need to prepare it for embedding. In the previous step we have regionalized the selected area and buffered it, now we have to join the features with prepared regions.
In [7]:
Copied!
plot_regions(buffered_h3_regions, tiles_style="CartoDB positron")
plot_regions(buffered_h3_regions, tiles_style="CartoDB positron")
Out[7]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [8]:
Copied!
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(buffered_h3_regions, features_gdf)
joint_gdf
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(buffered_h3_regions, features_gdf)
joint_gdf
Out[8]:
region_id | feature_id |
---|---|
891e2040003ffff | way/357597935 |
way/149745615 | |
way/308601123 | |
way/854890880 | |
way/357597934 | |
... | ... |
891e205aba7ffff | way/587585713 |
way/245190176 | |
way/407525323 | |
way/245190191 | |
way/794561919 |
473222 rows × 0 columns
GeoVex-Embedding¶
After preparing the data we can proceed with generating embeddings for the regions.
In [9]:
Copied!
neighbourhood = H3Neighbourhood(buffered_h3_regions)
embedder = GeoVexEmbedder(
target_features=GEOFABRIK_LAYERS,
batch_size=10,
neighbourhood_radius=k_ring_buffer_radius,
convolutional_layers=2,
embedding_size=50,
)
neighbourhood = H3Neighbourhood(buffered_h3_regions)
embedder = GeoVexEmbedder(
target_features=GEOFABRIK_LAYERS,
batch_size=10,
neighbourhood_radius=k_ring_buffer_radius,
convolutional_layers=2,
embedding_size=50,
)
In [10]:
Copied!
with warnings.catch_warnings():
warnings.simplefilter("ignore")
embeddings = embedder.fit_transform(
regions_gdf=buffered_h3_regions,
features_gdf=features_gdf,
joint_gdf=joint_gdf,
neighbourhood=neighbourhood,
trainer_kwargs={
# "max_epochs": 20, # uncomment for a longer training
"max_epochs": 5,
"accelerator": (
"cpu" if torch.backends.mps.is_available() else "auto"
), # GeoVexEmbedder does not support MPS
},
learning_rate=0.001,
)
embeddings.head()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
embeddings = embedder.fit_transform(
regions_gdf=buffered_h3_regions,
features_gdf=features_gdf,
joint_gdf=joint_gdf,
neighbourhood=neighbourhood,
trainer_kwargs={
# "max_epochs": 20, # uncomment for a longer training
"max_epochs": 5,
"accelerator": (
"cpu" if torch.backends.mps.is_available() else "auto"
), # GeoVexEmbedder does not support MPS
},
learning_rate=0.001,
)
embeddings.head()
0%| | 0/4319 [00:00<?, ?it/s]
11%|█ | 462/4319 [00:00<00:00, 4614.44it/s]
24%|██▍ | 1048/4319 [00:00<00:00, 5331.15it/s]
37%|███▋ | 1582/4319 [00:00<00:00, 5056.38it/s]
48%|████▊ | 2090/4319 [00:00<00:00, 4974.72it/s]
64%|██████▍ | 2760/4319 [00:00<00:00, 5573.05it/s]
77%|███████▋ | 3320/4319 [00:00<00:00, 5269.15it/s]
89%|████████▉ | 3852/4319 [00:00<00:00, 3048.58it/s]
100%|██████████| 4319/4319 [00:01<00:00, 4199.20it/s]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
| Name | Type | Params | Mode ----------------------------------------------- 0 | encoder | Sequential | 2.4 M | train 1 | decoder | Sequential | 1.8 M | train 2 | _loss | GeoVeXLoss | 0 | train ----------------------------------------------- 4.2 M Trainable params 0 Non-trainable params 4.2 M Total params 16.673 Total estimated model params size (MB) 26 Modules in train mode 0 Modules in eval mode
`Trainer.fit` stopped: `max_epochs=5` reached.
Warning: Some regions were not able to be encoded, as they don't have r=4 neighbors.
Out[10]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
region_id | |||||||||||||||||||||
891e2040003ffff | -4.919757 | 0.977214 | 15.540886 | 7.267544 | 0.375246 | -13.808127 | 9.966508 | 22.594244 | -7.383607 | -8.722119 | ... | -5.264326 | -6.032025 | -12.531649 | 33.042336 | -18.033167 | -1.549666 | 21.995079 | 4.041358 | 0.362814 | -2.429089 |
891e2040007ffff | 6.034072 | 1.500228 | 18.791761 | 2.711638 | -0.661604 | -14.759071 | 16.128145 | 28.910061 | -10.174938 | -12.496203 | ... | -6.727332 | -8.161683 | -15.778372 | 34.731655 | -22.907764 | -5.491254 | 27.430378 | 4.632978 | 3.774979 | 2.604064 |
891e204000bffff | -1.715762 | 1.870545 | 15.143153 | 7.560801 | 3.765955 | -12.740223 | 11.004804 | 22.616617 | -13.450274 | -5.554009 | ... | -4.166763 | -6.655728 | -12.826140 | 35.138317 | -18.381784 | -0.037034 | 21.692657 | 3.132348 | -6.410631 | -2.464615 |
891e204000fffff | -1.311792 | 2.971017 | 14.009445 | 6.109387 | -0.378318 | -11.469155 | 12.669106 | 23.884596 | -9.711639 | -7.825118 | ... | -2.429382 | -6.121771 | -10.380754 | 32.715111 | -18.698830 | -0.590125 | 21.607683 | 1.371498 | -4.446707 | -3.610208 |
891e2040013ffff | -5.932198 | -0.416908 | 15.708393 | 1.321368 | 3.426344 | -17.067482 | 10.307467 | 21.774134 | -7.855052 | -8.140857 | ... | -3.541734 | -9.155781 | -12.186864 | 34.199730 | -19.874952 | -1.979245 | 23.487093 | 5.526461 | -3.297323 | 2.389273 |
5 rows × 50 columns
Hex2Vec Embedding¶
In [11]:
Copied!
neighbourhood = H3Neighbourhood(buffered_h3_regions)
hex2vec_embedder = Hex2VecEmbedder(
encoder_sizes=[300, 150, 50],
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
hex2vec_embeddings = hex2vec_embedder.fit_transform(
regions_gdf=buffered_h3_regions,
features_gdf=features_gdf,
joint_gdf=joint_gdf,
neighbourhood=neighbourhood,
negative_sample_k_distance=2,
batch_size=64,
learning_rate=0.001,
trainer_kwargs={
# "max_epochs": 50, # uncomment for a longer training
"max_epochs": 5,
"accelerator": "auto",
},
)
hex2vec_embeddings.head()
neighbourhood = H3Neighbourhood(buffered_h3_regions)
hex2vec_embedder = Hex2VecEmbedder(
encoder_sizes=[300, 150, 50],
)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
hex2vec_embeddings = hex2vec_embedder.fit_transform(
regions_gdf=buffered_h3_regions,
features_gdf=features_gdf,
joint_gdf=joint_gdf,
neighbourhood=neighbourhood,
negative_sample_k_distance=2,
batch_size=64,
learning_rate=0.001,
trainer_kwargs={
# "max_epochs": 50, # uncomment for a longer training
"max_epochs": 5,
"accelerator": "auto",
},
)
hex2vec_embeddings.head()
0%| | 0/4319 [00:00<?, ?it/s]
76%|███████▌ | 3278/4319 [00:00<00:00, 32776.05it/s]
100%|██████████| 4319/4319 [00:00<00:00, 32452.01it/s]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
| Name | Type | Params | Mode ----------------------------------------------- 0 | encoder | Sequential | 136 K | train ----------------------------------------------- 136 K Trainable params 0 Non-trainable params 136 K Total params 0.544 Total estimated model params size (MB) 6 Modules in train mode 0 Modules in eval mode
`Trainer.fit` stopped: `max_epochs=5` reached.
Out[11]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
region_id | |||||||||||||||||||||
891e2040003ffff | -0.110676 | 0.089480 | 0.028877 | 0.120685 | -0.288726 | 0.453677 | -0.114213 | 0.002147 | -0.079172 | 0.012831 | ... | 0.079861 | 0.001937 | -0.228539 | 0.100645 | -0.224395 | 0.161915 | -0.070230 | -0.338976 | -0.387509 | 0.124790 |
891e2040007ffff | -0.063885 | -0.017521 | 0.097536 | 0.096900 | -0.204428 | 0.161754 | -0.094631 | 0.040657 | -0.238622 | 0.424944 | ... | 0.064661 | -0.178729 | -0.145094 | -0.328413 | -0.247036 | 0.178632 | 0.327338 | -0.191395 | 0.040639 | 0.128176 |
891e204000bffff | -0.114490 | 0.073995 | 0.002676 | 0.132338 | -0.261982 | 0.464466 | -0.115525 | -0.025258 | -0.076678 | 0.000670 | ... | 0.052283 | 0.020959 | -0.242463 | 0.062910 | -0.216534 | 0.162014 | -0.043497 | -0.290895 | -0.371664 | 0.099495 |
891e204000fffff | -0.100524 | 0.071662 | 0.039412 | 0.058283 | -0.305607 | 0.384675 | -0.081634 | -0.020329 | -0.069072 | -0.044051 | ... | 0.084282 | -0.032614 | -0.145873 | 0.084430 | -0.179812 | 0.230284 | -0.050195 | -0.400182 | -0.398195 | 0.142824 |
891e2040013ffff | 0.038788 | -0.087397 | -0.072665 | -0.216049 | -0.269323 | -0.021523 | 0.081442 | -0.178904 | -0.196169 | 0.347602 | ... | -0.134494 | -0.382217 | -0.303026 | -0.302419 | -0.203640 | 0.223172 | 0.390315 | -0.166923 | -0.166949 | 0.127539 |
5 rows × 50 columns
Comparing the Embeddings¶
GeoVex Embedding¶
PCA¶
In [12]:
Copied!
# do pca with three components and then cast to RGB
pca = PCA(n_components=3)
pca_embeddings = pca.fit_transform(embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=embeddings.index)
# convert to RGB
pca_embeddings = (
(pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)
# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)
color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
column="index",
tooltip="region_id",
tiles="CartoDB positron",
legend=False,
cmap=lambda x: color_dict[x],
style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)
# do pca with three components and then cast to RGB
pca = PCA(n_components=3)
pca_embeddings = pca.fit_transform(embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=embeddings.index)
# convert to RGB
pca_embeddings = (
(pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)
# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)
color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
column="index",
tooltip="region_id",
tiles="CartoDB positron",
legend=False,
cmap=lambda x: color_dict[x],
style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)
Out[12]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Clustering¶
In [13]:
Copied!
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings.index.name = "region_id"
embeddings["cluster"] = clusterizer.labels_
embeddings["cluster"]
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings.index.name = "region_id"
embeddings["cluster"] = clusterizer.labels_
embeddings["cluster"]
Out[13]:
region_id 891e2040003ffff 2 891e2040007ffff 0 891e204000bffff 2 891e204000fffff 2 891e2040013ffff 4 .. 891e205a937ffff 2 891e205a93bffff 3 891e205a967ffff 3 891e205a9a7ffff 2 891e205a9afffff 2 Name: cluster, Length: 3251, dtype: int32
In [14]:
Copied!
plot_numeric_data(base_h3_regions, "cluster", embeddings, tiles_style="CartoDB positron")
plot_numeric_data(base_h3_regions, "cluster", embeddings, tiles_style="CartoDB positron")
Out[14]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Hex2Vec¶
PCA¶
In [15]:
Copied!
# do pca with three components and then cast to RGB
pca = PCA(n_components=3)
pca_embeddings = pca.fit_transform(hex2vec_embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=hex2vec_embeddings.index)
# convert to RGB
pca_embeddings = (
(pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)
# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)
color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
column="index",
tooltip="region_id",
tiles="CartoDB positron",
legend=False,
cmap=lambda x: color_dict[x],
style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)
# do pca with three components and then cast to RGB
pca = PCA(n_components=3)
pca_embeddings = pca.fit_transform(hex2vec_embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=hex2vec_embeddings.index)
# convert to RGB
pca_embeddings = (
(pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)
# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)
color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
column="index",
tooltip="region_id",
tiles="CartoDB positron",
legend=False,
cmap=lambda x: color_dict[x],
style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)
Out[15]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Clustering¶
In [16]:
Copied!
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(hex2vec_embeddings)
hex2vec_embeddings["cluster"] = clusterizer.labels_
hex2vec_embeddings["cluster"]
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(hex2vec_embeddings)
hex2vec_embeddings["cluster"] = clusterizer.labels_
hex2vec_embeddings["cluster"]
Out[16]:
region_id 891e2040003ffff 2 891e2040007ffff 2 891e204000bffff 2 891e204000fffff 2 891e2040013ffff 2 .. 891e205aba3ffff 1 891e205aba7ffff 3 891e205abafffff 1 891e205abb3ffff 1 891e205abb7ffff 1 Name: cluster, Length: 4319, dtype: int32
In [17]:
Copied!
plot_numeric_data(base_h3_regions, "cluster", hex2vec_embeddings, tiles_style="CartoDB positron")
plot_numeric_data(base_h3_regions, "cluster", hex2vec_embeddings, tiles_style="CartoDB positron")
Out[17]:
Make this Notebook Trusted to load map: File -> Trust Notebook