Hex2vec embedder
In [1]:
Copied!
from pytorch_lightning import seed_everything
from srai.embedders import Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMOnlineLoader
from srai.neighbourhoods import H3Neighbourhood
from srai.plotting import plot_numeric_data, plot_regions
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
from pytorch_lightning import seed_everything
from srai.embedders import Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMOnlineLoader
from srai.neighbourhoods import H3Neighbourhood
from srai.plotting import plot_numeric_data, plot_regions
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
In [2]:
Copied!
SEED = 71
seed_everything(SEED)
SEED = 71
seed_everything(SEED)
Seed set to 71
Out[2]:
71
Load data from OSM¶
First use geocoding to get the area
In [3]:
Copied!
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")
Out[3]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Next, download the data for the selected region and the specified tags. We're using OSMOnlineLoader
here, as it's faster for low numbers of tags. In a real life scenario with more tags, you would likely want to use the OSMPbfLoader
.
In [4]:
Copied!
tags = {
"leisure": "park",
"landuse": "forest",
"amenity": ["bar", "restaurant", "cafe"],
"water": "river",
"sport": "soccer",
}
loader = OSMOnlineLoader()
features_gdf = loader.load(area_gdf, tags)
folium_map = plot_regions(area_gdf, colormap=["rgba(0,0,0,0)"], tiles_style="CartoDB positron")
features_gdf.explore(m=folium_map)
tags = {
"leisure": "park",
"landuse": "forest",
"amenity": ["bar", "restaurant", "cafe"],
"water": "river",
"sport": "soccer",
}
loader = OSMOnlineLoader()
features_gdf = loader.load(area_gdf, tags)
folium_map = plot_regions(area_gdf, colormap=["rgba(0,0,0,0)"], tiles_style="CartoDB positron")
features_gdf.explore(m=folium_map)
Out[4]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Prepare the data for embedding¶
After downloading the data, we need to prepare it for embedding. Namely - we need to regionalize the selected area, and join the features with regions.
In [5]:
Copied!
regionalizer = H3Regionalizer(resolution=9)
regions_gdf = regionalizer.transform(area_gdf)
plot_regions(regions_gdf, tiles_style="CartoDB positron")
regionalizer = H3Regionalizer(resolution=9)
regions_gdf = regionalizer.transform(area_gdf)
plot_regions(regions_gdf, tiles_style="CartoDB positron")
Out[5]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [6]:
Copied!
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(regions_gdf, features_gdf)
joint_gdf
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(regions_gdf, features_gdf)
joint_gdf
Out[6]:
region_id | feature_id |
---|---|
891e2040897ffff | node/280727473 |
891e2040d4bffff | node/300461026 |
node/300461036 | |
891e2040d5bffff | node/300461042 |
891e2040887ffff | node/300461045 |
... | |
way/1422748565 | |
891e2040d4bffff | way/1422748565 |
891e20454c3ffff | way/1422964443 |
891e20454c7ffff | way/1423396175 |
891e204055bffff | way/1426281734 |
4184 rows × 0 columns
Embedding¶
After preparing the data we can proceed with generating embeddings for the regions.
In [7]:
Copied!
import warnings
neighbourhood = H3Neighbourhood(regions_gdf)
embedder = Hex2VecEmbedder([15, 10])
with warnings.catch_warnings():
warnings.simplefilter("ignore")
embeddings = embedder.fit_transform(
regions_gdf,
features_gdf,
joint_gdf,
neighbourhood,
trainer_kwargs={"max_epochs": 5, "accelerator": "cpu"},
batch_size=100,
)
embeddings
import warnings
neighbourhood = H3Neighbourhood(regions_gdf)
embedder = Hex2VecEmbedder([15, 10])
with warnings.catch_warnings():
warnings.simplefilter("ignore")
embeddings = embedder.fit_transform(
regions_gdf,
features_gdf,
joint_gdf,
neighbourhood,
trainer_kwargs={"max_epochs": 5, "accelerator": "cpu"},
batch_size=100,
)
embeddings
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
| Name | Type | Params | Mode ----------------------------------------------- 0 | encoder | Sequential | 280 | train ----------------------------------------------- 280 Trainable params 0 Non-trainable params 280 Total params 0.001 Total estimated model params size (MB) 4 Modules in train mode 0 Modules in eval mode
`Trainer.fit` stopped: `max_epochs=5` reached.
Out[7]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
region_id | ||||||||||
891e2047303ffff | 0.341400 | -0.253870 | -0.016749 | 0.323208 | 0.238173 | -0.123129 | -0.279072 | -0.088379 | -0.194273 | 0.370400 |
891e20404c7ffff | 0.341400 | -0.253870 | -0.016749 | 0.323208 | 0.238173 | -0.123129 | -0.279072 | -0.088379 | -0.194273 | 0.370400 |
891e204212bffff | -0.329779 | 0.040839 | -0.070775 | -0.314209 | 0.056109 | 0.149004 | -0.064876 | 0.201306 | 0.015928 | -0.140280 |
891e204563bffff | -0.291778 | 0.046439 | -0.286547 | -0.423768 | 0.014743 | 0.219305 | -0.381652 | 0.303765 | -0.121125 | 0.008270 |
891e204754bffff | 0.341400 | -0.253870 | -0.016749 | 0.323208 | 0.238173 | -0.123129 | -0.279072 | -0.088379 | -0.194273 | 0.370400 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
891e204550fffff | -0.291778 | 0.046439 | -0.286547 | -0.423768 | 0.014743 | 0.219305 | -0.381652 | 0.303765 | -0.121125 | 0.008270 |
891e2040b13ffff | 0.173853 | 0.151073 | 0.402511 | 0.401253 | -0.121004 | -0.201290 | 0.662516 | -0.408048 | 0.422258 | -0.270984 |
891e2040da3ffff | 0.408749 | -0.309027 | 0.075073 | 0.203427 | 0.093649 | -0.139810 | 0.031921 | -0.251953 | -0.077973 | 0.095363 |
891e2051857ffff | -0.444953 | 0.212055 | -0.382394 | -0.606828 | -0.047674 | 0.245112 | -0.314173 | 0.380123 | -0.153552 | -0.145722 |
891e2040007ffff | 0.095493 | 0.123367 | 0.443478 | 0.595970 | -0.123755 | -0.044791 | 0.349595 | -0.360595 | 0.407458 | 0.128345 |
3168 rows × 10 columns
Visualizing the embeddings' similarity¶
In [8]:
Copied!
from sklearn.cluster import KMeans
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings["cluster"] = clusterizer.labels_
embeddings
from sklearn.cluster import KMeans
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings["cluster"] = clusterizer.labels_
embeddings
Out[8]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|
region_id | |||||||||||
891e2047303ffff | 0.341400 | -0.253870 | -0.016749 | 0.323208 | 0.238173 | -0.123129 | -0.279072 | -0.088379 | -0.194273 | 0.370400 | 0 |
891e20404c7ffff | 0.341400 | -0.253870 | -0.016749 | 0.323208 | 0.238173 | -0.123129 | -0.279072 | -0.088379 | -0.194273 | 0.370400 | 0 |
891e204212bffff | -0.329779 | 0.040839 | -0.070775 | -0.314209 | 0.056109 | 0.149004 | -0.064876 | 0.201306 | 0.015928 | -0.140280 | 2 |
891e204563bffff | -0.291778 | 0.046439 | -0.286547 | -0.423768 | 0.014743 | 0.219305 | -0.381652 | 0.303765 | -0.121125 | 0.008270 | 2 |
891e204754bffff | 0.341400 | -0.253870 | -0.016749 | 0.323208 | 0.238173 | -0.123129 | -0.279072 | -0.088379 | -0.194273 | 0.370400 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
891e204550fffff | -0.291778 | 0.046439 | -0.286547 | -0.423768 | 0.014743 | 0.219305 | -0.381652 | 0.303765 | -0.121125 | 0.008270 | 2 |
891e2040b13ffff | 0.173853 | 0.151073 | 0.402511 | 0.401253 | -0.121004 | -0.201290 | 0.662516 | -0.408048 | 0.422258 | -0.270984 | 1 |
891e2040da3ffff | 0.408749 | -0.309027 | 0.075073 | 0.203427 | 0.093649 | -0.139810 | 0.031921 | -0.251953 | -0.077973 | 0.095363 | 0 |
891e2051857ffff | -0.444953 | 0.212055 | -0.382394 | -0.606828 | -0.047674 | 0.245112 | -0.314173 | 0.380123 | -0.153552 | -0.145722 | 2 |
891e2040007ffff | 0.095493 | 0.123367 | 0.443478 | 0.595970 | -0.123755 | -0.044791 | 0.349595 | -0.360595 | 0.407458 | 0.128345 | 1 |
3168 rows × 11 columns
In [9]:
Copied!
plot_numeric_data(regions_gdf, "cluster", embeddings)
plot_numeric_data(regions_gdf, "cluster", embeddings)
Out[9]:
Make this Notebook Trusted to load map: File -> Trust Notebook