Hex2vec embedder
In [1]:
Copied!
from pytorch_lightning import seed_everything
from srai.embedders import Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMOnlineLoader
from srai.neighbourhoods import H3Neighbourhood
from srai.plotting import plot_numeric_data, plot_regions
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
from pytorch_lightning import seed_everything
from srai.embedders import Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMOnlineLoader
from srai.neighbourhoods import H3Neighbourhood
from srai.plotting import plot_numeric_data, plot_regions
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
In [2]:
Copied!
SEED = 71
seed_everything(SEED)
SEED = 71
seed_everything(SEED)
Seed set to 71
Out[2]:
71
Load data from OSM¶
First use geocoding to get the area
In [3]:
Copied!
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")
Out[3]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Next, download the data for the selected region and the specified tags. We're using OSMOnlineLoader
here, as it's faster for low numbers of tags. In a real life scenario with more tags, you would likely want to use the OSMPbfLoader
.
In [4]:
Copied!
tags = {
"leisure": "park",
"landuse": "forest",
"amenity": ["bar", "restaurant", "cafe"],
"water": "river",
"sport": "soccer",
}
loader = OSMOnlineLoader()
features_gdf = loader.load(area_gdf, tags)
folium_map = plot_regions(area_gdf, colormap=["rgba(0,0,0,0)"], tiles_style="CartoDB positron")
features_gdf.explore(m=folium_map)
tags = {
"leisure": "park",
"landuse": "forest",
"amenity": ["bar", "restaurant", "cafe"],
"water": "river",
"sport": "soccer",
}
loader = OSMOnlineLoader()
features_gdf = loader.load(area_gdf, tags)
folium_map = plot_regions(area_gdf, colormap=["rgba(0,0,0,0)"], tiles_style="CartoDB positron")
features_gdf.explore(m=folium_map)
0%| | 0/7 [00:00<?, ?it/s]
Downloading leisure: park : 0%| | 0/7 [00:00<?, ?it/s]
Downloading leisure: park : 14%|████████████████████▋ | 1/7 [00:10<01:05, 10.89s/it]
Downloading landuse: forest : 14%|████████████████████▋ | 1/7 [00:10<01:05, 10.89s/it]
Downloading landuse: forest : 29%|█████████████████████████████████████████▍ | 2/7 [00:15<00:36, 7.39s/it]
Downloading amenity: bar : 29%|█████████████████████████████████████████▍ | 2/7 [00:15<00:36, 7.39s/it]
Downloading amenity: bar : 43%|██████████████████████████████████████████████████████████████▏ | 3/7 [00:17<00:18, 4.58s/it]
Downloading amenity: restaurant: 43%|██████████████████████████████████████████████████████████████▏ | 3/7 [00:17<00:18, 4.58s/it]
Downloading amenity: restaurant: 57%|██████████████████████████████████████████████████████████████████████████████████▊ | 4/7 [00:38<00:33, 11.07s/it]
Downloading amenity: cafe : 57%|██████████████████████████████████████████████████████████████████████████████████▊ | 4/7 [00:38<00:33, 11.07s/it]
Downloading amenity: cafe : 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5/7 [00:39<00:15, 7.64s/it]
Downloading water: river : 71%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 5/7 [00:39<00:15, 7.64s/it]
Downloading water: river : 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 6/7 [00:42<00:05, 5.92s/it]
Downloading sport: soccer : 86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 6/7 [00:42<00:05, 5.92s/it]
Downloading sport: soccer : 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:44<00:00, 4.63s/it]
Downloading sport: soccer : 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:44<00:00, 6.32s/it]
Out[4]:
Make this Notebook Trusted to load map: File -> Trust Notebook
Prepare the data for embedding¶
After downloading the data, we need to prepare it for embedding. Namely - we need to regionalize the selected area, and join the features with regions.
In [5]:
Copied!
regionalizer = H3Regionalizer(resolution=9)
regions_gdf = regionalizer.transform(area_gdf)
plot_regions(regions_gdf, tiles_style="CartoDB positron")
regionalizer = H3Regionalizer(resolution=9)
regions_gdf = regionalizer.transform(area_gdf)
plot_regions(regions_gdf, tiles_style="CartoDB positron")
Out[5]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [6]:
Copied!
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(regions_gdf, features_gdf)
joint_gdf
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(regions_gdf, features_gdf)
joint_gdf
Out[6]:
region_id | feature_id |
---|---|
891e20460dbffff | way/846151177 |
relation/5479020 | |
891e2040b73ffff | way/301083222 |
891e20429c3ffff | way/564288360 |
891e20475bbffff | way/360949794 |
... | ... |
891e20550cbffff | relation/2519767 |
891e2051b8fffff | relation/2950476 |
891e20456a7ffff | relation/7261537 |
way/454851047 | |
way/454851046 |
4020 rows × 0 columns
Embedding¶
After preparing the data we can proceed with generating embeddings for the regions.
In [7]:
Copied!
import warnings
neighbourhood = H3Neighbourhood(regions_gdf)
embedder = Hex2VecEmbedder([15, 10])
with warnings.catch_warnings():
warnings.simplefilter("ignore")
embeddings = embedder.fit_transform(
regions_gdf,
features_gdf,
joint_gdf,
neighbourhood,
trainer_kwargs={"max_epochs": 5, "accelerator": "cpu"},
batch_size=100,
)
embeddings
import warnings
neighbourhood = H3Neighbourhood(regions_gdf)
embedder = Hex2VecEmbedder([15, 10])
with warnings.catch_warnings():
warnings.simplefilter("ignore")
embeddings = embedder.fit_transform(
regions_gdf,
features_gdf,
joint_gdf,
neighbourhood,
trainer_kwargs={"max_epochs": 5, "accelerator": "cpu"},
batch_size=100,
)
embeddings
0%| | 0/3168 [00:00<?, ?it/s]
50%|███████████████████████████████████████████████████████████████████████████████████▉ | 1573/3168 [00:00<00:00, 15723.75it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3168/3168 [00:00<00:00, 18556.53it/s]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
| Name | Type | Params --------------------------------------- 0 | encoder | Sequential | 280 --------------------------------------- 280 Trainable params 0 Non-trainable params 280 Total params 0.001 Total estimated model params size (MB)
`Trainer.fit` stopped: `max_epochs=5` reached.
Out[7]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
region_id | ||||||||||
891e20460dbffff | -0.439610 | 0.162985 | -0.399176 | -0.581462 | -0.019725 | 0.249176 | -0.328244 | 0.384069 | -0.112147 | -0.092024 |
891e2040b73ffff | 0.060024 | 0.186553 | 0.451140 | 0.584353 | -0.126542 | -0.032704 | 0.348927 | -0.382170 | 0.369541 | 0.092625 |
891e20429c3ffff | 0.060024 | 0.186553 | 0.451140 | 0.584353 | -0.126542 | -0.032704 | 0.348927 | -0.382170 | 0.369541 | 0.092625 |
891e20475bbffff | 0.335875 | -0.283459 | 0.218984 | 0.383375 | 0.204464 | -0.168726 | 0.085973 | -0.121118 | -0.021420 | 0.161742 |
891e2047343ffff | -0.630838 | 0.740119 | 0.472413 | 0.378317 | -0.440499 | 0.208015 | 0.446423 | -0.197289 | 0.737608 | -0.112968 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
891e20550cbffff | -0.482761 | 0.024029 | -0.363131 | -0.825913 | -0.112309 | 0.299447 | -0.169782 | 0.510711 | -0.168912 | -0.346754 |
891e2040bafffff | 0.334912 | -0.254242 | -0.015363 | 0.293516 | 0.227590 | -0.119095 | -0.264572 | -0.102402 | -0.214497 | 0.349670 |
891e204188bffff | 0.334912 | -0.254242 | -0.015363 | 0.293516 | 0.227590 | -0.119095 | -0.264572 | -0.102402 | -0.214497 | 0.349670 |
891e2051b8fffff | -0.319579 | 0.008740 | -0.308726 | -0.444802 | 0.017347 | 0.255475 | -0.413749 | 0.370286 | -0.084450 | 0.036234 |
891e20456a7ffff | -0.482761 | 0.024029 | -0.363131 | -0.825913 | -0.112309 | 0.299447 | -0.169782 | 0.510711 | -0.168912 | -0.346754 |
3168 rows × 10 columns
Visualizing the embeddings' similarity¶
In [8]:
Copied!
from sklearn.cluster import KMeans
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings["cluster"] = clusterizer.labels_
embeddings
from sklearn.cluster import KMeans
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings["cluster"] = clusterizer.labels_
embeddings
Out[8]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | cluster | |
---|---|---|---|---|---|---|---|---|---|---|---|
region_id | |||||||||||
891e20460dbffff | -0.439610 | 0.162985 | -0.399176 | -0.581462 | -0.019725 | 0.249176 | -0.328244 | 0.384069 | -0.112147 | -0.092024 | 4 |
891e2040b73ffff | 0.060024 | 0.186553 | 0.451140 | 0.584353 | -0.126542 | -0.032704 | 0.348927 | -0.382170 | 0.369541 | 0.092625 | 2 |
891e20429c3ffff | 0.060024 | 0.186553 | 0.451140 | 0.584353 | -0.126542 | -0.032704 | 0.348927 | -0.382170 | 0.369541 | 0.092625 | 2 |
891e20475bbffff | 0.335875 | -0.283459 | 0.218984 | 0.383375 | 0.204464 | -0.168726 | 0.085973 | -0.121118 | -0.021420 | 0.161742 | 1 |
891e2047343ffff | -0.630838 | 0.740119 | 0.472413 | 0.378317 | -0.440499 | 0.208015 | 0.446423 | -0.197289 | 0.737608 | -0.112968 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
891e20550cbffff | -0.482761 | 0.024029 | -0.363131 | -0.825913 | -0.112309 | 0.299447 | -0.169782 | 0.510711 | -0.168912 | -0.346754 | 0 |
891e2040bafffff | 0.334912 | -0.254242 | -0.015363 | 0.293516 | 0.227590 | -0.119095 | -0.264572 | -0.102402 | -0.214497 | 0.349670 | 1 |
891e204188bffff | 0.334912 | -0.254242 | -0.015363 | 0.293516 | 0.227590 | -0.119095 | -0.264572 | -0.102402 | -0.214497 | 0.349670 | 1 |
891e2051b8fffff | -0.319579 | 0.008740 | -0.308726 | -0.444802 | 0.017347 | 0.255475 | -0.413749 | 0.370286 | -0.084450 | 0.036234 | 4 |
891e20456a7ffff | -0.482761 | 0.024029 | -0.363131 | -0.825913 | -0.112309 | 0.299447 | -0.169782 | 0.510711 | -0.168912 | -0.346754 | 0 |
3168 rows × 11 columns
In [9]:
Copied!
plot_numeric_data(regions_gdf, "cluster", embeddings)
plot_numeric_data(regions_gdf, "cluster", embeddings)
Out[9]:
Make this Notebook Trusted to load map: File -> Trust Notebook