Geovex embedder

In [1]:

Copied!





import warnings

import pandas as pd
import torch
from pytorch_lightning import seed_everything
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from srai.embedders import GeoVexEmbedder, Hex2VecEmbedder
from srai.h3 import ring_buffer_h3_regions_gdf
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.neighbourhoods import H3Neighbourhood
from srai.plotting import plot_numeric_data, plot_regions
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
import warnings

import pandas as pd
import torch
from pytorch_lightning import seed_everything
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from srai.embedders import GeoVexEmbedder, Hex2VecEmbedder
from srai.h3 import ring_buffer_h3_regions_gdf
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.neighbourhoods import H3Neighbourhood
from srai.plotting import plot_numeric_data, plot_regions
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf

In [2]:

Copied!

SEED = 71
seed_everything(SEED)
SEED = 71
seed_everything(SEED)

Seed set to 71

Out[2]:

Load data from OSM¶

First use geocoding to get the area

In [3]:

Copied!

area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
plot_regions(area_gdf, tiles_style="CartoDB positron")

Out[3]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Buffer the Area¶

The GeoVex embedder requires a buffer around the area of interest, as the hexagon needs to have its radius k neighbors in the dataset as well. The buffer is defined in hexagon radius units, so a buffer of 1 means that the hexagon will have its 1-neighborhood in the dataset as well.

In [4]:

Copied!

area_gdf.head()
area_gdf.head()

Out[4]:

	geometry
region_id
Wrocław, Lower Silesian Voivodeship, Poland	POLYGON ((16.80734 51.13895, 16.80859 51.13887...

In [5]:

Copied!





resolution = 9
k_ring_buffer_radius = 4

regionalizer = H3Regionalizer(resolution=resolution)
base_h3_regions = regionalizer.transform(area_gdf)

buffered_h3_regions = ring_buffer_h3_regions_gdf(base_h3_regions, distance=k_ring_buffer_radius)
buffered_h3_geometry = buffered_h3_regions.unary_union

print("Base regions:", len(base_h3_regions))
print("Buffered regions:", len(buffered_h3_regions))
resolution = 9
k_ring_buffer_radius = 4

regionalizer = H3Regionalizer(resolution=resolution)
base_h3_regions = regionalizer.transform(area_gdf)

buffered_h3_regions = ring_buffer_h3_regions_gdf(base_h3_regions, distance=k_ring_buffer_radius)
buffered_h3_geometry = buffered_h3_regions.unary_union

print("Base regions:", len(base_h3_regions))
print("Buffered regions:", len(buffered_h3_regions))

Base regions: 3168
Buffered regions: 4319

Download the Data¶

Next, download the data for the selected region and the specified tags.

In [6]:

Copied!

tags = GEOFABRIK_LAYERS
loader = OSMPbfLoader()

features_gdf = loader.load(buffered_h3_geometry, tags)
tags = GEOFABRIK_LAYERS
loader = OSMPbfLoader()

features_gdf = loader.load(buffered_h3_geometry, tags)

/root/development/srai/srai/loaders/osm_loaders/osm_pbf_loader.py:128: FutureWarning: Use `convert_geometry_to_geodataframe` instead. Deprecated since 0.8.1 version.
  features_gdf = pbf_reader.get_features_gdf_from_geometry(

  0%|                                               | 0.00/161M [00:00<?, ?B/s]

  0%|                                       | 37.9k/161M [00:00<09:49, 272kB/s]

  0%|                                       | 65.5k/161M [00:00<09:55, 269kB/s]

  0%|                                       | 93.2k/161M [00:00<10:00, 267kB/s]

  0%|                                        | 135k/161M [00:00<08:16, 323kB/s]

  0%|                                        | 168k/161M [00:00<08:26, 317kB/s]

  0%|                                        | 214k/161M [00:00<08:26, 317kB/s]

  0%|                                        | 251k/161M [00:00<08:05, 330kB/s]

  0%|                                        | 292k/161M [00:00<07:40, 348kB/s]

  0%|                                        | 341k/161M [00:01<06:57, 384kB/s]

  0%|                                        | 398k/161M [00:01<06:21, 420kB/s]

  0%|                                        | 452k/161M [00:01<05:55, 450kB/s]

  0%|▏                                       | 505k/161M [00:01<05:38, 473kB/s]

  0%|▏                                       | 562k/161M [00:01<05:18, 502kB/s]

  0%|▏                                       | 624k/161M [00:01<04:59, 534kB/s]

  0%|▏                                       | 685k/161M [00:01<04:47, 557kB/s]

  0%|▏                                       | 751k/161M [00:01<04:32, 586kB/s]

  1%|▏                                       | 820k/161M [00:01<04:22, 608kB/s]

  1%|▏                                       | 894k/161M [00:01<04:07, 645kB/s]

  1%|▏                                       | 972k/161M [00:02<03:53, 684kB/s]

  1%|▎                                      | 1.05M/161M [00:02<03:44, 711kB/s]

  1%|▎                                      | 1.13M/161M [00:02<03:35, 740kB/s]

  1%|▎                                      | 1.21M/161M [00:02<03:27, 766kB/s]

  1%|▎                                      | 1.31M/161M [00:02<03:29, 761kB/s]

  1%|▎                                      | 1.42M/161M [00:02<03:06, 854kB/s]

  1%|▎                                      | 1.52M/161M [00:02<03:01, 878kB/s]

  1%|▍                                      | 1.61M/161M [00:02<03:03, 865kB/s]

  1%|▍                                      | 1.71M/161M [00:02<02:53, 914kB/s]

  1%|▍                                      | 1.81M/161M [00:02<02:50, 933kB/s]

  1%|▍                                      | 1.91M/161M [00:03<02:43, 971kB/s]

  1%|▍                                      | 2.03M/161M [00:03<02:40, 987kB/s]

  1%|▌                                     | 2.14M/161M [00:03<02:35, 1.02MB/s]

  1%|▌                                     | 2.26M/161M [00:03<02:30, 1.05MB/s]

  1%|▌                                     | 2.37M/161M [00:03<02:26, 1.08MB/s]

  2%|▌                                     | 2.49M/161M [00:03<02:22, 1.11MB/s]

  2%|▌                                     | 2.61M/161M [00:03<02:18, 1.14MB/s]

  2%|▋                                     | 2.74M/161M [00:03<02:14, 1.18MB/s]

  2%|▋                                     | 2.87M/161M [00:03<02:10, 1.21MB/s]

  2%|▋                                     | 3.00M/161M [00:04<02:09, 1.22MB/s]

  2%|▋                                     | 3.13M/161M [00:04<02:04, 1.27MB/s]

  2%|▊                                     | 3.27M/161M [00:04<02:03, 1.28MB/s]

  2%|▊                                     | 3.41M/161M [00:04<01:59, 1.31MB/s]

  2%|▊                                     | 3.55M/161M [00:04<01:57, 1.34MB/s]

  2%|▊                                     | 3.69M/161M [00:04<01:55, 1.36MB/s]

  2%|▉                                     | 3.83M/161M [00:04<01:53, 1.38MB/s]

  2%|▉                                     | 3.99M/161M [00:04<01:50, 1.42MB/s]

  3%|▉                                     | 4.15M/161M [00:04<01:46, 1.47MB/s]

  3%|█                                     | 4.31M/161M [00:04<01:43, 1.50MB/s]

  3%|█                                     | 4.47M/161M [00:05<01:40, 1.55MB/s]

  3%|█                                     | 4.64M/161M [00:05<01:39, 1.57MB/s]

  3%|█▏                                    | 4.81M/161M [00:05<01:38, 1.58MB/s]

  3%|█▏                                    | 4.98M/161M [00:05<01:35, 1.63MB/s]

  3%|█▏                                    | 5.16M/161M [00:05<01:33, 1.67MB/s]

  3%|█▎                                    | 5.34M/161M [00:05<01:35, 1.63MB/s]

  3%|█▎                                    | 5.52M/161M [00:05<01:32, 1.68MB/s]

  4%|█▎                                    | 5.71M/161M [00:05<01:29, 1.73MB/s]

  4%|█▍                                    | 5.90M/161M [00:05<01:27, 1.77MB/s]

  4%|█▍                                    | 6.10M/161M [00:05<01:25, 1.81MB/s]

  4%|█▍                                    | 6.30M/161M [00:06<01:23, 1.85MB/s]

  4%|█▌                                    | 6.50M/161M [00:06<01:21, 1.89MB/s]

  4%|█▌                                    | 6.70M/161M [00:06<01:21, 1.89MB/s]

  4%|█▋                                    | 6.90M/161M [00:06<01:20, 1.92MB/s]

  4%|█▋                                    | 7.10M/161M [00:06<01:18, 1.95MB/s]

  5%|█▋                                    | 7.30M/161M [00:06<01:17, 1.97MB/s]

  5%|█▊                                    | 7.53M/161M [00:06<01:15, 2.03MB/s]

  5%|█▊                                    | 7.75M/161M [00:06<01:14, 2.06MB/s]

  5%|█▉                                    | 7.96M/161M [00:06<01:13, 2.09MB/s]

  5%|█▉                                    | 8.17M/161M [00:06<01:13, 2.07MB/s]

  5%|█▉                                    | 8.38M/161M [00:07<01:13, 2.07MB/s]

  5%|██                                    | 8.61M/161M [00:07<01:10, 2.14MB/s]

  5%|██                                    | 8.83M/161M [00:07<01:10, 2.14MB/s]

  6%|██▏                                   | 9.06M/161M [00:07<01:10, 2.15MB/s]

  6%|██▏                                   | 9.30M/161M [00:07<01:08, 2.21MB/s]

  6%|██▎                                   | 9.54M/161M [00:07<01:07, 2.24MB/s]

  6%|██▎                                   | 9.79M/161M [00:07<01:05, 2.31MB/s]

  6%|██▎                                   | 10.0M/161M [00:07<01:05, 2.32MB/s]

  6%|██▍                                   | 10.3M/161M [00:07<01:03, 2.35MB/s]

  7%|██▍                                   | 10.5M/161M [00:07<01:02, 2.38MB/s]

  7%|██▌                                   | 10.8M/161M [00:08<01:01, 2.43MB/s]

  7%|██▌                                   | 11.0M/161M [00:08<01:01, 2.44MB/s]

  7%|██▋                                   | 11.3M/161M [00:08<00:59, 2.49MB/s]

  7%|██▋                                   | 11.5M/161M [00:08<00:59, 2.52MB/s]

  7%|██▊                                   | 11.8M/161M [00:08<00:56, 2.62MB/s]

  8%|██▊                                   | 12.1M/161M [00:08<00:54, 2.70MB/s]

  8%|██▉                                   | 12.4M/161M [00:08<00:54, 2.74MB/s]

  8%|███                                   | 12.7M/161M [00:08<00:51, 2.87MB/s]

  8%|███                                   | 13.1M/161M [00:08<00:49, 2.99MB/s]

  8%|███▏                                  | 13.4M/161M [00:08<00:47, 3.10MB/s]

  9%|███▎                                  | 13.8M/161M [00:09<00:44, 3.28MB/s]

  9%|███▎                                  | 14.2M/161M [00:09<00:42, 3.43MB/s]

  9%|███▍                                  | 14.6M/161M [00:09<00:40, 3.56MB/s]

  9%|███▌                                  | 15.0M/161M [00:09<00:39, 3.72MB/s]

 10%|███▋                                  | 15.4M/161M [00:09<00:37, 3.84MB/s]

 10%|███▋                                  | 15.8M/161M [00:09<00:36, 4.01MB/s]

 10%|███▊                                  | 16.3M/161M [00:09<00:34, 4.16MB/s]

 10%|███▉                                  | 16.8M/161M [00:09<00:33, 4.27MB/s]

 11%|████                                  | 17.3M/161M [00:09<00:32, 4.45MB/s]

 11%|████▏                                 | 17.8M/161M [00:10<00:30, 4.65MB/s]

 11%|████▎                                 | 18.3M/161M [00:10<00:29, 4.87MB/s]

 12%|████▍                                 | 18.9M/161M [00:10<00:27, 5.06MB/s]

 12%|████▌                                 | 19.5M/161M [00:10<00:26, 5.30MB/s]

 12%|████▋                                 | 20.1M/161M [00:10<00:25, 5.51MB/s]

 13%|████▉                                 | 20.7M/161M [00:10<00:24, 5.63MB/s]

 13%|█████                                 | 21.3M/161M [00:10<00:23, 5.89MB/s]

 14%|█████▏                                | 22.0M/161M [00:10<00:22, 6.09MB/s]

 14%|█████▎                                | 22.7M/161M [00:10<00:21, 6.34MB/s]

 15%|█████▌                                | 23.4M/161M [00:10<00:21, 6.52MB/s]

 15%|█████▋                                | 24.1M/161M [00:11<00:20, 6.66MB/s]

 15%|█████▊                                | 24.8M/161M [00:11<00:19, 6.87MB/s]

 16%|██████                                | 25.6M/161M [00:11<00:19, 7.05MB/s]

 16%|██████▏                               | 26.4M/161M [00:11<00:18, 7.26MB/s]

 17%|██████▍                               | 27.2M/161M [00:11<00:17, 7.57MB/s]

 17%|██████▋                               | 28.1M/161M [00:11<00:16, 7.88MB/s]

 18%|██████▊                               | 28.9M/161M [00:11<00:16, 7.89MB/s]

 19%|███████                               | 29.8M/161M [00:11<00:15, 8.28MB/s]

 19%|███████▎                              | 30.7M/161M [00:11<00:15, 8.61MB/s]

 20%|███████▍                              | 31.7M/161M [00:11<00:14, 8.73MB/s]

 20%|███████▋                              | 32.5M/161M [00:12<00:14, 8.61MB/s]

 21%|███████▉                              | 33.5M/161M [00:12<00:14, 8.71MB/s]

 21%|████████▏                             | 34.5M/161M [00:12<00:13, 9.09MB/s]

 22%|████████▍                             | 35.6M/161M [00:12<00:12, 9.73MB/s]

 23%|████████▋                             | 36.7M/161M [00:12<00:12, 10.2MB/s]

 24%|████████▉                             | 37.9M/161M [00:12<00:11, 10.6MB/s]

 24%|█████████▎                            | 39.1M/161M [00:12<00:10, 11.1MB/s]

 25%|█████████▌                            | 40.4M/161M [00:12<00:10, 11.4MB/s]

 26%|█████████▊                            | 41.6M/161M [00:12<00:10, 11.6MB/s]

 27%|██████████▏                           | 42.9M/161M [00:12<00:09, 12.0MB/s]

 28%|██████████▍                           | 44.2M/161M [00:13<00:09, 12.4MB/s]

 28%|██████████▊                           | 45.5M/161M [00:13<00:09, 12.4MB/s]

 29%|███████████                           | 46.9M/161M [00:13<00:08, 12.9MB/s]

 30%|███████████▍                          | 48.3M/161M [00:13<00:08, 13.3MB/s]

 31%|███████████▊                          | 49.7M/161M [00:13<00:08, 13.5MB/s]

 32%|████████████▏                         | 51.3M/161M [00:13<00:07, 14.1MB/s]

 33%|████████████▍                         | 52.8M/161M [00:13<00:07, 14.3MB/s]

 34%|████████████▊                         | 54.4M/161M [00:13<00:07, 14.8MB/s]

 35%|█████████████▎                        | 56.0M/161M [00:13<00:06, 15.4MB/s]

 36%|█████████████▋                        | 57.7M/161M [00:13<00:06, 15.7MB/s]

 37%|██████████████                        | 59.4M/161M [00:14<00:06, 16.0MB/s]

 38%|██████████████▍                       | 61.1M/161M [00:14<00:06, 16.4MB/s]

 39%|██████████████▉                       | 62.9M/161M [00:14<00:05, 17.0MB/s]

 40%|███████████████▎                      | 64.8M/161M [00:14<00:05, 17.4MB/s]

 42%|███████████████▊                      | 66.7M/161M [00:14<00:05, 17.9MB/s]

 43%|████████████████▏                     | 68.6M/161M [00:14<00:05, 18.2MB/s]

 44%|████████████████▋                     | 70.5M/161M [00:14<00:04, 18.5MB/s]

 45%|█████████████████▏                    | 72.6M/161M [00:14<00:04, 19.3MB/s]

 46%|█████████████████▋                    | 74.6M/161M [00:14<00:04, 19.5MB/s]

 48%|██████████████████▏                   | 76.9M/161M [00:14<00:04, 20.2MB/s]

 49%|██████████████████▋                   | 78.9M/161M [00:15<00:04, 20.2MB/s]

 51%|███████████████████▏                  | 81.2M/161M [00:15<00:03, 21.0MB/s]

 52%|███████████████████▋                  | 83.3M/161M [00:15<00:03, 20.8MB/s]

 53%|████████████████████▎                 | 85.6M/161M [00:15<00:03, 21.6MB/s]

 55%|████████████████████▊                 | 87.7M/161M [00:15<00:03, 20.9MB/s]

 56%|█████████████████████▍                | 90.4M/161M [00:15<00:03, 22.6MB/s]

 58%|█████████████████████▉                | 92.9M/161M [00:15<00:02, 23.2MB/s]

 59%|██████████████████████▌               | 95.2M/161M [00:15<00:02, 22.3MB/s]

 61%|███████████████████████               | 97.5M/161M [00:15<00:02, 22.3MB/s]

 62%|███████████████████████▌              | 99.7M/161M [00:16<00:03, 19.0MB/s]

 64%|████████████████████████▊              | 102M/161M [00:16<00:02, 20.1MB/s]

 65%|█████████████████████████▎             | 104M/161M [00:16<00:02, 21.0MB/s]

 67%|██████████████████████████             | 107M/161M [00:16<00:02, 23.1MB/s]

 69%|██████████████████████████▊            | 110M/161M [00:16<00:01, 25.2MB/s]

 70%|███████████████████████████▍           | 113M/161M [00:16<00:02, 23.4MB/s]

 72%|████████████████████████████▏          | 116M/161M [00:16<00:01, 25.1MB/s]

 74%|████████████████████████████▊          | 119M/161M [00:16<00:01, 25.9MB/s]

 75%|█████████████████████████████▍         | 121M/161M [00:16<00:01, 25.3MB/s]

 77%|██████████████████████████████         | 124M/161M [00:16<00:01, 24.2MB/s]

 79%|██████████████████████████████▋        | 126M/161M [00:17<00:01, 24.4MB/s]

 80%|███████████████████████████████▎       | 129M/161M [00:17<00:01, 25.3MB/s]

 82%|███████████████████████████████▉       | 132M/161M [00:17<00:01, 25.7MB/s]

 84%|████████████████████████████████▌      | 134M/161M [00:17<00:01, 23.5MB/s]

 85%|█████████████████████████████████▏     | 137M/161M [00:17<00:00, 24.1MB/s]

 87%|█████████████████████████████████▊     | 139M/161M [00:17<00:00, 23.9MB/s]

 88%|██████████████████████████████████▍    | 142M/161M [00:17<00:00, 23.9MB/s]

 90%|███████████████████████████████████    | 144M/161M [00:17<00:00, 24.5MB/s]

 91%|███████████████████████████████████▋   | 147M/161M [00:17<00:00, 24.7MB/s]

 93%|████████████████████████████████████▎  | 150M/161M [00:18<00:00, 25.3MB/s]

 95%|████████████████████████████████████▉  | 152M/161M [00:18<00:00, 25.8MB/s]

 96%|█████████████████████████████████████▌ | 155M/161M [00:18<00:00, 24.3MB/s]

 98%|██████████████████████████████████████▏| 157M/161M [00:18<00:00, 23.6MB/s]

 99%|██████████████████████████████████████▊| 160M/161M [00:18<00:00, 23.5MB/s]

  0%|                                               | 0.00/161M [00:00<?, ?B/s]

100%|████████████████████████████████████████| 161M/161M [00:00<00:00, 130GB/s]

Finished operation in 0:01:30

Prepare the data for embedding¶

After downloading the data, we need to prepare it for embedding. In the previous step we have regionalized the selected area and buffered it, now we have to join the features with prepared regions.

In [7]:

Copied!

plot_regions(buffered_h3_regions, tiles_style="CartoDB positron")
plot_regions(buffered_h3_regions, tiles_style="CartoDB positron")

Out[7]:

Make this Notebook Trusted to load map: File -> Trust Notebook

In [8]:

Copied!

joiner = IntersectionJoiner()
joint_gdf = joiner.transform(buffered_h3_regions, features_gdf)
joint_gdf
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(buffered_h3_regions, features_gdf)
joint_gdf

Out[8]:


region_id	feature_id
891e20415c3ffff	way/639173300
	way/639173305
	way/639173292
	way/639173298
	way/639173295
...	...
891e2042c5bffff	way/1221785078
	way/617247589
	node/12178691111
	way/1315743040
	way/338451967

470974 rows × 0 columns

GeoVex-Embedding¶

After preparing the data we can proceed with generating embeddings for the regions.

In [9]:

Copied!





neighbourhood = H3Neighbourhood(buffered_h3_regions)

embedder = GeoVexEmbedder(
    target_features=GEOFABRIK_LAYERS,
    batch_size=10,
    neighbourhood_radius=k_ring_buffer_radius,
    convolutional_layers=2,
    embedding_size=50,
)
neighbourhood = H3Neighbourhood(buffered_h3_regions)

embedder = GeoVexEmbedder(
    target_features=GEOFABRIK_LAYERS,
    batch_size=10,
    neighbourhood_radius=k_ring_buffer_radius,
    convolutional_layers=2,
    embedding_size=50,
)

In [10]:

Copied!





with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    embeddings = embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        trainer_kwargs={
            # "max_epochs": 20, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": (
                "cpu" if torch.backends.mps.is_available() else "auto"
            ),  # GeoVexEmbedder does not support MPS
        },
        learning_rate=0.001,
    )

embeddings.head()
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    embeddings = embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        trainer_kwargs={
            # "max_epochs": 20, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": (
                "cpu" if torch.backends.mps.is_available() else "auto"
            ),  # GeoVexEmbedder does not support MPS
        },
        learning_rate=0.001,
    )

embeddings.head()

  0%|                                                                                                                                                                                       | 0/4319 [00:00<?, ?it/s]

  7%|███████████▊                                                                                                                                                               | 298/4319 [00:00<00:01, 2977.39it/s]

 14%|███████████████████████▌                                                                                                                                                   | 596/4319 [00:00<00:01, 2616.16it/s]

 21%|███████████████████████████████████▍                                                                                                                                       | 894/4319 [00:00<00:01, 2767.43it/s]

 29%|█████████████████████████████████████████████████                                                                                                                         | 1245/4319 [00:00<00:01, 3047.17it/s]

 40%|███████████████████████████████████████████████████████████████████▌                                                                                                      | 1718/4319 [00:00<00:00, 3636.30it/s]

 51%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 2217/4319 [00:00<00:00, 4086.44it/s]

 61%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 2651/4319 [00:00<00:00, 4163.76it/s]

 71%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                 | 3070/4319 [00:00<00:00, 3661.98it/s]

 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                  | 3448/4319 [00:01<00:00, 1931.95it/s]

 88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 3808/4319 [00:01<00:00, 2223.06it/s]

 99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 4263/4319 [00:01<00:00, 2686.72it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4319/4319 [00:01<00:00, 2868.51it/s]

GPU available: False, used: False

TPU available: False, using: 0 TPU cores

IPU available: False, using: 0 IPUs

HPU available: False, using: 0 HPUs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 2.4 M 
1 | decoder | Sequential | 1.8 M 
2 | _loss   | GeoVeXLoss | 0     
---------------------------------------
4.2 M     Trainable params
0         Non-trainable params
4.2 M     Total params
16.673    Total estimated model params size (MB)

`Trainer.fit` stopped: `max_epochs=5` reached.

Warning: Some regions were not able to be encoded, as they don't have r=4 neighbors.

Out[10]:

	0	1	2	3	4	5	6	7	8	9	...	40	41	42	43	44	45	46	47	48	49
region_id
891e20415c3ffff	-7.166576	4.612884	5.810854	2.861640	21.997570	-9.436325	7.482526	18.923206	-9.867811	7.966309	...	28.322075	-6.364246	-13.277594	22.375540	-2.255809	-6.471719	12.232291	-13.322634	-9.269577	-37.739197
891e2041907ffff	3.973829	9.313976	1.895119	2.223809	11.112829	-10.830462	13.871595	33.959091	-0.438345	0.880967	...	8.808785	-12.741753	-22.844219	25.921379	5.494741	-6.990178	14.358145	-4.909728	-10.827537	-4.619166
891e2040ab7ffff	-0.444428	14.284081	4.531074	-8.027399	12.615927	-21.629021	17.915295	43.846535	-4.988875	-1.905399	...	-9.441130	-11.487149	-34.089729	34.881371	7.754468	17.645123	18.857574	-20.831202	-17.862314	18.929993
891e204428bffff	-3.834833	-1.319726	7.040741	1.511523	26.205547	-9.432720	10.759152	20.852016	-4.333705	4.477701	...	29.684891	-11.003342	-14.228016	24.174984	-8.291220	-11.648386	13.469184	-10.012638	-11.638663	-35.196800
891e2050a6fffff	-8.292417	2.250541	9.942144	-0.451594	26.428261	-9.327646	8.219390	18.725006	-5.673605	4.614204	...	31.654573	-9.077385	-13.744881	23.924671	-7.241611	-8.259130	13.753767	-11.648851	-9.042700	-41.507065

5 rows × 50 columns

Hex2Vec Embedding¶

In [11]:

Copied!





neighbourhood = H3Neighbourhood(buffered_h3_regions)

hex2vec_embedder = Hex2VecEmbedder(
    encoder_sizes=[300, 150, 50],
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    hex2vec_embeddings = hex2vec_embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        negative_sample_k_distance=2,
        batch_size=64,
        learning_rate=0.001,
        trainer_kwargs={
            # "max_epochs": 50, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": "auto",
        },
    )

hex2vec_embeddings.head()
neighbourhood = H3Neighbourhood(buffered_h3_regions)

hex2vec_embedder = Hex2VecEmbedder(
    encoder_sizes=[300, 150, 50],
)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    hex2vec_embeddings = hex2vec_embedder.fit_transform(
        regions_gdf=buffered_h3_regions,
        features_gdf=features_gdf,
        joint_gdf=joint_gdf,
        neighbourhood=neighbourhood,
        negative_sample_k_distance=2,
        batch_size=64,
        learning_rate=0.001,
        trainer_kwargs={
            # "max_epochs": 50, # uncomment for a longer training
            "max_epochs": 5,
            "accelerator": "auto",
        },
    )

hex2vec_embeddings.head()

  0%|                                                                                                                                                                                       | 0/4319 [00:00<?, ?it/s]

 57%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 2454/4319 [00:00<00:00, 24534.00it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4319/4319 [00:00<00:00, 24970.47it/s]

GPU available: False, used: False

TPU available: False, using: 0 TPU cores

IPU available: False, using: 0 IPUs

HPU available: False, using: 0 HPUs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 136 K 
---------------------------------------
136 K     Trainable params
0         Non-trainable params
136 K     Total params
0.547     Total estimated model params size (MB)

`Trainer.fit` stopped: `max_epochs=5` reached.

Out[11]:

	0	1	2	3	4	5	6	7	8	9	...	40	41	42	43	44	45	46	47	48	49
region_id
891e20415c3ffff	0.416962	0.035779	-0.186067	-0.120777	-0.071452	0.055338	-0.043889	-0.127631	-0.250136	0.196315	...	-0.151119	-0.004833	-0.187080	0.077156	-0.141285	-0.431161	0.328697	0.181021	-0.151026	0.080886
891e2041e9bffff	-0.124418	0.144055	0.225159	-0.165187	0.028201	0.013533	-0.417721	-0.126727	0.299988	-0.090582	...	-0.052293	0.002316	0.045462	0.047620	-0.092574	0.141624	0.093620	0.059649	-0.008359	-0.191902
891e2041907ffff	-0.410192	-0.376262	-0.243375	-0.112062	0.010848	0.089727	0.323338	0.093452	0.377860	-0.019473	...	0.285004	-0.208769	0.466243	0.115423	0.340272	0.318488	-0.403600	0.002987	0.227352	0.143953
891e2040ab7ffff	-0.390069	-0.286319	0.228089	-0.313441	0.096417	-0.842389	0.276795	0.403996	-0.932810	-0.324581	...	0.484384	0.471702	-0.541222	-0.480084	-0.227130	-0.516908	-0.466135	0.044409	0.073485	-0.228895
891e204428bffff	-0.115689	0.227325	0.458062	0.181985	-0.160654	0.251673	-0.166865	0.004040	0.069666	-0.231800	...	0.043838	0.195844	-0.106797	0.100985	0.013575	-0.029209	-0.110115	-0.312481	-0.118547	-0.026663

5 rows × 50 columns

Comparing the Embeddings¶

GeoVex Embedding¶

PCA¶

In [12]:

Copied!





# do pca with three components and then cast to RGB
pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)
# do pca with three components and then cast to RGB
pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)

Out[12]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Clustering¶

In [13]:

Copied!





clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings.index.name = "region_id"
embeddings["cluster"] = clusterizer.labels_
embeddings["cluster"]
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(embeddings)
embeddings.index.name = "region_id"
embeddings["cluster"] = clusterizer.labels_
embeddings["cluster"]

Out[13]:

region_id
891e20415c3ffff    1
891e2041907ffff    0
891e2040ab7ffff    4
891e204428bffff    1
891e2050a6fffff    1
                  ..
891e2043533ffff    3
891e2046213ffff    2
891e204763bffff    0
891e2040313ffff    2
891e2042c5bffff    2
Name: cluster, Length: 3251, dtype: int32

In [14]:

Copied!

plot_numeric_data(base_h3_regions, "cluster", embeddings, tiles_style="CartoDB positron")
plot_numeric_data(base_h3_regions, "cluster", embeddings, tiles_style="CartoDB positron")

Out[14]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Hex2Vec¶

PCA¶

In [15]:

Copied!





# do pca with three components and then cast to RGB
pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(hex2vec_embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=hex2vec_embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)
# do pca with three components and then cast to RGB
pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(hex2vec_embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=hex2vec_embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min()) / (pca_embeddings.max() - pca_embeddings.min()) * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)


color_dict = dict(enumerate(base_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list()))
base_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)

Out[15]:

Make this Notebook Trusted to load map: File -> Trust Notebook

Clustering¶

In [16]:

Copied!

clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(hex2vec_embeddings)

hex2vec_embeddings["cluster"] = clusterizer.labels_
hex2vec_embeddings["cluster"]
clusterizer = KMeans(n_clusters=5, random_state=SEED)
clusterizer.fit(hex2vec_embeddings)

hex2vec_embeddings["cluster"] = clusterizer.labels_
hex2vec_embeddings["cluster"]

Out[16]:

region_id
891e20415c3ffff    0
891e2041e9bffff    2
891e2041907ffff    1
891e2040ab7ffff    3
891e204428bffff    0
                  ..
891e2043533ffff    1
891e2046213ffff    2
891e204763bffff    1
891e2040313ffff    1
891e2042c5bffff    0
Name: cluster, Length: 4319, dtype: int32

In [17]:

Copied!

plot_numeric_data(base_h3_regions, "cluster", hex2vec_embeddings, tiles_style="CartoDB positron")
plot_numeric_data(base_h3_regions, "cluster", hex2vec_embeddings, tiles_style="CartoDB positron")

Out[17]:

Make this Notebook Trusted to load map: File -> Trust Notebook