House sales in king county

House Sales in King County Dataset¶

his dataset contains house sale prices for King County, which includes Seattle, covering approximately 21,000 residential property sales recorded between May 2014 and May 2015. It provides geographic coordinates, physical property attributes (such as size, number of rooms, and condition), as well as contextual features like proximity to waterfronts.

Run this notebook in Google Colab:

Remember to install the srai library before running the notebook:

%pip install srai[all]

In [1]:

Copied!





# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

# dataset import
from srai.datasets import HouseSalesInKingCountyDataset

In [2]:

Copied!

hskc_dataset = HouseSalesInKingCountyDataset()
hskc_dataset = HouseSalesInKingCountyDataset()

In [3]:

Copied!

type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)

Out[3]:

(NoneType, NoneType)

Load default version of dataset

In [4]:

Copied!

ds = hskc_dataset.load()
ds.keys()
ds = hskc_dataset.load()
ds.keys()

Out[4]:

dict_keys(['train', 'test'])

In [5]:

Copied!

type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)

Out[5]:

(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)

In [6]:

Copied!

print("Aggregation H3 resolution:", hskc_dataset.resolution)
print("Aggregation H3 resolution:", hskc_dataset.resolution)

Aggregation H3 resolution: 8

In [7]:

Copied!

print("Prediction target:", hskc_dataset.target)
print("Prediction target:", hskc_dataset.target)

Prediction target: price

In [8]:

Copied!

train_gdf, test_gdf = ds["train"], ds["test"]
train_gdf, test_gdf = ds["train"], ds["test"]

In [9]:

Copied!

train_gdf.head()
train_gdf.head()

Out[9]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	sqft_living15	sqft_lot15	geometry
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	3	7	2170	400	1951	1991	98125	1690	7639	POINT (-122.319 47.721)
3	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	3	8	1680	0	1987	0	98074	1800	7503	POINT (-122.045 47.6168)
5	1321400060	20140627T000000	257500.0	3	2.25	1715	6819	2.0	3	7	1715	0	1995	0	98003	2238	6819	POINT (-122.327 47.3097)
6	2008000270	20150115T000000	291850.0	3	1.50	1060	9711	1.0	3	7	1060	0	1963	0	98198	1650	9711	POINT (-122.315 47.4095)
8	3793500160	20150312T000000	323000.0	3	2.50	1890	6560	2.0	3	7	1890	0	2003	0	98038	2390	7570	POINT (-122.031 47.3684)

In [10]:

Copied!

len(test_gdf)
len(test_gdf)

Out[10]:

In [11]:

Copied!





fig, axes = plt.subplots(
    2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)

train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)

ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
    f"King County data - points on a map"
    f" (Train: {train_points} ({train_pct:.2f}%),"
    f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
    handles=[
        Line2D([], [], marker="o", color="orange", linestyle="None"),
        Line2D([], [], marker="o", color="royalblue", linestyle="None"),
    ],
    labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()

ax_dist = axes[1]
sns.kdeplot(
    x=train_gdf[hskc_dataset.target],
    label="train",
    color="orange",
    ax=ax_dist,
    fill=False,
    cut=0,
)
sns.kdeplot(
    x=test_gdf[hskc_dataset.target],
    label="test",
    color="royalblue",
    ax=ax_dist,
    fill=False,
    cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()

fig.tight_layout()

plt.show()
fig, axes = plt.subplots(
    2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)

train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)

ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
    f"King County data - points on a map"
    f" (Train: {train_points} ({train_pct:.2f}%),"
    f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
    handles=[
        Line2D([], [], marker="o", color="orange", linestyle="None"),
        Line2D([], [], marker="o", color="royalblue", linestyle="None"),
    ],
    labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()

ax_dist = axes[1]
sns.kdeplot(
    x=train_gdf[hskc_dataset.target],
    label="train",
    color="orange",
    ax=ax_dist,
    fill=False,
    cut=0,
)
sns.kdeplot(
    x=test_gdf[hskc_dataset.target],
    label="test",
    color="royalblue",
    ax=ax_dist,
    fill=False,
    cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()

fig.tight_layout()

plt.show()

No description has been provided for this image

Getting the h3 with target values

In [12]:

Copied!

hskc_dataset.resolution
hskc_dataset.resolution

Out[12]:

In [13]:

Copied!

train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()

In [14]:

Copied!

train_h3.head()
train_h3.head()

Out[14]:

	geometry	price
region_id
8828d4adedfffff	POLYGON ((-121.96689 47.21246, -121.96192 47.2...	251000.0
8828d55293fffff	POLYGON ((-122.35777 47.50313, -122.3528 47.50...	336150.0
8828d4255dfffff	POLYGON ((-121.98318 47.45347, -121.97819 47.4...	539000.0
8828d54a61fffff	POLYGON ((-122.06091 47.67404, -122.05591 47.6...	500000.0
8828d43581fffff	POLYGON ((-122.01239 47.36596, -122.00741 47.3...	341230.0

In [15]:

Copied!

test_h3.head()
test_h3.head()

Out[15]:

	geometry	price
region_id
8828d54b53fffff	POLYGON ((-122.04252 47.6914, -122.03752 47.69...	692500.0
8828d54121fffff	POLYGON ((-122.14337 47.677, -122.13838 47.680...	661000.0
8828d5556dfffff	POLYGON ((-122.3193 47.56546, -122.31433 47.56...	424457.6
8828d5cb25fffff	POLYGON ((-122.13595 47.34675, -122.13098 47.3...	332550.0
8828d42f45fffff	POLYGON ((-121.78984 47.48696, -121.78483 47.4...	342612.5

In [16]:

Copied!





with plt.rc_context({"hatch.linewidth": 0.4}):
    ax = train_h3.plot(
        hskc_dataset.target,
        cmap="spring_r",
        legend=True,
        legend_kwds=dict(
            location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
        ),
        figsize=(15, 9),
        alpha=0.5,
    )

    axes[0].set_axis_off()

    test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)

    test_h3.plot(
        ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
    )

    ax.set_title("King County data aggregated to H3 cells")
    ax.legend(
        handles=[
            Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
            Patch(
                edgecolor=(0, 0, 0, 0.8),
                linewidth=0.1,
                facecolor=(0, 0, 0, 0),
                hatch="+++",
            ),
        ],
        labels=["Train", "Test"],
        loc=2,
    )

    cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)

    fig.tight_layout()

    plt.show()
with plt.rc_context({"hatch.linewidth": 0.4}):
    ax = train_h3.plot(
        hskc_dataset.target,
        cmap="spring_r",
        legend=True,
        legend_kwds=dict(
            location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
        ),
        figsize=(15, 9),
        alpha=0.5,
    )

    axes[0].set_axis_off()

    test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)

    test_h3.plot(
        ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
    )

    ax.set_title("King County data aggregated to H3 cells")
    ax.legend(
        handles=[
            Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
            Patch(
                edgecolor=(0, 0, 0, 0.8),
                linewidth=0.1,
                facecolor=(0, 0, 0, 0),
                hatch="+++",
            ),
        ],
        labels=["Train", "Test"],
        loc=2,
    )

    cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)

    fig.tight_layout()

    plt.show()

Load raw version of dataset

In [17]:

Copied!

ds = hskc_dataset.load(version="all")
ds.keys()
ds = hskc_dataset.load(version="all")
ds.keys()

Out[17]:

dict_keys(['train'])

In [18]:

Copied!

type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)

Out[18]:

(geopandas.geodataframe.GeoDataFrame, NoneType)

In [19]:

Copied!

ds["train"].head()
ds["train"].head()

Out[19]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	sqft_living15	sqft_lot15	geometry
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	3	7	2170	400	1951	1991	98125	1690	7639	POINT (-122.319 47.721)
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	5	7	1050	910	1965	0	98136	1360	5000	POINT (-122.393 47.5208)
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	3	8	1680	0	1987	0	98074	1800	7503	POINT (-122.045 47.6168)
6	1321400060	20140627T000000	257500.0	3	2.25	1715	6819	2.0	3	7	1715	0	1995	0	98003	2238	6819	POINT (-122.327 47.3097)
7	2008000270	20150115T000000	291850.0	3	1.50	1060	9711	1.0	3	7	1060	0	1963	0	98198	1650	9711	POINT (-122.315 47.4095)

Creating your own train - test split -> Bucket regression (works similarly for spatial regression)

In [20]:

Copied!

train, test = hskc_dataset.train_test_split(
    target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
train, test = hskc_dataset.train_test_split(
    target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)

Summary of the split:

  Train: 1514 H3 cells (13897 points)
  Test: 690 H3 cells (3474 points)

  Expected ratios: {'train': 0.8, 'validation': 0, 'test': 0.2}
  Actual ratios: {'train': 0.8, 'test': 0.2}
  Actual ratios difference: {'train': 0.0, 'test': 0.0}
   bucket  train_ratio  test_ratio  train_ratio_difference  \
0       0      0.80000     0.20000                 0.00000   
1       1      0.79989     0.20011                 0.00011   
2       2      0.79988     0.20012                 0.00012   
3       3      0.79988     0.20012                 0.00012   
4       4      0.80011     0.19989                -0.00011   
5       5      0.80012     0.19988                -0.00012   
6       6      0.80012     0.19988                -0.00012   
7       7      0.80023     0.19977                -0.00023   
8       8      0.80000     0.20000                 0.00000   
9       9      0.79988     0.20012                 0.00012   

   test_ratio_difference  train_points  test_points  
0                0.00000          1436          359  
1               -0.00011          1407          352  
2               -0.00012          1327          332  
3               -0.00012          1391          348  
4                0.00011          1465          366  
5                0.00012          1357          339  
6                0.00012          1345          336  
7                0.00023          1390          347  
8                0.00000          1396          349  
9               -0.00012          1383          346  
Created new train_gdf and test_gdf. Train len: 13897,test len: 3474

In [21]:

Copied!

resolution = hskc_dataset.resolution
resolution = hskc_dataset.resolution

In [22]:

Copied!

type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)

Out[22]:

(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)

In [23]:

Copied!

train.head()
train.head()

Out[23]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	zipcode	sqft_living15	sqft_lot15	geometry
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	3	8	1680	0	1987	98074	1800	7503	POINT (-122.045 47.6168)
6	1321400060	20140627T000000	257500.0	3	2.25	1715	6819	2.0	3	7	1715	0	1995	98003	2238	6819	POINT (-122.327 47.3097)
7	2008000270	20150115T000000	291850.0	3	1.50	1060	9711	1.0	3	7	1060	0	1963	98198	1650	9711	POINT (-122.315 47.4095)
9	3793500160	20150312T000000	323000.0	3	2.50	1890	6560	2.0	3	7	1890	0	2003	98038	2390	7570	POINT (-122.031 47.3684)
10	1736800520	20150403T000000	662500.0	3	2.50	3560	9796	1.0	3	8	1860	1700	1965	98007	2210	8925	POINT (-122.145 47.6007)

In [24]:

Copied!

test.head()
test.head()

Out[24]:

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	sqft_living15	sqft_lot15	geometry
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	3	7	2170	400	1951	1991	98125	1690	7639	POINT (-122.319 47.721)
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	5	7	1050	910	1965	0	98136	1360	5000	POINT (-122.393 47.5208)
13	6054650070	20141007T000000	400000.0	3	1.75	1370	9680	1.0	4	7	1370	0	1977	0	98074	1370	10208	POINT (-122.045 47.6127)
24	3814700200	20141120T000000	329000.0	3	2.25	2450	6500	2.0	4	8	2450	0	1985	0	98030	2200	6865	POINT (-122.172 47.3739)
28	5101402488	20140624T000000	438000.0	3	1.75	1520	6380	1.0	3	7	790	730	1948	0	98115	1520	6235	POINT (-122.304 47.695)

In [ ]: