House sales in king county
House Sales in King County Dataset¶
his dataset contains house sale prices for King County, which includes Seattle, covering approximately 21,000 residential property sales recorded between May 2014 and May 2015. It provides geographic coordinates, physical property attributes (such as size, number of rooms, and condition), as well as contextual features like proximity to waterfronts.
In [1]:
Copied!
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
In [2]:
Copied!
hskc_dataset = HouseSalesInKingCountyDataset()
hskc_dataset = HouseSalesInKingCountyDataset()
In [3]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[3]:
(NoneType, NoneType)
Load default version of dataset
In [4]:
Copied!
ds = hskc_dataset.load()
ds.keys()
ds = hskc_dataset.load()
ds.keys()
Out[4]:
dict_keys(['train', 'test'])
In [5]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[5]:
(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)
In [6]:
Copied!
print("Aggregation H3 resolution:", hskc_dataset.resolution)
print("Aggregation H3 resolution:", hskc_dataset.resolution)
Aggregation H3 resolution: 8
In [7]:
Copied!
print("Prediction target:", hskc_dataset.target)
print("Prediction target:", hskc_dataset.target)
Prediction target: price
In [8]:
Copied!
train_gdf, test_gdf = ds["train"], ds["test"]
train_gdf, test_gdf = ds["train"], ds["test"]
In [9]:
Copied!
train_gdf.head()
train_gdf.head()
Out[9]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
| 3 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
| 5 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
| 6 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
| 8 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 2390 | 7570 | POINT (-122.031 47.3684) |
In [10]:
Copied!
len(test_gdf)
len(test_gdf)
Out[10]:
3461
In [11]:
Copied!
fig, axes = plt.subplots(
2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)
train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)
ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
f"King County data - points on a map"
f" (Train: {train_points} ({train_pct:.2f}%),"
f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
handles=[
Line2D([], [], marker="o", color="orange", linestyle="None"),
Line2D([], [], marker="o", color="royalblue", linestyle="None"),
],
labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()
ax_dist = axes[1]
sns.kdeplot(
x=train_gdf[hskc_dataset.target],
label="train",
color="orange",
ax=ax_dist,
fill=False,
cut=0,
)
sns.kdeplot(
x=test_gdf[hskc_dataset.target],
label="test",
color="royalblue",
ax=ax_dist,
fill=False,
cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()
fig.tight_layout()
plt.show()
fig, axes = plt.subplots(
2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)
train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)
ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
f"King County data - points on a map"
f" (Train: {train_points} ({train_pct:.2f}%),"
f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
handles=[
Line2D([], [], marker="o", color="orange", linestyle="None"),
Line2D([], [], marker="o", color="royalblue", linestyle="None"),
],
labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()
ax_dist = axes[1]
sns.kdeplot(
x=train_gdf[hskc_dataset.target],
label="train",
color="orange",
ax=ax_dist,
fill=False,
cut=0,
)
sns.kdeplot(
x=test_gdf[hskc_dataset.target],
label="test",
color="royalblue",
ax=ax_dist,
fill=False,
cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()
fig.tight_layout()
plt.show()
Getting the h3 with target values
In [12]:
Copied!
hskc_dataset.resolution
hskc_dataset.resolution
Out[12]:
8
In [13]:
Copied!
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
In [14]:
Copied!
train_h3.head()
train_h3.head()
Out[14]:
| geometry | price | |
|---|---|---|
| region_id | ||
| 8828d4adedfffff | POLYGON ((-121.96689 47.21246, -121.96192 47.2... | 251000.0 |
| 8828d55293fffff | POLYGON ((-122.35777 47.50313, -122.3528 47.50... | 336150.0 |
| 8828d4255dfffff | POLYGON ((-121.98318 47.45347, -121.97819 47.4... | 539000.0 |
| 8828d54a61fffff | POLYGON ((-122.06091 47.67404, -122.05591 47.6... | 500000.0 |
| 8828d43581fffff | POLYGON ((-122.01239 47.36596, -122.00741 47.3... | 341230.0 |
In [15]:
Copied!
test_h3.head()
test_h3.head()
Out[15]:
| geometry | price | |
|---|---|---|
| region_id | ||
| 8828d54b53fffff | POLYGON ((-122.04252 47.6914, -122.03752 47.69... | 692500.0 |
| 8828d54121fffff | POLYGON ((-122.14337 47.677, -122.13838 47.680... | 661000.0 |
| 8828d5556dfffff | POLYGON ((-122.3193 47.56546, -122.31433 47.56... | 424457.6 |
| 8828d5cb25fffff | POLYGON ((-122.13595 47.34675, -122.13098 47.3... | 332550.0 |
| 8828d42f45fffff | POLYGON ((-121.78984 47.48696, -121.78483 47.4... | 342612.5 |
In [16]:
Copied!
with plt.rc_context({"hatch.linewidth": 0.4}):
ax = train_h3.plot(
hskc_dataset.target,
cmap="spring_r",
legend=True,
legend_kwds=dict(
location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
),
figsize=(15, 9),
alpha=0.5,
)
axes[0].set_axis_off()
test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)
test_h3.plot(
ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
)
ax.set_title("King County data aggregated to H3 cells")
ax.legend(
handles=[
Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
Patch(
edgecolor=(0, 0, 0, 0.8),
linewidth=0.1,
facecolor=(0, 0, 0, 0),
hatch="+++",
),
],
labels=["Train", "Test"],
loc=2,
)
cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
fig.tight_layout()
plt.show()
with plt.rc_context({"hatch.linewidth": 0.4}):
ax = train_h3.plot(
hskc_dataset.target,
cmap="spring_r",
legend=True,
legend_kwds=dict(
location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
),
figsize=(15, 9),
alpha=0.5,
)
axes[0].set_axis_off()
test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)
test_h3.plot(
ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
)
ax.set_title("King County data aggregated to H3 cells")
ax.legend(
handles=[
Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
Patch(
edgecolor=(0, 0, 0, 0.8),
linewidth=0.1,
facecolor=(0, 0, 0, 0),
hatch="+++",
),
],
labels=["Train", "Test"],
loc=2,
)
cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
fig.tight_layout()
plt.show()
Load raw version of dataset
In [17]:
Copied!
ds = hskc_dataset.load(version="all")
ds.keys()
ds = hskc_dataset.load(version="all")
ds.keys()
Out[17]:
dict_keys(['train'])
In [18]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[18]:
(geopandas.geodataframe.GeoDataFrame, NoneType)
In [19]:
Copied!
ds["train"].head()
ds["train"].head()
Out[19]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 1360 | 5000 | POINT (-122.393 47.5208) |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
| 6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
| 7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
Creating your own train - test split -> Bucket regression (works similarly for spatial regression)
In [20]:
Copied!
train, test = hskc_dataset.train_test_split(
target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
train, test = hskc_dataset.train_test_split(
target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
Summary of the split:
Train: 1514 H3 cells (13897 points)
Test: 690 H3 cells (3474 points)
Expected ratios: {'train': 0.8, 'validation': 0, 'test': 0.2}
Actual ratios: {'train': 0.8, 'test': 0.2}
Actual ratios difference: {'train': 0.0, 'test': 0.0}
bucket train_ratio test_ratio train_ratio_difference \
0 0 0.80000 0.20000 0.00000
1 1 0.79989 0.20011 0.00011
2 2 0.79988 0.20012 0.00012
3 3 0.79988 0.20012 0.00012
4 4 0.80011 0.19989 -0.00011
5 5 0.80012 0.19988 -0.00012
6 6 0.80012 0.19988 -0.00012
7 7 0.80023 0.19977 -0.00023
8 8 0.80000 0.20000 0.00000
9 9 0.79988 0.20012 0.00012
test_ratio_difference train_points test_points
0 0.00000 1436 359
1 -0.00011 1407 352
2 -0.00012 1327 332
3 -0.00012 1391 348
4 0.00011 1465 366
5 0.00012 1357 339
6 0.00012 1345 336
7 0.00023 1390 347
8 0.00000 1396 349
9 -0.00012 1383 346
Created new train_gdf and test_gdf. Train len: 13897,test len: 3474
In [21]:
Copied!
resolution = hskc_dataset.resolution
resolution = hskc_dataset.resolution
In [22]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[22]:
(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)
In [23]:
Copied!
train.head()
train.head()
Out[23]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
| 6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
| 7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
| 9 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 2390 | 7570 | POINT (-122.031 47.3684) |
| 10 | 1736800520 | 20150403T000000 | 662500.0 | 3 | 2.50 | 3560 | 9796 | 1.0 | 0 | 0 | 3 | 8 | 1860 | 1700 | 1965 | 0 | 98007 | 2210 | 8925 | POINT (-122.145 47.6007) |
In [24]:
Copied!
test.head()
test.head()
Out[24]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 1360 | 5000 | POINT (-122.393 47.5208) |
| 13 | 6054650070 | 20141007T000000 | 400000.0 | 3 | 1.75 | 1370 | 9680 | 1.0 | 0 | 0 | 4 | 7 | 1370 | 0 | 1977 | 0 | 98074 | 1370 | 10208 | POINT (-122.045 47.6127) |
| 24 | 3814700200 | 20141120T000000 | 329000.0 | 3 | 2.25 | 2450 | 6500 | 2.0 | 0 | 0 | 4 | 8 | 2450 | 0 | 1985 | 0 | 98030 | 2200 | 6865 | POINT (-122.172 47.3739) |
| 28 | 5101402488 | 20140624T000000 | 438000.0 | 3 | 1.75 | 1520 | 6380 | 1.0 | 0 | 0 | 3 | 7 | 790 | 730 | 1948 | 0 | 98115 | 1520 | 6235 | POINT (-122.304 47.695) |
In [ ]:
Copied!