House sales in king county
House Sales in King County Dataset¶
his dataset contains house sale prices for King County, which includes Seattle, covering approximately 21,000 residential property sales recorded between May 2014 and May 2015. It provides geographic coordinates, physical property attributes (such as size, number of rooms, and condition), as well as contextual features like proximity to waterfronts.
In [1]:
Copied!
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
In [2]:
Copied!
hskc_dataset = HouseSalesInKingCountyDataset()
hskc_dataset = HouseSalesInKingCountyDataset()
In [3]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[3]:
(NoneType, NoneType)
Load default version of dataset
In [4]:
Copied!
ds = hskc_dataset.load()
ds.keys()
ds = hskc_dataset.load()
ds.keys()
Out[4]:
dict_keys(['train', 'test'])
In [5]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[5]:
(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)
In [6]:
Copied!
print("Aggregation H3 resolution:", hskc_dataset.resolution)
print("Aggregation H3 resolution:", hskc_dataset.resolution)
Aggregation H3 resolution: 8
In [7]:
Copied!
print("Prediction target:", hskc_dataset.target)
print("Prediction target:", hskc_dataset.target)
Prediction target: price
In [8]:
Copied!
train_gdf, test_gdf = ds["train"], ds["test"]
train_gdf, test_gdf = ds["train"], ds["test"]
In [9]:
Copied!
train_gdf.head()
train_gdf.head()
Out[9]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
| 3 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
| 5 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
| 6 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
| 8 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 2390 | 7570 | POINT (-122.031 47.3684) |
In [10]:
Copied!
len(test_gdf)
len(test_gdf)
Out[10]:
3461
In [11]:
Copied!
fig, axes = plt.subplots(
2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)
train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)
ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
f"King County data - points on a map"
f" (Train: {train_points} ({train_pct:.2f}%),"
f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
handles=[
Line2D([], [], marker="o", color="orange", linestyle="None"),
Line2D([], [], marker="o", color="royalblue", linestyle="None"),
],
labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()
ax_dist = axes[1]
sns.kdeplot(
x=train_gdf[hskc_dataset.target],
label="train",
color="orange",
ax=ax_dist,
fill=False,
cut=0,
)
sns.kdeplot(
x=test_gdf[hskc_dataset.target],
label="test",
color="royalblue",
ax=ax_dist,
fill=False,
cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()
fig.tight_layout()
plt.show()
fig, axes = plt.subplots(
2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)
train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)
ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
f"King County data - points on a map"
f" (Train: {train_points} ({train_pct:.2f}%),"
f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
handles=[
Line2D([], [], marker="o", color="orange", linestyle="None"),
Line2D([], [], marker="o", color="royalblue", linestyle="None"),
],
labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()
ax_dist = axes[1]
sns.kdeplot(
x=train_gdf[hskc_dataset.target],
label="train",
color="orange",
ax=ax_dist,
fill=False,
cut=0,
)
sns.kdeplot(
x=test_gdf[hskc_dataset.target],
label="test",
color="royalblue",
ax=ax_dist,
fill=False,
cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()
fig.tight_layout()
plt.show()
Getting the h3 with target values
In [12]:
Copied!
hskc_dataset.resolution
hskc_dataset.resolution
Out[12]:
8
In [13]:
Copied!
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
In [14]:
Copied!
train_h3.head()
train_h3.head()
Out[14]:
| geometry | price | |
|---|---|---|
| region_id | ||
| 8828d43581fffff | POLYGON ((-122.01239 47.36596, -122.00741 47.3... | 341230.000000 |
| 8828d43497fffff | POLYGON ((-122.11765 47.36418, -122.11268 47.3... | 308417.500000 |
| 8828d55283fffff | POLYGON ((-122.32989 47.49552, -122.32492 47.4... | 303056.250000 |
| 8828d55b17fffff | POLYGON ((-122.02268 47.57386, -122.01769 47.5... | 696500.000000 |
| 8828d55889fffff | POLYGON ((-122.11603 47.62192, -122.11104 47.6... | 507944.230769 |
In [15]:
Copied!
test_h3.head()
test_h3.head()
Out[15]:
| geometry | price | |
|---|---|---|
| region_id | ||
| 8828d4349dfffff | POLYGON ((-122.11288 47.35161, -122.10791 47.3... | 450000.0 |
| 8828d54e63fffff | POLYGON ((-122.2648 47.73244, -122.25981 47.73... | 827500.0 |
| 8828d55913fffff | POLYGON ((-122.01044 47.62377, -122.00545 47.6... | 698400.0 |
| 8828d436e1fffff | POLYGON ((-122.08904 47.28873, -122.08407 47.2... | 560000.0 |
| 8828d5ce33fffff | POLYGON ((-122.45898 47.39342, -122.45403 47.3... | 615000.0 |
In [16]:
Copied!
with plt.rc_context({"hatch.linewidth": 0.4}):
ax = train_h3.plot(
hskc_dataset.target,
cmap="spring_r",
legend=True,
legend_kwds=dict(
location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
),
figsize=(15, 9),
alpha=0.5,
)
axes[0].set_axis_off()
test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)
test_h3.plot(
ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
)
ax.set_title("King County data aggregated to H3 cells")
ax.legend(
handles=[
Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
Patch(
edgecolor=(0, 0, 0, 0.8),
linewidth=0.1,
facecolor=(0, 0, 0, 0),
hatch="+++",
),
],
labels=["Train", "Test"],
loc=2,
)
cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
fig.tight_layout()
plt.show()
with plt.rc_context({"hatch.linewidth": 0.4}):
ax = train_h3.plot(
hskc_dataset.target,
cmap="spring_r",
legend=True,
legend_kwds=dict(
location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
),
figsize=(15, 9),
alpha=0.5,
)
axes[0].set_axis_off()
test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)
test_h3.plot(
ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
)
ax.set_title("King County data aggregated to H3 cells")
ax.legend(
handles=[
Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
Patch(
edgecolor=(0, 0, 0, 0.8),
linewidth=0.1,
facecolor=(0, 0, 0, 0),
hatch="+++",
),
],
labels=["Train", "Test"],
loc=2,
)
cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
fig.tight_layout()
plt.show()
Load raw version of dataset
In [17]:
Copied!
ds = hskc_dataset.load(version="all")
ds.keys()
ds = hskc_dataset.load(version="all")
ds.keys()
Out[17]:
dict_keys(['train'])
In [18]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[18]:
(geopandas.geodataframe.GeoDataFrame, NoneType)
In [19]:
Copied!
ds["train"].head()
ds["train"].head()
Out[19]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 1360 | 5000 | POINT (-122.393 47.5208) |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
| 6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
| 7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
Creating your own train - test split -> Bucket regression (works similarly for spatial regression)
In [20]:
Copied!
train, test = hskc_dataset.train_test_split(
target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
train, test = hskc_dataset.train_test_split(
target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
Summary of the split:
Train: 1514 H3 cells (13897 points)
Test: 690 H3 cells (3474 points)
Expected ratios: {'train': 0.8, 'validation': 0, 'test': 0.2}
Actual ratios: {'train': 0.8, 'test': 0.2}
Actual ratios difference: {'train': 0.0, 'test': 0.0}
bucket train_ratio test_ratio train_ratio_difference \
0 0 0.80000 0.20000 0.00000
1 1 0.79989 0.20011 0.00011
2 2 0.79988 0.20012 0.00012
3 3 0.79988 0.20012 0.00012
4 4 0.80011 0.19989 -0.00011
5 5 0.80012 0.19988 -0.00012
6 6 0.80012 0.19988 -0.00012
7 7 0.80023 0.19977 -0.00023
8 8 0.80000 0.20000 0.00000
9 9 0.79988 0.20012 0.00012
test_ratio_difference train_points test_points
0 0.00000 1436 359
1 -0.00011 1407 352
2 -0.00012 1327 332
3 -0.00012 1391 348
4 0.00011 1465 366
5 0.00012 1357 339
6 0.00012 1345 336
7 0.00023 1390 347
8 0.00000 1396 349
9 -0.00012 1383 346
Created new train_gdf and test_gdf. Train len: 13897,test len: 3474
In [21]:
Copied!
resolution = hskc_dataset.resolution
resolution = hskc_dataset.resolution
In [22]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[22]:
(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)
In [23]:
Copied!
train.head()
train.head()
Out[23]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
| 6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
| 7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
| 9 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 2390 | 7570 | POINT (-122.031 47.3684) |
| 10 | 1736800520 | 20150403T000000 | 662500.0 | 3 | 2.50 | 3560 | 9796 | 1.0 | 0 | 0 | 3 | 8 | 1860 | 1700 | 1965 | 0 | 98007 | 2210 | 8925 | POINT (-122.145 47.6007) |
In [24]:
Copied!
test.head()
test.head()
Out[24]:
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 1360 | 5000 | POINT (-122.393 47.5208) |
| 13 | 6054650070 | 20141007T000000 | 400000.0 | 3 | 1.75 | 1370 | 9680 | 1.0 | 0 | 0 | 4 | 7 | 1370 | 0 | 1977 | 0 | 98074 | 1370 | 10208 | POINT (-122.045 47.6127) |
| 24 | 3814700200 | 20141120T000000 | 329000.0 | 3 | 2.25 | 2450 | 6500 | 2.0 | 0 | 0 | 4 | 8 | 2450 | 0 | 1985 | 0 | 98030 | 2200 | 6865 | POINT (-122.172 47.3739) |
| 28 | 5101402488 | 20140624T000000 | 438000.0 | 3 | 1.75 | 1520 | 6380 | 1.0 | 0 | 0 | 3 | 7 | 790 | 730 | 1948 | 0 | 98115 | 1520 | 6235 | POINT (-122.304 47.695) |
In [ ]:
Copied!