House sales in king county
House Sales in King County Dataset¶
his dataset contains house sale prices for King County, which includes Seattle, covering approximately 21,000 residential property sales recorded between May 2014 and May 2015. It provides geographic coordinates, physical property attributes (such as size, number of rooms, and condition), as well as contextual features like proximity to waterfronts.
In [1]:
Copied!
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
In [2]:
Copied!
hskc_dataset = HouseSalesInKingCountyDataset()
hskc_dataset = HouseSalesInKingCountyDataset()
In [3]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[3]:
(NoneType, NoneType)
Load default version of dataset
In [4]:
Copied!
ds = hskc_dataset.load()
ds.keys()
ds = hskc_dataset.load()
ds.keys()
Out[4]:
dict_keys(['train', 'test'])
In [5]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[5]:
(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)
In [6]:
Copied!
print("Aggregation H3 resolution:", hskc_dataset.resolution)
print("Aggregation H3 resolution:", hskc_dataset.resolution)
Aggregation H3 resolution: 8
In [7]:
Copied!
print("Prediction target:", hskc_dataset.target)
print("Prediction target:", hskc_dataset.target)
Prediction target: price
In [8]:
Copied!
train_gdf, test_gdf = ds["train"], ds["test"]
train_gdf, test_gdf = ds["train"], ds["test"]
In [9]:
Copied!
train_gdf.head()
train_gdf.head()
Out[9]:
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
3 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
5 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
6 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
8 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 2390 | 7570 | POINT (-122.031 47.3684) |
In [10]:
Copied!
len(test_gdf)
len(test_gdf)
Out[10]:
3461
In [11]:
Copied!
fig, axes = plt.subplots(
2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)
train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)
ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
f"King County data - points on a map"
f" (Train: {train_points} ({train_pct:.2f}%),"
f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
handles=[
Line2D([], [], marker="o", color="orange", linestyle="None"),
Line2D([], [], marker="o", color="royalblue", linestyle="None"),
],
labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()
ax_dist = axes[1]
sns.kdeplot(
x=train_gdf[hskc_dataset.target],
label="train",
color="orange",
ax=ax_dist,
fill=False,
cut=0,
)
sns.kdeplot(
x=test_gdf[hskc_dataset.target],
label="test",
color="royalblue",
ax=ax_dist,
fill=False,
cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()
fig.tight_layout()
plt.show()
fig, axes = plt.subplots(
2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)
train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)
ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
f"King County data - points on a map"
f" (Train: {train_points} ({train_pct:.2f}%),"
f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
handles=[
Line2D([], [], marker="o", color="orange", linestyle="None"),
Line2D([], [], marker="o", color="royalblue", linestyle="None"),
],
labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()
ax_dist = axes[1]
sns.kdeplot(
x=train_gdf[hskc_dataset.target],
label="train",
color="orange",
ax=ax_dist,
fill=False,
cut=0,
)
sns.kdeplot(
x=test_gdf[hskc_dataset.target],
label="test",
color="royalblue",
ax=ax_dist,
fill=False,
cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()
fig.tight_layout()
plt.show()
Getting the h3 with target values
In [12]:
Copied!
hskc_dataset.resolution
hskc_dataset.resolution
Out[12]:
8
In [13]:
Copied!
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
In [14]:
Copied!
train_h3.head()
train_h3.head()
Out[14]:
geometry | price | |
---|---|---|
region_id | ||
8828d5425bfffff | POLYGON ((-122.31256 47.58042, -122.30759 47.5... | 404505.277778 |
8828d548c1fffff | POLYGON ((-122.12101 47.74922, -122.11601 47.7... | 559166.666667 |
8828d54ed5fffff | POLYGON ((-122.35688 47.76018, -122.3519 47.76... | 372116.666667 |
8828d55069fffff | POLYGON ((-122.21643 47.49248, -122.21145 47.4... | 264250.000000 |
8828d55351fffff | POLYGON ((-122.21022 47.45992, -122.20525 47.4... | 331714.285714 |
In [15]:
Copied!
test_h3.head()
test_h3.head()
Out[15]:
geometry | price | |
---|---|---|
region_id | ||
8828d54a51fffff | POLYGON ((-122.11543 47.66936, -122.11043 47.6... | 513666.666667 |
8828d54911fffff | POLYGON ((-122.01186 47.75857, -122.00685 47.7... | 760500.000000 |
8828d42aa3fffff | POLYGON ((-121.72716 47.48638, -121.72215 47.4... | 688000.000000 |
8828d426e1fffff | POLYGON ((-122.09873 47.42921, -122.09375 47.4... | 420500.000000 |
8828d42459fffff | POLYGON ((-122.07563 47.43407, -122.07065 47.4... | 405000.000000 |
In [16]:
Copied!
with plt.rc_context({"hatch.linewidth": 0.4}):
ax = train_h3.plot(
hskc_dataset.target,
cmap="spring_r",
legend=True,
legend_kwds=dict(
location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
),
figsize=(15, 9),
alpha=0.5,
)
axes[0].set_axis_off()
test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)
test_h3.plot(
ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
)
ax.set_title("King County data aggregated to H3 cells")
ax.legend(
handles=[
Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
Patch(
edgecolor=(0, 0, 0, 0.8),
linewidth=0.1,
facecolor=(0, 0, 0, 0),
hatch="+++",
),
],
labels=["Train", "Test"],
loc=2,
)
cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
fig.tight_layout()
plt.show()
with plt.rc_context({"hatch.linewidth": 0.4}):
ax = train_h3.plot(
hskc_dataset.target,
cmap="spring_r",
legend=True,
legend_kwds=dict(
location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
),
figsize=(15, 9),
alpha=0.5,
)
axes[0].set_axis_off()
test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)
test_h3.plot(
ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
)
ax.set_title("King County data aggregated to H3 cells")
ax.legend(
handles=[
Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
Patch(
edgecolor=(0, 0, 0, 0.8),
linewidth=0.1,
facecolor=(0, 0, 0, 0),
hatch="+++",
),
],
labels=["Train", "Test"],
loc=2,
)
cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
fig.tight_layout()
plt.show()
Load raw version of dataset
In [17]:
Copied!
ds = hskc_dataset.load(version="all")
ds.keys()
ds = hskc_dataset.load(version="all")
ds.keys()
Out[17]:
dict_keys(['train'])
In [18]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[18]:
(geopandas.geodataframe.GeoDataFrame, NoneType)
In [19]:
Copied!
ds["train"].head()
ds["train"].head()
Out[19]:
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 1360 | 5000 | POINT (-122.393 47.5208) |
4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
Creating your own train - test split -> Bucket regression (works similarly for spatial regression)
In [20]:
Copied!
train, test = hskc_dataset.train_test_split(
target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
train, test = hskc_dataset.train_test_split(
target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
Summary of the split: Train: 1514 H3 cells (13897 points) Test: 690 H3 cells (3474 points) Expected ratios: {'train': 0.8, 'validation': 0, 'test': 0.2} Actual ratios: {'train': 0.8, 'test': 0.2} Actual ratios difference: {'train': 0.0, 'test': 0.0} bucket train_ratio test_ratio train_ratio_difference \ 0 0 0.80000 0.20000 0.00000 1 1 0.79989 0.20011 0.00011 2 2 0.79988 0.20012 0.00012 3 3 0.79988 0.20012 0.00012 4 4 0.80011 0.19989 -0.00011 5 5 0.80012 0.19988 -0.00012 6 6 0.80012 0.19988 -0.00012 7 7 0.80023 0.19977 -0.00023 8 8 0.80000 0.20000 0.00000 9 9 0.79988 0.20012 0.00012 test_ratio_difference train_points test_points 0 0.00000 1436 359 1 -0.00011 1407 352 2 -0.00012 1327 332 3 -0.00012 1391 348 4 0.00011 1465 366 5 0.00012 1357 339 6 0.00012 1345 336 7 0.00023 1390 347 8 0.00000 1396 349 9 -0.00012 1383 346 Created new train_gdf and test_gdf. Train len: 13897,test len: 3474
In [21]:
Copied!
resolution = hskc_dataset.resolution
resolution = hskc_dataset.resolution
In [22]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[22]:
(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)
In [23]:
Copied!
train.head()
train.head()
Out[23]:
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
9 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 2390 | 7570 | POINT (-122.031 47.3684) |
10 | 1736800520 | 20150403T000000 | 662500.0 | 3 | 2.50 | 3560 | 9796 | 1.0 | 0 | 0 | 3 | 8 | 1860 | 1700 | 1965 | 0 | 98007 | 2210 | 8925 | POINT (-122.145 47.6007) |
In [24]:
Copied!
test.head()
test.head()
Out[24]:
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 1360 | 5000 | POINT (-122.393 47.5208) |
13 | 6054650070 | 20141007T000000 | 400000.0 | 3 | 1.75 | 1370 | 9680 | 1.0 | 0 | 0 | 4 | 7 | 1370 | 0 | 1977 | 0 | 98074 | 1370 | 10208 | POINT (-122.045 47.6127) |
24 | 3814700200 | 20141120T000000 | 329000.0 | 3 | 2.25 | 2450 | 6500 | 2.0 | 0 | 0 | 4 | 8 | 2450 | 0 | 1985 | 0 | 98030 | 2200 | 6865 | POINT (-122.172 47.3739) |
28 | 5101402488 | 20140624T000000 | 438000.0 | 3 | 1.75 | 1520 | 6380 | 1.0 | 0 | 0 | 3 | 7 | 790 | 730 | 1948 | 0 | 98115 | 1520 | 6235 | POINT (-122.304 47.695) |
In [ ]:
Copied!