House sales in king county
House Sales in King County Dataset¶
his dataset contains house sale prices for King County, which includes Seattle, covering approximately 21,000 residential property sales recorded between May 2014 and May 2015. It provides geographic coordinates, physical property attributes (such as size, number of rooms, and condition), as well as contextual features like proximity to waterfronts.
In [1]:
Copied!
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
# plotting imports
import contextily as cx
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
# dataset import
from srai.datasets import HouseSalesInKingCountyDataset
In [2]:
Copied!
hskc_dataset = HouseSalesInKingCountyDataset()
hskc_dataset = HouseSalesInKingCountyDataset()
In [3]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[3]:
(NoneType, NoneType)
Load default version of dataset
In [4]:
Copied!
ds = hskc_dataset.load()
ds.keys()
ds = hskc_dataset.load()
ds.keys()
Out[4]:
dict_keys(['train', 'test'])
In [5]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[5]:
(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)
In [6]:
Copied!
print("Aggregation H3 resolution:", hskc_dataset.resolution)
print("Aggregation H3 resolution:", hskc_dataset.resolution)
Aggregation H3 resolution: 8
In [7]:
Copied!
print("Prediction target:", hskc_dataset.target)
print("Prediction target:", hskc_dataset.target)
Prediction target: price
In [8]:
Copied!
train_gdf, test_gdf = ds["train"], ds["test"]
train_gdf, test_gdf = ds["train"], ds["test"]
In [9]:
Copied!
train_gdf.head()
train_gdf.head()
Out[9]:
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
3 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
5 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
6 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
8 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 2390 | 7570 | POINT (-122.031 47.3684) |
In [10]:
Copied!
len(test_gdf)
len(test_gdf)
Out[10]:
3461
In [11]:
Copied!
fig, axes = plt.subplots(
2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)
train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)
ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
f"King County data - points on a map"
f" (Train: {train_points} ({train_pct:.2f}%),"
f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
handles=[
Line2D([], [], marker="o", color="orange", linestyle="None"),
Line2D([], [], marker="o", color="royalblue", linestyle="None"),
],
labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()
ax_dist = axes[1]
sns.kdeplot(
x=train_gdf[hskc_dataset.target],
label="train",
color="orange",
ax=ax_dist,
fill=False,
cut=0,
)
sns.kdeplot(
x=test_gdf[hskc_dataset.target],
label="test",
color="royalblue",
ax=ax_dist,
fill=False,
cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()
fig.tight_layout()
plt.show()
fig, axes = plt.subplots(
2, 1, sharex=False, sharey=False, figsize=(12, 13), height_ratios=[3, 1]
)
train_points = len(train_gdf)
test_points = len(test_gdf)
train_pct = 100 * train_points / (train_points + test_points)
test_pct = 100 * test_points / (train_points + test_points)
ax_map = axes[0]
train_gdf.plot(color="orange", markersize=0.3, ax=ax_map, label="train")
test_gdf.plot(color="royalblue", markersize=0.3, ax=ax_map, label="test")
ax_map.set_title(
f"King County data - points on a map"
f" (Train: {train_points} ({train_pct:.2f}%),"
f" Test: {test_points} ({test_pct:.2f}%))"
)
ax_map.legend(
handles=[
Line2D([], [], marker="o", color="orange", linestyle="None"),
Line2D([], [], marker="o", color="royalblue", linestyle="None"),
],
labels=["Train", "Test"],
)
cx.add_basemap(ax_map, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
ax_map.set_axis_off()
ax_dist = axes[1]
sns.kdeplot(
x=train_gdf[hskc_dataset.target],
label="train",
color="orange",
ax=ax_dist,
fill=False,
cut=0,
)
sns.kdeplot(
x=test_gdf[hskc_dataset.target],
label="test",
color="royalblue",
ax=ax_dist,
fill=False,
cut=0,
)
ax_dist.set_title("King County data - target distribution")
ax_dist.legend()
fig.tight_layout()
plt.show()
Getting the h3 with target values
In [12]:
Copied!
hskc_dataset.resolution
hskc_dataset.resolution
Out[12]:
8
In [13]:
Copied!
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
train_h3, _, test_h3 = hskc_dataset.get_h3_with_labels()
In [14]:
Copied!
train_h3.head()
train_h3.head()
Out[14]:
geometry | price | |
---|---|---|
region_id | ||
8828d5cb37fffff | POLYGON ((-122.17195 47.35951, -122.16698 47.3... | 304992.857143 |
8828d541a1fffff | POLYGON ((-122.20069 47.71221, -122.1957 47.71... | 415644.214286 |
8828d54e35fffff | POLYGON ((-122.27443 47.7574, -122.26944 47.76... | 510192.307692 |
8828d4264bfffff | POLYGON ((-122.07965 47.37897, -122.07467 47.3... | 332400.000000 |
8828d55863fffff | POLYGON ((-122.03563 47.59145, -122.03064 47.5... | 658566.428571 |
In [15]:
Copied!
test_h3.head()
test_h3.head()
Out[15]:
geometry | price | |
---|---|---|
region_id | ||
8828d40667fffff | POLYGON ((-121.63923 47.1514, -121.63423 47.15... | 380000.000000 |
8828d096cbfffff | POLYGON ((-121.96069 47.64092, -121.95569 47.6... | 560000.000000 |
8828d42595fffff | POLYGON ((-122.03831 47.51644, -122.03332 47.5... | 496662.500000 |
8828d5d887fffff | POLYGON ((-122.27476 47.2698, -122.2698 47.273... | 314500.000000 |
8828d54f51fffff | POLYGON ((-122.23007 47.73977, -122.22508 47.7... | 440601.923077 |
In [16]:
Copied!
with plt.rc_context({"hatch.linewidth": 0.4}):
ax = train_h3.plot(
hskc_dataset.target,
cmap="spring_r",
legend=True,
legend_kwds=dict(
location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
),
figsize=(15, 9),
alpha=0.5,
)
axes[0].set_axis_off()
test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)
test_h3.plot(
ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
)
ax.set_title("King County data aggregated to H3 cells")
ax.legend(
handles=[
Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
Patch(
edgecolor=(0, 0, 0, 0.8),
linewidth=0.1,
facecolor=(0, 0, 0, 0),
hatch="+++",
),
],
labels=["Train", "Test"],
loc=2,
)
cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
fig.tight_layout()
plt.show()
with plt.rc_context({"hatch.linewidth": 0.4}):
ax = train_h3.plot(
hskc_dataset.target,
cmap="spring_r",
legend=True,
legend_kwds=dict(
location="right", shrink=0.9, pad=0.02, label=hskc_dataset.target
),
figsize=(15, 9),
alpha=0.5,
)
axes[0].set_axis_off()
test_h3.plot(hskc_dataset.target, cmap="spring_r", alpha=0.5, ax=ax)
test_h3.plot(
ax=ax, linewidth=0.4, color=(0, 0, 0, 0), edgecolor=(0, 0, 0, 0.4), hatch="+++"
)
ax.set_title("King County data aggregated to H3 cells")
ax.legend(
handles=[
Patch(edgecolor=(0, 0, 0, 0.8), linewidth=0.1, facecolor=(0, 0, 0, 0)),
Patch(
edgecolor=(0, 0, 0, 0.8),
linewidth=0.1,
facecolor=(0, 0, 0, 0),
hatch="+++",
),
],
labels=["Train", "Test"],
loc=2,
)
cx.add_basemap(ax, source=cx.providers.CartoDB.PositronNoLabels, crs=4326, zoom=11)
fig.tight_layout()
plt.show()
Load raw version of dataset
In [17]:
Copied!
ds = hskc_dataset.load(version="all")
ds.keys()
ds = hskc_dataset.load(version="all")
ds.keys()
Out[17]:
dict_keys(['train'])
In [18]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[18]:
(geopandas.geodataframe.GeoDataFrame, NoneType)
In [19]:
Copied!
ds["train"].head()
ds["train"].head()
Out[19]:
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 1360 | 5000 | POINT (-122.393 47.5208) |
4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
Creating your own train - test split -> Bucket regression (works similarly for spatial regression)
In [20]:
Copied!
train, test = hskc_dataset.train_test_split(
target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
train, test = hskc_dataset.train_test_split(
target_column="price", test_size=0.2, resolution=8, n_bins=10, random_state=42
)
Summary of the split: Train: 1514 H3 cells (13897 points) Test: 690 H3 cells (3474 points) Expected ratios: {'train': 0.8, 'validation': 0, 'test': 0.2} Actual ratios: {'train': 0.8, 'test': 0.2} Actual ratios difference: {'train': 0.0, 'test': 0.0} bucket train_ratio test_ratio train_ratio_difference \ 0 0 0.80000 0.20000 0.00000 1 1 0.79989 0.20011 0.00011 2 2 0.79988 0.20012 0.00012 3 3 0.79988 0.20012 0.00012 4 4 0.80011 0.19989 -0.00011 5 5 0.80012 0.19988 -0.00012 6 6 0.80012 0.19988 -0.00012 7 7 0.80023 0.19977 -0.00023 8 8 0.80000 0.20000 0.00000 9 9 0.79988 0.20012 0.00012 test_ratio_difference train_points test_points 0 0.00000 1436 359 1 -0.00011 1407 352 2 -0.00012 1327 332 3 -0.00012 1391 348 4 0.00011 1465 366 5 0.00012 1357 339 6 0.00012 1345 336 7 0.00023 1390 347 8 0.00000 1396 349 9 -0.00012 1383 346 Created new train_gdf and test_gdf. Train len: 13897,test len: 3474
In [21]:
Copied!
resolution = hskc_dataset.resolution
resolution = hskc_dataset.resolution
In [22]:
Copied!
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
type(hskc_dataset.train_gdf), type(hskc_dataset.test_gdf)
Out[22]:
(geopandas.geodataframe.GeoDataFrame, geopandas.geodataframe.GeoDataFrame)
In [23]:
Copied!
train.head()
train.head()
Out[23]:
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | 3 | 8 | 1680 | 0 | 1987 | 0 | 98074 | 1800 | 7503 | POINT (-122.045 47.6168) |
6 | 1321400060 | 20140627T000000 | 257500.0 | 3 | 2.25 | 1715 | 6819 | 2.0 | 0 | 0 | 3 | 7 | 1715 | 0 | 1995 | 0 | 98003 | 2238 | 6819 | POINT (-122.327 47.3097) |
7 | 2008000270 | 20150115T000000 | 291850.0 | 3 | 1.50 | 1060 | 9711 | 1.0 | 0 | 0 | 3 | 7 | 1060 | 0 | 1963 | 0 | 98198 | 1650 | 9711 | POINT (-122.315 47.4095) |
9 | 3793500160 | 20150312T000000 | 323000.0 | 3 | 2.50 | 1890 | 6560 | 2.0 | 0 | 0 | 3 | 7 | 1890 | 0 | 2003 | 0 | 98038 | 2390 | 7570 | POINT (-122.031 47.3684) |
10 | 1736800520 | 20150403T000000 | 662500.0 | 3 | 2.50 | 3560 | 9796 | 1.0 | 0 | 0 | 3 | 8 | 1860 | 1700 | 1965 | 0 | 98007 | 2210 | 8925 | POINT (-122.145 47.6007) |
In [24]:
Copied!
test.head()
test.head()
Out[24]:
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | sqft_living15 | sqft_lot15 | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | 3 | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 1690 | 7639 | POINT (-122.319 47.721) |
3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | 5 | 7 | 1050 | 910 | 1965 | 0 | 98136 | 1360 | 5000 | POINT (-122.393 47.5208) |
13 | 6054650070 | 20141007T000000 | 400000.0 | 3 | 1.75 | 1370 | 9680 | 1.0 | 0 | 0 | 4 | 7 | 1370 | 0 | 1977 | 0 | 98074 | 1370 | 10208 | POINT (-122.045 47.6127) |
24 | 3814700200 | 20141120T000000 | 329000.0 | 3 | 2.25 | 2450 | 6500 | 2.0 | 0 | 0 | 4 | 8 | 2450 | 0 | 1985 | 0 | 98030 | 2200 | 6865 | POINT (-122.172 47.3739) |
28 | 5101402488 | 20140624T000000 | 438000.0 | 3 | 1.75 | 1520 | 6380 | 1.0 | 0 | 0 | 3 | 7 | 790 | 730 | 1948 | 0 | 98115 | 1520 | 6235 | POINT (-122.304 47.695) |
In [ ]:
Copied!