
Practical workflows for reliable spatial predictions
Rome R Users Group
2025-11-27

Based on the goal of the analysis (incomplete lists)
Books




The geocompx community grows through shared curiosity and collaboration.
Website: geocompx.org
Here, we want to predict plant species richness (the number of different species present in a given area) for South America (a hard problem!)
library(sf)
library(terra)
library(CAST)
library(caret)
library(tmap)
tmap_options(scale = 1.5)
data(splotdata, package = "CAST")
south_america = rnaturalearth::ne_countries(continent = "South America")
# head(splotdata)
tm_shape(south_america) +
tm_borders() +
tm_shape(splotdata) +
tm_symbols(fill = "Species_richness", size = 0.5)
set.seed(21)
train_indices = sample(1:nrow(splotdata), size = 0.7 * nrow(splotdata))
splotdata_test = splotdata[-train_indices, ]
splotdata = splotdata[train_indices, ]
tm_shape(south_america) +
tm_borders() +
tm_shape(splotdata) +
tm_symbols(fill = "Species_richness", size = 0.5) +
tm_title("Training data")
70/30 random split into training and testing data
dir.create("data")
wc = geodata::worldclim_global(var = "bio", res = 10, path = "data/")
wc = wc[[c(1, 12, 14)]]
elev = geodata::elevation_global(res = 10, path = "data/")
predictors = crop(c(wc, elev), st_bbox(splotdata))
names(predictors) = c("bio_1", "bio_12", "bio_14", "elev")
tm_shape(predictors) +
tm_raster(col.scale = list(tm_scale_continuous(values = "cividis"),
tm_scale_continuous(values = "PuBuGn"),
tm_scale_continuous(values = "YlGnBu"),
tm_scale_continuous(values = "geyser")),
col.legend = tm_legend(orientation = "landscape")) +
tm_facets(ncol = 4)bio_1: Average monthly difference between daily high and low temperatures, bio_12: Annual Precipitation, bio_14: Precipitation of Driest Month, elev: Elevation
| Species_richness | bio_1 | bio_12 | bio_14 | elev | x | y |
|---|---|---|---|---|---|---|
| 40 | 15.000000 | 1199 | 53 | 2513 | -79.36602 | -5.343727 |
| 18 | 16.545834 | 1760 | 119 | 720 | -50.93534 | -28.870443 |
| 17 | 7.345833 | 992 | 15 | 3571 | -73.08057 | 5.908890 |
| 2 | 5.991667 | 1456 | 23 | 3868 | -74.15775 | 3.993870 |
| 53 | 20.020834 | 531 | 6 | 251 | -64.54472 | -29.964170 |
| 35 | 18.266666 | 2141 | 153 | 603 | -52.51585 | -26.773140 |
| 22 | 9.379167 | 1121 | 62 | 3384 | -79.03380 | -3.160160 |
| 104 | 16.954166 | 1156 | 17 | 1941 | -78.16670 | 0.883330 |
| 10 | 10.045834 | 2175 | 52 | 328 | -73.12974 | -39.845370 |
| 29 | 9.800000 | 1055 | 51 | 3442 | -79.31206 | -3.716683 |
| 92 | 26.608334 | 2080 | 82 | 380 | -74.83330 | 5.250000 |
| 48 | 17.270834 | 1660 | 83 | 680 | -49.04789 | -26.192380 |
| 110 | 20.208334 | 1291 | 27 | 1496 | -68.48700 | -14.988000 |
| 66 | 25.295834 | 2116 | 54 | 212 | -69.27120 | -12.329900 |
| 23 | 14.108334 | 1686 | 113 | 1249 | -49.81775 | -28.608093 |
| 103 | 25.441666 | 1927 | 48 | 194 | -68.80190 | -12.433400 |
| 22 | 7.537500 | 1212 | 70 | 3624 | -79.16920 | -2.880100 |
| 16 | 12.150000 | 1304 | 14 | 2842 | -72.94137 | 10.247530 |
Inverse Distance Weighting (IDW) estimates values at unsampled points using nearby samples – closer points have more influence on the results.
library(gstat)
m_idw = gstat(
formula = Species_richness ~ 1,
locations = ~ x + y,
data = splotdata_df,
set = list(idp = 2)
)
pred_idw = interpolate(predictor_space, m_idw, index = 1, debug.level = 0)
pred_idw = mask(pred_idw, predictor_space)
names(pred_idw) = "idw"
tm_shape(pred_idw) +
tm_raster(col.scale = tm_scale_continuous(values = "spectral", midpoint = 0))
m_ok = gstat(
formula = Species_richness ~ 1,
locations = ~ x + y,
data = splotdata_df,
model = fm_ok
)
pred_ok = interpolate(predictor_space, m_ok, index = 1, debug.level = 0)
pred_ok = mask(pred_ok, predictor_space)
names(pred_ok) = "ordinary kriging"
tm_shape(pred_ok) +
tm_raster(col.scale = tm_scale_continuous(values = "spectral", midpoint = 0)) +
tm_layout(scale = 1)
m_uk = gstat(
formula = Species_richness ~ bio_1 + bio_12 + elev,
locations = ~ x + y,
data = splotdata_df,
model = fm_uk
)
pred_uk = interpolate(predictors, m_uk, index = 1, debug.level = 0)
pred_uk = mask(pred_uk, predictor_space)
names(pred_uk) = "universal kriging"
tm_shape(pred_uk) +
tm_raster(col.scale = tm_scale_continuous(values = "spectral", midpoint = 0)) +
tm_layout(scale = 1)
Machine learning methods, such as Random Forests, can capture complex relationships between predictors and the target variable.
However, they do not explicitly account for spatial relationships in the data.
set.seed(10) # set seed to reproduce the model
model_rf = train(
splotdata_df[, names(predictors)],
splotdata_df$Species_richness,
method = "rf",
tuneGrid = data.frame("mtry" = 2),
importance = TRUE,
ntree = 50,
trControl = trainControl(method = "cv", number = 3, savePredictions = "final")
)
pred_rf = terra::predict(predictors, model_rf, na.rm = TRUE)
names(pred_rf) = "random forest"
tm_shape(pred_rf) +
tm_raster(col.scale = tm_scale_continuous(values = "spectral", midpoint = 0))
rmse = function(observed, predicted) {
sqrt(mean((observed - predicted)^2, na.rm = TRUE))
}
## idw
pred_idw_test = extract(pred_idw, splotdata_test)
rmse_idw = rmse(splotdata_test$Species_richness, pred_idw_test[, 2])
## kriging
pred_ok_test = extract(pred_ok, splotdata_test)
rmse_sk = rmse(splotdata_test$Species_richness, pred_ok_test[, 2])
## universal kriging
pred_uk_test = extract(pred_uk, splotdata_test)
rmse_uk = rmse(splotdata_test$Species_richness, pred_uk_test[, 2])
## machine learning
pred_rf_test = extract(pred_rf, splotdata_test)
rmse_rf = rmse(splotdata_test$Species_richness, pred_rf_test[, 2])
# which model is the best?
rmse_results = data.frame(
Method = c("IDW", "Ordinary Kriging", "Universal Kriging", "Random Forest"),
RMSE = round(c(rmse_idw, rmse_sk, rmse_uk, rmse_rf), 2)
)
pred_all = c(pred_idw, pred_ok, pred_uk, pred_rf)
pred_all_cent = st_centroid(st_as_sf(as.polygons(ext(pred_all))))
tm_shape(pred_all) +
tm_raster(col.scale = tm_scale_continuous(values = "spectral", midpoint = 0),
col.legend = tm_legend(title = "Species richness",
orientation = "landscape"),
col.free = FALSE) +
tm_credits(paste0("RMSE: ", rmse_results$RMSE), size = 2) +
tm_facets(ncol = 4)# issue no 1: there are some negative values
tm_shape(pred_all) +
tm_raster(col.scale = tm_scale_continuous(values = "spectral",
midpoint = 0,
values.range = c(0.2, 1)),
col.legend = tm_legend(title = "Species richness",
orientation = "landscape"),
col.free = FALSE) +
tm_facets(ncol = 4) +
tm_shape(splotdata) +
tm_dots(shape = 2, size = 0.2, lwd = 0.3)Some methods produce unrealistic negative predictions for species richness.

Our training data is clustered in space. Thus:

Our training data is clustered in space. Thus:
Our testing data is also spatially clustered, and not representative of the entire study area.
Thus, how reliable are our models’ evaluations? How can we address these issues?
Solution approach: Having a random training and testing sample of the whole prediction area. (Sadly, often impossible in real-world applications)
set.seed(2025-11-02)
random_sample_train = spatSample(predictor_space, size = 490, method = "random",
na.rm = TRUE, exact = TRUE,
as.points = TRUE, replace = FALSE)
random_sample_test = spatSample(predictor_space, size = 200, method = "random",
na.rm = TRUE, exact = TRUE,
as.points = TRUE, replace = FALSE)Solution approach: Identify areas where the environment is not well represented, making predictions less trustworthy (Area of Applicability – AoA, Meyer and Pebesma, 2021).
AOA = aoa(predictors,
train = splotdata_df[c("bio_1", "bio_12", "bio_14", "elev")],
verbose = FALSE)
tm_shape(AOA$DI) +
tm_raster(col.scale = tm_scale_continuous_sqrt(values = "batlow"),
col.legend = tm_legend(title = "")) +
tm_shape(splotdata) +
tm_dots(shape = 2, col = "white", size = 0.2, lwd = 0.3) +
tm_title("Dissimilarity to training data")
pred_all_aoa = mask(pred_all, AOA[[3]], maskvalues = 0)
tm_shape(predictor_space) +
tm_raster(col.scale = tm_scale(values = "black",
labels = "Different environment"),
col.legend = tm_legend(title = ""),
col_alpha = 0.1) +
tm_add_legend(labels = "Training data", fill = "black",
type = "symbols", shape = 2, size = 1) +
tm_shape(pred_all_aoa) +
tm_raster(col.scale = tm_scale_continuous(values = "spectral",
midpoint = 0),
col.legend = tm_legend(title = "Species richness",
orientation = "landscape"),
col.free = FALSE) +
tm_facets(ncol = 4) +
tm_shape(splotdata) +
tm_dots(shape = 2, size = 0.2, lwd = 0.3)
Our predictions are reliable for the areas which are similar to the training data (inside the AoA).
Clustered data: our previous RMSE estimates only reflect model performance in areas close to testing points.
However, we want to know how our model performs across the entire study area (except outside the AoA).
Solution approach: Use a prediction-domain adaptive validation method, e.g., kNNDM, Linnenbrink et al., 2024, to better reflect the prediction situation.
set.seed(11)
indices_knndm = knndm(splotdata, predictors, k = 5)
# plot(indices_knndm)
# indices_knndm2 = knndm(splotdata[, names(predictors)], predictors, k = 5, space = "feature")
# plot(indices_knndm2)
plot(geodist(splotdata, predictors, cvfolds = indices_knndm$indx_test)) +
scale_x_sqrt(labels = scales::label_number()) +
ggtitle("kNNDM CV: k-fold nearest-neighbour distance matching cross-validation")
kNNDM adapts cross-validation folds to better reflect the prediction situation

Here, we apply kNNDM approach (with a random forest method) to better reflect the prediction situation.
pred_rf2 = terra::predict(predictors, model_rf2, na.rm = TRUE)
names(pred_rf2) = "random forest2"
pred_rf2_aoa = mask(pred_rf2, AOA[[3]], maskvalues = 0)
tm_shape(predictor_space) +
tm_raster(col.scale = tm_scale(values = "black",
labels = "Different environment"),
col.legend = tm_legend(title = ""),
col_alpha = 0.1) +
tm_shape(pred_rf2_aoa) +
tm_raster(col.scale = tm_scale_continuous(values = "spectral",
midpoint = 0),
col.legend = tm_legend(title = "Species richness",
orientation = "landscape")) +
tm_title("Random Forest")
35.15
RMSE based on the prediction-domain adaptive cross-validation (kNNDM)
set.seed(2025-11-26)
random_sample_test_i = spatSample(predictor_space, size = 200, method = "random",
na.rm = TRUE, exact = TRUE,
as.points = TRUE, replace = FALSE)
random_sample_test_i$type = "test"
tm_shape(south_america) +
tm_borders() +
tm_shape(random_sample_test_i) +
tm_symbols(size = 0.5,
fill = "type",
fill.legend = tm_legend(title = "",
position = tm_pos_in("right", "bottom"))) +
tm_title("Independent testing data") +
tm_layout(scale = 2)
random_sample_train$type = "train"
random_sample_test$type = "test"
random_sample = rbind(random_sample_train, random_sample_test)
tm_shape(south_america) +
tm_borders() +
tm_shape(random_sample) +
tm_symbols(size = 0.5,
fill = "type",
fill.legend = tm_legend(title = "",
position = tm_pos_in("right", "bottom"))) +
tm_title("Random sample") +
tm_layout(scale = 2)
# pak::pak("JanLinnenbrink/CAST") # new feature: work-in-progress
splotdata2 = splotdata
knndm_folds = CAST::knndm(splotdata2, modeldomain = predictors,
prop_test = 0.3, tolerance = 0.1)
splotdata2$type = knndm_folds$clusters
tm_shape(south_america) +
tm_borders() +
tm_shape(splotdata2) +
tm_symbols(size = 0.5,
fill = "type",
fill.legend = tm_legend(title = "",
position = tm_pos_in("right", "bottom"))) +
tm_title("Clustered sample") +
tm_layout(scale = 2)
set.seed(2025-11-02)
splotdata3 = splotdata[sample(1:nrow(splotdata), size = 143), ]
indices_knndm2 = knndm(splotdata3, predictors, k = 5)
splotdata3$fold = as.factor(indices_knndm2$clusters)
tm_shape(south_america) +
tm_borders() +
tm_shape(splotdata3) +
tm_symbols(size = 0.5,
fill = "fold",
fill.legend = tm_legend(position = tm_pos_in("right", "bottom"))) +
tm_title("Small sample") +
tm_layout(scale = 2)
set.seed(2025-11-02)
splotdata4 = splotdata[sample(1:nrow(splotdata), size = 80), ]
indices_knndm3 = knndm(splotdata4, predictors, k = 5)
splotdata4$fold = as.factor(indices_knndm3$clusters)
tm_shape(south_america) +
tm_borders() +
tm_shape(splotdata4) +
tm_symbols(size = 0.5,
fill = "fold",
fill.legend = tm_legend(position = tm_pos_in("right", "bottom"))) +
tm_title("(Even) smaller sample") +
tm_layout(scale = 2)
CRAN Task View: Analysis of Spatial Data:
Website: https://jakubnowosad.com
Slides: https://jakubnowosad.com/rome2025
Various R developers
Contributors to the geocompx project
Members of the Remote Sensing and Spatial Modelling Research Group (especially Hanna Meyer and Jan Linnenbrink)
