library(varunayan)
library(dplyr)
library(ggplot2)
library(sf)
library(patchwork)

The problem

ERA5 and IMD give us data on regular grids (for example 0.25° × 0.25°). When we aggregate to administrative boundaries like districts, the smaller districts may contain no grid points at all, which leaves them with missing data.

districts_url <- "https://bharatviz.saketlab.org/India_LGD_districts.geojson"
districts_file <- "indian_districts.geojson"
if (!file.exists(districts_file)) {
  download.file(districts_url, districts_file, mode = "wb", timeout = 300)
}
districts_sf <- sf::st_read(districts_file, quiet = TRUE)

imd_tmax <- imd_temperature_geojson(
  request_id = "spatial_demo",
  start_year = 2023, end_year = 2023,
  geojson_file = districts_file,
  var_type = "tmax"
)

imd_spatial <- imd_tmax %>%
  group_by(longitude, latitude) %>%
  summarise(temp = mean(temperature, na.rm = TRUE), .groups = "drop")

era5_temp <- era5ify_geojson(
  request_id = "spatial_era5",
  variables = "2m_temperature",
  start_date = "2023-01-01",
  end_date = "2023-01-31",
  json_file = districts_file,
  frequency = "daily",
  resolution = 0.25
)
#>   |                                                                              |                                                                      |   0%  |                                                                              |=                                                                     |   2%  |                                                                              |===                                                                   |   4%  |                                                                              |=====                                                                 |   6%  |                                                                              |======                                                                |   9%  |                                                                              |========                                                              |  11%  |                                                                              |=========                                                             |  13%  |                                                                              |===========                                                           |  15%  |                                                                              |============                                                          |  17%  |                                                                              |==============                                                        |  20%  |                                                                              |===============                                                       |  22%  |                                                                              |=================                                                     |  24%  |                                                                              |==================                                                    |  26%  |                                                                              |====================                                                  |  28%  |                                                                              |=====================                                                 |  30%  |                                                                              |=======================                                               |  33%  |                                                                              |========================                                              |  35%  |                                                                              |==========================                                            |  37%  |                                                                              |===========================                                           |  39%  |                                                                              |=============================                                         |  41%  |                                                                              |==============================                                        |  43%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  48%  |                                                                              |===================================                                   |  50%  |                                                                              |====================================                                  |  52%  |                                                                              |=========================================                             |  59%  |                                                                              |===========================================                           |  61%  |                                                                              |===============================================                       |  67%  |                                                                              |=================================================                     |  69%  |                                                                              |==================================================                    |  72%  |                                                                              |====================================================                  |  74%  |                                                                              |=====================================================                 |  76%  |                                                                              |==============================================================        |  89%  |                                                                              |================================================================      |  91%  |                                                                              |=================================================================     |  93%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  98%  |                                                                              |======================================================================| 100%

era5_spatial <- era5_temp %>%
  group_by(longitude, latitude) %>%
  summarise(temp = mean(value, na.rm = TRUE), .groups = "drop")

Visualizing grid coverage

We use compare_grids() to see how the grid points align with polygon boundaries.

compare_grids(
  IMD = imd_spatial,
  point_size = 1,
  polygons = districts_sf,
  title = "IMD grid vs district boundaries"
)

compare_grids(
  ERA5 = era5_spatial,
  colors = c(ERA5 = "#3498DB"),
  point_size = 0.5,
  polygons = districts_sf,
  title = "ERA5 grid vs district boundaries"
)

compare_grids(
  IMD = imd_spatial,
  ERA5 = era5_spatial,
  point_size = 0.5,
  polygons = districts_sf,
  title = "IMD vs ERA5 grid points"
)

Red circles are IMD grid points, blue circles are ERA5 grid points. Many small districts have no grid points within their boundaries.

Coverage statistics

IMD coverage

imd_coverage <- check_grid_coverage(imd_spatial, districts_sf, c("state_name", "district_name"))
cat("IMD Grid Coverage:\n")
#> IMD Grid Coverage:
cat("  Total districts:", imd_coverage$total_polygons, "\n")
#>   Total districts: 785
cat("  Grid points:", imd_coverage$total_grid_points, "\n")
#>   Grid points: 280
cat("  Districts with data:", imd_coverage$polygons_with_points, "\n")
#>   Districts with data: 249
cat("  Coverage:", imd_coverage$coverage_percent, "%\n")
#>   Coverage: 31.7 %

ERA5 coverage

era5_coverage <- check_grid_coverage(era5_spatial, districts_sf, c("state_name", "district_name"))
cat("ERA5 Grid Coverage:\n")
#> ERA5 Grid Coverage:
cat("  Total districts:", era5_coverage$total_polygons, "\n")
#>   Total districts: 785
cat("  Grid points:", era5_coverage$total_grid_points, "\n")
#>   Grid points: 4635
cat("  Districts with data:", era5_coverage$polygons_with_points, "\n")
#>   Districts with data: 752
cat("  Coverage:", era5_coverage$coverage_percent, "%\n")
#>   Coverage: 95.8 %

Aggregation methods

Point-in-polygon

We use a spatial join to find the grid points that fall within each polygon:

result_pip <- aggregate_to_polygons(
  imd_spatial, districts_sf, "temp",
  method = "point_in_polygon",
  polygon_id_cols = c("state_name", "district_name")
)

cat("Districts with data:", sum(!is.na(result_pip$value)), "/", nrow(result_pip), "\n")
#> Districts with data: 249 / 785

Nearest centroid

Here we assign each polygon the grid point nearest its centroid:

result_nearest <- aggregate_to_polygons(
  imd_spatial, districts_sf, "temp",
  method = "nearest_centroid",
  polygon_id_cols = c("state_name", "district_name")
)

cat("Districts with data:", sum(!is.na(result_nearest$value)), "/", nrow(result_nearest), "\n")
#> Districts with data: 785 / 785

Comparing results

make_map <- function(data, title) {
  map_sf <- districts_sf %>%
    left_join(data, by = c("state_name", "district_name"))

  ggplot(map_sf) +
    geom_sf(aes(fill = value), color = "white", linewidth = 0.05) +
    scale_fill_viridis_c(option = "plasma", na.value = "grey80", name = "°C") +
    labs(title = title) +
    theme_void() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 11))
}

p1 <- make_map(result_pip, "Point-in-Polygon")
p2 <- make_map(result_nearest, "Nearest Centroid")

p1 + p2 + plot_layout(guides = "collect")

Value comparison

For districts that have data in both methods:

comparison <- result_pip %>%
  rename(pip = value) %>%
  inner_join(
    result_nearest %>% rename(nearest = value),
    by = c("state_name", "district_name")
  ) %>%
  filter(!is.na(pip) & !is.na(nearest))

cor_val <- cor(comparison$pip, comparison$nearest, use = "complete.obs")

ggplot(comparison, aes(x = pip, y = nearest)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
  geom_point(alpha = 0.5, color = "#2C3E50") +
  coord_fixed() +
  labs(
    x = "Point-in-polygon (°C)",
    y = "Nearest centroid (°C)",
    subtitle = sprintf("r = %.3f (n = %d districts)", cor_val, nrow(comparison))
  ) +
  theme_minimal()

Spatial aggregation methods