Skip to content

IO

The dissmodel.io module provides a unified dataset abstraction for loading and saving geospatial data. Format detection is automatic — the correct backend (vector, raster, or Xarray) is selected based on the file extension or an explicit fmt argument.

For cloud deployments, s3:// URIs are resolved transparently via the configured MinIO/S3 client — no changes to model code are required.


load_dataset

from dissmodel.io import load_dataset

# Vector — returns (GeoDataFrame, checksum)
gdf, checksum = load_dataset("data/grid.gpkg")
gdf, checksum = load_dataset("data/grid.zip")
gdf, checksum = load_dataset("s3://bucket/grid.gpkg")

# Raster — returns ((RasterBackend, meta), checksum)
(backend, meta), checksum = load_dataset("data/output.tif", fmt="raster")

# Explicit format
gdf, checksum = load_dataset("data/grid.zip", fmt="vector")

The returned checksum is the SHA-256 of the raw file bytes. Inside an executor's load(), assign it to record.source.checksum:

def load(self, record: ExperimentRecord):
    gdf, checksum = load_dataset(record.source.uri)
    record.source.checksum = checksum
    return gdf

save_dataset

from dissmodel.io import save_dataset

# Vector
checksum = save_dataset(gdf, "results/output.gpkg")
checksum = save_dataset(gdf, "s3://bucket/output.gpkg")

# Raster
checksum = save_dataset((backend, meta), "results/output.tif")

Returns the SHA-256 of the saved file. Inside an executor's save(), assign it to record.output_sha256:

def save(self, result, record: ExperimentRecord) -> ExperimentRecord:
    uri = record.output_path or "output.gpkg"
    checksum = save_dataset(result, uri)
    record.output_path   = uri
    record.output_sha256 = checksum
    record.status        = "completed"
    return record

Format dispatch

Extension fmt Backend Returns
.gpkg, .shp, .geojson "vector" GeoPandas (GeoDataFrame, checksum)
.zip (containing shapefile) "vector" GeoPandas (GeoDataFrame, checksum)
.tif, .tiff "raster" rasterio + RasterBackend ((backend, meta), checksum)
.zip (containing GeoTIFF) "raster" rasterio + RasterBackend ((backend, meta), checksum)
"auto" (default) auto-detect depends on extension

vector_to_raster_backend

Rasterizes a GeoDataFrame into a RasterBackend. Used inside executor load() methods when the model requires a raster substrate but the input is a vector file.

from dissmodel.io.convert import vector_to_raster_backend

backend = vector_to_raster_backend(
    source      = gdf,
    resolution  = 100.0,      # metres
    attrs       = {"uso": 0, "alt": 0.0, "solo": 1},
    crs         = "EPSG:31984",
    nodata      = 0,
)

The resulting backend includes a "mask" band marking valid cells (non-nodata).


API Reference

dissmodel.io._dispatch.load_dataset(uri, minio_client=None, fmt=None, **kwargs)

Load any supported dataset from a URI.

Format is inferred from the URI extension unless fmt is provided.

Supported formats

vector — .shp, .gpkg, .geojson, .zip (shapefile inside zip) raster — .tif, .tiff xarray — .zarr, .nc (post-MVP — BDC/STAC integration)

Returns:

Type Description
(data, sha256_checksum)
Source code in dissmodel/io/_dispatch.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def load_dataset(uri: str, minio_client=None, fmt: str | None = None, **kwargs):
    """
    Load any supported dataset from a URI.

    Format is inferred from the URI extension unless fmt is provided.

    Supported formats
    -----------------
    vector  — .shp, .gpkg, .geojson, .zip (shapefile inside zip)
    raster  — .tif, .tiff
    xarray  — .zarr, .nc  (post-MVP — BDC/STAC integration)

    Returns
    -------
    (data, sha256_checksum)
    """
    resolved_fmt = fmt or detect_format(uri)

    if resolved_fmt == "vector":
        from dissmodel.io.vector import load_gdf
        return load_gdf(uri, minio_client=minio_client, **kwargs)

    if resolved_fmt == "raster":
        from dissmodel.io.raster import load_geotiff
        return load_geotiff(uri, minio_client=minio_client, **kwargs)

    if resolved_fmt == "xarray":
        from dissmodel.io._xarray import load_xarray
        return load_xarray(uri, minio_client=minio_client, **kwargs)

    raise ValueError(f"Unsupported format: '{resolved_fmt}'")

dissmodel.io._dispatch.save_dataset(data, uri, minio_client=None, fmt=None, **kwargs)

Save any supported dataset to a URI.

Format is inferred from the URI extension unless fmt is provided.

Returns:

Type Description
sha256_checksum of the saved file
Source code in dissmodel/io/_dispatch.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def save_dataset(data, uri: str, minio_client=None, fmt: str | None = None, **kwargs) -> str:
    """
    Save any supported dataset to a URI.

    Format is inferred from the URI extension unless fmt is provided.

    Returns
    -------
    sha256_checksum of the saved file
    """
    resolved_fmt = fmt or detect_format(uri)

    if resolved_fmt == "vector":
        from dissmodel.io.vector import save_gdf
        return save_gdf(data, uri, minio_client=minio_client, **kwargs)

    if resolved_fmt == "raster":
        from dissmodel.io.raster import save_geotiff
        return save_geotiff(data, uri, minio_client=minio_client, **kwargs)

    if resolved_fmt == "xarray":
        from dissmodel.io._xarray import save_xarray
        return save_xarray(data, uri, minio_client=minio_client, **kwargs)

    raise ValueError(f"Unsupported format: '{resolved_fmt}'")

dissmodel.io.convert.vector_to_raster_backend(source, resolution, attrs, crs=None, all_touched=False, nodata=0, nodata_value=None, add_mask=True)

Convert a vector source to a RasterBackend.

Accepts a file path (Shapefile, GeoJSON, GPKG, .zip) or an in-memory GeoDataFrame. Each requested attribute column is rasterized into a separate band.

Parameters:

Name Type Description Default
source str, Path, or GeoDataFrame

Vector source. File paths are read with GeoPandas; GeoDataFrames are used directly (a copy is made before any reprojection).

required
resolution float

Cell size in the units of the CRS.

required
attrs list[str] or dict[str, Any]

Columns to rasterize. A list uses nodata as fill for all columns; a dict maps column names to per-column fill defaults.

required
crs str, int, or None

Target CRS for reprojection. If None and source is a GeoDataFrame without a CRS, a ValueError is raised.

None
all_touched bool

If True, burn all cells touched by a geometry edge.

False
nodata int or float

Default fill for cells outside geometries. Default: 0.

0
nodata_value int or float or None

Sentinel for out-of-extent cells. Useful when 0 is a valid value (e.g. nodata_value=-1 for proportion arrays). Default: None.

None
add_mask bool

If True (default), adds a "mask" band — 1.0 where a cell is covered by at least one geometry, 0.0 elsewhere.

True

Returns:

Type Description
RasterBackend

Raises:

Type Description
ImportError

If geopandas or rasterio are not installed.

FileNotFoundError

If a file path does not exist.

ValueError

If attrs is empty, a requested column is missing, or the GeoDataFrame has no CRS and crs is also None.

Examples:

>>> # From file path
>>> b = vector_to_raster_backend(
...     "data/mangue_grid.shp", resolution=100, attrs=["uso", "alt"]
... )
>>> # From in-memory GeoDataFrame
>>> import geopandas as gpd
>>> gdf = gpd.read_file("data/mangue_grid.shp").to_crs("EPSG:31984")
>>> b = vector_to_raster_backend(gdf, resolution=100, attrs={"uso": -1})
Source code in dissmodel/io/convert.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def vector_to_raster_backend(
    source: str | pathlib.Path | Any,   # Any = gpd.GeoDataFrame
    resolution: float,
    attrs: list[str] | dict[str, Any],
    crs: str | int | None = None,
    all_touched: bool = False,
    nodata: int | float = 0,
    nodata_value: int | float | None = None,
    add_mask: bool = True,
) -> RasterBackend:
    """
    Convert a vector source to a RasterBackend.

    Accepts a file path (Shapefile, GeoJSON, GPKG, .zip) or an in-memory
    GeoDataFrame. Each requested attribute column is rasterized into a
    separate band.

    Parameters
    ----------
    source : str, Path, or GeoDataFrame
        Vector source. File paths are read with GeoPandas; GeoDataFrames
        are used directly (a copy is made before any reprojection).
    resolution : float
        Cell size in the units of the CRS.
    attrs : list[str] or dict[str, Any]
        Columns to rasterize. A list uses ``nodata`` as fill for all columns;
        a dict maps column names to per-column fill defaults.
    crs : str, int, or None
        Target CRS for reprojection. If ``None`` and source is a GeoDataFrame
        without a CRS, a ValueError is raised.
    all_touched : bool
        If ``True``, burn all cells touched by a geometry edge.
    nodata : int or float
        Default fill for cells outside geometries. Default: ``0``.
    nodata_value : int or float or None
        Sentinel for out-of-extent cells. Useful when ``0`` is a valid value
        (e.g. ``nodata_value=-1`` for proportion arrays). Default: ``None``.
    add_mask : bool
        If ``True`` (default), adds a ``"mask"`` band — ``1.0`` where a cell
        is covered by at least one geometry, ``0.0`` elsewhere.

    Returns
    -------
    RasterBackend

    Raises
    ------
    ImportError
        If ``geopandas`` or ``rasterio`` are not installed.
    FileNotFoundError
        If a file path does not exist.
    ValueError
        If ``attrs`` is empty, a requested column is missing, or the
        GeoDataFrame has no CRS and ``crs`` is also ``None``.

    Examples
    --------
    >>> # From file path
    >>> b = vector_to_raster_backend(
    ...     "data/mangue_grid.shp", resolution=100, attrs=["uso", "alt"]
    ... )

    >>> # From in-memory GeoDataFrame
    >>> import geopandas as gpd
    >>> gdf = gpd.read_file("data/mangue_grid.shp").to_crs("EPSG:31984")
    >>> b = vector_to_raster_backend(gdf, resolution=100, attrs={"uso": -1})
    """
    try:
        import rasterio
        import rasterio.features
        import rasterio.transform
    except ImportError:
        raise ImportError("rasterio is required — pip install rasterio")

    try:
        import geopandas as gpd
    except ImportError:
        raise ImportError("geopandas is required — pip install geopandas")

    # ── resolve source → GeoDataFrame ────────────────────────────────────────
    if isinstance(source, (str, pathlib.Path)):
        path = pathlib.Path(source)
        if not str(source).startswith("zip://") and not path.exists():
            raise FileNotFoundError(f"File not found: {path}")
        gdf = gpd.read_file(str(source))
    else:
        # Assume GeoDataFrame — make a copy to avoid mutating the caller's data
        gdf = source.copy()

    # ── CRS validation and reprojection ───────────────────────────────────────
    if gdf.crs is None and crs is None:
        raise ValueError(
            "Source GeoDataFrame has no CRS and no target CRS was provided. "
            "Pass crs= to specify the coordinate reference system."
        )

    if crs is not None:
        gdf = gdf.to_crs(crs)

    # ── resolve attrs → {column: fill_default} ────────────────────────────────
    if isinstance(attrs, list):
        attr_defaults: dict[str, Any] = {col: nodata for col in attrs}
    else:
        attr_defaults = dict(attrs)

    if not attr_defaults:
        raise ValueError("attrs must not be empty")

    missing = [col for col in attr_defaults if col not in gdf.columns]
    if missing:
        raise ValueError(f"Columns not found in source: {missing}")

    # ── compute grid from bounding box ────────────────────────────────────────
    xmin, ymin, xmax, ymax = gdf.total_bounds
    n_cols = int(np.ceil((xmax - xmin) / resolution))
    n_rows = int(np.ceil((ymax - ymin) / resolution))

    transform = rasterio.transform.from_bounds(
        xmin, ymin, xmax, ymax, n_cols, n_rows
    )

    backend = RasterBackend(
        shape        = (n_rows, n_cols),
        nodata_value = nodata_value,
        transform    = transform,  
        crs          = gdf.crs     
    )

    # ── rasterize geometry coverage → "mask" band ─────────────────────────────
    valid_geoms = [geom for geom in gdf.geometry if geom is not None]
    coverage = rasterio.features.rasterize(
        shapes    = ((geom, 1) for geom in valid_geoms),
        out_shape = (n_rows, n_cols),
        transform = transform,
        fill      = 0,
        all_touched = all_touched,
        dtype     = np.uint8,
    )
    mask = coverage.astype(bool)

    if add_mask:
        backend.set("mask", mask.astype(np.float32))

    # ── rasterize each attribute column ───────────────────────────────────────
    for col, default in attr_defaults.items():
        values = gdf[col]

        dtype = np.int32 if np.issubdtype(values.dtype, np.integer) else np.float32

        arr = rasterio.features.rasterize(
            shapes = (
                (geom, float(val))
                for geom, val in zip(gdf.geometry, values)
                if geom is not None
            ),
            out_shape = (n_rows, n_cols),
            transform = transform,
            fill      = float(default),
            all_touched = all_touched,
            dtype     = dtype,
        )

        sentinel = nodata_value if nodata_value is not None else default
        arr = np.where(mask, arr, sentinel).astype(dtype)
        backend.set(col, arr)

    n_valid = int(mask.sum())
    n_total = n_rows * n_cols
    print(
        f"  rasterized: {n_valid:,} valid cells"
        f" / {n_total:,} total"
        f" ({100 * n_valid / n_total:.1f}% coverage)"
    )

    return backend