Skip to content

pre_proc_gdf

cellseg_gsontools.pre_proc_gdf(gdf, min_size=None)

Apply some light pre-processing of a geodataframe.

Namely, remove invalid polygons, empty geometries and add bounds to the gdf.

Parameters:

Name Type Description Default
gdf GeoDataFrame

Input geodataframe.

required
min_size int

The minimum size of the polygons in pixels.

None

Returns:

Type Description
Union[GeoDataFrame, None]

gpd.GeoDataFrame: The pre-processed gdf or None if input gdf is empty or None.

Examples:

>>> from cellseg_gsontools import pre_proc_gdf
>>> gdf = pre_proc_gdf(gdf, min_size=100)
Source code in cellseg_gsontools/utils.py
def pre_proc_gdf(
    gdf: gpd.GeoDataFrame, min_size: int = None
) -> Union[gpd.GeoDataFrame, None]:
    """Apply some light pre-processing of a geodataframe.

    Namely, remove invalid polygons, empty geometries and add bounds to the gdf.

    Parameters:
        gdf (gpd.GeoDataFrame):
            Input geodataframe.
        min_size (int, optional):
            The minimum size of the polygons in pixels.

    Returns:
        gpd.GeoDataFrame:
            The pre-processed gdf or None if input gdf is empty or None.

    Examples:
        >>> from cellseg_gsontools import pre_proc_gdf
        >>> gdf = pre_proc_gdf(gdf, min_size=100)
    """
    if gdf.empty or gdf is None:
        return gdf

    # drop invalid geometries if there are any after buffer
    gdf.geometry = gdf.geometry.buffer(0)
    gdf = gdf[gdf.is_valid]

    # drop empty geometries
    gdf = gdf[~gdf.is_empty]

    # if there are multipolygon geometries, explode them
    if "MultiPolygon" in list(gdf["geometry"].geom_type):
        gdf = gdf.explode(index_parts=False).reset_index(drop=True)

    # drop geometries that are less than min_size pixels
    if min_size is not None:
        gdf = gdf[gdf.area > min_size]

    # drop geometries that are not polygons
    gdf = gdf[gdf.geom_type == "Polygon"]

    try:
        # add bounding box coords of the polygons to the gdfs
        # and correct for the max coords
        gdf["xmin"] = gdf.bounds["minx"].astype(int)
        gdf["ymin"] = gdf.bounds["miny"].astype(int)
        gdf["ymax"] = gdf.bounds["maxy"].astype(int) + 1
        gdf["xmax"] = gdf.bounds["maxx"].astype(int) + 1
    except Exception:
        warnings.warn("Could not create bounds cols to gdf.", RuntimeWarning)

    return gdf