Skip to content

local_diversity

cellseg_gsontools.diversity.local_diversity(gdf, spatial_weights, val_col, id_col=None, metrics=('simpson_index'), scheme='FisherJenks', parallel=True, num_processes=-1, rm_nhood_cols=True, col_prefix=None, create_copy=True)

Compute the local diversity/heterogenity metric for cell neighborhood.

Note

Allowed diversity metrics:

  • simpson_index - for both categorical and real valued neighborhoods
  • shannon_index - for both categorical and real valued neighborhoods
  • gini_index - for only real valued neighborhoods
  • theil_index - for only real valued neighborhoods
Note

If val_col is not categorical, the values are binned using mapclassify. The bins are then used to compute the diversity metrics. If val_col is categorical, the values are used directly.

Parameters:

Name Type Description Default
gdf GeoDataFrame

The input GeoDataFrame.

required
spatial_weights W

Libpysal spatial weights object.

required
val_col Union[str, Tuple[str, ...]]

The name of the column in the gdf for which the diversity is computed. You can also pass in a list of columns, in which case the diversity is computed for each column.

required
id_col str

The unique id column in the gdf. If None, this uses set_uid to set it. Defaults to None.

None
metrics Tuple[str, ...]

A Tuple/List of diversity metrics. Allowed metrics: "shannon_index", "simpson_index", "gini_index", "theil_index". Defaults to None.

('simpson_index')
scheme str

mapclassify classification scheme. Defaults to "FisherJenks".

'FisherJenks'
parallel bool

Flag whether to use parallel apply operations when computing the diversities. Defaults to True.

True
num_processes int, default=-1

The number of processes to use when parallel=True. If -1, this will use all available cores.

-1
rm_nhood_cols bool

Flag, whether to remove the extra neighborhood columns from the result gdf. Defaults to True.

True
col_prefix str

Prefix for the new column names. Defaults to None.

None
create_copy bool

Flag whether to create a copy of the input gdf or not. Defaults to True.

True

Raises:

Type Description
ValueError

If an illegal metric is given.

Returns:

Type Description
GeoDataFrame

gpd.GeoDataFrame: The input geodataframe with computed diversity metric columns added.

Examples:

Compute the simpson diversity of eccentricity values for each cell neighborhood

>>> from cellseg_gsontools.diversity import local_diversity
>>> from cellseg_gsontools.graphs import fit_graph
>>> w = fit_graph(gdf, type="distband", thres=75.0)
>>> local_diversity(
...     gdf,
...     spatial_weights=w_dist,
...     val_col="eccentricity",
...     metrics=["simpson_index"],
... )
Source code in cellseg_gsontools/diversity.py
def local_diversity(
    gdf: gpd.GeoDataFrame,
    spatial_weights: W,
    val_col: Union[str, Tuple[str, ...]],
    id_col: str = None,
    metrics: Tuple[str, ...] = ("simpson_index",),
    scheme: str = "FisherJenks",
    parallel: bool = True,
    num_processes: int = -1,
    rm_nhood_cols: bool = True,
    col_prefix: str = None,
    create_copy: bool = True,
) -> gpd.GeoDataFrame:
    """Compute the local diversity/heterogenity metric for cell neighborhood.

    Note:
        Allowed diversity metrics:

        - `simpson_index` - for both categorical and real valued neighborhoods
        - `shannon_index` - for both categorical and real valued neighborhoods
        - `gini_index` - for only real valued neighborhoods
        - `theil_index` - for only real valued neighborhoods

    Note:
        If `val_col` is not categorical, the values are binned using `mapclassify`.
        The bins are then used to compute the diversity metrics. If `val_col` is
        categorical, the values are used directly.

    Parameters:
        gdf (gpd.GeoDataFrame):
            The input GeoDataFrame.
        spatial_weights (libysal.weights.W):
            Libpysal spatial weights object.
        val_col (Union[str, Tuple[str, ...]]):
            The name of the column in the gdf for which the diversity is computed.
            You can also pass in a list of columns, in which case the diversity is
            computed for each column.
        id_col (str):
            The unique id column in the gdf. If None, this uses `set_uid` to set it.
            Defaults to None.
        metrics (Tuple[str, ...]):
            A Tuple/List of diversity metrics. Allowed metrics: "shannon_index",
            "simpson_index", "gini_index", "theil_index". Defaults to None.
        scheme (str):
            `mapclassify` classification scheme. Defaults to "FisherJenks".
        parallel (bool):
            Flag whether to use parallel apply operations when computing the diversities.
            Defaults to True.
        num_processes (int, default=-1):
            The number of processes to use when parallel=True. If -1,
            this will use all available cores.
        rm_nhood_cols (bool):
            Flag, whether to remove the extra neighborhood columns from the result gdf.
            Defaults to True.
        col_prefix (str):
            Prefix for the new column names. Defaults to None.
        create_copy (bool):
            Flag whether to create a copy of the input gdf or not. Defaults to True.

    Raises:
        ValueError:
            If an illegal metric is given.

    Returns:
        gpd.GeoDataFrame:
            The input geodataframe with computed diversity metric columns added.

    Examples:
        Compute the simpson diversity of eccentricity values for each cell neighborhood
        >>> from cellseg_gsontools.diversity import local_diversity
        >>> from cellseg_gsontools.graphs import fit_graph
        >>> w = fit_graph(gdf, type="distband", thres=75.0)
        >>> local_diversity(
        ...     gdf,
        ...     spatial_weights=w_dist,
        ...     val_col="eccentricity",
        ...     metrics=["simpson_index"],
        ... )
    """
    allowed = list(DIVERSITY_LOOKUP.keys())
    if not all(m in allowed for m in metrics):
        raise ValueError(
            f"Illegal metric in `metrics`. Got: {metrics}. Allowed metrics: {allowed}."
        )

    if create_copy:
        gdf = gdf.copy()

    # set uid
    if id_col is None:
        id_col = "uid"
        gdf = set_uid(gdf)

    # If shannon or simpson index in metrics, counts are needed
    ret_counts = False
    if any([m in metrics for m in ("simpson_index", "shannon_index")]):
        ret_counts = True

    # If Gini is in metrics, neighboring values are needed
    gt = ("gini_index", "theil_index")
    ret_vals = False
    if any([m in metrics for m in gt]):
        ret_vals = True

    # Get the immediate node neighborhood
    func = partial(neighborhood, spatial_weights=spatial_weights)
    gdf["nhood"] = gdf_apply(
        gdf,
        func,
        columns=[id_col],
        axis=1,
        parallel=parallel,
        num_processes=num_processes,
    )

    if isinstance(val_col, str):
        val_col = (val_col,)

    for col in val_col:
        values = gdf[col]

        # Get bins if data not categorical
        if not is_categorical(values):
            bins = mapclassify.classify(values, scheme=scheme).bins
        else:
            bins = None

        # Get the counts of the binned metric inside the neighborhoods
        if ret_counts:
            func = partial(nhood_counts, values=values, bins=bins)
            gdf[f"{col}_nhood_counts"] = gdf_apply(
                gdf,
                func,
                columns=["nhood"],
                axis=1,
                parallel=parallel,
                num_processes=num_processes,
            )

        if ret_vals:
            func = partial(nhood_vals, values=values)
            gdf[f"{col}_nhood_vals"] = gdf_apply(
                gdf,
                func,
                columns=["nhood"],
                axis=1,
                parallel=parallel,
                num_processes=num_processes,
            )

        # Compute the diversity metrics for the neighborhood counts
        for metric in metrics:
            colname = f"{col}_nhood_counts" if metric not in gt else f"{col}_nhood_vals"

            col_prefix = "" if col_prefix is None else col_prefix
            gdf[f"{col_prefix}{col}_{metric}"] = gdf_apply(
                gdf,
                DIVERSITY_LOOKUP[metric],
                columns=[colname],
                parallel=parallel,
                num_processes=num_processes,
            )

        if rm_nhood_cols:
            gdf = gdf.drop(labels=[colname], axis=1)

    if rm_nhood_cols:
        gdf = gdf.drop(labels=["nhood"], axis=1)

    return gdf