Source code for cycombinepy.cluster

"""SOM clustering wrapper around FlowSOM_Python.

Port of ``create_som`` in ``R/02_batch_correct.R:210``. Unlike the R version
(which supports kohonen / FlowSOM / FuseSOM / kmeans backends), the Python port
exclusively uses saeyslab's ``FlowSOM`` package.
"""

from __future__ import annotations

from typing import Iterable

import numpy as np
from anndata import AnnData

from cycombinepy._utils import marker_matrix, resolve_markers



[docs]
def create_som(
    adata: AnnData,
    markers: Iterable[str] | None = None,
    xdim: int = 8,
    ydim: int = 8,
    n_clusters: int | None = None,
    seed: int = 473,
    rlen: int = 10,
    layer: str | None = None,
    label_key: str = "cycombine_som",
    copy: bool = False,
) -> AnnData | None:
    """Train a FlowSOM on the marker columns of ``adata`` and store cluster labels.

    Cells are mapped to either the raw SOM node (default) or a metacluster if
    ``n_clusters`` is supplied. The resulting 1-indexed integer labels are written
    to ``adata.obs[label_key]`` as a ``category``.

    Parameters
    ----------
    adata
        AnnData to cluster.
    markers
        Var names used for clustering. Defaults to :func:`cycombinepy.get_markers`.
    xdim, ydim
        SOM grid dimensions. Default 8x8 matches cyCombine.
    n_clusters
        If set, consensus-metacluster the SOM nodes into this many clusters.
    seed
        Random seed passed to FlowSOM.
    rlen
        Number of training passes. (Kept for parity with R; forwarded if the
        installed FlowSOM version accepts it.)
    layer
        Read features from this layer rather than ``adata.X``.
    label_key
        Name of the ``adata.obs`` column to write labels into.
    copy
        If True, return a modified copy rather than mutating in place.
    """
    try:
        import flowsom as fs
    except ImportError as exc:  # pragma: no cover
        raise ImportError(
            "create_som requires FlowSOM_Python. Install it with "
            "`pip install flowsom`."
        ) from exc

    markers = resolve_markers(adata, markers)
    if copy:
        adata = adata.copy()

    X = marker_matrix(adata, markers, layer=layer)

    # FlowSOM_Python accepts an AnnData; build a minimal one with just the markers
    # we care about so that the SOM is trained on the correct columns.
    import anndata as ad

    feat = ad.AnnData(X=X.astype(np.float32), var={"marker": markers})
    feat.var_names = list(markers)

    n_clus = n_clusters if n_clusters is not None else xdim * ydim
    try:
        fsom = fs.FlowSOM(
            feat,
            cols_to_use=list(markers),
            xdim=xdim,
            ydim=ydim,
            n_clusters=n_clus,
            rlen=rlen,
            seed=seed,
        )
    except TypeError:
        # Older versions may not accept ``rlen``.
        fsom = fs.FlowSOM(
            feat,
            cols_to_use=list(markers),
            xdim=xdim,
            ydim=ydim,
            n_clusters=n_clus,
            seed=seed,
        )

    if n_clusters is None:
        labels = np.asarray(fsom.get_cell_data().obs["clustering"]).astype(int)
    else:
        labels = np.asarray(fsom.get_cell_data().obs["metaclustering"]).astype(int)

    # Store as 1-indexed categorical labels for parity with R.
    labels = labels + 1 if labels.min() == 0 else labels
    adata.obs[label_key] = labels.astype(str)
    adata.obs[label_key] = adata.obs[label_key].astype("category")

    return adata if copy else None