Source code for cycombinepy.cluster
"""SOM clustering wrapper around FlowSOM_Python.
Port of ``create_som`` in ``R/02_batch_correct.R:210``. Unlike the R version
(which supports kohonen / FlowSOM / FuseSOM / kmeans backends), the Python port
exclusively uses saeyslab's ``FlowSOM`` package.
"""
from __future__ import annotations
from typing import Iterable
import numpy as np
from anndata import AnnData
from cycombinepy._utils import marker_matrix, resolve_markers
[docs]
def create_som(
adata: AnnData,
markers: Iterable[str] | None = None,
xdim: int = 8,
ydim: int = 8,
n_clusters: int | None = None,
seed: int = 473,
rlen: int = 10,
layer: str | None = None,
label_key: str = "cycombine_som",
copy: bool = False,
) -> AnnData | None:
"""Train a FlowSOM on the marker columns of ``adata`` and store cluster labels.
Cells are mapped to either the raw SOM node (default) or a metacluster if
``n_clusters`` is supplied. The resulting 1-indexed integer labels are written
to ``adata.obs[label_key]`` as a ``category``.
Parameters
----------
adata
AnnData to cluster.
markers
Var names used for clustering. Defaults to :func:`cycombinepy.get_markers`.
xdim, ydim
SOM grid dimensions. Default 8x8 matches cyCombine.
n_clusters
If set, consensus-metacluster the SOM nodes into this many clusters.
seed
Random seed passed to FlowSOM.
rlen
Number of training passes. (Kept for parity with R; forwarded if the
installed FlowSOM version accepts it.)
layer
Read features from this layer rather than ``adata.X``.
label_key
Name of the ``adata.obs`` column to write labels into.
copy
If True, return a modified copy rather than mutating in place.
"""
try:
import flowsom as fs
except ImportError as exc: # pragma: no cover
raise ImportError(
"create_som requires FlowSOM_Python. Install it with "
"`pip install flowsom`."
) from exc
markers = resolve_markers(adata, markers)
if copy:
adata = adata.copy()
X = marker_matrix(adata, markers, layer=layer)
# FlowSOM_Python accepts an AnnData; build a minimal one with just the markers
# we care about so that the SOM is trained on the correct columns.
import anndata as ad
feat = ad.AnnData(X=X.astype(np.float32), var={"marker": markers})
feat.var_names = list(markers)
n_clus = n_clusters if n_clusters is not None else xdim * ydim
try:
fsom = fs.FlowSOM(
feat,
cols_to_use=list(markers),
xdim=xdim,
ydim=ydim,
n_clusters=n_clus,
rlen=rlen,
seed=seed,
)
except TypeError:
# Older versions may not accept ``rlen``.
fsom = fs.FlowSOM(
feat,
cols_to_use=list(markers),
xdim=xdim,
ydim=ydim,
n_clusters=n_clus,
seed=seed,
)
if n_clusters is None:
labels = np.asarray(fsom.get_cell_data().obs["clustering"]).astype(int)
else:
labels = np.asarray(fsom.get_cell_data().obs["metaclustering"]).astype(int)
# Store as 1-indexed categorical labels for parity with R.
labels = labels + 1 if labels.min() == 0 else labels
adata.obs[label_key] = labels.astype(str)
adata.obs[label_key] = adata.obs[label_key].astype("category")
return adata if copy else None