FeatureMAP on synthetic data of mixture-of-Gaussian with linear and one isolate pattern.
[1]:
import featuremap
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
import warnings
warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=FutureWarning)
warnings.simplefilter("ignore", category=DeprecationWarning)
/Users/uqyyao4/opt/anaconda3/envs/featmap/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
Data loading.
[2]:
import numpy as np
def sample_multidimensional_mixture(w, means, covariances, n_samples=1):
"""
Sample from a mixture of multivariate normal distributions.
Parameters:
w (list or np.array): Weights of the mixture components (should sum to 1).
means (list of np.array): List of mean vectors for each component.
covariances (list of np.array): List of covariance matrices for each component.
n_samples (int): Number of samples to generate.
Returns:
np.array: Samples drawn from the mixture distribution.
"""
# Ensure w is a numpy array
w = np.array(w)
# Check that the input is valid
assert len(w) == len(means) == len(covariances), "Lengths of w, means, and covariances must be equal."
assert np.isclose(np.sum(w), 1), "Weights w must sum to 1."
# Step 1: Sample indices from the categorical distribution defined by w
indices = np.random.choice(len(w), size=n_samples, p=w)
# Step 2: Sample from the multivariate normal distribution corresponding to the selected index
samples = np.array([np.random.multivariate_normal(means[i], covariances[i]) for i in indices])
return samples, indices
# Example usage
w = [0.25, 0.25, 0.25, 0.25] # Weights of the mixture components
# Linear
# Means for each of the 3 components (20-dimensional)
means = [
np.zeros(20), # Mean vector for the first component
np.ones(20) * 1, # Mean vector for the second component
np.ones(20) * 2, # Mean vector for the third component
np.ones(20) * 4 # Mean vector for the fourth component
]
# Covariance matrices for each of the 3 components (20x20 matrices)
covariances = [
np.eye(20) * 1, # Identity matrix as covariance for the first component
np.eye(20) * 1, # Scaled identity matrix for the second component
np.eye(20) * 1, # Scaled identity matrix for the third component
np.eye(20) * 1 # Scaled identity matrix for the third component
]
n_samples = 3000 # Number of samples to generate
samples, indices = sample_multidimensional_mixture(w, means, covariances, n_samples)
print(indices[:10]) # Print the indices of the first 10 samples
# Print the shape of the generated data
# print(samples.shape) # Should be (1000, 20)
# Print the first sample
# print(samples[0])
data = samples
data_pseudotime = np.array(indices)
[1 1 1 2 2 3 2 3 2 1]
[3]:
# statistics of data_pseudotime
import collections
counter=collections.Counter(data_pseudotime)
print(counter)
Counter({3: 784, 1: 749, 0: 749, 2: 718})
Visualize the data.
[4]:
import numpy as np
# import pca
from sklearn.decomposition import PCA
# plot the data by PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(data)
x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
min_lim = min(x_min, y_min)
max_lim = max(x_max, y_max)
# contour plot of the data
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity
# Create a grid of points
x = np.linspace(min_lim, max_lim, 100)
y = np.linspace(min_lim, max_lim, 100)
X, Y = np.meshgrid(x, y)
# Stack the grid points to create a 2D input for the KDE model
xy = np.vstack([X.ravel(), Y.ravel()]).T
# Fit a KDE model to the data
kde = KernelDensity(bandwidth=0.5)
kde.fit(X_pca)
# Evaluate the KDE model on the grid
Z = np.exp(kde.score_samples(xy))
Z = Z.reshape(X.shape)
# Plot the contour plot
plt.figure(figsize=(10, 7))
# plt.contourf(X, Y, Z, cmap='Blues')
plt.contourf(X, Y, Z,levels=15, cmap='Blues',linewidths=0.5, )
# Add lines to mark level boundaries
contour_lines = plt.contour(X, Y, Z, levels=15, colors='blue', linewidths=0.5)
# plt.colorbar()
# plt.scatter(X_pca[:, 0], X_pca[:, 1], c=data_pseudotime, cmap='tab10')
plt.xlim(min_lim, max_lim)
plt.ylim(min_lim, max_lim)
plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title('Contour plot of the data')
plt.show()
FeatureMAP to visualize the data.
FeatureMAP expression embedding.
[5]:
from featuremap.featuremap_ import _preprocess_data
emb_svd, vh = _preprocess_data(data)
emb_featuremap = featuremap.FeatureMAP(
n_neighbors=30,
min_dist=0.3,
random_state=42,
n_epochs=400,
output_variation=False,
feat_gauge_coefficient=2,
verbose=True,
).fit(emb_svd)
FeatureMAP(feat_gauge_coefficient=2, min_dist=0.3, n_epochs=400, random_state=42, verbose=True)
Mon Feb 10 17:37:07 2025 Construct fuzzy simplicial set
Mon Feb 10 17:37:07 2025 Finding Nearest Neighbors
Mon Feb 10 17:37:07 2025 Building RP forest with 8 trees
Mon Feb 10 17:37:11 2025 NN descent for 12 iterations
1 / 12
2 / 12
3 / 12
Stopping threshold met -- exiting after 3 iterations
Mon Feb 10 17:37:24 2025 Finished Nearest Neighbor Search
Mon Feb 10 17:37:27 2025 Construct embedding
Mon Feb 10 17:37:27 2025 Computing tangent space
Mon Feb 10 17:37:31 2025 Local SVD time is 4.491661071777344
Mon Feb 10 17:37:31 2025Applying graph convolution for 5 iterations...
Mon Feb 10 17:37:32 2025Graph convolution completed in 0.81 seconds
Mon Feb 10 17:37:32 2025 Tangent_space_approximation time is 5.343469858169556
k is 10
Mon Feb 10 17:37:49 2025 Tangent space embedding
Mon Feb 10 17:37:49 2025 Start optimizing layout
Epochs completed: 100%| ██████████ 400/400 [00:25]
Mon Feb 10 17:38:15 2025 Optimize layout time is 26.005359172821045
Mon Feb 10 17:38:15 2025 Finished embedding
[6]:
from featuremap import features
import importlib
importlib.reload(features)
adata = features.create_adata(X=data, emb_featuremap=emb_featuremap)
# adata.var_names = mnist.data.columns.to_list()
adata.obsm['X_svd'] = emb_svd
adata.varm['svd_vh'] = vh.T
adata.obsm['X_featmap'] = emb_featuremap.embedding_
adata.obs['pseudotime'] = data_pseudotime
adata.obs['pseudotime'] = adata.obs['pseudotime'].astype(str)
# adata.obsm['X_umap'] = emb_umap
import scanpy as sc
sc.pl.embedding(adata, basis='X_featmap', color='pseudotime', legend_loc='on data')
mu is not added to adata
FeatureMAP variation embedding.
[7]:
emb_featuremap_v = featuremap.FeatureMAP(
n_neighbors=30,
random_state=42,
output_variation=True,
n_epochs=400,
).fit(emb_svd)
adata.obsm["X_featmap_v"] = emb_featuremap_v.embedding_
adata.obsm['variation_pc'] = emb_featuremap_v._featuremap_kwds['variation_pc']
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
plt.figure(dpi=100)
embedding_df = pd.DataFrame(adata.obsm["X_featmap_v"], index=adata.obs_names, columns=['dim_0', 'dim_1'])
embedding_df['pseudotime'] = data_pseudotime
sns.scatterplot(x='dim_0',y='dim_1', hue='pseudotime', data=embedding_df, palette='Spectral', s=10)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
[7]:
<matplotlib.legend.Legend at 0x7fa78a3af700>
Benchmark with other DR methods
[8]:
def plot_func(emb, label, method):
x_min, x_max = emb[:, 0].min(), emb[:, 0].max()
y_min, y_max = emb[:, 1].min(), emb[:, 1].max()
min_lim = min(x_min - (x_min+x_max)/2, y_min - (y_min+y_max)/2)
max_lim = max(x_max +(x_min+x_max)/2, y_max+ (y_min+y_max)/2)
plt.figure(figsize=(10, 7))
plt.scatter(emb[:, 0], emb[:, 1], c=label, cmap='tab10')
plt.xlim(min_lim, max_lim)
plt.ylim(min_lim, max_lim)
# plt.colorbar()
plt.xlabel(f'{method} 1')
plt.ylabel(f'{method} 2')
plt.title(f'{method} plot')
plt.show()
### Benchmark with other DR methods
# UMAP plot
import umap
emb_umap = umap.UMAP().fit_transform(data)
adata.obsm['X_umap'] = emb_umap
# plot the data by UMAP
plot_func(emb_umap, data_pseudotime, 'UMAP')
# DensMAP plot
import umap
emb_densmap = umap.UMAP(densmap=True).fit_transform(data)
adata.obsm['X_densmap'] = emb_densmap
# plot the data by DensMAP
plot_func(emb_densmap, data_pseudotime, 'DensMAP')
# plot the data by t-SNE
from sklearn.manifold import TSNE
emb_tsne = TSNE(n_components=2, random_state=42).fit_transform(data)
adata.obsm['X_tsne'] = emb_tsne
plot_func(emb_tsne, data_pseudotime, 't-SNE')
# plot the data by phate
import phate
phate_op = phate.PHATE()
emb_phate = phate_op.fit_transform(data)
adata.obsm['X_phate'] = emb_phate
plot_func(emb_phate, data_pseudotime, 'PHATE')
# plot data by PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(data)
adata.obsm['X_pca'] = X_pca
plot_func(X_pca, data_pseudotime, 'PCA')
Calculating PHATE...
Running PHATE on 3000 observations and 20 variables.
Calculating graph and diffusion operator...
Calculating KNN search...
Calculated KNN search in 0.41 seconds.
Calculating affinities...
Calculated affinities in 0.03 seconds.
Calculated graph and diffusion operator in 0.46 seconds.
Calculating landmark operator...
Calculating SVD...
Calculated SVD in 0.54 seconds.
Calculating KMeans...
Calculated KMeans in 19.55 seconds.
Calculated landmark operator in 21.43 seconds.
Calculating optimal t...
Automatically selected t = 10
Calculated optimal t in 21.93 seconds.
Calculating diffusion potential...
Calculated diffusion potential in 31.96 seconds.
Calculating metric MDS...
Calculated metric MDS in 7.88 seconds.
Calculated PHATE in 83.68 seconds.
[9]:
emb_phate_obj = phate_op.fit(data)
phate_graph = emb_phate_obj.graph.to_igraph()
# number of nodes
print(phate_graph.vcount())
# to networkx
phate_graph_nx = phate_graph.to_networkx()
Running PHATE on 3000 observations and 20 variables.
Calculating graph and diffusion operator...
Calculating KNN search...
Calculated KNN search in 0.27 seconds.
Calculating affinities...
Calculated affinities in 0.02 seconds.
Calculated graph and diffusion operator in 0.30 seconds.
Calculating landmark operator...
Calculating SVD...
Calculated SVD in 0.56 seconds.
Calculating KMeans...
Calculated KMeans in 17.01 seconds.
Calculated landmark operator in 19.89 seconds.
3000
[10]:
from featuremap.features import feature_variation, feature_variation_embedding
feature_variation(adata, threshold=0.9)
k is 10
Start matrix multiplication
Finish matrix multiplication in 0.030820846557617188
Finish norm calculation in 0.0027840137481689453
Transition and core states identification by density, curvature and betweenness centrality
Use density to define transition and core states.
[11]:
##################################
# Contour plot to show the density
######################################
from featuremap import core_transition_states
import importlib
importlib.reload(core_transition_states)
from featuremap.core_transition_states import plot_density
plot_density(adata)
#%%
#######################################################
# Compute core-states based on clusters
#########################################################
quantile_core = 0.5
quantile_trans = 0.2
from featuremap.core_transition_states import compute_density
compute_density(adata, quantile_core=quantile_core, quantile_trans=quantile_trans)
# import scanpy as sc
# sc.pl.embedding(adata, basis='X_featmap_v', color='core_trans_states', )
sc.pl.embedding(adata, 'featmap_v',legend_fontsize=6, s=10, legend_loc='on data', color='density')
# plot histogram of density
import seaborn as sns
import matplotlib.pyplot as plt
density = adata.obs['density']
plt.figure(dpi=100)
sns.histplot(density, bins=50)
plt.xlabel('Density')
plt.ylabel('Frequency')
plt.title('Histogram of Density')
threshold_core = density.quantile(quantile_core)
plt.axvline(threshold_core, color='red', linestyle='--', label='Core threshold')
threshold_trans = density.quantile(quantile_trans)
plt.axvline(threshold_trans, color='blue', linestyle='--', label='Transition threshold')
plt.legend()
plt.show()
<Figure size 640x480 with 0 Axes>
Use curvature to define transition and core states.
Curvature by variation
Consider a car driving along a curvy road. The tighter the curve, the more difficult the driving is. In math we have a number, the curvature, that describes this “tightness”. If the curvature is zero then the curve looks like a line near this point. While if the curvature is a large number, then the curve has a sharp bend.
[12]:
from featuremap import core_transition_states
import importlib
importlib.reload(core_transition_states)
quantile_core = 0.6
quantile_trans = 0.8
core_transition_states.compute_curvature(adata, emb_featuremap, quantile_core=quantile_core, quantile_trans=quantile_trans)
sc.pl.embedding(adata, 'featmap',legend_fontsize=6, s=10, legend_loc='on data', color='curvature')
# plot histogram of curvature
import seaborn as sns
import matplotlib.pyplot as plt
curvature = adata.obs['curvature']
plt.figure(dpi=100)
sns.histplot(curvature, bins=50)
plt.xlabel('Curvature')
plt.ylabel('Frequency')
plt.title('Histogram of Curvature')
threshold_core = curvature.quantile(quantile_core)
plt.axvline(threshold_core, color='red', linestyle='--', label=f'Core threshold')
threshold_trans = curvature.quantile(quantile_trans)
plt.axvline(threshold_trans, color='blue', linestyle='--', label=f'Transition threshold')
plt.legend()
plt.show()
Use betweeness centrality to define transition states.
[13]:
from featuremap import core_transition_states
import importlib
importlib.reload(core_transition_states)
quantile_trans = 0.8
quantile_core = 0.2
core_transition_states.compute_betweenness_centrality(adata, emb_featuremap, quantile_core=quantile_core, quantile_trans=quantile_trans)
betweenness_centrality = adata.obs['betweenness_centrality'].copy()
plt.hist(betweenness_centrality, bins=50)
threshold_core = betweenness_centrality.quantile(quantile_core)
plt.axvline(threshold_core, color='red', linestyle='--', label=f'Core threshold')
threshold_trans = betweenness_centrality.quantile(quantile_trans)
plt.axvline(threshold_trans, color='blue', linestyle='--', label=f'Transition threshold')
plt.legend()
plt.xlabel('Betweenness Centrality')
plt.ylabel('Frequency')
plt.title('Histogram of Betweenness Centrality')
plt.show()
Visualize the results of transition and core states by density, curvature and betweenness centrality.
[14]:
adata.obsm['X_FeatureMAP'] = adata.obsm['X_featmap']
adata.obsm['X_FeatureMAP_v'] = adata.obsm['X_featmap_v']
# set figure size
sc.set_figure_params(figsize=(4, 3),fontsize=18)
sc.pl.embedding(adata, basis='FeatureMAP', color=['density', 'density_core', 'density_transition'], cmap='jet', save='_linear_iso_density.png')
sc.pl.embedding(adata, basis='FeatureMAP_v', color=['curvature', 'curvature_core', 'curvature_transition'], cmap='jet', save='_linear_iso_curvature.png')
sc.pl.embedding(adata, basis='FeatureMAP_v', color=['betweenness_centrality', 'betweenness_centrality_core', 'betweenness_centrality_transition'],
cmap='jet',save='_linear_iso_betweenness_centrality.png')
WARNING: saving figure to file figures/FeatureMAP_linear_iso_density.png
WARNING: saving figure to file figures/FeatureMAP_v_linear_iso_curvature.png
WARNING: saving figure to file figures/FeatureMAP_v_linear_iso_betweenness_centrality.png
Union the results from density, curvature and betweenness centrality
[15]:
from featuremap import core_transition_states
import importlib
importlib.reload(core_transition_states)
core_transition_states.plot_core_transition_states(adata)
Compute the cluster state labels based on the percentage of core_states and transition_states for each cluster.
[16]:
import anndata as ad
adata_var = ad.AnnData(X=adata.obsm['variation_pc'], obs=adata.obs)
adata_var.obsm['X_featmap_v'] = adata.obsm['X_featmap_v']
# adata_var.obs['clusters'] = adata.obs['clusters']
# leiiden clustering on variation embedding
sc.pp.pca(adata_var)
sc.pp.neighbors(adata_var, n_neighbors=5,)
sc.tl.leiden(adata_var, resolution=0.6)
adata.obs['leiden_v'] = adata_var.obs['leiden']
from featuremap import core_transition_states
import importlib
importlib.reload(core_transition_states)
# plot the leiden clustering on the variation embedding
sc.pl.embedding(adata, basis='FeatureMAP_v', color='leiden_v', legend_loc='on data')
core_transition_states.compute_cluster_state_labels(adata)
[17]:
# visualize cluster_state_label in UMAP, DensMAP, PHATE, t-SNE, PCA
sc.pl.embedding(adata, basis='X_featmap_v', color='cluster_state_label', cmap='Blues_r', s=10, frameon=False)
sc.pl.embedding(adata, basis='X_umap', color='cluster_state_label', cmap='Blues_r', s=10, frameon=False)
sc.pl.embedding(adata, basis='X_densmap', color='cluster_state_label', cmap='Blues_r', s=10, frameon=False)
sc.pl.embedding(adata, basis='X_phate', color='cluster_state_label', cmap='Blues_r', s=10, frameon=False)
sc.pl.embedding(adata, basis='X_tsne', color='cluster_state_label', cmap='Blues_r', s=10, frameon=False)
sc.pl.embedding(adata, basis='X_pca', color='cluster_state_label', cmap='Blues_r', s=10,frameon=False)
Project the identified transition and core states back to the original data
[18]:
X_pca = adata.obsm['X_pca']
x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
min_lim = min(x_min, y_min)
max_lim = max(x_max, y_max)
# contour plot of the data
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity
# Create a grid of points
x = np.linspace(min_lim, max_lim, 100)
y = np.linspace(min_lim, max_lim, 100)
X, Y = np.meshgrid(x, y)
# Stack the grid points to create a 2D input for the KDE model
xy = np.vstack([X.ravel(), Y.ravel()]).T
# Fit a KDE model to the data
kde = KernelDensity(bandwidth=0.5)
kde.fit(X_pca)
# Evaluate the KDE model on the grid
Z = np.exp(kde.score_samples(xy))
Z = Z.reshape(X.shape)
color = adata.obs['cluster_state_label'].cat.codes
# create color map by [#1f77b4, #ff7f0e]
from matplotlib.colors import ListedColormap
cmap = ListedColormap(['#1f77b4', '#ff7f0e'])
# # create a colormap by ['orange', 'blue']
# from matplotlib.colors import ListedColormap
# cmap = ListedColormap(['blue', 'orange'])
# Plot the contour plot
plt.figure(figsize=(10, 7))
plt.contourf(X, Y, Z,levels=15, cmap='Blues',linewidths=0.5, )
# Add lines to mark level boundaries
contour_lines = plt.contour(X, Y, Z, levels=15, colors='blue', linewidths=0.5)
# plt.colorbar()
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=color, cmap=cmap, alpha=0.2)
plt.xlim(min_lim, max_lim)
plt.ylim(min_lim, max_lim)
plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title('Contour plot of the data')
plt.show()
Compare the clustering coefficient in clustersing by expression and variation space
[19]:
from featuremap import core_transition_states
import networkx as nx
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Create graphs from feature maps
G_expr = nx.from_numpy_matrix(emb_featuremap.graph_)
G_var = nx.from_numpy_matrix(emb_featuremap_v._featuremap_kwds['graph_v'])
G_phate = phate_graph_nx
# Get clusters
clusters = adata.obs['leiden_v'].values.tolist()
# Compute weighted clustering coefficients
cluster_coefficients_var = core_transition_states.clustering_coefficient_by_cluster(G_var, clusters)
cluster_coefficients_expr = core_transition_states.clustering_coefficient_by_cluster(G_expr, clusters)
cluster_coefficients_phate = core_transition_states.clustering_coefficient_by_cluster(G_phate, clusters)
# Create DataFrames
df_var = pd.DataFrame({"Cluster": list(cluster_coefficients_var.keys()), "Coefficient": list(cluster_coefficients_var.values()), "type": 'variation_graph'})
df_expr = pd.DataFrame({"Cluster": list(cluster_coefficients_expr.keys()), "Coefficient": list(cluster_coefficients_expr.values()), "type": 'expression_graph'})
df_phate = pd.DataFrame({"Cluster": list(cluster_coefficients_phate.keys()), "Coefficient": list(cluster_coefficients_phate.values()), "type": 'phate_graph'})
# Combine DataFrames
df = pd.concat([df_var, df_expr, df_phate])
# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x="Cluster", y="Coefficient", hue='type', palette='Set2', data=df)
# Add value on each bar
for index, row in df.iterrows():
plt.text(index, row.Coefficient, round(row.Coefficient, 2), color='black', ha="center")
plt.title("Cluster-wise weighted clustering coefficients")
plt.xlabel("Cluster")
plt.ylabel("Weighted clustering coefficient")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
[24]:
import networkx as nx
from featuremap import core_transition_states
import importlib
importlib.reload(core_transition_states)
from sklearn.metrics import pairwise_distances, silhouette_score
from scipy.stats import ttest_ind, f_oneway
import matplotlib.pyplot as plt
import seaborn as sns
# Graph distance matrices
G_expr = nx.from_numpy_matrix(emb_featuremap.graph_)
G_var = nx.from_numpy_matrix(emb_featuremap_v._featuremap_kwds['graph_v'])
G_phate = phate_graph_nx
# Get the weighted adjacency matrix
adjacency_expr = nx.to_numpy_array(G_expr)
adjacency_var = nx.to_numpy_array(G_var)
adjacency_phate = nx.to_numpy_array(G_phate)
# Compute the distance matrices
dist_mat_expr = pairwise_distances(adjacency_expr, metric='euclidean')
dist_mat_var = pairwise_distances(adjacency_var, metric='euclidean')
dist_mat_phate = pairwise_distances(adjacency_phate, metric='euclidean')
labels = np.array(adata.obs['leiden_v'].values.tolist())
clusters = np.unique(labels)
ss_clusters_expr, ss_clusters_var, ss_clusters_phate = {}, {}, {}
for cluster in clusters:
cluster_indices = np.where(labels == cluster)[0]
ss_clusters_expr[cluster] = np.mean([core_transition_states.silhouette_score_one_point(dist_mat_expr, labels, idx) for idx in cluster_indices])
ss_clusters_var[cluster] = np.mean([core_transition_states.silhouette_score_one_point(dist_mat_var, labels, idx) for idx in cluster_indices])
ss_clusters_phate[cluster] = np.mean([core_transition_states.silhouette_score_one_point(dist_mat_phate, labels, idx) for idx in cluster_indices])
ss_all = []
def compute_silhouette_scores(state_type, ss_clusters):
cluster_state_dict = adata.uns['cluster_state_dict']
states = [cluster for cluster, state in cluster_state_dict.items() if state == state_type]
return [ss_clusters[cluster] for cluster in states]
ss_all.append(compute_silhouette_scores('transition_states', ss_clusters_expr))
ss_all.append(compute_silhouette_scores('core_states', ss_clusters_expr))
ss_all.append(compute_silhouette_scores('transition_states', ss_clusters_phate))
ss_all.append(compute_silhouette_scores('core_states', ss_clusters_phate))
ss_all.append(compute_silhouette_scores('transition_states', ss_clusters_var))
ss_all.append(compute_silhouette_scores('core_states', ss_clusters_var))
# T-test and ANOVA
f_stat_1, p_val_1 = f_oneway(ss_all[1], ss_all[3], ss_all[5])
print(f'ANOVA: f_stat: {f_stat}, p_val: {p_val}')
f_stat_2, p_val_2 = f_oneway(ss_all[0], ss_all[2], ss_all[4])
print(f'ANOVA: f_stat: {f_stat}, p_val: {p_val}')
# Plotting
def plot_silhouette_scores(data, labels, title, p_val):
plt.figure(figsize=(10, 6))
box = sns.boxplot(data=data, palette="tab10", showfliers=False)
hatch_patterns = ['/', '\\', 'x']
for patch, hatch in zip(box.patches, hatch_patterns):
patch.set_hatch(hatch)
sns.stripplot(data=data, color=".3", jitter=True, dodge=True)
plt.xticks(ticks=range(len(labels)), labels=labels)
plt.title(title)
plt.ylabel("Silhouette score")
plt.text(0.2, 0.8, f"p-value: {p_val:.2e}", ha='center', va='center', transform=plt.gca().transAxes)
plt.show()
plot_silhouette_scores([ss_all[1], ss_all[3], ss_all[5]], ['UMAP graph', 'PHATE graph', 'Variation graph'], "Silhouette score in core states", p_val_1)
plot_silhouette_scores([ss_all[0], ss_all[2], ss_all[4]], ['UMAP graph', 'PHATE graph', 'Variation graph'], "Silhouette score in transition states", p_val_2)
ANOVA: f_stat: 12.231996273568905, p_val: 0.001270287901695751
ANOVA: f_stat: 12.231996273568905, p_val: 0.001270287901695751
Compute Silhouette score of different methods.
[21]:
import networkx as nx
from featuremap import core_transition_states
import importlib
importlib.reload(core_transition_states)
from sklearn.metrics import pairwise_distances, silhouette_score
from scipy.stats import ttest_ind, f_oneway
import matplotlib.pyplot as plt
import seaborn as sns
# Graph distance matrices
G_expr = nx.from_numpy_matrix(emb_featuremap.graph_)
G_var = nx.from_numpy_matrix(emb_featuremap_v._featuremap_kwds['graph_v'])
G_phate = phate_graph_nx
# Get the weighted adjacency matrix
adjacency_expr = nx.to_numpy_array(G_expr)
adjacency_var = nx.to_numpy_array(G_var)
adjacency_phate = nx.to_numpy_array(G_phate)
# Compute the distance matrices
dist_mat_expr = pairwise_distances(adjacency_expr, metric='euclidean')
dist_mat_var = pairwise_distances(adjacency_var, metric='euclidean')
dist_mat_phate = pairwise_distances(adjacency_phate, metric='euclidean')
labels = np.array(adata.obs['leiden_v'].values.tolist())
clusters = np.unique(labels)
ss_clusters_expr, ss_clusters_var, ss_clusters_phate = {}, {}, {}
for cluster in clusters:
cluster_indices = np.where(labels == cluster)[0]
ss_clusters_expr[cluster] = np.mean([core_transition_states.silhouette_score_one_point(dist_mat_expr, labels, idx) for idx in cluster_indices])
ss_clusters_var[cluster] = np.mean([core_transition_states.silhouette_score_one_point(dist_mat_var, labels, idx) for idx in cluster_indices])
ss_clusters_phate[cluster] = np.mean([core_transition_states.silhouette_score_one_point(dist_mat_phate, labels, idx) for idx in cluster_indices])
ss_all = []
def compute_silhouette_scores(state_type, ss_clusters):
cluster_state_dict = adata.uns['cluster_state_dict']
states = [cluster for cluster, state in cluster_state_dict.items() if state == state_type]
return [ss_clusters[cluster] for cluster in states]
ss_all.append(compute_silhouette_scores('transition_states', ss_clusters_expr))
ss_all.append(compute_silhouette_scores('core_states', ss_clusters_expr))
ss_all.append(compute_silhouette_scores('transition_states', ss_clusters_phate))
ss_all.append(compute_silhouette_scores('core_states', ss_clusters_phate))
ss_all.append(compute_silhouette_scores('transition_states', ss_clusters_var))
ss_all.append(compute_silhouette_scores('core_states', ss_clusters_var))
# T test and ANOVA
t_stat, p_val = ttest_ind(ss_all[2], ss_all[3])
print(f'T-test: t_stat: {t_stat}, p_val: {p_val}')
f_stat, p_val = f_oneway(ss_all[4], ss_all[5], ss_all[0])
print(f'ANOVA: f_stat: {f_stat}, p_val: {p_val}')
# Plotting
def plot_silhouette_scores(data, labels, title, p_val):
plt.figure(figsize=(10, 6))
box = sns.boxplot(data=data, palette="tab10", showfliers=False)
hatch_patterns = ['/', '\\', 'x']
for patch, hatch in zip(box.patches, hatch_patterns):
patch.set_hatch(hatch)
sns.stripplot(data=data, color=".3", jitter=True, dodge=True)
plt.xticks(ticks=range(len(labels)), labels=labels)
plt.title(title)
plt.ylabel("Silhouette score")
plt.text(0.2, 0.8, f"p-value: {p_val:.2e}", ha='center', va='center', transform=plt.gca().transAxes)
plt.show()
plot_silhouette_scores([ss_all[4], ss_all[5], ss_all[0]], ['UMAP graph', 'PHATE graph', 'Variation graph'], "Silhouette score in core states", p_val)
plot_silhouette_scores([ss_all[0], ss_all[2], ss_all[4]], ['UMAP graph', 'PHATE graph', 'Variation graph'], "Silhouette score in transition states", p_val)
T-test: t_stat: 4.275210848071979, p_val: 0.0016236023022129006
ANOVA: f_stat: 12.231996273568905, p_val: 0.001270287901695751
[22]:
adata
[22]:
AnnData object with n_obs × n_vars = 3000 × 20
obs: 'pseudotime', 'intrinsic_dim', 'leiden', 'density', 'density_core', 'density_largest', 'density_transition', 'density_core_id', 'density_transition_id', 'curvature', 'curvature_core', 'curvature_transition', 'degree', 'betweenness_centrality', 'betweenness_centrality_transition', 'betweenness_centrality_core', 'core_states', 'transition_states', 'leiden_v', 'cluster_state_label'
uns: 'pseudotime_colors', 'neighbors', 'leiden', 'core_states_colors', 'transition_states_colors', 'leiden_v_colors', 'cluster_state_label_colors', 'cluster_state_dict'
obsm: 'X_featmap', '_knn_indices', 'U', 'Singular_value', 'vh_smoothed', 'variation_pc', 'variation_embedding', 'gauge_v1_emb', 'gauge_v2_emb', 'VH_embedding', 'R', 'mu_sum', 'X_embedding', 'X_svd', 'X_featmap_v', 'X_umap', 'X_densmap', 'X_tsne', 'X_phate', 'X_pca', 'pcVals_project_back', 'X_FeatureMAP', 'X_FeatureMAP_v'
varm: 'svd_vh'
layers: 'variation_feature'
obsp: 'distances', 'connectivities'
[23]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
# Define resolutions
resolutions = [1.5, 1.8, 2, 2.1, 2.2, 2.3, 2.4, 2.5, 3]
# Dictionaries to store core and transition state percentages
core_states_percentage_dict = {}
transition_states_percentage_dict = {}
for res in resolutions:
# Perform Leiden clustering for the given resolution
leiden_key = f'leiden_expr_{res}'
sc.tl.leiden(adata, resolution=res, key_added=leiden_key)
# Visualize clusters on FeatureMAP embedding
sc.pl.embedding(adata, basis='FeatureMAP_v', color=leiden_key, legend_loc='on data')
# Initialize storage for cluster-wise percentages
core_states_percentage = []
transition_states_percentage = []
# Compute cluster-wise percentages
clusters = adata.obs[leiden_key].unique()
for cluster in clusters:
cluster_cells = adata.obs[leiden_key] == cluster
total_cells = cluster_cells.sum()
core_cells = (adata.obs['core_states']=='1') & cluster_cells
transition_cells = (adata.obs['transition_states']=='1') & cluster_cells
core_percentage = (core_cells.sum() / total_cells) * 100
transition_percentage = (transition_cells.sum() / total_cells) * 100
core_states_percentage.append(core_percentage)
transition_states_percentage.append(transition_percentage)
# Store results in dictionaries
core_states_percentage_dict[res] = core_states_percentage
transition_states_percentage_dict[res] = transition_states_percentage
# Convert to DataFrame for plotting
df = pd.DataFrame({
"Cluster": clusters,
"Core States (%)": core_states_percentage,
"Transition States (%)": transition_states_percentage
})
# Plot for the current resolution
fig, ax = plt.subplots(figsize=(8, 5))
df.set_index("Cluster").plot(kind="bar", stacked=True, ax=ax, color=["blue", "orange"])
ax.set_xlabel("Clusters")
ax.set_ylabel("Percentage")
ax.set_title(f"Core and Transition State Percentages (Resolution = {res})")
ax.legend(["Core States", "Transition States"])
plt.xticks(rotation=45)
plt.show()
[ ]: