Source code for featuremap.featuremap_

# Authors: 
#
# License: 
#
# Adapts source code from: 
# Leland McInnes <leland.mcinnes@gmail.com>
#
# License: BSD 3 clause


from __future__ import print_function

import locale
from warnings import warn
import time
import warnings

from scipy.optimize import curve_fit
from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
from sklearn.neighbors import KDTree
# from sklearn.manifold import MDS, TSNE 
# from sklearn.decomposition import PCA


from umap.umap_ import raise_disconnected_warning, smooth_knn_dist, nearest_neighbors, compute_membership_strengths
from umap.umap_ import fuzzy_simplicial_set, make_epochs_per_sample, init_graph_transform, find_ab_params
import umap.distances as dist
import umap.sparse as sparse

from umap.utils import (
    submatrix,
    ts,
    csr_unique,
    fast_knn_indices,
)
from umap.spectral import spectral_layout


try:
    import joblib
except ImportError:
    # sklearn.externals.joblib is deprecated in 0.21, will be removed in 0.23
    from sklearn.externals import joblib

import numpy as np
import scipy.sparse
from scipy.sparse import tril as sparse_tril, triu as sparse_triu
import scipy.sparse.csgraph
import numba



# from featuremap.core_transition_state import kernel_density_estimate

from pynndescent import NNDescent
from pynndescent.distances import named_distances as pynn_named_distances
from pynndescent.sparse import sparse_named_distances as pynn_sparse_named_distances

locale.setlocale(locale.LC_NUMERIC, "C")

[docs] INT32_MIN = np.iinfo(np.int32).min + 1
[docs] INT32_MAX = np.iinfo(np.int32).max - 1
[docs] SMOOTH_K_TOLERANCE = 1e-5
[docs] MIN_K_DIST_SCALE = 1e-3
[docs] NPY_INFINITY = np.inf
[docs] DISCONNECTION_DISTANCES = { "correlation": 2, "cosine": 2, "hellinger": 1, "jaccard": 1, "dice": 1, }
# Preprocess the data by singular value decomposition def _preprocess_data(X): T1 = time.time() if X.shape[1] > 100 and X.shape[0] > 100: print("Performing SVD decomposition on the data") u, s, vh = scipy.sparse.linalg.svds(X, k=100, which='LM', random_state=42) X = np.matmul(u, np.diag(s)) elif X.shape[1] > 100 and X.shape[0] < 100: print(int(X.shape[0]-1)) u, s, vh = scipy.sparse.linalg.svds(X, k=int(X.shape[0]-1), which='LM', random_state=42) X = np.matmul(u, np.diag(s)) else: vh = np.eye(X.shape[1]) T2 = time.time() # print(f'SVD decomposition time is {T2-T1}') return X, vh from sklearn.neighbors import NearestNeighbors
[docs] def kernel_density_estimate(data, X, bw=0.5, min_radius=5, output_onlylogp=False, ): """ Density estimation for data points specified by X with kernel density estimation. Parameters ---------- data : array of shape (n_samples, n_features) 2D array including data points. Input to density estimation. X : array 2D array including multiple data points. Input to density estimation. output_onlylogp : bool If true, returns logp, else returns p, g, h, msu. Returns ------- p : array 1D array. Unnormalized probability density. The probability density is not normalized due to numerical stability. Exact log probability """ nbrs = NearestNeighbors(n_neighbors=min_radius + 1).fit(data) adaptive_bw = np.maximum(nbrs.kneighbors(data)[0][:, -1], bw) # the number of data points and the dimensionality n, d = data.shape from scipy.spatial.distance import cdist # compare euclidean distances between each pair of data and X D = cdist(data, X) # and evaluate the kernel at each distance # prevent numerical overflow due to large exponentials logc = -d * np.log(np.min(adaptive_bw)) - d / 2 * np.log(2 * np.pi) C = (adaptive_bw[:, np.newaxis] / np.min(adaptive_bw)) ** (-d) * \ np.exp(-1 / 2. * (D / adaptive_bw[:, np.newaxis]) ** 2) if output_onlylogp: # return the kernel density estimate return np.log(np.mean(C, axis=0).T) + logc else: return np.mean(C, axis=0).T
@numba.njit()
[docs] def local_svd( data, knn_index, neighbor_weights, n_neighbors=15, n_neighbors_in_guage=30, ): """ Local singular value decomposition (SVD) for each node in the data Parameters ----------- data: array of shape (n_samples, n_features) The source data to be embedded by FeatureMAP. knn_index: array of shape (n_samples, n_neighbors) The index of k-nearest neighbors for each node weight: array of shape (n_samples, n_samples) The weight matrix of the graph n_neighbors: int The number of nearest neighbors n_neighbors_in_guage: int The number of nearest neighbors for local SVD Returns -------- gauge_u: list of shape (n_neighbors_in_guage, n_neighbors_in_guage); store u singular_values: list of shape (n_neighbors_in_guage,); store single values for each frame gauge_vh: list of shape (n_neighbors_in_guage, d); store v """ gauge_u = [] singular_values = [] gauge_vh = [] for row_i in numba.prange(data.shape[0]): # choose up to n_neighbors_in_guage nearest neighbors, skipping self at 0 upper = n_neighbors_in_guage + 1 if upper > n_neighbors: upper = n_neighbors indices = knn_index[row_i, 1:upper] data_around_i = data[indices] - data[row_i] weights_around_i = neighbor_weights[row_i, 1:upper] # Normalize and apply as row-wise scaling without forming diagonal matrices wsum = np.sum(weights_around_i) if wsum == 0.0: wsum = 1.0 sqrt_w = np.sqrt(weights_around_i / wsum) weighted = data_around_i * sqrt_w[:, np.newaxis] u, s, vh = np.linalg.svd(weighted, full_matrices=False) gauge_u.append(u) singular_values.append(s) gauge_vh.append(vh) return gauge_u, singular_values, gauge_vh
import time
[docs] def graph_convolution(features, knn_index, num_iterations, verbose=False, memory_target_mb=256, backend="auto"): """ Perform iterative neighbor averaging as a simple graph convolution. Args: features (np.ndarray): Input feature matrix of shape (num_nodes, num_features, num_channels). knn_index (np.ndarray): Nearest neighbor indices of shape (num_nodes, num_neighbors). num_iterations (int, optional): Number of smoothing iterations. Default is 20. Returns: np.ndarray: Smoothed feature matrix after `num_iterations` iterations. dict: Dictionary storing the first averaged result under the key "VH". """ if verbose: print(ts() + f' Applying graph convolution for {num_iterations} iterations...') start_time = time.time() # Dictionary to store intermediate results featuremap_results = {} # Iterative neighbor averaging n_nodes, f_dim, c_dim = features.shape k = knn_index.shape[1] eps = 1e-12 use_sparse = False if backend == "sparse": use_sparse = True elif backend == "auto": # Prefer sparse backend for very large n, or if temporary gather would exceed target memory # Estimate gather tensor size per batch for batch approach target_bytes = max(1, int(memory_target_mb * 1024 * 1024)) denom = max(1, 4 * k * f_dim * c_dim) est_batch = int(target_bytes // denom) if n_nodes >= 50_000 or est_batch < 32: use_sparse = True if use_sparse: # Build a row-stochastic adjacency once, then apply repeated sparse matmuls # Filter invalid neighbors (-1) idx = knn_index.reshape(-1) rows = np.repeat(np.arange(n_nodes, dtype=np.int32), k) mask = idx >= 0 rows = rows[mask] cols = idx[mask].astype(np.int32, copy=False) # Row-normalized weights (mean over neighbors) data = np.full(rows.shape[0], 1.0 / float(k), dtype=np.float32) # Build CSR adjacency adj = scipy.sparse.csr_matrix((data, (rows, cols)), shape=(n_nodes, n_nodes)) for iteration in range(num_iterations): # Flatten features to (n_nodes, f_dim * c_dim) flat = features.reshape(n_nodes, f_dim * c_dim) flat = adj.dot(flat) smoothed = flat.reshape(n_nodes, f_dim, c_dim) # Normalize across channel dimension per feature row norms = np.linalg.norm(smoothed, axis=2, keepdims=True) norms = np.maximum(norms, eps) features = smoothed / norms if iteration == 0: featuremap_results["VH"] = features.astype(np.float32, copy=True) else: # Vectorized with batching and advanced indexing target_bytes = max(1, int(memory_target_mb * 1024 * 1024)) denom = max(1, 4 * k * f_dim * c_dim) batch_size = int(target_bytes // denom) if batch_size < 32: batch_size = 32 batch_size = min(batch_size, n_nodes) for iteration in range(num_iterations): smoothed_features = np.empty_like(features) for start in range(0, n_nodes, batch_size): end = min(n_nodes, start + batch_size) idx_batch = knn_index[start:end] # (B, k) # Gather neighbor features: (B, k, f_dim, c_dim) neighbor_features = features[idx_batch] # Mean over neighbors -> (B, f_dim, c_dim) mean_features = neighbor_features.mean(axis=1) # Normalize across channel dimension per feature row norms = np.linalg.norm(mean_features, axis=2, keepdims=True) norms = np.maximum(norms, eps) smoothed_features[start:end] = mean_features / norms features = smoothed_features if iteration == 0: featuremap_results["VH"] = features.astype(np.float32, copy=True) end_time = time.time() if verbose: print(ts() + f' Graph convolution completed in {end_time - start_time:.2f} seconds') return features, featuremap_results
# @numba.njit()
[docs] def tangent_space_approximation( data, graph, featuremap_kwds, ): """ Compute the origial gauge in high-dimensional data by local SVD Parameters ---------- data: array of shape (n_samples, n_features) The source data to be embedded by FeatureMAP. graph: sparse matrix The 1-skeleton of the high dimensional fuzzy simplicial set as represented by a graph for which we require a sparse matrix for the (weighted) adjacency matrix. featuremap_kwds: dict Key word arguments to be used by the FeatureMAP optimization. Returns ------- Local tangent space U, S, V """ import os # check if the data is sparse if scipy.sparse.issparse(data): data = data.toarray() n_neighbors = featuremap_kwds["n_neighbors"] n_neighbors_in_guage = int(featuremap_kwds["gauge_coefficient"] * n_neighbors) knn_index = featuremap_kwds["_knn_indices"].astype(np.int32) # clamp gauge neighbor count to available knn to avoid extra memory if n_neighbors_in_guage >= n_neighbors: n_neighbors_in_guage = n_neighbors - 1 if n_neighbors > 1 else 0 # Build neighbor weights for each row from sparse graph without densifying # neighbor_weights has shape (n_samples, n_neighbors); positions without edges are 0 n_samples = knn_index.shape[0] neighbor_weights = np.zeros((n_samples, n_neighbors), dtype=np.float32) G = graph.tocsr() for i in range(n_samples): inds = knn_index[i] # mask invalid indices (-1) for j in range(n_neighbors): nb = inds[j] if nb >= 0: # fetch scalar weight; G[i, nb] returns 1x1 sparse, use .A1 or .toarray() val = G[i, nb] # scipy sparse scalar -> np.matrix subclass or 0; convert w = float(val) if hasattr(val, "__float__") else float(val.toarray()[0, 0]) neighbor_weights[i, j] = w gauge_u = [] # list of shape (n_neighbors_in_guage, n_neighbors_in_guage); store u singular_values = [] # list of shape (n_neighbors_in_guage,); store single values for each frame gauge_vh = [] # list of shape (n_neighbors_in_guage, d); store v T1 = time.time() gauge_u, singular_values, gauge_vh = local_svd( data, knn_index, neighbor_weights, n_neighbors, n_neighbors_in_guage, ) # Avoid storing U to reduce RAM; keep singular values and VH only featuremap_kwds["Singular_value"] = np.array(singular_values).astype(np.float32, copy=True) featuremap_kwds["VH"] = np.array(gauge_vh).astype(np.float32, copy=True) T2 = time.time() if featuremap_kwds['verbose']: print(ts() + f' Local SVD time is {T2-T1}') # Before average gauge_vh = featuremap_kwds["VH"].copy() ########################################### # # Average over NNs in original space. (Message passing accross knn network) ######################################### # knn_index = featuremap_kwds["_knn_indices"] knn_index = knn_index gauge_vh = np.array(gauge_vh) num_iterations = featuremap_kwds["gcn_iterations"] if num_iterations is not None: num_iterations = num_iterations else: if data.shape[0] < 5000: num_iterations = int(np.log2(data.shape[0])/2) elif data.shape[0] < 10000: num_iterations = int(np.log2(data.shape[0])) else: num_iterations = int(np.log2(data.shape[0])*2) # num_iterations = 42 gauge_vh, featuremap_kwds["VH"] = graph_convolution(gauge_vh, knn_index, num_iterations=num_iterations, verbose=featuremap_kwds['verbose']) featuremap_kwds["vh_smoothed"] = np.array(gauge_vh).astype(np.float32, copy=True)
[docs] def project_gauge_to_knn_graph( data, graph, featuremap_kwds, scale=10, use_negative_cosines=False,): """ Project the gauge (rotation matrix) to the KNN graph and compute the transition probability matrix Modified from the scVelo: https://scvelo.readthedocs.io/en/stable/scvelo.utils.get_transition_matrix.html#scvelo.utils.get_transition_matrix Parameters ---------- data: array of shape (n_samples, n_features) The source data to be embedded by FeatureMAP. graph: sparse matrix The 1-skeleton of the high dimensional fuzzy simplicial set as represented by a graph for which we require a sparse matrix for the (weighted) adjacency matrix. featuremap_kwds: dict Key word arguments to be used by the FeatureMAP optimization. scale: float The scale factor for the transition probability matrix use_negative_cosines: bool Whether to use negative cosine similarity in the transition probability matrix Returns ------- transition_matrix: array of shape (n_samples, n_samples) The transition probability matrix """ # for each edge, compute the transition probability by the (normalized) cosine similarity between edge and the gauge gauge_v1 = featuremap_kwds["VH"][:,0,:] gauge_v2 = featuremap_kwds["VH"][:,1,:] # gauge_v1 = featuremap_kwds["vh_smoothed"][:,0,:] # gauge_v2 = featuremap_kwds["vh_smoothed"][:,1,:] head = graph.row tail = graph.col # Build sparse transition scores only on existing edges rows = [] cols = [] data_v1 = [] data_v2 = [] for i in range(len(head)): j = int(head[i]) k = int(tail[i]) edge_vector = (data[k] - data[j]).astype(np.float32, copy=False) gv1 = gauge_v1[j].astype(np.float32, copy=False) gv2 = gauge_v2[j].astype(np.float32, copy=False) denom1 = (np.linalg.norm(edge_vector) * np.linalg.norm(gv1)) denom2 = (np.linalg.norm(edge_vector) * np.linalg.norm(gv2)) c1 = 0.0 if denom1 == 0.0 else float(np.dot(edge_vector, gv1) / denom1) c2 = 0.0 if denom2 == 0.0 else float(np.dot(edge_vector, gv2) / denom2) rows.append(j) cols.append(k) data_v1.append(c1) data_v2.append(c2) rows = np.asarray(rows, dtype=np.int32) cols = np.asarray(cols, dtype=np.int32) v1 = np.asarray(data_v1, dtype=np.float32) v2 = np.asarray(data_v2, dtype=np.float32) # Split pos/neg and apply expm1 transform sparsely v1_pos = np.clip(v1, 0.0, 1.0) v1_neg = np.clip(v1, -1.0, 0.0) v2_pos = np.clip(v2, 0.0, 1.0) v2_neg = np.clip(v2, -1.0, 0.0) tv1 = np.expm1(v1_pos * scale) tv2 = np.expm1(v2_pos * scale) if use_negative_cosines: tv1 -= np.expm1(-v1_neg * scale) tv2 -= np.expm1(-v2_neg * scale) else: tv1 += np.expm1(v1_neg * scale) + 1.0 tv2 += np.expm1(v2_neg * scale) + 1.0 n = data.shape[0] T_v1 = scipy.sparse.csr_matrix((tv1, (rows, cols)), shape=(n, n)) T_v2 = scipy.sparse.csr_matrix((tv2, (rows, cols)), shape=(n, n)) # Row-normalize (avoid densifying) row_sums_v1 = np.asarray(T_v1.sum(axis=1)).ravel() row_sums_v1[row_sums_v1 == 0.0] = 1.0 row_sums_v2 = np.asarray(T_v2.sum(axis=1)).ravel() row_sums_v2[row_sums_v2 == 0.0] = 1.0 inv_v1 = scipy.sparse.diags(1.0 / row_sums_v1) inv_v2 = scipy.sparse.diags(1.0 / row_sums_v2) T_v1 = inv_v1.dot(T_v1) T_v2 = inv_v2.dot(T_v2) featuremap_kwds["T_v1"] = T_v1 featuremap_kwds["T_v2"] = T_v2
[docs] def recover_gauge_from_embedding( data_embedding, featuremap_kwds, ): """ Recover the gauge from the low-dimensional embedding Parameters ---------- data_embedding: array of shape (n_samples, n_components) The low-dimensional embedding of the data featuremap_kwds: dict Returns ------- gauge_v1_emb: array of shape (n_samples, n_components) The recovered gauge v1 in low dimesional space gauge_v2_emb: array of shape (n_samples, n_components) The recovered gauge v2 in low dimensional space """ # Example input data knn_indices = featuremap_kwds["_knn_indices"] n_neighbors = featuremap_kwds["n_neighbors"] T_v1 = featuremap_kwds["T_v1"] T_v2 = featuremap_kwds["T_v2"] # Gather per-row transition weights for knn neighbors without densifying whole matrices n_samples = data_embedding.shape[0] k = knn_indices.shape[1] T_v1_knn = np.zeros((n_samples, k), dtype=np.float32) T_v2_knn = np.zeros((n_samples, k), dtype=np.float32) if scipy.sparse.issparse(T_v1): T1 = T_v1.tocsr() T2 = T_v2.tocsr() for i in range(n_samples): idx = knn_indices[i] for j in range(k): nb = idx[j] if nb >= 0: T_v1_knn[i, j] = T1[i, nb] T_v2_knn[i, j] = T2[i, nb] else: # dense T_v1_knn = np.take_along_axis(T_v1, knn_indices, axis=1) T_v2_knn = np.take_along_axis(T_v2, knn_indices, axis=1) # Modify the transition weights by subtracting 1/n_neighbors T_v1_knn = T_v1_knn - (1.0 / n_neighbors) T_v2_knn = T_v2_knn - (1.0 / n_neighbors) # Compute displacement using knn neighbors # avoid negative indices by masking valid_mask = knn_indices >= 0 safe_knn = knn_indices.copy() safe_knn[~valid_mask] = 0 displacement = data_embedding[safe_knn] - data_embedding[:, np.newaxis, :] # shape (n_samples, k_neighbors, n_components) displacement_v1 = displacement # rotate the displacement by pi/2 displacement_v2 = np.array([displacement[:,:,1], -displacement[:,:,0]]).transpose(1,2,0) # Compute the gauge recovery # zero out contributions from invalid neighbors T_v1_knn = T_v1_knn * valid_mask.astype(np.float32) T_v2_knn = T_v2_knn * valid_mask.astype(np.float32) gauge_v1_emb = np.einsum('ij,ijk->ik', T_v1_knn, displacement_v1) # shape (n_samples, n_components) gauge_v2_emb = np.einsum('ij,ijk->ik', T_v2_knn, displacement_v2) # shape (n_samples, n_components) featuremap_kwds["gauge_v1_emb"] = gauge_v1_emb featuremap_kwds["gauge_v2_emb"] = gauge_v2_emb # stack along the second axis featuremap_kwds["VH_embedding"] = np.stack((gauge_v1_emb, gauge_v2_emb), axis=1)
[docs] def tangent_space_embedding( featuremap_kwds, ): """ Embedding the gauge (rotation matrix) to low dim space Parameters ---------- featuremap_kwds: dict Key word arguments to be used by the FeatureMAP optimization. """ ################################################################################ # Embedding the gauge (rotation matrix) to low dim space # Consider the distance within topological structure of KNN network ############################################################################### random_state = featuremap_kwds["random_state"] n_neighbors = featuremap_kwds["n_neighbors"] metric = featuremap_kwds["metric"] min_dist = featuremap_kwds["min_dist"] n_epochs = featuremap_kwds["n_epochs"] gauge_vh = featuremap_kwds["vh_smoothed"] gauge_vh_copy = np.array(gauge_vh, copy=True) # First largest PC # T1 = time.time() rotation_matrix = gauge_vh_copy[:,0,:] # from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import PCA pca = PCA(n_components=min(rotation_matrix.shape[1], 50)) X_pca = pca.fit_transform(rotation_matrix) import umap # print('random_state', random_state) umap_embedding = umap.UMAP(n_neighbors=n_neighbors,random_state=random_state, metric=metric,min_dist=min_dist, n_epochs=n_epochs).fit_transform(X=X_pca) # umap_embedding = umap.UMAP(n_neighbors=30, metric='euclidean',min_dist=0.5).fit_transform(X=X_pca) featuremap_kwds['gauge_v1_emb'] = umap_embedding # T2 = time.time() # print(f'UMAP time is {T2-T1}') umap_embedding_norm = np.linalg.norm(umap_embedding, axis=1, keepdims=True) umap_embedding_norm[umap_embedding_norm == 0.0] = 1.0 # gauge_vh_embedding is embedding of VH gauge_vh_mean_embedding = np.zeros([umap_embedding.shape[0], umap_embedding.shape[1], 2]) normalized_embedding = umap_embedding / umap_embedding_norm gauge_vh_mean_embedding[:, 0, 0] = normalized_embedding[:, 0] gauge_vh_mean_embedding[:, 0, 1] = normalized_embedding[:, 1] # Second largest PC rotation_matrix = gauge_vh_copy[:,1,:] pca = PCA(n_components=min(rotation_matrix.shape[1], 50)) X_pca = pca.fit_transform(rotation_matrix) umap_embedding = umap.UMAP(n_neighbors=n_neighbors,random_state=random_state, metric=metric,min_dist=min_dist, n_epochs=n_epochs).fit_transform(X=X_pca) # umap_embedding = umap.UMAP(n_neighbors=30, metric='euclidean',min_dist=0.5).fit_transform(X=X_pca) featuremap_kwds['gauge_v2_emb'] = umap_embedding umap_embedding_norm = np.linalg.norm(umap_embedding, axis=1, keepdims=True) umap_embedding_norm[umap_embedding_norm == 0.0] = 1.0 normalized_embedding = umap_embedding / umap_embedding_norm # gauge_vh_embedding is embedding of VH gauge_vh_mean_embedding[:, 1, 0] = normalized_embedding[:, 0] gauge_vh_mean_embedding[:, 1, 1] = normalized_embedding[:, 1] featuremap_kwds["VH_embedding"] = np.array(gauge_vh_mean_embedding).astype(np.float32, copy=True) # VH_embedding after average
# # First largest PC # # T1 = time.time() # gauge_vh_copy = gauge_vh.copy() # rotation_matrix = np.array(gauge_vh_copy)[:,0,:] # # from sklearn.decomposition import TruncatedSVD # from sklearn.decomposition import PCA # pca = PCA(n_components=min(rotation_matrix.shape[1], 50)) # X_pca = pca.fit_transform(rotation_matrix) # # Spectral embedding # graph, _, _ = fuzzy_simplicial_set( # X_pca, # n_neighbors, # random_state, # metric, # ) # spectral_embedding = spectral_layout( # X_pca, # graph, # dim=2, # random_state=random_state, # metric=metric, # ) # featuremap_kwds['gauge_v1_emb'] = spectral_embedding # # T2 = time.time() # # print(f'UMAP time is {T2-T1}') # spectral_embedding_norm = np.linalg.norm(spectral_embedding, axis=1) # # gauge_vh_embedding is embedding of VH # gauge_vh_mean_embedding = np.zeros([spectral_embedding.shape[0], spectral_embedding.shape[1],2]) # gauge_vh_mean_embedding[:,0,0] = spectral_embedding[:, 0]/spectral_embedding_norm # gauge_vh_mean_embedding[:,0,1] = spectral_embedding[:, 1]/spectral_embedding_norm # # Second largest PC # rotation_matrix = np.array(gauge_vh_copy)[:,1,:] # pca = PCA(n_components=min(rotation_matrix.shape[1], 50)) # X_pca = pca.fit_transform(rotation_matrix) # # Spectral embedding # graph, _, _ = fuzzy_simplicial_set( # X_pca, # n_neighbors, # random_state, # metric, # ) # spectral_embedding = spectral_layout( # X_pca, # graph, # dim=2, # random_state=random_state, # metric=metric, # ) # featuremap_kwds['gauge_v2_emb'] = spectral_embedding # spectral_embedding_norm = np.linalg.norm(spectral_embedding, axis=1) # # gauge_vh_embedding is embedding of VH # gauge_vh_mean_embedding[:,1,0] = spectral_embedding[:, 0]/spectral_embedding_norm # gauge_vh_mean_embedding[:,1,1] = spectral_embedding[:, 1]/spectral_embedding_norm # featuremap_kwds["VH_embedding"] = np.array(gauge_vh_mean_embedding).astype(np.float32, copy=True) # VH_embedding after average
[docs] def variation_embedding( data, featuremap_kwds, ): """ Compute the variation embedding of the data Parameters ---------- data: array of shape (n_samples, n_features) The source data to be embedded by FeatureMAP. featuremap_kwds: dict Key word arguments to be used by the FeatureMAP optimization. Returns ------- emb_variation: array of shape (n_samples, n_components) The optimized of ``graph`` of variation into an ``n_components`` dimensional euclidean space. """ n_components=featuremap_kwds["n_components"] random_state=featuremap_kwds["random_state"] n_neighbors=featuremap_kwds["n_neighbors"] min_dist=featuremap_kwds["min_dist"] spread=featuremap_kwds["spread"] metric=featuremap_kwds["metric"] threshold=featuremap_kwds["threshold"] ########################################## # Compute feature variation ############################################ gauge_vh = featuremap_kwds["vh_smoothed"] singular_values_collection = featuremap_kwds["Singular_value"] # Compute intrinsic dimensionality locally def pc_accumulation(arr, threshold): arr_sum = float(np.sum(np.square(arr))) if arr_sum <= 0.0: return 0 temp_sum = 0.0 for i in range(arr.shape[0]): temp_sum += arr[i] * arr[i] if temp_sum > arr_sum * threshold: return i # If the threshold is never exceeded (e.g., threshold >= 1), fall back to the last index return max(arr.shape[0] - 1, 0) # threshold = 0.5 intrinsic_dim = np.zeros(data.shape[0]).astype(int) for i in range(data.shape[0]): intrinsic_dim[i] = pc_accumulation(singular_values_collection[i], threshold) # Compute the gene norm in top k PCs (norm of the arrow in biplot) k = int(np.median(intrinsic_dim)) if featuremap_kwds['verbose']: print(f'k is {k}') # gene_var_norm = np.linalg.norm(gauge_vh[:, :k, :], axis=1) gene_var_norm = np.sqrt(np.einsum('ijk, ijk->ik', gauge_vh[:, :k, :], gauge_vh[:, :k, :])) featuremap_kwds["variation_pc"] = np.array(gene_var_norm).astype(np.float32, copy=True) ###################################### # Variation embedding ######################################## import umap emb_variation = umap.UMAP(n_components=int(n_components), random_state=random_state, n_neighbors=n_neighbors,metric=metric,min_dist=min_dist, spread=spread).fit(gene_var_norm) featuremap_kwds["variation_embedding"] = emb_variation.embedding_ featuremap_kwds["graph_v"] = emb_variation.graph_ featuremap_kwds['gauge_v1_emb'] = emb_variation.embedding_[:, :2] # print(featuremap_kwds['gauge_v1_emb'].shape) featuremap_kwds['gauge_v2_emb'] = emb_variation.embedding_[:,[1,0]] featuremap_kwds['gauge_v2_emb'][:,0] = -featuremap_kwds['gauge_v2_emb'][:,0] # normalze the gauge_v1_emb and gauge_v2_emb gauge_v1_emb_norm = np.linalg.norm(featuremap_kwds['gauge_v1_emb'], axis=1, keepdims=True) gauge_v2_emb_norm = np.linalg.norm(featuremap_kwds['gauge_v2_emb'], axis=1, keepdims=True) gauge_v1_emb_norm[gauge_v1_emb_norm == 0.0] = 1.0 gauge_v2_emb_norm[gauge_v2_emb_norm == 0.0] = 1.0 featuremap_kwds['VH_embedding'] = np.stack(( featuremap_kwds['gauge_v1_emb'] / gauge_v1_emb_norm, featuremap_kwds['gauge_v2_emb'] / gauge_v2_emb_norm ), axis=1) return emb_variation.embedding_
""" Add gauge to 1-simplex; embed it with gauge in low-dimensional space """
[docs] def simplicial_set_embedding_with_tangent_space_embedding( data, graph, n_components, initial_alpha, a, b, gamma, negative_sample_rate, n_epochs, init, random_state, metric, metric_kwds, featuremap_kwds, output_feat, output_metric=dist.named_distances_with_gradients["euclidean"], output_metric_kwds={}, euclidean_output=True, parallel=False, verbose=False, tqdm_kwds=None, output_variation=False, # original_data_flag=True, # pca_vh=None ): """Perform a fuzzy simplicial set embedding, using a specified initialisation method and then minimizing the fuzzy set cross entropy between the 1-skeletons of the high and low dimensional fuzzy simplicial sets. Parameters ---------- data: array of shape (n_samples, n_features) The source data to be embedded by FeatureMAP. graph: sparse matrix The 1-skeleton of the high dimensional fuzzy simplicial set as represented by a graph for which we require a sparse matrix for the (weighted) adjacency matrix. n_components: int The dimensionality of the euclidean space into which to embed the data. initial_alpha: float Initial learning rate for the SGD. a: float Parameter of differentiable approximation of right adjoint functor b: float Parameter of differentiable approximation of right adjoint functor gamma: float Weight to apply to negative samples. negative_sample_rate: int (optional, default 5) The number of negative samples to select per positive sample in the optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy. n_epochs: int (optional, default 0) The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If 0 is specified a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small). init: string How to initialize the low dimensional embedding. Options are: * 'spectral': use a spectral embedding of the fuzzy 1-skeleton * 'random': assign initial embedding positions at random. * A numpy array of initial embedding positions. random_state: numpy RandomState or equivalent A state capable being used as a numpy random state. metric: string or callable The metric used to measure distance in high dimensional space; used if multiple connected components need to be layed out. metric_kwds: dict Key word arguments to be passed to the metric function; used if multiple connected components need to be layed out. featuremap_kwds: dict Key word arguments to be used by the FeatureMAP optimization. output_metric: function Function returning the distance between two points in embedding space and the gradient of the distance wrt the first argument. output_metric_kwds: dict Key word arguments to be passed to the output_metric function. euclidean_output: bool Whether to use the faster code specialised for euclidean output metrics parallel: bool (optional, default False) Whether to run the computation using numba parallel. Running in parallel is non-deterministic, and is not used if a random seed has been set, to ensure reproducibility. verbose: bool (optional, default False) Whether to report information on the current progress of the algorithm. tqdm_kwds: dict Key word arguments to be used by the tqdm progress bar. Returns ------- embedding: array of shape (n_samples, n_components) The optimized of ``graph`` into an ``n_components`` dimensional euclidean space. """ graph = graph.tocoo() #TODO: tocoo or toarray? head = graph.row tail = graph.col # weight = graph.toarray() # weight = graph.data # print(np.count_nonzero(weight, axis=1)) graph.sum_duplicates() n_vertices = graph.shape[1] # For smaller datasets we can use more epochs if graph.shape[0] <= 10000: default_epochs = 400 else: default_epochs = 200 # Use more epochs for FeatureMAP # if featuremap: default_epochs += 200 if n_epochs is None: n_epochs = default_epochs if n_epochs > 10: graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0 else: graph.data[graph.data < (graph.data.max() / float(default_epochs))] = 0.0 graph.eliminate_zeros() """ Initialization """ if isinstance(init, str) and init == "random": embedding = random_state.uniform( low=-10.0, high=10.0, size=(graph.shape[0], n_components) ).astype(np.float32) elif isinstance(init, str) and init == "spectral": # We add a little noise to avoid local minima for optimization to come initialisation = spectral_layout( data, graph, n_components, random_state, metric=metric, metric_kwds=metric_kwds, ) expansion = 10.0 / np.abs(initialisation).max() embedding = (initialisation * expansion).astype( np.float32 ) + random_state.normal( scale=0.0001, size=[graph.shape[0], n_components] ).astype( np.float32 ) else: init_data = np.array(init) if len(init_data.shape) == 2: if np.unique(init_data, axis=0).shape[0] < init_data.shape[0]: tree = KDTree(init_data) dist, ind = tree.query(init_data, k=2) nndist = np.mean(dist[:, 1]) embedding = init_data + random_state.normal( scale=0.001 * nndist, size=init_data.shape ).astype(np.float32) else: embedding = init_data epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs) rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) aux_data = {} # Compute the gauge in high-dimensional data if verbose: print(ts() + " Computing tangent space") T1 = time.time() n_neighbors=featuremap_kwds["n_neighbors"] n_neighbors_in_guage=int(featuremap_kwds["gauge_coefficient"] * n_neighbors) tangent_space_approximation(data, graph, featuremap_kwds=featuremap_kwds) T2 = time.time() if verbose: print(ts() + f' Tangent_space_approximation time is {T2-T1}') # if verbose: # print(ts() + " Project gauge to knn_graph") # # Project the gauge to KNN graph to compute the transition probability matrix # project_gauge_to_knn_graph(data, graph, featuremap_kwds=featuremap_kwds) ''' Variation embedding only ''' T1 = time.time() embedding = variation_embedding(data=data,featuremap_kwds=featuremap_kwds) T2 = time.time() if output_variation == True: if verbose: print(ts() + f' Variation_embedding time is {T2-T1}') return embedding else: # Embedding the gauge (rotation matrix) to low dim space if verbose: print(ts() + " Tangent space embedding") # T1 = time.time() # tangent_space_embedding(featuremap_kwds) # T2 = time.time() # if verbose: # print(ts() + f' Tangent_space_embedding time is {T2-T1}') head = graph.row tail = graph.col mu_sum = np.zeros(n_vertices, dtype=np.float32) # For each node, sum of edge existing probabilty of incident edges for i in range(len(head)): j = head[i] k = tail[i] mu = graph.data[i] # edge existing probability mu_sum[j] += mu mu_sum[k] += mu epsilon = 1e-8 # print('singular_values, ' + str(singular_values)) # Get the variance from singular values, corresponding to squared Euclidean distance singular_values = featuremap_kwds["Singular_value"].copy() ro_var = np.square(np.array(singular_values)) ro = np.log(epsilon + ro_var) # radius in each directions of the hyper-ellipsoid # print('ro, ' + str(ro)) std_ro = np.std(ro, axis=0) std_ro = np.where(std_ro == 0.0, 1.0, std_ro) R = (ro - np.mean(ro, axis=0)) / std_ro # normalization by column featuremap_kwds["R"] = R.astype(np.float32, copy=True) featuremap_kwds["mu"] = graph.data featuremap_kwds["mu_sum"] = mu_sum min_embed = np.min(embedding, 0) max_embed = np.max(embedding, 0) range_embed = max_embed - min_embed range_embed[range_embed == 0.0] = 1.0 embedding = ( 10.0 * (embedding - min_embed) / range_embed ).astype(np.float32, order="C") if verbose: print(ts() + ' Start optimizing layout') T1 = time.time() embedding = optimize_layout_euclidean_anisotropic_projection( embedding, embedding, head, tail, n_epochs, n_vertices, epochs_per_sample, a, b, rng_state, gamma, initial_alpha, negative_sample_rate, parallel=parallel, verbose=verbose, # featuremap=featuremap, featuremap_kwds=featuremap_kwds, tqdm_kwds=tqdm_kwds, move_other=True, ) T2 = time.time() if verbose: print(ts() + f' Optimize layout time is {T2-T1}') """ Compute the radius of two principal directions in embedding directions """ if output_feat: T1 = time.time() # if output_feat: if verbose: print(ts() + " Computing expression embedding densities") # scale the embedding coordinates # embedding = embedding * 100 # Compute graph in embedding (knn_indices, knn_dists, _,) = nearest_neighbors( embedding, featuremap_kwds["n_neighbors"], "euclidean", {}, False, random_state, verbose=False, ) emb_graph, _, _, emb_dists = fuzzy_simplicial_set( embedding, featuremap_kwds["n_neighbors"], random_state, "euclidean", {}, knn_indices, knn_dists, verbose=False, return_dists=True, ) emb_graph = emb_graph.tocoo() emb_graph.sum_duplicates() emb_graph.eliminate_zeros() n_vertices = emb_graph.shape[1] dim = embedding.shape[1] phi_sum = np.zeros(n_vertices, dtype=np.float32) re_sum = np.zeros([n_vertices, dim], dtype=np.float32) re_sum_without_log = np.zeros([n_vertices, dim], dtype=np.float32) VH_embedding = featuremap_kwds["VH_embedding"] head = emb_graph.row tail = emb_graph.col for i in range(len(head)): j = head[i] k = tail[i] current = embedding[j] other = embedding[k] current_VH = VH_embedding[j] # array shape of (dim, dim) other_VH = VH_embedding[k] vec_diff = other - current # D = emb_dists[j, k] phi = emb_graph.data[i] phi_sum[j] += phi phi_sum[k] += phi for d in range(dim): vec_proj_vh_j = np.dot(vec_diff, current_VH[d]) # project to d-th rotation direction vec_proj_vh_k = np.dot(vec_diff, other_VH[d]) re_sum[j,d] += phi * vec_proj_vh_j * vec_proj_vh_j re_sum[k,d] += phi * vec_proj_vh_k * vec_proj_vh_k for i in range(re_sum.shape[0]): re_sum_without_log[i] = epsilon + (re_sum[i] / phi_sum[i]) for i in range(re_sum.shape[0]): re_sum[i] = np.log(epsilon + (re_sum[i] / phi_sum[i])) featuremap_kwds["re_sum"] = np.sqrt(re_sum) # Variance of Gaussian distribution featuremap_kwds["re_sum_without_log"] = np.sqrt(re_sum_without_log) # Variance of Gaussian distribution featuremap_kwds["emb_graph"] = emb_graph featuremap_kwds["emb_dists"] = emb_dists featuremap_kwds["emb_knn_indices"] = knn_indices T2 = time.time() if verbose: print(ts() + f' Embedding radii computation time is {T2-T1}') return embedding
[docs] class FeatureMAP(BaseEstimator): """Feature-preserving Manifold Approximation and Projection Approximate the high dimensional manifold of the input data by low dimensional embedding, with feature-preserving property. The algorithm is based on the paper: Feature-preserving manifold approximation and projection to visulize single-cell data, Yang et al. 2023. The algorithm is based on the UMAP algorithm, and the main difference is that FeatureMAP uses the tangent space of the input data to compute the (embedding) gauge for low dimensional embedding, where the features are locally illustrated by the embedding tangent space (i.e., gauge) and data points are embedded to the low dimensional space with the gauge. Parameters ---------- n_neighbors: float (optional, default 15) The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. n_components: int (optional, default 2) The dimension of the space to embed into. This defaults to 2 to provide easy visualization, but can reasonably be set to any integer value in the range 2 to 100. metric: string (currently, only 'euclidean' is supported) The metric to use to compute distances in high dimensional space. n_epochs: int (optional, default None) The number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If None is specified a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small). learning_rate: float (optional, default 1.0) The initial learning rate for the embedding optimization. init: string (optional, default 'spectral') How to initialize the low dimensional embedding. Options are: * 'spectral': use a spectral embedding of the fuzzy 1-skeleton * 'random': assign initial embedding positions at random. * A numpy array of initial embedding positions. min_dist: float (optional, default 0.5) The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. spread: float (optional, default 1.0) The effective scale of embedded points. In combination with ``min_dist`` this determines how clustered/clumped the embedded points are. low_memory: bool (optional, default True) For some datasets the nearest neighbor computation can consume a lot of memory. If you find that FeatureMAP is failing due to memory constraints consider setting this option to True. This approach is more computationally expensive, but avoids excessive memory use. set_op_mix_ratio: float (optional, default 1.0) Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection. local_connectivity: int (optional, default 1) The local connectivity required -- i.e. the number of nearest neighbors that should be assumed to be connected at a local level. The higher this value the more connected the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold. repulsion_strength: float (optional, default 1.0) Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in greater weight being given to negative samples. negative_sample_rate: int (optional, default 5) The number of negative samples to select per positive sample in the optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy. a: float (optional, default None) More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``. b: float (optional, default None) More specific parameters controlling the embedding. If None these values are set automatically as determined by ``min_dist`` and ``spread``. random_state: int, RandomState instance or None, optional (default: None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. metric_kwds: dict (optional, default None) Arguments to pass on to the metric, such as the ``p`` value for Minkowski distance. If None then no arguments are passed on. angular_rp_forest: bool (optional, default False) Whether to use an angular random projection forest to initialise the approximate nearest neighbor search. This can be faster, but is mostly on useful for metric that use an angular style distance such as cosine, correlation etc. In the case of those metrics angular forests will be chosen automatically. transform_seed: int (optional, default 42) Random seed used for the stochastic aspects of the transform operation. This ensures consistency in transform operations. verbose: bool (optional, default False) Controls verbosity of logging. tqdm_kwds: dict (optional, defaul None) Key word arguments to be used by the tqdm progress bar. unique: bool (optional, default False) Controls if the rows of your data should be uniqued before being embedded. If you have more duplicates than you have n_neighbour you can have the identical data points lying in different regions of your space. It also violates the definition of a metric. For to map from internal structures back to your data use the variable _unique_inverse_. featuremap: bool (optional, default False) Specifies whether the featity-augmented objective of FeatureMAP should be used for optimization. Turning on this option generates an embedding where the local featities are encouraged to be correlated with those in the original space. Parameters below with the prefix 'feat' further control the behavior of this extension. feat_lambda: float (optional, default 2.0) Controls the regularization weight of the anisotropic density correlation term in FeatureMAP. Higher values prioritize density preservation over the FeatureMAP objective, and vice versa for values closer to zero. Setting this parameter to zero is equivalent to running the original UMAP algorithm. feat_frac: float (optional, default 0.3) Controls the fraction of epochs (between 0 and 1) where the feature-augmented objective is used in FeatureMAP. The first (1 - feat_frac) fraction of epochs optimize the original UMAP objective before introducing the anisotropic density correlation term. feat_gauge_coefficient: float (default 1.0) A coefficient multiplication of n_neighbors as number of neighbors in gauge construction. Larger coefficient means long distance information feat_var_shift: float (optional, default 0.1) A small constant added to the variance of local radii in the embedding when calculating the anisotropic density correlation objective to prevent numerical instability from dividing by a small number output_feat: bool (optional, default False) Whether to compute the embedding radius. If set to True, the algorithm will compute the embedding radius and return the embedding radius. disconnection_distance: float (optional, default np.inf or maximal value for bounded distances) Disconnect any vertices of distance greater than or equal to disconnection_distance when approximating the manifold via our k-nn graph. This is particularly useful in the case that you have a bounded metric. The UMAP assumption that we have a connected manifold can be problematic when you have points that are maximally different from all the rest of your data. The connected manifold assumption will make such points have perfect similarity to a random set of other points. Too many such points will artificially connect your space. output_variation: bool (optional, default True) Whether to compute the variation embedding only. If set to True, the algorithm will compute the variation embedding only and return the embedding. If set to False, the algorithm will compute the Feature-augmented embedding. threshold: float (optional, default 0.9) The threshold to compute the intrinsic dimensionality of the data. The intrinsic dimensionality is computed by the number of principal components that accumulates 90% of the variance of the data. """ def __init__( self, n_neighbors=15, n_components=2, metric="euclidean", metric_kwds=None, output_metric="euclidean", output_metric_kwds=None, n_epochs=None, learning_rate=1.0, init="spectral", min_dist=0.5, spread=1.0, low_memory=True, n_jobs=-1, set_op_mix_ratio=1.0, local_connectivity=1.0, repulsion_strength=1.0, negative_sample_rate=5, a=None, b=None, random_state=None, angular_rp_forest=False, transform_seed=42, transform_mode="embedding", force_approximation_algorithm=False, verbose=False, tqdm_kwds=None, unique=False, # featuremap=False, feat_lambda=0.5, feat_frac=0.3, feat_gauge_coefficient = 1.0, feat_var_shift=0.1, output_feat=False, disconnection_distance=None, output_variation=False, threshold=0.9, gcn_iterations=None # original_data_flag=True, # pca_vh=None ):
[docs] self.n_neighbors = n_neighbors
[docs] self.metric = metric
[docs] self.output_metric = output_metric
[docs] self.metric_kwds = metric_kwds
[docs] self.output_metric_kwds = output_metric_kwds
[docs] self.n_epochs = n_epochs
[docs] self.init = init
[docs] self.n_components = n_components
[docs] self.repulsion_strength = repulsion_strength
[docs] self.learning_rate = learning_rate
[docs] self.spread = spread
[docs] self.min_dist = min_dist
[docs] self.low_memory = low_memory
[docs] self.set_op_mix_ratio = set_op_mix_ratio
[docs] self.local_connectivity = local_connectivity
[docs] self.negative_sample_rate = negative_sample_rate
[docs] self.random_state = random_state
[docs] self.angular_rp_forest = angular_rp_forest
[docs] self.transform_seed = transform_seed
[docs] self.transform_mode = transform_mode
[docs] self.force_approximation_algorithm = force_approximation_algorithm
[docs] self.verbose = verbose
[docs] self.tqdm_kwds = tqdm_kwds
[docs] self.unique = unique
# self.featuremap = featuremap
[docs] self.feat_lambda = feat_lambda
[docs] self.feat_frac = feat_frac
[docs] self.feat_gauge_coefficient = feat_gauge_coefficient
[docs] self.feat_var_shift = feat_var_shift
[docs] self.output_feat = output_feat
[docs] self.disconnection_distance = disconnection_distance
[docs] self.output_variation = output_variation
[docs] self.threshold = threshold
[docs] self.gcn_iterations = gcn_iterations
# self.original_data_flag = original_data_flag, # self.pca_vh = pca_vh
[docs] self.n_jobs = n_jobs
[docs] self.a = a
[docs] self.b = b
def _validate_parameters(self): if self.set_op_mix_ratio < 0.0 or self.set_op_mix_ratio > 1.0: raise ValueError("set_op_mix_ratio must be between 0.0 and 1.0") if self.repulsion_strength < 0.0: raise ValueError("repulsion_strength cannot be negative") if self.min_dist > self.spread: raise ValueError("min_dist must be less than or equal to spread") if self.min_dist < 0.0: raise ValueError("min_dist cannot be negative") if not isinstance(self.init, str) and not isinstance(self.init, np.ndarray): raise ValueError("init must be a string or ndarray") if isinstance(self.init, str) and self.init not in ( "spectral", "random", ): raise ValueError('string init values must be "spectral" or "random"') if ( isinstance(self.init, np.ndarray) and self.init.shape[1] != self.n_components ): raise ValueError("init ndarray must match n_components value") if not isinstance(self.metric, str) and not callable(self.metric): raise ValueError("metric must be string or callable") if self.negative_sample_rate < 0: raise ValueError("negative sample rate must be positive") if self._initial_alpha < 0.0: raise ValueError("learning_rate must be positive") if self.n_neighbors < 2: raise ValueError("n_neighbors must be greater than 1") if not isinstance(self.n_components, int): if isinstance(self.n_components, str): raise ValueError("n_components must be an int") if self.n_components % 1 != 0: raise ValueError("n_components must be a whole number") try: # this will convert other types of int (eg. numpy int64) # to Python int self.n_components = int(self.n_components) except ValueError: raise ValueError("n_components must be an int") if self.n_components < 1: raise ValueError("n_components must be greater than 0") # if self.n_components larger than the number of features, it will raise an error if self.n_components >= self._raw_data.shape[1]: raise ValueError( "n_components must be less than or equal to the number of features" ) if self.n_epochs is not None and ( self.n_epochs < 0 or not isinstance(self.n_epochs, int) ): raise ValueError("n_epochs must be a nonnegative integer") if self.metric_kwds is None: self._metric_kwds = {} else: self._metric_kwds = self.metric_kwds if self.output_metric_kwds is None: self._output_metric_kwds = {} else: self._output_metric_kwds = self.output_metric_kwds # save repeated checks later on if scipy.sparse.isspmatrix_csr(self._raw_data): self._sparse_data = True else: self._sparse_data = False # set input distance metric & inverse_transform distance metric if callable(self.metric): in_returns_grad = self._check_custom_metric( self.metric, self._metric_kwds, self._raw_data ) if in_returns_grad: _m = self.metric @numba.njit(fastmath=True) def _dist_only(x, y, *kwds): return _m(x, y, *kwds)[0] self._input_distance_func = _dist_only self._inverse_distance_func = self.metric else: self._input_distance_func = self.metric self._inverse_distance_func = None warn( "custom distance metric does not return gradient; inverse_transform will be unavailable. " "To enable using inverse_transform method, define a distance function that returns a tuple " "of (distance [float], gradient [np.array])" ) elif self.metric == "precomputed": if self.unique: raise ValueError("unique is poorly defined on a precomputed metric") warn("using precomputed metric; inverse_transform will be unavailable") self._input_distance_func = self.metric self._inverse_distance_func = None elif self.metric == "hellinger" and self._raw_data.min() < 0: raise ValueError("Metric 'hellinger' does not support negative values") elif self.metric in dist.named_distances: if self._sparse_data: if self.metric in sparse.sparse_named_distances: self._input_distance_func = sparse.sparse_named_distances[ self.metric ] else: raise ValueError( "Metric {} is not supported for sparse data".format(self.metric) ) else: self._input_distance_func = dist.named_distances[self.metric] try: self._inverse_distance_func = dist.named_distances_with_gradients[ self.metric ] except KeyError: warn( "gradient function is not yet implemented for {} distance metric; " "inverse_transform will be unavailable".format(self.metric) ) self._inverse_distance_func = None elif self.metric in pynn_named_distances: if self._sparse_data: if self.metric in pynn_sparse_named_distances: self._input_distance_func = pynn_sparse_named_distances[self.metric] else: raise ValueError( "Metric {} is not supported for sparse data".format(self.metric) ) else: self._input_distance_func = pynn_named_distances[self.metric] warn( "gradient function is not yet implemented for {} distance metric; " "inverse_transform will be unavailable".format(self.metric) ) self._inverse_distance_func = None else: raise ValueError("metric is neither callable nor a recognised string") # set output distance metric if callable(self.output_metric): out_returns_grad = self._check_custom_metric( self.output_metric, self._output_metric_kwds ) if out_returns_grad: self._output_distance_func = self.output_metric else: raise ValueError( "custom output_metric must return a tuple of (distance [float], gradient [np.array])" ) elif self.output_metric == "precomputed": raise ValueError("output_metric cannnot be 'precomputed'") elif self.output_metric in dist.named_distances_with_gradients: self._output_distance_func = dist.named_distances_with_gradients[ self.output_metric ] elif self.output_metric in dist.named_distances: raise ValueError( "gradient function is not yet implemented for {}.".format( self.output_metric ) ) else: raise ValueError( "output_metric is neither callable nor a recognised string" ) # set angularity for NN search based on metric if self.metric in ( "cosine", "correlation", "dice", "jaccard", "ll_dirichlet", "hellinger", ): self.angular_rp_forest = True if self.n_jobs < -1 or self.n_jobs == 0: raise ValueError("n_jobs must be a postive integer, or -1 (for all cores)") if self.feat_lambda < 0.0: raise ValueError("feat_lambda cannot be negative") if self.feat_frac < 0.0 or self.feat_frac > 1.0: raise ValueError("feat_frac must be between 0.0 and 1.0") if self.feat_gauge_coefficient < 0.0: raise ValueError("feat_gauge_coefficient cannot be negative") if self.feat_var_shift < 0.0: raise ValueError("feat_var_shift cannot be negative") self._featuremap_kwds = { "lambda": self.feat_lambda, #if self.featuremap else 0.0, "frac": self.feat_frac, #if self.featuremap else 0.0, "gauge_coefficient": self.feat_gauge_coefficient, #if self.featuremap else 0.0, "var_shift": self.feat_var_shift, "n_neighbors": self.n_neighbors, "n_jobs": self.n_jobs, "random_state": self.random_state, "metric": self.metric, "min_dist": self.min_dist, "spread": self.spread, "n_components": self.n_components, "verbose": self.verbose, "n_epochs": self.n_epochs, "threshold": self.threshold, 'gcn_iterations': self.gcn_iterations, } # if self.output_variation: # self._featuremap_kwds['threshold'] = self.threshold # if self.featuremap: if self.output_metric not in ("euclidean", "l2"): raise ValueError( "Non-Euclidean output metric not supported for FeatureMAP." ) # This will be used to prune all edges of greater than a fixed value from our knn graph. # We have preset defaults described in DISCONNECTION_DISTANCES for our bounded measures. # Otherwise a user can pass in their own value. if self.disconnection_distance is None: self._disconnection_distance = DISCONNECTION_DISTANCES.get( self.metric, np.inf ) elif isinstance(self.disconnection_distance, int) or isinstance( self.disconnection_distance, float ): self._disconnection_distance = self.disconnection_distance else: raise ValueError("disconnection_distance must either be None or a numeric.") if self.tqdm_kwds is None: self.tqdm_kwds = {} else: if isinstance(self.tqdm_kwds, dict) is False: raise ValueError( "tqdm_kwds must be a dictionary. Please provide valid tqdm " "parameters as key value pairs. Valid tqdm parameters can be " "found here: https://github.com/tqdm/tqdm#parameters" ) if "desc" not in self.tqdm_kwds: self.tqdm_kwds["desc"] = "Epochs completed" if "bar_format" not in self.tqdm_kwds: bar_f = "{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]" self.tqdm_kwds["bar_format"] = bar_f def _check_custom_metric(self, metric, kwds, data=None): # quickly check to determine whether user-defined # self.metric/self.output_metric returns both distance and gradient if data is not None: # if checking the high-dimensional distance metric, test directly on # input data so we don't risk violating any assumptions potentially # hard-coded in the metric (e.g., bounded; non-negative) x, y = data[np.random.randint(0, data.shape[0], 2)] else: # if checking the manifold distance metric, simulate some data on a # reasonable interval with output dimensionality x, y = np.random.uniform(low=-10, high=10, size=(2, self.n_components)) if scipy.sparse.issparse(data): metric_out = metric(x.indices, x.data, y.indices, y.data, **kwds) else: metric_out = metric(x, y, **kwds) # True if metric returns iterable of length 2, False otherwise return hasattr(metric_out, "__iter__") and len(metric_out) == 2
[docs] def fit(self, X): """Fit X into an embedded space. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. If the method is 'exact', X may be a sparse matrix of type 'csr', 'csc' or 'coo'. """ # # SVD decompostion of data # if X.shape[1] > 100: # if self.verbose: # print("Performing SVD decomposition on the data") # u, s, vh = scipy.sparse.linalg.svds(X, k=100, which='LM', random_state=42) # X = np.matmul(u, np.diag(s)) # else: # vh = np.eye(X.shape[1]) X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C") self._raw_data = X # Handle all the optional arguments, setting default if self.a is None or self.b is None: self._a, self._b = find_ab_params(self.spread, self.min_dist) else: self._a = self.a self._b = self.b if isinstance(self.init, np.ndarray): init = check_array(self.init, dtype=np.float32, accept_sparse=False) else: init = self.init self._initial_alpha = self.learning_rate self._validate_parameters() # SVD decomposition of data # self._featuremap_kwds['svd_vh'] = vh.T if self.verbose: print(str(self)) self._original_n_threads = numba.get_num_threads() if self.n_jobs > 0 and self.n_jobs is not None: numba.set_num_threads(self.n_jobs) # Check if we should unique the data # We've already ensured that we aren't in the precomputed case if self.unique: # check if the matrix is feate if self._sparse_data: # Call a sparse unique function index, inverse, counts = csr_unique(X) else: index, inverse, counts = np.unique( X, return_index=True, return_inverse=True, return_counts=True, axis=0, )[1:4] if self.verbose: print( "Unique=True -> Number of data points reduced from ", X.shape[0], " to ", X[index].shape[0], ) most_common = np.argmax(counts) print( "Most common duplicate is", index[most_common], " with a count of ", counts[most_common], ) # We'll expose an inverse map when unique=True for users to map from our internal structures to their data self._unique_inverse_ = inverse # If we aren't asking for unique use the full index. # This will save special cases later. else: index = list(range(X.shape[0])) inverse = list(range(X.shape[0])) # print('inverse, ' + str(inverse)) # Error check n_neighbors based on data size if X[index].shape[0] <= self.n_neighbors: if X[index].shape[0] == 1: self.embedding_ = np.zeros( (1, self.n_components) ) # needed to sklearn comparability return self warn( "n_neighbors is larger than the dataset size; truncating to " "X.shape[0] - 1" ) self._n_neighbors = X[index].shape[0] - 1 # if self.featuremap: self._featuremap_kwds["n_neighbors"] = self._n_neighbors else: self._n_neighbors = self.n_neighbors # Note: unless it causes issues for setting 'index', could move this to # initial sparsity check above if self._sparse_data and not X.has_sorted_indices: X.sort_indices() random_state = check_random_state(self.random_state) if self.verbose: print(ts(), "Construct fuzzy simplicial set") """ Construct the fuzzy_simplicial_set """ if self.metric == "precomputed" and self._sparse_data: # For sparse precomputed distance matrices, we just argsort the rows to find # nearest neighbors. To make this easier, we expect matrices that are # symmetrical (so we can find neighbors by looking at rows in isolation, # rather than also having to consider that sample's column too). # print("Computing KNNs for sparse precomputed distances...") if sparse_tril(X).getnnz() != sparse_triu(X).getnnz(): raise ValueError( "Sparse precomputed distance matrices should be symmetrical!" ) if not np.all(X.diagonal() == 0): raise ValueError("Non-zero distances from samples to themselves!") self._knn_indices = np.zeros((X.shape[0], self.n_neighbors), dtype=np.int) self._knn_dists = np.zeros(self._knn_indices.shape, dtype=np.float) for row_id in range(X.shape[0]): # Find KNNs row-by-row row_data = X[row_id].data row_indices = X[row_id].indices if len(row_data) < self._n_neighbors: raise ValueError( "Some rows contain fewer than n_neighbors distances!" ) row_nn_data_indices = np.argsort(row_data)[: self._n_neighbors] self._knn_indices[row_id] = row_indices[row_nn_data_indices] self._knn_dists[row_id] = row_data[row_nn_data_indices] # Disconnect any vertices farther apart than _disconnection_distance disconnected_index = self._knn_dists >= self._disconnection_distance self._knn_indices[disconnected_index] = -1 self._knn_dists[disconnected_index] = np.inf edges_removed = disconnected_index.sum() ( self.graph_, self._sigmas, self._rhos, # self.graph_dists_, ) = fuzzy_simplicial_set( X[index], self.n_neighbors, random_state, "precomputed", self._metric_kwds, self._knn_indices, self._knn_dists, self.angular_rp_forest, self.set_op_mix_ratio, self.local_connectivity, True, self.verbose, # self.output_feat, ) # Report the number of vertices with degree 0 in our our self.graph_ # This ensures that they were properly disconnected. vertices_disconnected = np.sum( np.array(self.graph_.sum(axis=1)).flatten() == 0 ) raise_disconnected_warning( edges_removed, vertices_disconnected, self._disconnection_distance, self._raw_data.shape[0], verbose=self.verbose, ) # Handle small cases efficiently by computing all distances # TODO: get _knn_indices when less than 4096 elif X[index].shape[0] < 1000 and not self.force_approximation_algorithm: self._small_data = True try: # sklearn pairwise_distances fails for callable metric on sparse data _m = self.metric if self._sparse_data else self._input_distance_func dmat = pairwise_distances(X[index], metric=_m, **self._metric_kwds) except (ValueError, TypeError) as e: # metric is numba.jit'd or not supported by sklearn, # fallback to pairwise special if self._sparse_data: # Get a fresh metric since we are casting to feate if not callable(self.metric): _m = dist.named_distances[self.metric] dmat = dist.pairwise_special_metric( X[index].toarray(), metric=_m, kwds=self._metric_kwds, ) else: dmat = dist.pairwise_special_metric( X[index], metric=self._input_distance_func, kwds=self._metric_kwds, ) else: dmat = dist.pairwise_special_metric( X[index], metric=self._input_distance_func, kwds=self._metric_kwds, ) # set any values greater than disconnection_distance to be np.inf. # This will have no effect when _disconnection_distance is not set since it defaults to np.inf. edges_removed = np.sum(dmat >= self._disconnection_distance) dmat[dmat >= self._disconnection_distance] = np.inf # print(dmat.shape) # print(dmat[:2, :]) self._knn_dists = np.sort(dmat, axis=1) self._knn_indices = np.argsort(dmat, axis=1) # print( self._knn_dists[:2, :]) # print( self._knn_indices[:2, :]) ( self.graph_, self._sigmas, self._rhos, # self.graph_dists_, ) = fuzzy_simplicial_set( dmat, self._n_neighbors, random_state, "precomputed", self._metric_kwds, None, None, self.angular_rp_forest, self.set_op_mix_ratio, self.local_connectivity, True, self.verbose, # self.output_feat, ) # Report the number of vertices with degree 0 in our our self.graph_ # This ensures that they were properly disconnected. vertices_disconnected = np.sum( np.array(self.graph_.sum(axis=1)).flatten() == 0 ) raise_disconnected_warning( edges_removed, vertices_disconnected, self._disconnection_distance, self._raw_data.shape[0], verbose=self.verbose, ) else: # Standard case self._small_data = False # Standard case if self._sparse_data and self.metric in pynn_sparse_named_distances: nn_metric = self.metric elif not self._sparse_data and self.metric in pynn_named_distances: nn_metric = self.metric else: nn_metric = self._input_distance_func ( self._knn_indices, self._knn_dists, self._knn_search_index, ) = nearest_neighbors( X[index], self._n_neighbors, nn_metric, self._metric_kwds, self.angular_rp_forest, random_state, self.low_memory, use_pynndescent=True, n_jobs=self.n_jobs, verbose=self.verbose, ) # Disconnect any vertices farther apart than _disconnection_distance disconnected_index = self._knn_dists >= self._disconnection_distance self._knn_indices[disconnected_index] = -1 self._knn_dists[disconnected_index] = np.inf edges_removed = disconnected_index.sum() ( self.graph_, self._sigmas, self._rhos, # self.graph_dists_, ) = fuzzy_simplicial_set( X[index], self.n_neighbors, random_state, nn_metric, self._metric_kwds, self._knn_indices, self._knn_dists, self.angular_rp_forest, self.set_op_mix_ratio, self.local_connectivity, True, self.verbose, # self.output_feat, ) # Report the number of vertices with degree 0 in our our self.graph_ # This ensures that they were properly disconnected. vertices_disconnected = np.sum( np.array(self.graph_.sum(axis=1)).flatten() == 0 ) raise_disconnected_warning( edges_removed, vertices_disconnected, self._disconnection_distance, self._raw_data.shape[0], verbose=self.verbose, ) # if self.featuremap or self.output_feat: # self._featuremap_kwds["graph_dists"] = self.graph_dists_ self._featuremap_kwds["n_neighbors"] = self.n_neighbors # self._featuremap_kwds["n_neighbors_in_guage"] = self.n_neighbors # TODO: n_neighbors_in_guage same as n_neighbors or not self._featuremap_kwds["_knn_indices"] = self._knn_indices if self.verbose: print(ts(), "Construct embedding") if self.transform_mode == "embedding": self.embedding_ = self._fit_embed_data( self._raw_data[index], self.n_epochs, # self.featuremap, init, random_state, ) # print('aux_data, ' + str(aux_data)) # Assign any points that are fully disconnected from our manifold(s) to have embedding # coordinates of np.nan. These will be filtered by our plotting functions automatically. # They also prevent users from being deceived a distance query to one of these points. # Might be worth moving this into simplicial_set_embedding or _fit_embed_data disconnected_vertices = np.array(self.graph_.sum(axis=1)).flatten() == 0 if len(disconnected_vertices) > 0: self.embedding_[disconnected_vertices] = np.full( self.n_components, np.nan ) self.embedding_ = self.embedding_[inverse] # if self.output_feat: # self.rad_orig_ = aux_data["rad_orig"][inverse] # self.rad_emb_ = aux_data["rad_emb"][inverse] self._featuremap_kwds["X_embedding"] = self.embedding_ if self.verbose: print(ts() + " Finished embedding") numba.set_num_threads(self._original_n_threads) self._input_hash = joblib.hash(self._raw_data) return self
def _fit_embed_data(self, X, n_epochs, init, random_state): """A method wrapper for simplicial_set_embedding that can be replaced by subclasses. """ # if featuremap: return simplicial_set_embedding_with_tangent_space_embedding( X, self.graph_, self.n_components, self._initial_alpha, self._a, self._b, self.repulsion_strength, self.negative_sample_rate, n_epochs, init, random_state, self._input_distance_func, self._metric_kwds, # self.featuremap, self._featuremap_kwds, self.output_feat, self._output_distance_func, self._output_metric_kwds, self.output_metric in ("euclidean", "l2"), self.random_state is None, self.verbose, tqdm_kwds=self.tqdm_kwds, output_variation=self.output_variation, )
[docs] def fit_transform(self, X): """Fit X into an embedded space and return that transformed output. Parameters ---------- X : array, shape (n_samples, n_features) or (n_samples, n_samples) If the metric is 'precomputed' X must be a square distance matrix. Otherwise it contains a sample per row. Returns ------- X_new : array, shape (n_samples, n_components) Embedding of the training data in low-dimensional space. """ self.fit(X) if self.transform_mode == "embedding": if self.output_variation: return self.embedding_ # elif self.output_feat: # return self.embedding_, self.rad_orig_, self.rad_emb_ else: return self.embedding_ elif self.transform_mode == "graph": return self.graph_ else: raise ValueError( "Unrecognized transform mode {}; should be one of 'embedding' or 'graph'".format( self.transform_mode ) )
[docs] def __repr__(self): from sklearn.utils._pprint import _EstimatorPrettyPrinter import re pp = _EstimatorPrettyPrinter( compact=True, indent=1, indent_at_name=True, n_max_elements_to_show=50, ) pp._changed_only = True repr_ = pp.pformat(self) repr_ = re.sub("tqdm_kwds={.*},", "", repr_, flags=re.S) # remove empty lines repr_ = re.sub("\n *\n", "\n", repr_, flags=re.S) # remove extra whitespaces after a comma repr_ = re.sub(", +", ", ", repr_) return repr_
import numpy as np import numba import umap.distances as dist from umap.utils import tau_rand_int from tqdm.auto import tqdm @numba.njit()
[docs] def clip(val): """Standard clamping of a value into a fixed range (in this case -4.0 to 4.0) Parameters ---------- val: float The value to be clamped. Returns ------- The clamped value, now fixed to be in the range -4.0 to 4.0. """ if val > 4.0: return 4.0 elif val < -4.0: return -4.0 else: return val
@numba.njit( "f4(f4[::1],f4[::1])", fastmath=True, cache=True, locals={ "result": numba.types.float32, "diff": numba.types.float32, "dim": numba.types.intp, }, )
[docs] def rdist(x, y): """Reduced Euclidean distance. Parameters ---------- x: array of shape (embedding_dim,) y: array of shape (embedding_dim,) Returns ------- The squared euclidean distance between x and y """ result = 0.0 dim = x.shape[0] for i in range(dim): diff = x[i] - y[i] result += diff * diff return result
# def vdiff(x, y): # """ # Vector difference. # Parameters # ---------- # x : array of shape (embedding_dim,) # y : array of shape (embedding_dim,) # Returns # ------- # The vector difference between x and y # """ # result = y - x # return result def _optimize_layout_euclidean_single_epoch_grad( head_embedding, tail_embedding, head, tail, n_vertices, epochs_per_sample, a, b, rng_state, gamma, dim, move_other, alpha, epochs_per_negative_sample, epoch_of_next_negative_sample, epoch_of_next_sample, n, featuremap_flag, feat_phi_sum, feat_re_sum, feat_re_cov, feat_re_std, feat_re_mean, feat_lambda, feat_R, feat_VH_embedding, feat_mu, feat_mu_tot, ): # print('epochs_per_sample.shape[0]' + str(epochs_per_sample.shape[0])) for i in numba.prange(epochs_per_sample.shape[0]): if epoch_of_next_sample[i] <= n: j = head[i] k = tail[i] current = head_embedding[j] other = tail_embedding[k] # vec_diff = vdiff(current, other) vec_diff = other - current inner_product = np.dot(vec_diff, vec_diff) outer_product = np.outer(vec_diff, vec_diff) grad_d = np.zeros(dim, dtype=np.float32) #dim = 1 # random select a dimension from dim # d = tau_rand_int(rng_state) % dim # print('featuremap_flag' + str(featuremap_flag)) if featuremap_flag: current_VH = feat_VH_embedding[j] # rotation matrix embedding; other_VH = feat_VH_embedding[k] grad_cor_coeff = np.zeros(dim, dtype=np.float32) # TODO: focus on each d of dim for d in numba.prange(dim): phi = 1.0 / (1.0 + a * pow(inner_product, b)) dphi_term = ( 2 * a * b * pow(inner_product, b - 1) * vec_diff / (1.0 + a * pow(inner_product, b)) ) v_j = current_VH[d] v_k = other_VH[d] project_vec_j = np.dot(v_j, vec_diff) project_vec_k = np.dot(v_k, vec_diff) #TODO: check feat_phi_sum, feat_re_sum # Have changed the order of j, k q_jk = phi / feat_phi_sum[j] q_kj = phi / feat_phi_sum[k] # drj = q_jk * ( # project_vec_j * (2 * v_j - project_vec_j * dphi_term ) / np.exp(feat_re_sum[j,d]) + dphi_term # ) # drk = q_kj * ( # project_vec_k * (2 * v_k - project_vec_k * dphi_term ) / np.exp(feat_re_sum[k,d]) + dphi_term # ) drj = np.zeros(dim) drk = np.zeros(dim) for s in numba.prange(dim): drj[s] = q_jk * ( project_vec_j * (2 * v_j[s] - project_vec_j * dphi_term[s] ) / np.exp(feat_re_sum[j,d]) + dphi_term[s] ) drk[s] = q_kj * ( project_vec_k * (2 * v_k[s] - project_vec_k * dphi_term[s] ) / np.exp(feat_re_sum[k,d]) + dphi_term[s] ) # check feat_re_std: array shape (dim,) re_std_sq = feat_re_std[d] * feat_re_std[d] weight_j = ( feat_R[j,d] - feat_re_cov[d] * (feat_re_sum[j,d] - feat_re_mean[d]) / re_std_sq ) weight_k = ( feat_R[k,d] - feat_re_cov[d] * (feat_re_sum[k,d] - feat_re_mean[d]) / re_std_sq ) for s in numba.prange(dim): grad_cor_coeff[s] += (weight_j * drj[s] + weight_k * drk[s]) / feat_re_std[d] for s in numba.prange(dim): grad_cor_coeff[s] = ( grad_cor_coeff[s] * feat_lambda * feat_mu_tot / feat_mu[i] / n_vertices ) # grad_coeff = np.zeros(dim, dtype=np.float32) if inner_product > 0.0: # gradient of log Q_jk grad_coeff_term = (-2.0) * a * b * pow(inner_product, b - 1.0) grad_coeff_term = grad_coeff_term / (a * pow(inner_product, b) + 1.0) else: grad_coeff_term = 0 # gradient w.r.t y_j; sampling edge (j,k), Z_jk = y_k - y_j # grad_d = clip_arr(* vec_diff[d]) * vec_diff[d] * (-1.0) for d in numba.prange(dim): grad_d = clip(grad_coeff_term * vec_diff[d] * (-1.0)) # grad_d = grad_coeff[d] * (-1.0) * vec_diff[d] if featuremap_flag: # FIXME: grad_cor_coeff might be referenced before assignment grad_d += clip(grad_cor_coeff[d] * (-1.0)) # current[d] += grad_d * alpha if move_other: other[d] += -grad_d * alpha epoch_of_next_sample[i] += epochs_per_sample[i] n_neg_samples = int( (n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i] ) for p in numba.prange(n_neg_samples): k = tau_rand_int(rng_state) % n_vertices other = tail_embedding[k] # vec_diff = vdiff(current, other) vec_diff = other - current inner_product = np.dot(vec_diff, vec_diff) if inner_product > 0.0: grad_coeff_term = 2.0 * gamma * b #divisor = np.repeat((0.001 + inner_product) * (a * pow(inner_product, b) + 1), dim) grad_coeff_term = grad_coeff_term / ((0.001 + inner_product) * (a * pow(inner_product, b) + 1)) elif j == k: continue else: grad_coeff_term = 0.0 for d in numba.prange(dim): if grad_coeff_term > 0.0: grad_d = clip(grad_coeff_term * vec_diff[d] * (-1.0)) else: grad_d = 2.0 current[d] += grad_d * alpha epoch_of_next_negative_sample[i] += ( n_neg_samples * epochs_per_negative_sample[i] ) # Compute the variance in each direction of embedding under rotation VH def _optimize_layout_euclidean_featuremap_epoch_init_grad( head_embedding, tail_embedding, head, tail, # random_state, # gauge_vh, VH_embedding, # rotation_angle, a, b, re_sum, phi_sum, ): """ Compute the principal radius in the dim-dimensional embedding space. Parameter --------- re_sum: array of shape (n_vertices, dim) The principal radius in the embedding space phi_sum: array of shape (n_vertices,) For node i, the sum of edge existing probability incident to this node """ # VH_embedding.fill(0) # random_initial = random_state.randint(0, head_embedding.shape[0], 1).astype(np.int64) # vh_initial = gauge_vh[random_initial[0]][0] # for i in range(head_embedding.shape[0]): # vh_vector = gauge_vh[i][0] # # angle = angle_between(vh_initial, vh_vector) # angle = random_state.random() * 3.14 # # angle = random_state.random() * 3.14 # # angle = random_state.normal(0,1) # # rotation_angle[i] = angle # vh_embedding = np.array([[np.cos(angle), np.sin(angle)],[-np.sin(angle), np.cos(angle)]]) # # vh_embedding = np.identity(2) # VH_embedding[i] = vh_embedding re_sum.fill(0) phi_sum.fill(0) dim = head_embedding.shape[1] for i in numba.prange(head.size): j = head[i] k = tail[i] current = head_embedding[j] other = tail_embedding[k] # current_VH = VH[j][:,:dim] # projection matrix; keep first dim dimensions # other_VH = VH[k][:,:dim] # Restrict dim to 2 # theta_j = rotation_angle[j] # theta_k = rotation_angle[k] current_VH = VH_embedding[j] # array shape of (dim, dim) other_VH = VH_embedding[k] # current_VH = np.identity(2) # other_VH = np.identity(2) # vec_diff = vdiff(current, other) vec_diff = other - current # inner_product = np.dot(vec_diff, vec_diff) dist_squared = rdist(current, other) phi = 1.0 / (1.0 + a * pow(dist_squared, b)) phi_sum[j] += phi phi_sum[k] += phi for d in numba.prange(dim): vec_proj_vh_j = np.dot(vec_diff, current_VH[d]) # project to d-th rotation direction vec_proj_vh_k = np.dot(vec_diff, other_VH[d]) re_sum[j,d] += phi * vec_proj_vh_j * vec_proj_vh_j re_sum[k,d] += phi * vec_proj_vh_k * vec_proj_vh_k # vec_proj_vh_j = np.dot(current_VH, vec_diff) # project to rotation direction # vec_proj_vh_k = np.dot(other_VH, vec_diff) # re_sum[j] += phi * vec_proj_vh_j * vec_proj_vh_j # re_sum[k] += phi * vec_proj_vh_k * vec_proj_vh_k epsilon = 1e-8 for i in numba.prange(re_sum.shape[0]): re_sum[i] = np.log(epsilon + (re_sum[i] / phi_sum[i])) # for i in range(re_sum.shape[0]): # re_sum[i] = np.log(epsilon + np.sqrt((re_sum[i] / phi_sum[i])))
[docs] def optimize_layout_euclidean_anisotropic_projection( head_embedding, tail_embedding, head, tail, n_epochs, n_vertices, epochs_per_sample, a, b, rng_state, gamma=1.0, initial_alpha=1.0, negative_sample_rate=5.0, parallel=False, verbose=False, # featuremap=False, featuremap_kwds=None, tqdm_kwds=None, move_other=False, ): """Improve an embedding using stochastic gradient descent to minimize the fuzzy set cross entropy between the 1-skeletons of the high dimensional and low dimensional fuzzy simplicial sets. In practice this is done by sampling edges based on their membership strength (with the (1-p) terms coming from negative sampling similar to word2vec). Parameters ---------- head_embedding: array of shape (n_samples, n_components) The initial embedding to be improved by SGD. tail_embedding: array of shape (source_samples, n_components) The reference embedding of embedded points. If not embedding new previously unseen points with respect to an existing embedding this is simply the head_embedding (again); otherwise it provides the existing embedding to embed with respect to. head: array of shape (n_1_simplices) The indices of the heads of 1-simplices with non-zero membership. tail: array of shape (n_1_simplices) The indices of the tails of 1-simplices with non-zero membership. n_epochs: int The number of training epochs to use in optimization. n_vertices: int The number of vertices (0-simplices) in the dataset. epochs_per_sample: array of shape (n_1_simplices) A float value of the number of epochs per 1-simplex. 1-simplices with weaker membership strength will have more epochs between being sampled. a: float Parameter of differentiable approximation of right adjoint functor b: float Parameter of differentiable approximation of right adjoint functor rng_state: array of int64, shape (3,) The internal state of the rng gamma: float (optional, default 1.0) Weight to apply to negative samples. initial_alpha: float (optional, default 1.0) Initial learning rate for the SGD. negative_sample_rate: int (optional, default 5) Number of negative samples to use per positive sample. parallel: bool (optional, default False) Whether to run the computation using numba parallel. Running in parallel is non-deterministic, and is not used if a random seed has been set, to ensure reproducibility. verbose: bool (optional, default False) Whether to report information on the current progress of the algorithm. featuremap: bool (optional, default False) Whether to use the feature-augmented featuremap objective featuremap_kwds: dict (optional, default None) Auxiliary data for featuremap tqdm_kwds: dict (optional, default None) Keyword arguments for tqdm progress bar. move_other: bool (optional, default False) Whether to adjust tail_embedding alongside head_embedding Returns ------- embedding: array of shape (n_samples, n_components) The optimized embedding. """ dim = head_embedding.shape[1] alpha = initial_alpha epochs_per_negative_sample = epochs_per_sample / negative_sample_rate epoch_of_next_negative_sample = epochs_per_negative_sample.copy() epoch_of_next_sample = epochs_per_sample.copy() optimize_fn = numba.njit( _optimize_layout_euclidean_single_epoch_grad, fastmath=True, parallel=parallel, # nopython=True, cache=True ) # optimize_fn = _optimize_layout_euclidean_single_epoch_grad if featuremap_kwds is None: featuremap_kwds = {} if tqdm_kwds is None: tqdm_kwds = {} # if featuremap: feat_init_fn = numba.njit( _optimize_layout_euclidean_featuremap_epoch_init_grad, fastmath=True, parallel=parallel, # nopython=True, cache=True ) # feat_init_fn = _optimize_layout_euclidean_featuremap_epoch_init_grad feat_mu_tot = np.sum(featuremap_kwds["mu_sum"]) / 2 # sum of all edges' existing probability, float, shape of (1,) # should we modify lambda? Yes, we should #TODO: modify lambda feat_lambda = featuremap_kwds["lambda"] feat_R = featuremap_kwds["R"] # array shape of (n_vertices d) feat_VH = featuremap_kwds["VH"] # feat_VH_embedding = featuremap_kwds["VH_embedding"] # array of shape (n_vertices, dim) feat_VH_embedding = np.repeat(np.eye(2)[np.newaxis, :, :], head_embedding.shape[0], axis=0) # initialize vh_embedding by identity matrix feat_VH_embedding = feat_VH_embedding.astype(np.float32) # feat_rotation_angle = featuremap_kwds["rotation_angle"] feat_mu = featuremap_kwds["mu"] # edge probability feat_phi_sum = np.zeros(n_vertices, dtype=np.float32) # For each node i in embedding space, sum of edge existing probality incident to this node feat_re_sum = np.zeros([n_vertices, dim], dtype=np.float32) # Embedding radius in principal directions feat_var_shift = featuremap_kwds["var_shift"] # else: # feat_mu_tot = 0 # feat_lambda = 0 # feat_R = np.zeros(1, dtype=np.float32) # feat_VH = np.zeros(1, dtype=np.float32) # feat_VH_embedding = np.zeros(1, dtype=np.float32) # # feat_rotation_angle = np.zeros(1, dtype=np.float32) # feat_mu = np.zeros(1, dtype=np.float32) # feat_phi_sum = np.zeros(1, dtype=np.float32) # feat_re_sum = np.zeros(1, dtype=np.float32) if "disable" not in tqdm_kwds: tqdm_kwds["disable"] = not verbose for n in tqdm(range(n_epochs), **tqdm_kwds): featuremap_flag = ( # featuremap and (featuremap_kwds["lambda"] > 0) and (((n + 1) / float(n_epochs)) > (1 - featuremap_kwds["frac"])) ) if featuremap_flag: # Compute the initial embedding under rotation VH # T1 = time.time() feat_init_fn( head_embedding, tail_embedding, head, tail, # random_state, # feat_VH, feat_VH_embedding, a, b, feat_re_sum, feat_phi_sum, ) # T2 = time.time() # print(f'featuremap initialization time is {T2-T1}') # feat_init_fn.inspect_types() # FIXME: feat_var_shift might be referenced before assignment feat_re_std = np.sqrt(np.var(feat_re_sum, axis=0) + feat_var_shift) feat_re_mean = np.mean(feat_re_sum, axis=0) feat_re_sum_centered = np.subtract(feat_re_sum, feat_re_mean) product = np.diag(np.dot(feat_re_sum_centered.T, feat_R[:,:dim])) feat_re_cov = product / (n_vertices - 1) else: feat_re_std = np.zeros(dim, dtype=np.float32) feat_re_mean = np.zeros(dim, dtype=np.float32) feat_re_cov = np.zeros(dim, dtype=np.float32) # # recover the gauge from the low dimensional embedding # if featuremap_flag and n % 10 == 0: # # print('recover gauge from embedding') # recover_gauge_from_embedding(data_embedding=head_embedding, featuremap_kwds=featuremap_kwds) # feat_VH_embedding = featuremap_kwds["VH_embedding"].astype(np.float32) # T1 = time.time() optimize_fn( head_embedding, tail_embedding, head, tail, n_vertices, epochs_per_sample, a, b, rng_state, gamma, dim, move_other, alpha, epochs_per_negative_sample, epoch_of_next_negative_sample, epoch_of_next_sample, n, featuremap_flag, feat_phi_sum, feat_re_sum, feat_re_cov, feat_re_std, feat_re_mean, feat_lambda, feat_R, feat_VH_embedding, feat_mu, feat_mu_tot, ) # T2 = time.time() # print(f'Optimize_fn time is {T2-T1}') alpha = initial_alpha * (1.0 - (float(n) / float(n_epochs))) return head_embedding