Source code for featuremap.featuremap_

# Authors: 
#
# License: 
#
# Adapts source code from: 
# Leland McInnes <leland.mcinnes@gmail.com>
#
# License: BSD 3 clause


from __future__ import print_function

import locale
from warnings import warn
import time
import warnings

from scipy.optimize import curve_fit
from sklearn.base import BaseEstimator
from sklearn.utils import check_random_state, check_array
from sklearn.utils.validation import check_is_fitted
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize
from sklearn.neighbors import KDTree
# from sklearn.manifold import MDS, TSNE 
# from sklearn.decomposition import PCA


from umap.umap_ import raise_disconnected_warning, smooth_knn_dist, nearest_neighbors, compute_membership_strengths
from umap.umap_ import fuzzy_simplicial_set, make_epochs_per_sample, init_graph_transform, find_ab_params
import umap.distances as dist
import umap.sparse as sparse

from umap.utils import (
    submatrix,
    ts,
    csr_unique,
    fast_knn_indices,
)
from umap.spectral import spectral_layout


try:
    import joblib
except ImportError:
    # sklearn.externals.joblib is deprecated in 0.21, will be removed in 0.23
    from sklearn.externals import joblib

import numpy as np
import scipy.sparse
from scipy.sparse import tril as sparse_tril, triu as sparse_triu
import scipy.sparse.csgraph
import numba



# from featuremap.core_transition_state import kernel_density_estimate

from pynndescent import NNDescent
from pynndescent.distances import named_distances as pynn_named_distances
from pynndescent.sparse import sparse_named_distances as pynn_sparse_named_distances

locale.setlocale(locale.LC_NUMERIC, "C")


[docs]
INT32_MIN = np.iinfo(np.int32).min + 1


[docs]
INT32_MAX = np.iinfo(np.int32).max - 1



[docs]
SMOOTH_K_TOLERANCE = 1e-5


[docs]
MIN_K_DIST_SCALE = 1e-3


[docs]
NPY_INFINITY = np.inf



[docs]
DISCONNECTION_DISTANCES = {
    "correlation": 2,
    "cosine": 2,
    "hellinger": 1,
    "jaccard": 1,
    "dice": 1,
}



# Preprocess the data by singular value decomposition
def _preprocess_data(X): 
    T1 = time.time()
    if X.shape[1] > 100 and X.shape[0] > 100:
        print("Performing SVD decomposition on the data")
        u, s, vh = scipy.sparse.linalg.svds(X, k=100, which='LM', random_state=42)
        X = np.matmul(u, np.diag(s))
    elif X.shape[1] > 100 and X.shape[0] < 100:
        print(int(X.shape[0]-1))
        u, s, vh = scipy.sparse.linalg.svds(X, k=int(X.shape[0]-1), which='LM', random_state=42)
        X = np.matmul(u, np.diag(s))
    else:    
        vh = np.eye(X.shape[1])
    T2 = time.time()
    # print(f'SVD decomposition time is {T2-T1}')
    return X, vh

from sklearn.neighbors import NearestNeighbors

[docs]
def kernel_density_estimate(data, X, bw=0.5, min_radius=5, output_onlylogp=False, ):
        """
        Density estimation for data points specified by X with kernel density estimation.

        Parameters
        ----------
        data : array of shape (n_samples, n_features)
            2D array including data points. Input to density estimation.
       
        X : array
            2D array including multiple data points. Input to density estimation.
        output_onlylogp : bool
            If true, returns logp, else returns p, g, h, msu.

        Returns
        -------
        p : array
            1D array.  Unnormalized probability density. The probability density
            is not normalized due to numerical stability. Exact log probability
        """
        nbrs = NearestNeighbors(n_neighbors=min_radius + 1).fit(data)
        adaptive_bw = np.maximum(nbrs.kneighbors(data)[0][:, -1], bw)

        # the number of data points and the dimensionality
        n, d = data.shape

        from scipy.spatial.distance import cdist
        # compare euclidean distances between each pair of data and X
        D = cdist(data, X)
        

        # and evaluate the kernel at each distance
        # prevent numerical overflow due to large exponentials
        logc = -d * np.log(np.min(adaptive_bw)) - d / 2 * np.log(2 * np.pi)
        C = (adaptive_bw[:, np.newaxis] / np.min(adaptive_bw)) ** (-d) * \
            np.exp(-1 / 2. * (D / adaptive_bw[:, np.newaxis]) ** 2)

        if output_onlylogp:
            # return the kernel density estimate
            return np.log(np.mean(C, axis=0).T) + logc
        else:
            return np.mean(C, axis=0).T



@numba.njit()

[docs]
def local_svd(
        data,
        knn_index,
        neighbor_weights,
        n_neighbors=15,
        n_neighbors_in_guage=30,
        ):
    """
    Local singular value decomposition (SVD) for each node in the data

    Parameters
    -----------
    data: array of shape (n_samples, n_features)
        The source data to be embedded by FeatureMAP.
    knn_index: array of shape (n_samples, n_neighbors)
        The index of k-nearest neighbors for each node
    weight: array of shape (n_samples, n_samples)
        The weight matrix of the graph
    n_neighbors: int
        The number of nearest neighbors
    n_neighbors_in_guage: int
        The number of nearest neighbors for local SVD
    
    Returns
    --------
    gauge_u: list of shape (n_neighbors_in_guage, n_neighbors_in_guage); store u
    singular_values: list of shape (n_neighbors_in_guage,); store single values for each frame
    gauge_vh: list of shape (n_neighbors_in_guage, d); store v

    """
    
    gauge_u = []
    singular_values = []
    gauge_vh = []
    
    for row_i in numba.prange(data.shape[0]):
        # choose up to n_neighbors_in_guage nearest neighbors, skipping self at 0
        upper = n_neighbors_in_guage + 1
        if upper > n_neighbors:
            upper = n_neighbors
        indices = knn_index[row_i, 1:upper]

        data_around_i = data[indices] - data[row_i]

        weights_around_i = neighbor_weights[row_i, 1:upper]
        # Normalize and apply as row-wise scaling without forming diagonal matrices
        wsum = np.sum(weights_around_i)
        if wsum == 0.0:
            wsum = 1.0
        sqrt_w = np.sqrt(weights_around_i / wsum)
        weighted = data_around_i * sqrt_w[:, np.newaxis]

        u, s, vh = np.linalg.svd(weighted, full_matrices=False)

        gauge_u.append(u)
        singular_values.append(s)
        gauge_vh.append(vh)

    return gauge_u, singular_values, gauge_vh




import time

[docs]
def graph_convolution(features, knn_index, num_iterations, verbose=False, memory_target_mb=256, backend="auto"):   
    """
    Perform iterative neighbor averaging as a simple graph convolution.

    Args:
        features (np.ndarray): Input feature matrix of shape (num_nodes, num_features, num_channels).
        knn_index (np.ndarray): Nearest neighbor indices of shape (num_nodes, num_neighbors).
        num_iterations (int, optional): Number of smoothing iterations. Default is 20.

    Returns:
        np.ndarray: Smoothed feature matrix after `num_iterations` iterations.
        dict: Dictionary storing the first averaged result under the key "VH".
    """
    if verbose:
        print(ts() + f' Applying graph convolution for {num_iterations} iterations...')
    start_time = time.time()

    # Dictionary to store intermediate results
    featuremap_results = {}

    # Iterative neighbor averaging
    n_nodes, f_dim, c_dim = features.shape
    k = knn_index.shape[1]
    eps = 1e-12

    use_sparse = False
    if backend == "sparse":
        use_sparse = True
    elif backend == "auto":
        # Prefer sparse backend for very large n, or if temporary gather would exceed target memory
        # Estimate gather tensor size per batch for batch approach
        target_bytes = max(1, int(memory_target_mb * 1024 * 1024))
        denom = max(1, 4 * k * f_dim * c_dim)
        est_batch = int(target_bytes // denom)
        if n_nodes >= 50_000 or est_batch < 32:
            use_sparse = True

    if use_sparse:
        # Build a row-stochastic adjacency once, then apply repeated sparse matmuls
        # Filter invalid neighbors (-1)
        idx = knn_index.reshape(-1)
        rows = np.repeat(np.arange(n_nodes, dtype=np.int32), k)
        mask = idx >= 0
        rows = rows[mask]
        cols = idx[mask].astype(np.int32, copy=False)
        # Row-normalized weights (mean over neighbors)
        data = np.full(rows.shape[0], 1.0 / float(k), dtype=np.float32)
        # Build CSR adjacency
        adj = scipy.sparse.csr_matrix((data, (rows, cols)), shape=(n_nodes, n_nodes))

        for iteration in range(num_iterations):
            # Flatten features to (n_nodes, f_dim * c_dim)
            flat = features.reshape(n_nodes, f_dim * c_dim)
            flat = adj.dot(flat)
            smoothed = flat.reshape(n_nodes, f_dim, c_dim)
            # Normalize across channel dimension per feature row
            norms = np.linalg.norm(smoothed, axis=2, keepdims=True)
            norms = np.maximum(norms, eps)
            features = smoothed / norms
            if iteration == 0:
                featuremap_results["VH"] = features.astype(np.float32, copy=True)
    else:
        # Vectorized with batching and advanced indexing
        target_bytes = max(1, int(memory_target_mb * 1024 * 1024))
        denom = max(1, 4 * k * f_dim * c_dim)
        batch_size = int(target_bytes // denom)
        if batch_size < 32:
            batch_size = 32
        batch_size = min(batch_size, n_nodes)

        for iteration in range(num_iterations):
            smoothed_features = np.empty_like(features)

            for start in range(0, n_nodes, batch_size):
                end = min(n_nodes, start + batch_size)
                idx_batch = knn_index[start:end]  # (B, k)
                # Gather neighbor features: (B, k, f_dim, c_dim)
                neighbor_features = features[idx_batch]
                # Mean over neighbors -> (B, f_dim, c_dim)
                mean_features = neighbor_features.mean(axis=1)
                # Normalize across channel dimension per feature row
                norms = np.linalg.norm(mean_features, axis=2, keepdims=True)
                norms = np.maximum(norms, eps)
                smoothed_features[start:end] = mean_features / norms

            features = smoothed_features
            if iteration == 0:
                featuremap_results["VH"] = features.astype(np.float32, copy=True)

    end_time = time.time()
    if verbose:
        print(ts() + f' Graph convolution completed in {end_time - start_time:.2f} seconds')

    return features, featuremap_results


                 

# @numba.njit()

[docs]
def tangent_space_approximation(
        data,
        graph,
        featuremap_kwds,
        ):
    """
    Compute the origial gauge in high-dimensional data by local SVD

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data to be embedded by FeatureMAP.
    graph: sparse matrix
        The 1-skeleton of the high dimensional fuzzy simplicial set as
        represented by a graph for which we require a sparse matrix for the
        (weighted) adjacency matrix.
    featuremap_kwds: dict
        Key word arguments to be used by the FeatureMAP optimization.
            
    Returns
    -------
    Local tangent space U, S, V
    """
    
    import os
    # check if the data is sparse
    if scipy.sparse.issparse(data):
        data = data.toarray()

    n_neighbors = featuremap_kwds["n_neighbors"]
    n_neighbors_in_guage = int(featuremap_kwds["gauge_coefficient"] * n_neighbors)
    knn_index = featuremap_kwds["_knn_indices"].astype(np.int32)
    # clamp gauge neighbor count to available knn to avoid extra memory
    if n_neighbors_in_guage >= n_neighbors:
        n_neighbors_in_guage = n_neighbors - 1 if n_neighbors > 1 else 0

    # Build neighbor weights for each row from sparse graph without densifying
    # neighbor_weights has shape (n_samples, n_neighbors); positions without edges are 0
    n_samples = knn_index.shape[0]
    neighbor_weights = np.zeros((n_samples, n_neighbors), dtype=np.float32)
    G = graph.tocsr()
    for i in range(n_samples):
        inds = knn_index[i]
        # mask invalid indices (-1)
        for j in range(n_neighbors):
            nb = inds[j]
            if nb >= 0:
                # fetch scalar weight; G[i, nb] returns 1x1 sparse, use .A1 or .toarray()
                val = G[i, nb]
                # scipy sparse scalar -> np.matrix subclass or 0; convert
                w = float(val) if hasattr(val, "__float__") else float(val.toarray()[0, 0])
                neighbor_weights[i, j] = w

    gauge_u = [] # list of shape (n_neighbors_in_guage, n_neighbors_in_guage); store u
    singular_values = [] # list of shape (n_neighbors_in_guage,); store single values for each frame
    gauge_vh = [] # list of shape (n_neighbors_in_guage, d); store v
    
    T1 = time.time()
    gauge_u, singular_values, gauge_vh = local_svd(
            data,
            knn_index,
            neighbor_weights,
            n_neighbors,
            n_neighbors_in_guage,
            )
    
    # Avoid storing U to reduce RAM; keep singular values and VH only
    featuremap_kwds["Singular_value"] = np.array(singular_values).astype(np.float32, copy=True)
    featuremap_kwds["VH"] = np.array(gauge_vh).astype(np.float32, copy=True)
    T2 = time.time()
    if featuremap_kwds['verbose']:
        print(ts() + f' Local SVD time is {T2-T1}')

     # Before average 
    gauge_vh = featuremap_kwds["VH"].copy()
    ###########################################
    # # Average over NNs in original space. (Message passing accross knn network)
    #########################################
    # knn_index = featuremap_kwds["_knn_indices"]
    knn_index = knn_index
    gauge_vh = np.array(gauge_vh)

    num_iterations = featuremap_kwds["gcn_iterations"]

    if num_iterations is not None:
        num_iterations = num_iterations
    else:
        if data.shape[0] < 5000:
            num_iterations = int(np.log2(data.shape[0])/2)
        elif data.shape[0] < 10000:
            num_iterations = int(np.log2(data.shape[0])) 
        else:
            num_iterations = int(np.log2(data.shape[0])*2)
        # num_iterations  = 42

    gauge_vh, featuremap_kwds["VH"] = graph_convolution(gauge_vh, knn_index, num_iterations=num_iterations, verbose=featuremap_kwds['verbose'])

    featuremap_kwds["vh_smoothed"] = np.array(gauge_vh).astype(np.float32, copy=True)






[docs]
def project_gauge_to_knn_graph( 
        data,
        graph,
        featuremap_kwds,
        scale=10,
        use_negative_cosines=False,):
    """
    Project the gauge (rotation matrix) to the KNN graph and compute the transition probability matrix
    Modified from the scVelo: https://scvelo.readthedocs.io/en/stable/scvelo.utils.get_transition_matrix.html#scvelo.utils.get_transition_matrix

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data to be embedded by FeatureMAP.
    graph: sparse matrix
        The 1-skeleton of the high dimensional fuzzy simplicial set as
        represented by a graph for which we require a sparse matrix for the
        (weighted) adjacency matrix.
    featuremap_kwds: dict
        Key word arguments to be used by the FeatureMAP optimization.
    scale: float
        The scale factor for the transition probability matrix
    use_negative_cosines: bool
        Whether to use negative cosine similarity in the transition probability matrix

    Returns
    -------
    transition_matrix: array of shape (n_samples, n_samples)
        The transition probability matrix
    """
    # for each edge, compute the transition probability by the (normalized) cosine similarity between edge and the gauge
    gauge_v1 = featuremap_kwds["VH"][:,0,:]
    gauge_v2 = featuremap_kwds["VH"][:,1,:]

    # gauge_v1 = featuremap_kwds["vh_smoothed"][:,0,:]
    # gauge_v2 = featuremap_kwds["vh_smoothed"][:,1,:]

    head = graph.row
    tail = graph.col

    # Build sparse transition scores only on existing edges
    rows = []
    cols = []
    data_v1 = []
    data_v2 = []

    for i in range(len(head)):
        j = int(head[i])
        k = int(tail[i])
        edge_vector = (data[k] - data[j]).astype(np.float32, copy=False)
        gv1 = gauge_v1[j].astype(np.float32, copy=False)
        gv2 = gauge_v2[j].astype(np.float32, copy=False)
        denom1 = (np.linalg.norm(edge_vector) * np.linalg.norm(gv1))
        denom2 = (np.linalg.norm(edge_vector) * np.linalg.norm(gv2))
        c1 = 0.0 if denom1 == 0.0 else float(np.dot(edge_vector, gv1) / denom1)
        c2 = 0.0 if denom2 == 0.0 else float(np.dot(edge_vector, gv2) / denom2)
        rows.append(j)
        cols.append(k)
        data_v1.append(c1)
        data_v2.append(c2)

    rows = np.asarray(rows, dtype=np.int32)
    cols = np.asarray(cols, dtype=np.int32)
    v1 = np.asarray(data_v1, dtype=np.float32)
    v2 = np.asarray(data_v2, dtype=np.float32)

    # Split pos/neg and apply expm1 transform sparsely
    v1_pos = np.clip(v1, 0.0, 1.0)
    v1_neg = np.clip(v1, -1.0, 0.0)
    v2_pos = np.clip(v2, 0.0, 1.0)
    v2_neg = np.clip(v2, -1.0, 0.0)

    tv1 = np.expm1(v1_pos * scale)
    tv2 = np.expm1(v2_pos * scale)
    if use_negative_cosines:
        tv1 -= np.expm1(-v1_neg * scale)
        tv2 -= np.expm1(-v2_neg * scale)
    else:
        tv1 += np.expm1(v1_neg * scale) + 1.0
        tv2 += np.expm1(v2_neg * scale) + 1.0

    n = data.shape[0]
    T_v1 = scipy.sparse.csr_matrix((tv1, (rows, cols)), shape=(n, n))
    T_v2 = scipy.sparse.csr_matrix((tv2, (rows, cols)), shape=(n, n))

    # Row-normalize (avoid densifying)
    row_sums_v1 = np.asarray(T_v1.sum(axis=1)).ravel()
    row_sums_v1[row_sums_v1 == 0.0] = 1.0
    row_sums_v2 = np.asarray(T_v2.sum(axis=1)).ravel()
    row_sums_v2[row_sums_v2 == 0.0] = 1.0
    inv_v1 = scipy.sparse.diags(1.0 / row_sums_v1)
    inv_v2 = scipy.sparse.diags(1.0 / row_sums_v2)
    T_v1 = inv_v1.dot(T_v1)
    T_v2 = inv_v2.dot(T_v2)

    featuremap_kwds["T_v1"] = T_v1
    featuremap_kwds["T_v2"] = T_v2



   

[docs]
def recover_gauge_from_embedding(
        data_embedding,
        featuremap_kwds,
        ):
    """
    Recover the gauge from the low-dimensional embedding

    Parameters
    ----------
    data_embedding: array of shape (n_samples, n_components)
        The low-dimensional embedding of the data
    featuremap_kwds: dict

    Returns
    -------
    gauge_v1_emb: array of shape (n_samples, n_components)
        The recovered gauge v1 in low dimesional space
    gauge_v2_emb: array of shape (n_samples, n_components)
        The recovered gauge v2 in low dimensional space
    """
    # Example input data
    knn_indices = featuremap_kwds["_knn_indices"]
    n_neighbors = featuremap_kwds["n_neighbors"]

    T_v1 = featuremap_kwds["T_v1"]
    T_v2 = featuremap_kwds["T_v2"]

    # Gather per-row transition weights for knn neighbors without densifying whole matrices
    n_samples = data_embedding.shape[0]
    k = knn_indices.shape[1]
    T_v1_knn = np.zeros((n_samples, k), dtype=np.float32)
    T_v2_knn = np.zeros((n_samples, k), dtype=np.float32)
    if scipy.sparse.issparse(T_v1):
        T1 = T_v1.tocsr()
        T2 = T_v2.tocsr()
        for i in range(n_samples):
            idx = knn_indices[i]
            for j in range(k):
                nb = idx[j]
                if nb >= 0:
                    T_v1_knn[i, j] = T1[i, nb]
                    T_v2_knn[i, j] = T2[i, nb]
    else:
        # dense
        T_v1_knn = np.take_along_axis(T_v1, knn_indices, axis=1)
        T_v2_knn = np.take_along_axis(T_v2, knn_indices, axis=1)
    
    # Modify the transition weights by subtracting 1/n_neighbors
    T_v1_knn = T_v1_knn - (1.0 / n_neighbors)
    T_v2_knn = T_v2_knn - (1.0 / n_neighbors)

    # Compute displacement using knn neighbors
    # avoid negative indices by masking
    valid_mask = knn_indices >= 0
    safe_knn = knn_indices.copy()
    safe_knn[~valid_mask] = 0
    displacement = data_embedding[safe_knn] - data_embedding[:, np.newaxis, :]  # shape (n_samples, k_neighbors, n_components)
    displacement_v1 = displacement
    # rotate the displacement by pi/2 
    displacement_v2 = np.array([displacement[:,:,1], -displacement[:,:,0]]).transpose(1,2,0)

    # Compute the gauge recovery
    # zero out contributions from invalid neighbors
    T_v1_knn = T_v1_knn * valid_mask.astype(np.float32)
    T_v2_knn = T_v2_knn * valid_mask.astype(np.float32)
    gauge_v1_emb = np.einsum('ij,ijk->ik', T_v1_knn, displacement_v1)  # shape (n_samples, n_components)
    gauge_v2_emb = np.einsum('ij,ijk->ik', T_v2_knn, displacement_v2)  # shape (n_samples, n_components)

    featuremap_kwds["gauge_v1_emb"] = gauge_v1_emb
    featuremap_kwds["gauge_v2_emb"] = gauge_v2_emb

    # stack along the second axis
    featuremap_kwds["VH_embedding"] = np.stack((gauge_v1_emb, gauge_v2_emb), axis=1)





[docs]
def tangent_space_embedding(
        featuremap_kwds,
        ):    
    """
    Embedding the gauge (rotation matrix) to low dim space

    Parameters
    ----------
    featuremap_kwds: dict
        Key word arguments to be used by the FeatureMAP optimization.  
    """
    
    ################################################################################
    # Embedding the gauge (rotation matrix) to low dim space
    # Consider the distance within topological structure of KNN network
    ###############################################################################
    random_state = featuremap_kwds["random_state"]
    n_neighbors = featuremap_kwds["n_neighbors"]
    metric = featuremap_kwds["metric"]
    min_dist = featuremap_kwds["min_dist"]
    n_epochs = featuremap_kwds["n_epochs"]

    gauge_vh = featuremap_kwds["vh_smoothed"]
    gauge_vh_copy = np.array(gauge_vh, copy=True)


    # First largest PC
    # T1 = time.time()
    rotation_matrix = gauge_vh_copy[:,0,:]
    
    # from sklearn.decomposition import TruncatedSVD
    from sklearn.decomposition import PCA
    pca = PCA(n_components=min(rotation_matrix.shape[1], 50))
    X_pca = pca.fit_transform(rotation_matrix)
    
    import umap
    # print('random_state', random_state)
    umap_embedding = umap.UMAP(n_neighbors=n_neighbors,random_state=random_state, metric=metric,min_dist=min_dist, n_epochs=n_epochs).fit_transform(X=X_pca)
    # umap_embedding = umap.UMAP(n_neighbors=30, metric='euclidean',min_dist=0.5).fit_transform(X=X_pca)
    featuremap_kwds['gauge_v1_emb'] = umap_embedding
    # T2 = time.time()
    # print(f'UMAP time is {T2-T1}')
    
    umap_embedding_norm = np.linalg.norm(umap_embedding, axis=1, keepdims=True)
    umap_embedding_norm[umap_embedding_norm == 0.0] = 1.0

    # gauge_vh_embedding is embedding of VH
    gauge_vh_mean_embedding = np.zeros([umap_embedding.shape[0], umap_embedding.shape[1], 2])
    normalized_embedding = umap_embedding / umap_embedding_norm
    gauge_vh_mean_embedding[:, 0, 0] = normalized_embedding[:, 0]
    gauge_vh_mean_embedding[:, 0, 1] = normalized_embedding[:, 1]
    
    
    # Second largest PC
    rotation_matrix = gauge_vh_copy[:,1,:]
    pca = PCA(n_components=min(rotation_matrix.shape[1], 50))
    X_pca = pca.fit_transform(rotation_matrix)
    
    
    umap_embedding = umap.UMAP(n_neighbors=n_neighbors,random_state=random_state, metric=metric,min_dist=min_dist, n_epochs=n_epochs).fit_transform(X=X_pca)
    # umap_embedding = umap.UMAP(n_neighbors=30,  metric='euclidean',min_dist=0.5).fit_transform(X=X_pca)
    featuremap_kwds['gauge_v2_emb'] = umap_embedding
    
    umap_embedding_norm = np.linalg.norm(umap_embedding, axis=1, keepdims=True)
    umap_embedding_norm[umap_embedding_norm == 0.0] = 1.0

    normalized_embedding = umap_embedding / umap_embedding_norm

    # gauge_vh_embedding is embedding of VH
    gauge_vh_mean_embedding[:, 1, 0] = normalized_embedding[:, 0]
    gauge_vh_mean_embedding[:, 1, 1] = normalized_embedding[:, 1]

    featuremap_kwds["VH_embedding"] = np.array(gauge_vh_mean_embedding).astype(np.float32, copy=True) # VH_embedding after average

  
      
    # # First largest PC
    # # T1 = time.time()
    # gauge_vh_copy = gauge_vh.copy()
    # rotation_matrix = np.array(gauge_vh_copy)[:,0,:]
    
    # # from sklearn.decomposition import TruncatedSVD
    # from sklearn.decomposition import PCA
    # pca = PCA(n_components=min(rotation_matrix.shape[1], 50))
    # X_pca = pca.fit_transform(rotation_matrix)

    # # Spectral embedding
    # graph, _, _ = fuzzy_simplicial_set(
    #                 X_pca,
    #                 n_neighbors,
    #                 random_state,
    #                 metric,
    #                 )

    # spectral_embedding = spectral_layout(
    #         X_pca,
    #         graph,
    #         dim=2,
    #         random_state=random_state,
    #         metric=metric,
    #     )
    
    # featuremap_kwds['gauge_v1_emb'] = spectral_embedding
    # # T2 = time.time()
    # # print(f'UMAP time is {T2-T1}')
    
    # spectral_embedding_norm = np.linalg.norm(spectral_embedding, axis=1)
        
    # # gauge_vh_embedding is embedding of VH
    # gauge_vh_mean_embedding = np.zeros([spectral_embedding.shape[0], spectral_embedding.shape[1],2])
    # gauge_vh_mean_embedding[:,0,0] = spectral_embedding[:, 0]/spectral_embedding_norm
    # gauge_vh_mean_embedding[:,0,1] = spectral_embedding[:, 1]/spectral_embedding_norm
    
    
    # # Second largest PC
    # rotation_matrix = np.array(gauge_vh_copy)[:,1,:]
    # pca = PCA(n_components=min(rotation_matrix.shape[1], 50))
    # X_pca = pca.fit_transform(rotation_matrix)
    
    #  # Spectral embedding
    # graph, _, _ = fuzzy_simplicial_set(
    #                 X_pca,
    #                 n_neighbors,
    #                 random_state,
    #                 metric,
    #                 )

    # spectral_embedding = spectral_layout(
    #         X_pca,
    #         graph,
    #         dim=2,
    #         random_state=random_state,
    #         metric=metric,
    #     )
    # featuremap_kwds['gauge_v2_emb'] = spectral_embedding
    
    # spectral_embedding_norm = np.linalg.norm(spectral_embedding, axis=1)
    
    # # gauge_vh_embedding is embedding of VH
    # gauge_vh_mean_embedding[:,1,0] = spectral_embedding[:, 0]/spectral_embedding_norm
    # gauge_vh_mean_embedding[:,1,1] = spectral_embedding[:, 1]/spectral_embedding_norm

    # featuremap_kwds["VH_embedding"] = np.array(gauge_vh_mean_embedding).astype(np.float32, copy=True) # VH_embedding after average
  
   

    

[docs]
def variation_embedding(
        data,
        featuremap_kwds,
        ):        
    """
    Compute the variation embedding of the data

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data to be embedded by FeatureMAP.
    featuremap_kwds: dict
        Key word arguments to be used by the FeatureMAP optimization.
    
    Returns
    -------
    emb_variation: array of shape (n_samples, n_components)
        The optimized of ``graph`` of variation into an ``n_components`` dimensional
        euclidean space.

    """

    n_components=featuremap_kwds["n_components"]
    random_state=featuremap_kwds["random_state"]
    n_neighbors=featuremap_kwds["n_neighbors"]
    min_dist=featuremap_kwds["min_dist"]
    spread=featuremap_kwds["spread"]
    metric=featuremap_kwds["metric"]
    threshold=featuremap_kwds["threshold"]
      
    ##########################################
    # Compute feature variation
    ############################################
    gauge_vh = featuremap_kwds["vh_smoothed"]
    singular_values_collection = featuremap_kwds["Singular_value"]
    
    # Compute intrinsic dimensionality locally
    def pc_accumulation(arr, threshold):
        arr_sum = float(np.sum(np.square(arr)))
        if arr_sum <= 0.0:
            return 0

        temp_sum = 0.0
        for i in range(arr.shape[0]):
            temp_sum += arr[i] * arr[i]
            if temp_sum > arr_sum * threshold:
                return i

        # If the threshold is never exceeded (e.g., threshold >= 1), fall back to the last index
        return max(arr.shape[0] - 1, 0)
    
    # threshold = 0.5
    intrinsic_dim = np.zeros(data.shape[0]).astype(int)
    for i in range(data.shape[0]):            
        intrinsic_dim[i] = pc_accumulation(singular_values_collection[i], threshold)
    # Compute the gene norm in top k PCs (norm of the arrow in biplot)
    k = int(np.median(intrinsic_dim))
    if featuremap_kwds['verbose']:
        print(f'k is {k}')
    
    # gene_var_norm = np.linalg.norm(gauge_vh[:, :k, :], axis=1)
    gene_var_norm = np.sqrt(np.einsum('ijk, ijk->ik', gauge_vh[:, :k, :], gauge_vh[:, :k, :]))

    featuremap_kwds["variation_pc"] = np.array(gene_var_norm).astype(np.float32, copy=True)
    
    
    ######################################
    # Variation embedding
    ######################################## 
    import umap
    emb_variation = umap.UMAP(n_components=int(n_components), random_state=random_state, 
                              n_neighbors=n_neighbors,metric=metric,min_dist=min_dist, spread=spread).fit(gene_var_norm)
    featuremap_kwds["variation_embedding"] = emb_variation.embedding_
    featuremap_kwds["graph_v"] = emb_variation.graph_
    
    featuremap_kwds['gauge_v1_emb'] = emb_variation.embedding_[:, :2]
    # print(featuremap_kwds['gauge_v1_emb'].shape)
    featuremap_kwds['gauge_v2_emb'] = emb_variation.embedding_[:,[1,0]]
    featuremap_kwds['gauge_v2_emb'][:,0] = -featuremap_kwds['gauge_v2_emb'][:,0]

    # normalze the gauge_v1_emb and gauge_v2_emb
    gauge_v1_emb_norm = np.linalg.norm(featuremap_kwds['gauge_v1_emb'], axis=1, keepdims=True)
    gauge_v2_emb_norm = np.linalg.norm(featuremap_kwds['gauge_v2_emb'], axis=1, keepdims=True)

    gauge_v1_emb_norm[gauge_v1_emb_norm == 0.0] = 1.0
    gauge_v2_emb_norm[gauge_v2_emb_norm == 0.0] = 1.0

    featuremap_kwds['VH_embedding'] = np.stack((
        featuremap_kwds['gauge_v1_emb'] / gauge_v1_emb_norm,
        featuremap_kwds['gauge_v2_emb'] / gauge_v2_emb_norm
    ), axis=1)

    return emb_variation.embedding_



"""
Add gauge to 1-simplex;
embed it with gauge in low-dimensional space
"""

[docs]
def simplicial_set_embedding_with_tangent_space_embedding(
    data,
    graph,
    n_components,
    initial_alpha,
    a,
    b,
    gamma,
    negative_sample_rate,
    n_epochs,
    init,
    random_state,
    metric,
    metric_kwds,
    featuremap_kwds,
    output_feat,
    output_metric=dist.named_distances_with_gradients["euclidean"],
    output_metric_kwds={},
    euclidean_output=True,
    parallel=False,
    verbose=False,
    tqdm_kwds=None,
    output_variation=False,
    # original_data_flag=True,
    # pca_vh=None
):
    """Perform a fuzzy simplicial set embedding, using a specified
    initialisation method and then minimizing the fuzzy set cross entropy
    between the 1-skeletons of the high and low dimensional fuzzy simplicial
    sets.

    Parameters
    ----------
    data: array of shape (n_samples, n_features)
        The source data to be embedded by FeatureMAP.

    graph: sparse matrix
        The 1-skeleton of the high dimensional fuzzy simplicial set as
        represented by a graph for which we require a sparse matrix for the
        (weighted) adjacency matrix.

    n_components: int
        The dimensionality of the euclidean space into which to embed the data.

    initial_alpha: float
        Initial learning rate for the SGD.

    a: float
        Parameter of differentiable approximation of right adjoint functor

    b: float
        Parameter of differentiable approximation of right adjoint functor

    gamma: float
        Weight to apply to negative samples.

    negative_sample_rate: int (optional, default 5)
        The number of negative samples to select per positive sample
        in the optimization process. Increasing this value will result
        in greater repulsive force being applied, greater optimization
        cost, but slightly more accuracy.

    n_epochs: int (optional, default 0)
        The number of training epochs to be used in optimizing the
        low dimensional embedding. Larger values result in more accurate
        embeddings. If 0 is specified a value will be selected based on
        the size of the input dataset (200 for large datasets, 500 for small).

    init: string
        How to initialize the low dimensional embedding. Options are:
            * 'spectral': use a spectral embedding of the fuzzy 1-skeleton
            * 'random': assign initial embedding positions at random.
            * A numpy array of initial embedding positions.

    random_state: numpy RandomState or equivalent
        A state capable being used as a numpy random state.

    metric: string or callable
        The metric used to measure distance in high dimensional space; used if
        multiple connected components need to be layed out.

    metric_kwds: dict
        Key word arguments to be passed to the metric function; used if
        multiple connected components need to be layed out.

    featuremap_kwds: dict
        Key word arguments to be used by the FeatureMAP optimization.


    output_metric: function
        Function returning the distance between two points in embedding space and
        the gradient of the distance wrt the first argument.

    output_metric_kwds: dict
        Key word arguments to be passed to the output_metric function.

    euclidean_output: bool
        Whether to use the faster code specialised for euclidean output metrics

    parallel: bool (optional, default False)
        Whether to run the computation using numba parallel.
        Running in parallel is non-deterministic, and is not used
        if a random seed has been set, to ensure reproducibility.

    verbose: bool (optional, default False)
        Whether to report information on the current progress of the algorithm.

    tqdm_kwds: dict
        Key word arguments to be used by the tqdm progress bar.

    Returns
    -------
    embedding: array of shape (n_samples, n_components)
        The optimized of ``graph`` into an ``n_components`` dimensional
        euclidean space.
    """
    graph = graph.tocoo() #TODO: tocoo or toarray?
    head = graph.row
    tail = graph.col
    # weight = graph.toarray() 
    # weight = graph.data
    # print(np.count_nonzero(weight, axis=1))

    graph.sum_duplicates()
    n_vertices = graph.shape[1]

    # For smaller datasets we can use more epochs
    if graph.shape[0] <= 10000:
        default_epochs = 400
    else:
        default_epochs = 200

    # Use more epochs for FeatureMAP
    # if featuremap:
    default_epochs += 200
    

    if n_epochs is None:
        n_epochs = default_epochs

    if n_epochs > 10:
        graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0
    else:
        graph.data[graph.data < (graph.data.max() / float(default_epochs))] = 0.0

    graph.eliminate_zeros()

    """
    Initialization
    """
    if isinstance(init, str) and init == "random":
        embedding = random_state.uniform(
            low=-10.0, high=10.0, size=(graph.shape[0], n_components)
        ).astype(np.float32)
    elif isinstance(init, str) and init == "spectral":
        # We add a little noise to avoid local minima for optimization to come
        initialisation = spectral_layout(
            data,
            graph,
            n_components,
            random_state,
            metric=metric,
            metric_kwds=metric_kwds,
        )
        expansion = 10.0 / np.abs(initialisation).max()
        embedding = (initialisation * expansion).astype(
            np.float32
        ) + random_state.normal(
            scale=0.0001, size=[graph.shape[0], n_components]
        ).astype(
            np.float32
        )
    else:
        init_data = np.array(init)
        if len(init_data.shape) == 2:
            if np.unique(init_data, axis=0).shape[0] < init_data.shape[0]:
                tree = KDTree(init_data)
                dist, ind = tree.query(init_data, k=2)
                nndist = np.mean(dist[:, 1])
                embedding = init_data + random_state.normal(
                    scale=0.001 * nndist, size=init_data.shape
                ).astype(np.float32)
            else:
                embedding = init_data

    epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs)

   
    rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64)

    aux_data = {}
    
    # Compute the gauge in high-dimensional data
    if verbose:
        print(ts() + " Computing tangent space")
    T1 = time.time()
    n_neighbors=featuremap_kwds["n_neighbors"]
    n_neighbors_in_guage=int(featuremap_kwds["gauge_coefficient"] * n_neighbors)
    tangent_space_approximation(data, graph, featuremap_kwds=featuremap_kwds)
    T2 = time.time()
    if verbose:
        print(ts() + f' Tangent_space_approximation time is {T2-T1}')
    
    # if verbose:
    #     print(ts() + " Project gauge to knn_graph")
    # # Project the gauge to KNN graph to compute the transition probability matrix
    # project_gauge_to_knn_graph(data, graph, featuremap_kwds=featuremap_kwds)
   
    '''
    Variation embedding only
    '''
    
    T1 = time.time()
    embedding = variation_embedding(data=data,featuremap_kwds=featuremap_kwds)  
    T2 = time.time()

    if output_variation == True:
        if verbose:
            print(ts() + f' Variation_embedding time is {T2-T1}')

        return embedding
    else:    

        # Embedding the gauge (rotation matrix) to low dim space
        if verbose:
            print(ts() + " Tangent space embedding")
        # T1 = time.time()
        # tangent_space_embedding(featuremap_kwds)
        # T2 = time.time()
        # if verbose:
        #     print(ts() + f' Tangent_space_embedding time is {T2-T1}')


        head = graph.row
        tail = graph.col
        
        mu_sum = np.zeros(n_vertices, dtype=np.float32) # For each node, sum of edge existing probabilty of incident edges
        for i in range(len(head)):
            j = head[i]
            k = tail[i]
            mu = graph.data[i] # edge existing probability

            mu_sum[j] += mu
            mu_sum[k] += mu
        
        epsilon = 1e-8
        # print('singular_values, ' + str(singular_values))
        
        # Get the variance from singular values, corresponding to squared Euclidean distance
        singular_values = featuremap_kwds["Singular_value"].copy()
        ro_var = np.square(np.array(singular_values))
        ro = np.log(epsilon + ro_var) # radius in each directions of the hyper-ellipsoid
        # print('ro, ' + str(ro))
        
        std_ro = np.std(ro, axis=0)
        std_ro = np.where(std_ro == 0.0, 1.0, std_ro)
        R = (ro - np.mean(ro, axis=0)) / std_ro # normalization by column
        featuremap_kwds["R"] = R.astype(np.float32, copy=True)
        featuremap_kwds["mu"] = graph.data
        featuremap_kwds["mu_sum"] = mu_sum


        min_embed = np.min(embedding, 0)
        max_embed = np.max(embedding, 0)
        range_embed = max_embed - min_embed
        range_embed[range_embed == 0.0] = 1.0

        embedding = (
            10.0
            * (embedding - min_embed)
            / range_embed
        ).astype(np.float32, order="C")
        
        if verbose:
            print(ts() + ' Start optimizing layout')
        T1 = time.time()
        embedding = optimize_layout_euclidean_anisotropic_projection(
            embedding,
            embedding,
            head,
            tail,
            n_epochs,
            n_vertices,
            epochs_per_sample,
            a,
            b,
            rng_state,
            gamma,
            initial_alpha,
            negative_sample_rate,
            parallel=parallel,
            verbose=verbose,
            # featuremap=featuremap,
            featuremap_kwds=featuremap_kwds,
            tqdm_kwds=tqdm_kwds,
            move_other=True,
        )
        T2 = time.time()
        if verbose:
            print(ts() + f' Optimize layout time is {T2-T1}')


        
    """
    Compute the radius of two principal directions in embedding directions 
    """
    if output_feat:
        T1 = time.time()
        # if output_feat:
        if verbose:
            print(ts() + " Computing expression embedding densities")
        
        # scale the embedding coordinates
        # embedding = embedding * 100        
        # Compute graph in embedding
        (knn_indices, knn_dists, _,) = nearest_neighbors(
            embedding,
            featuremap_kwds["n_neighbors"],
            "euclidean",
            {},
            False,
            random_state,
            verbose=False,
        )
    
        emb_graph, _, _, emb_dists = fuzzy_simplicial_set(
            embedding,
            featuremap_kwds["n_neighbors"],
            random_state,
            "euclidean",
            {},
            knn_indices,
            knn_dists,
            verbose=False,
            return_dists=True,
        )
    
        
        emb_graph = emb_graph.tocoo()
        emb_graph.sum_duplicates()
        emb_graph.eliminate_zeros()
    
        n_vertices = emb_graph.shape[1]
        dim = embedding.shape[1]
    
        phi_sum = np.zeros(n_vertices, dtype=np.float32)
        re_sum = np.zeros([n_vertices, dim], dtype=np.float32)
        re_sum_without_log = np.zeros([n_vertices, dim], dtype=np.float32)
    
        
        VH_embedding = featuremap_kwds["VH_embedding"]
    
        head = emb_graph.row
        tail = emb_graph.col
        
        for i in range(len(head)):
            j = head[i]
            k = tail[i]
            
            current = embedding[j]
            other = embedding[k]
            
            current_VH = VH_embedding[j] # array shape of (dim, dim)
            other_VH = VH_embedding[k]
            
            vec_diff = other - current
    
            # D = emb_dists[j, k]
            phi = emb_graph.data[i]
            phi_sum[j] += phi
            phi_sum[k] += phi
            
            for d in range(dim):
                vec_proj_vh_j = np.dot(vec_diff, current_VH[d]) # project to d-th rotation direction
                vec_proj_vh_k = np.dot(vec_diff, other_VH[d])        
         
                re_sum[j,d] += phi * vec_proj_vh_j * vec_proj_vh_j
                re_sum[k,d] += phi * vec_proj_vh_k * vec_proj_vh_k
                
        for i in range(re_sum.shape[0]):
            re_sum_without_log[i] = epsilon + (re_sum[i] / phi_sum[i])
    
        for i in range(re_sum.shape[0]):
            re_sum[i] = np.log(epsilon + (re_sum[i] / phi_sum[i]))
            
        featuremap_kwds["re_sum"] = np.sqrt(re_sum) # Variance of Gaussian distribution
        featuremap_kwds["re_sum_without_log"] = np.sqrt(re_sum_without_log) # Variance of Gaussian distribution
        featuremap_kwds["emb_graph"] = emb_graph
        featuremap_kwds["emb_dists"] = emb_dists
        featuremap_kwds["emb_knn_indices"] = knn_indices
         
        T2 = time.time()
        if verbose:
            print(ts() + f' Embedding radii computation time is {T2-T1}')
        
    return embedding

        






[docs]
class FeatureMAP(BaseEstimator):
    """Feature-preserving Manifold Approximation and Projection

    Approximate the high dimensional manifold of the input data by low dimensional embedding,
    with feature-preserving property. The algorithm is based on the paper:
    Feature-preserving manifold approximation and projection to visulize single-cell data, Yang et al. 2023.

    The algorithm is based on the UMAP algorithm, and the main difference is that FeatureMAP 
    uses the tangent space of the input data to compute the (embedding) gauge for low dimensional embedding,
    where the features are locally illustrated by the embedding tangent space (i.e., gauge) and data points are
    embedded to the low dimensional space with the gauge.


    Parameters
    ----------
    n_neighbors: float (optional, default 15)
        The size of local neighborhood (in terms of number of neighboring
        sample points) used for manifold approximation. Larger values
        result in more global views of the manifold, while smaller
        values result in more local data being preserved. In general
        values should be in the range 2 to 100.

    n_components: int (optional, default 2)
        The dimension of the space to embed into. This defaults to 2 to
        provide easy visualization, but can reasonably be set to any
        integer value in the range 2 to 100.

    metric: string  (currently, only 'euclidean' is supported)
        The metric to use to compute distances in high dimensional space.

    n_epochs: int (optional, default None)
        The number of training epochs to be used in optimizing the
        low dimensional embedding. Larger values result in more accurate
        embeddings. If None is specified a value will be selected based on
        the size of the input dataset (200 for large datasets, 500 for small).

    learning_rate: float (optional, default 1.0)
        The initial learning rate for the embedding optimization.

    init: string (optional, default 'spectral')
        How to initialize the low dimensional embedding. Options are:
            * 'spectral': use a spectral embedding of the fuzzy 1-skeleton
            * 'random': assign initial embedding positions at random.
            * A numpy array of initial embedding positions.

    min_dist: float (optional, default 0.5)
        The effective minimum distance between embedded points. Smaller values
        will result in a more clustered/clumped embedding where nearby points
        on the manifold are drawn closer together, while larger values will
        result on a more even dispersal of points. The value should be set
        relative to the ``spread`` value, which determines the scale at which
        embedded points will be spread out.

    spread: float (optional, default 1.0)
        The effective scale of embedded points. In combination with ``min_dist``
        this determines how clustered/clumped the embedded points are.

    low_memory: bool (optional, default True)
        For some datasets the nearest neighbor computation can consume a lot of
        memory. If you find that FeatureMAP is failing due to memory constraints
        consider setting this option to True. This approach is more
        computationally expensive, but avoids excessive memory use.

    set_op_mix_ratio: float (optional, default 1.0)
        Interpolate between (fuzzy) union and intersection as the set operation
        used to combine local fuzzy simplicial sets to obtain a global fuzzy
        simplicial sets. Both fuzzy set operations use the product t-norm.
        The value of this parameter should be between 0.0 and 1.0; a value of
        1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy
        intersection.

    local_connectivity: int (optional, default 1)
        The local connectivity required -- i.e. the number of nearest
        neighbors that should be assumed to be connected at a local level.
        The higher this value the more connected the manifold becomes
        locally. In practice this should be not more than the local intrinsic
        dimension of the manifold.

    repulsion_strength: float (optional, default 1.0)
        Weighting applied to negative samples in low dimensional embedding
        optimization. Values higher than one will result in greater weight
        being given to negative samples.

    negative_sample_rate: int (optional, default 5)
        The number of negative samples to select per positive sample
        in the optimization process. Increasing this value will result
        in greater repulsive force being applied, greater optimization
        cost, but slightly more accuracy.

    a: float (optional, default None)
        More specific parameters controlling the embedding. If None these
        values are set automatically as determined by ``min_dist`` and
        ``spread``.
    b: float (optional, default None)
        More specific parameters controlling the embedding. If None these
        values are set automatically as determined by ``min_dist`` and
        ``spread``.

    random_state: int, RandomState instance or None, optional (default: None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    metric_kwds: dict (optional, default None)
        Arguments to pass on to the metric, such as the ``p`` value for
        Minkowski distance. If None then no arguments are passed on.

    angular_rp_forest: bool (optional, default False)
        Whether to use an angular random projection forest to initialise
        the approximate nearest neighbor search. This can be faster, but is
        mostly on useful for metric that use an angular style distance such
        as cosine, correlation etc. In the case of those metrics angular forests
        will be chosen automatically.

    transform_seed: int (optional, default 42)
        Random seed used for the stochastic aspects of the transform operation.
        This ensures consistency in transform operations.

    verbose: bool (optional, default False)
        Controls verbosity of logging.

    tqdm_kwds: dict (optional, defaul None)
        Key word arguments to be used by the tqdm progress bar.

    unique: bool (optional, default False)
        Controls if the rows of your data should be uniqued before being
        embedded.  If you have more duplicates than you have n_neighbour
        you can have the identical data points lying in different regions of
        your space.  It also violates the definition of a metric.
        For to map from internal structures back to your data use the variable
        _unique_inverse_.

    featuremap: bool (optional, default False)
        Specifies whether the featity-augmented objective of FeatureMAP
        should be used for optimization. Turning on this option generates
        an embedding where the local featities are encouraged to be correlated
        with those in the original space. Parameters below with the prefix 'feat'
        further control the behavior of this extension.

    feat_lambda: float (optional, default 2.0)
        Controls the regularization weight of the anisotropic density correlation term
        in FeatureMAP. Higher values prioritize density preservation over the
        FeatureMAP objective, and vice versa for values closer to zero. Setting this
        parameter to zero is equivalent to running the original UMAP algorithm.

    feat_frac: float (optional, default 0.3)
        Controls the fraction of epochs (between 0 and 1) where the
        feature-augmented objective is used in FeatureMAP. The first
        (1 - feat_frac) fraction of epochs optimize the original UMAP objective
        before introducing the anisotropic density correlation term.

    feat_gauge_coefficient: float (default 1.0)
        A coefficient multiplication of n_neighbors as number of neighbors in 
        gauge construction. Larger coefficient means long distance information
    
    feat_var_shift: float (optional, default 0.1)
        A small constant added to the variance of local radii in the
        embedding when calculating the anisotropic density correlation objective to
        prevent numerical instability from dividing by a small number
    
    output_feat: bool (optional, default False)
        Whether to compute the embedding radius. 
        If set to True, the algorithm will compute the embedding radius
        and return the embedding radius.

    disconnection_distance: float (optional, default np.inf or maximal value for bounded distances)
        Disconnect any vertices of distance greater than or equal to disconnection_distance when approximating the
        manifold via our k-nn graph. This is particularly useful in the case that you have a bounded metric.  The
        UMAP assumption that we have a connected manifold can be problematic when you have points that are maximally
        different from all the rest of your data.  The connected manifold assumption will make such points have perfect
        similarity to a random set of other points.  Too many such points will artificially connect your space.
    
    output_variation: bool (optional, default True)
        Whether to compute the variation embedding only. If set to True, the algorithm will compute the variation embedding
        only and return the embedding. If set to False, the algorithm will compute the Feature-augmented embedding.
    
    threshold: float (optional, default 0.9)
        The threshold to compute the intrinsic dimensionality of the data. The intrinsic dimensionality is computed
        by the number of principal components that accumulates 90% of the variance of the data.
       
    """

    def __init__(
        self,
        n_neighbors=15,
        n_components=2,
        metric="euclidean",
        metric_kwds=None,
        output_metric="euclidean",
        output_metric_kwds=None,
        n_epochs=None,
        learning_rate=1.0,
        init="spectral",
        min_dist=0.5,
        spread=1.0,
        low_memory=True,
        n_jobs=-1,
        set_op_mix_ratio=1.0,
        local_connectivity=1.0,
        repulsion_strength=1.0,
        negative_sample_rate=5,
        a=None,
        b=None,
        random_state=None,
        angular_rp_forest=False,
        transform_seed=42,
        transform_mode="embedding",
        force_approximation_algorithm=False,
        verbose=False,
        tqdm_kwds=None,
        unique=False,
        # featuremap=False,
        feat_lambda=0.5,
        feat_frac=0.3,
        feat_gauge_coefficient = 1.0,
        feat_var_shift=0.1,
        output_feat=False,
        disconnection_distance=None,
        output_variation=False,
        threshold=0.9,
        gcn_iterations=None
        # original_data_flag=True,
        # pca_vh=None
    ):

[docs]
        self.n_neighbors = n_neighbors


[docs]
        self.metric = metric


[docs]
        self.output_metric = output_metric


[docs]
        self.metric_kwds = metric_kwds


[docs]
        self.output_metric_kwds = output_metric_kwds


[docs]
        self.n_epochs = n_epochs


[docs]
        self.init = init


[docs]
        self.n_components = n_components


[docs]
        self.repulsion_strength = repulsion_strength


[docs]
        self.learning_rate = learning_rate



[docs]
        self.spread = spread


[docs]
        self.min_dist = min_dist


[docs]
        self.low_memory = low_memory


[docs]
        self.set_op_mix_ratio = set_op_mix_ratio


[docs]
        self.local_connectivity = local_connectivity


[docs]
        self.negative_sample_rate = negative_sample_rate


[docs]
        self.random_state = random_state


[docs]
        self.angular_rp_forest = angular_rp_forest


[docs]
        self.transform_seed = transform_seed


[docs]
        self.transform_mode = transform_mode


[docs]
        self.force_approximation_algorithm = force_approximation_algorithm


[docs]
        self.verbose = verbose


[docs]
        self.tqdm_kwds = tqdm_kwds


[docs]
        self.unique = unique


        # self.featuremap = featuremap

[docs]
        self.feat_lambda = feat_lambda


[docs]
        self.feat_frac = feat_frac


[docs]
        self.feat_gauge_coefficient = feat_gauge_coefficient


[docs]
        self.feat_var_shift = feat_var_shift


[docs]
        self.output_feat = output_feat


[docs]
        self.disconnection_distance = disconnection_distance

        

[docs]
        self.output_variation = output_variation


[docs]
        self.threshold = threshold


[docs]
        self.gcn_iterations = gcn_iterations

        # self.original_data_flag = original_data_flag,
        # self.pca_vh = pca_vh
        
        

[docs]
        self.n_jobs = n_jobs



[docs]
        self.a = a


[docs]
        self.b = b


    def _validate_parameters(self):
        if self.set_op_mix_ratio < 0.0 or self.set_op_mix_ratio > 1.0:
            raise ValueError("set_op_mix_ratio must be between 0.0 and 1.0")
        if self.repulsion_strength < 0.0:
            raise ValueError("repulsion_strength cannot be negative")
        if self.min_dist > self.spread:
            raise ValueError("min_dist must be less than or equal to spread")
        if self.min_dist < 0.0:
            raise ValueError("min_dist cannot be negative")
        if not isinstance(self.init, str) and not isinstance(self.init, np.ndarray):
            raise ValueError("init must be a string or ndarray")
        if isinstance(self.init, str) and self.init not in (
            "spectral",
            "random",
        ):
            raise ValueError('string init values must be "spectral" or "random"')
        if (
            isinstance(self.init, np.ndarray)
            and self.init.shape[1] != self.n_components
        ):
            raise ValueError("init ndarray must match n_components value")
        if not isinstance(self.metric, str) and not callable(self.metric):
            raise ValueError("metric must be string or callable")
        if self.negative_sample_rate < 0:
            raise ValueError("negative sample rate must be positive")
        if self._initial_alpha < 0.0:
            raise ValueError("learning_rate must be positive")
        if self.n_neighbors < 2:
            raise ValueError("n_neighbors must be greater than 1")
        if not isinstance(self.n_components, int):
            if isinstance(self.n_components, str):
                raise ValueError("n_components must be an int")
            if self.n_components % 1 != 0:
                raise ValueError("n_components must be a whole number")
            try:
                # this will convert other types of int (eg. numpy int64)
                # to Python int
                self.n_components = int(self.n_components)
            except ValueError:
                raise ValueError("n_components must be an int")
        if self.n_components < 1:
            raise ValueError("n_components must be greater than 0")
        # if self.n_components larger than the number of features, it will raise an error
        if self.n_components >= self._raw_data.shape[1]:
            raise ValueError(
                "n_components must be less than or equal to the number of features"
            )
        if self.n_epochs is not None and (
            self.n_epochs < 0 or not isinstance(self.n_epochs, int)
        ):
            raise ValueError("n_epochs must be a nonnegative integer")
        if self.metric_kwds is None:
            self._metric_kwds = {}
        else:
            self._metric_kwds = self.metric_kwds
        if self.output_metric_kwds is None:
            self._output_metric_kwds = {}
        else:
            self._output_metric_kwds = self.output_metric_kwds
        # save repeated checks later on
        if scipy.sparse.isspmatrix_csr(self._raw_data):
            self._sparse_data = True
        else:
            self._sparse_data = False
        # set input distance metric & inverse_transform distance metric
        if callable(self.metric):
            in_returns_grad = self._check_custom_metric(
                self.metric, self._metric_kwds, self._raw_data
            )
            if in_returns_grad:
                _m = self.metric

                @numba.njit(fastmath=True)
                def _dist_only(x, y, *kwds):
                    return _m(x, y, *kwds)[0]

                self._input_distance_func = _dist_only
                self._inverse_distance_func = self.metric
            else:
                self._input_distance_func = self.metric
                self._inverse_distance_func = None
                warn(
                    "custom distance metric does not return gradient; inverse_transform will be unavailable. "
                    "To enable using inverse_transform method, define a distance function that returns a tuple "
                    "of (distance [float], gradient [np.array])"
                )
        elif self.metric == "precomputed":
            if self.unique:
                raise ValueError("unique is poorly defined on a precomputed metric")
            warn("using precomputed metric; inverse_transform will be unavailable")
            self._input_distance_func = self.metric
            self._inverse_distance_func = None
        elif self.metric == "hellinger" and self._raw_data.min() < 0:
            raise ValueError("Metric 'hellinger' does not support negative values")
        elif self.metric in dist.named_distances:
            if self._sparse_data:
                if self.metric in sparse.sparse_named_distances:
                    self._input_distance_func = sparse.sparse_named_distances[
                        self.metric
                    ]
                else:
                    raise ValueError(
                        "Metric {} is not supported for sparse data".format(self.metric)
                    )
            else:
                self._input_distance_func = dist.named_distances[self.metric]
            try:
                self._inverse_distance_func = dist.named_distances_with_gradients[
                    self.metric
                ]
            except KeyError:
                warn(
                    "gradient function is not yet implemented for {} distance metric; "
                    "inverse_transform will be unavailable".format(self.metric)
                )
                self._inverse_distance_func = None
        elif self.metric in pynn_named_distances:
            if self._sparse_data:
                if self.metric in pynn_sparse_named_distances:
                    self._input_distance_func = pynn_sparse_named_distances[self.metric]
                else:
                    raise ValueError(
                        "Metric {} is not supported for sparse data".format(self.metric)
                    )
            else:
                self._input_distance_func = pynn_named_distances[self.metric]

            warn(
                "gradient function is not yet implemented for {} distance metric; "
                "inverse_transform will be unavailable".format(self.metric)
            )
            self._inverse_distance_func = None
        else:
            raise ValueError("metric is neither callable nor a recognised string")
        # set output distance metric
        if callable(self.output_metric):
            out_returns_grad = self._check_custom_metric(
                self.output_metric, self._output_metric_kwds
            )
            if out_returns_grad:
                self._output_distance_func = self.output_metric
            else:
                raise ValueError(
                    "custom output_metric must return a tuple of (distance [float], gradient [np.array])"
                )
        elif self.output_metric == "precomputed":
            raise ValueError("output_metric cannnot be 'precomputed'")
        elif self.output_metric in dist.named_distances_with_gradients:
            self._output_distance_func = dist.named_distances_with_gradients[
                self.output_metric
            ]
        elif self.output_metric in dist.named_distances:
            raise ValueError(
                "gradient function is not yet implemented for {}.".format(
                    self.output_metric
                )
            )
        else:
            raise ValueError(
                "output_metric is neither callable nor a recognised string"
            )
        # set angularity for NN search based on metric
        if self.metric in (
            "cosine",
            "correlation",
            "dice",
            "jaccard",
            "ll_dirichlet",
            "hellinger",
        ):
            self.angular_rp_forest = True

        if self.n_jobs < -1 or self.n_jobs == 0:
            raise ValueError("n_jobs must be a postive integer, or -1 (for all cores)")

        if self.feat_lambda < 0.0:
            raise ValueError("feat_lambda cannot be negative")
        if self.feat_frac < 0.0 or self.feat_frac > 1.0:
            raise ValueError("feat_frac must be between 0.0 and 1.0")
        if self.feat_gauge_coefficient < 0.0:
            raise ValueError("feat_gauge_coefficient cannot be negative")
        if self.feat_var_shift < 0.0:
            raise ValueError("feat_var_shift cannot be negative")

        self._featuremap_kwds = {
            "lambda": self.feat_lambda, #if self.featuremap else 0.0,
            "frac": self.feat_frac, #if self.featuremap else 0.0,
            "gauge_coefficient": self.feat_gauge_coefficient, #if self.featuremap else 0.0,
            "var_shift": self.feat_var_shift,
            "n_neighbors": self.n_neighbors,
            "n_jobs": self.n_jobs,
            "random_state": self.random_state,
            "metric": self.metric,
            "min_dist": self.min_dist,
            "spread": self.spread,
            "n_components": self.n_components,
            "verbose": self.verbose,
            "n_epochs": self.n_epochs,
            "threshold": self.threshold,   
            'gcn_iterations': self.gcn_iterations,
            
        }
        # if self.output_variation:
        #     self._featuremap_kwds['threshold'] = self.threshold

        # if self.featuremap:
        if self.output_metric not in ("euclidean", "l2"):
            raise ValueError(
                "Non-Euclidean output metric not supported for FeatureMAP."
            )

        # This will be used to prune all edges of greater than a fixed value from our knn graph.
        # We have preset defaults described in DISCONNECTION_DISTANCES for our bounded measures.
        # Otherwise a user can pass in their own value.
        if self.disconnection_distance is None:
            self._disconnection_distance = DISCONNECTION_DISTANCES.get(
                self.metric, np.inf
            )
        elif isinstance(self.disconnection_distance, int) or isinstance(
            self.disconnection_distance, float
        ):
            self._disconnection_distance = self.disconnection_distance
        else:
            raise ValueError("disconnection_distance must either be None or a numeric.")

        if self.tqdm_kwds is None:
            self.tqdm_kwds = {}
        else:
            if isinstance(self.tqdm_kwds, dict) is False:
                raise ValueError(
                    "tqdm_kwds must be a dictionary. Please provide valid tqdm "
                    "parameters as key value pairs. Valid tqdm parameters can be "
                    "found here: https://github.com/tqdm/tqdm#parameters"
                )
        if "desc" not in self.tqdm_kwds:
            self.tqdm_kwds["desc"] = "Epochs completed"
        if "bar_format" not in self.tqdm_kwds:
            bar_f = "{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]"
            self.tqdm_kwds["bar_format"] = bar_f

    def _check_custom_metric(self, metric, kwds, data=None):
        # quickly check to determine whether user-defined
        # self.metric/self.output_metric returns both distance and gradient
        if data is not None:
            # if checking the high-dimensional distance metric, test directly on
            # input data so we don't risk violating any assumptions potentially
            # hard-coded in the metric (e.g., bounded; non-negative)
            x, y = data[np.random.randint(0, data.shape[0], 2)]
        else:
            # if checking the manifold distance metric, simulate some data on a
            # reasonable interval with output dimensionality
            x, y = np.random.uniform(low=-10, high=10, size=(2, self.n_components))

        if scipy.sparse.issparse(data):
            metric_out = metric(x.indices, x.data, y.indices, y.data, **kwds)
        else:
            metric_out = metric(x, y, **kwds)
        # True if metric returns iterable of length 2, False otherwise
        return hasattr(metric_out, "__iter__") and len(metric_out) == 2
    



[docs]
    def fit(self, X):
        """Fit X into an embedded space.

        Parameters
        ----------
        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
            If the metric is 'precomputed' X must be a square distance
            matrix. Otherwise it contains a sample per row. If the method
            is 'exact', X may be a sparse matrix of type 'csr', 'csc'
            or 'coo'.
        """
  
        # # SVD decompostion of data 
        # if X.shape[1] > 100:
        #     if self.verbose:
        #         print("Performing SVD decomposition on the data")
        #     u, s, vh = scipy.sparse.linalg.svds(X, k=100, which='LM', random_state=42)
        #     X = np.matmul(u, np.diag(s))
        # else:
        #     vh = np.eye(X.shape[1])

        X = check_array(X, dtype=np.float32, accept_sparse="csr", order="C")
        self._raw_data = X

        # Handle all the optional arguments, setting default
        if self.a is None or self.b is None:
            self._a, self._b = find_ab_params(self.spread, self.min_dist)
        else:
            self._a = self.a
            self._b = self.b

        if isinstance(self.init, np.ndarray):
            init = check_array(self.init, dtype=np.float32, accept_sparse=False)
        else:
            init = self.init

        self._initial_alpha = self.learning_rate

        self._validate_parameters()

        # SVD decomposition of data
        # self._featuremap_kwds['svd_vh'] = vh.T

        if self.verbose:
            print(str(self))

        self._original_n_threads = numba.get_num_threads()
        if self.n_jobs > 0 and self.n_jobs is not None:
            numba.set_num_threads(self.n_jobs)

        # Check if we should unique the data
        # We've already ensured that we aren't in the precomputed case
        if self.unique:
            # check if the matrix is feate
            if self._sparse_data:
                # Call a sparse unique function
                index, inverse, counts = csr_unique(X)
            else:
                index, inverse, counts = np.unique(
                    X,
                    return_index=True,
                    return_inverse=True,
                    return_counts=True,
                    axis=0,
                )[1:4]
            if self.verbose:
                print(
                    "Unique=True -> Number of data points reduced from ",
                    X.shape[0],
                    " to ",
                    X[index].shape[0],
                )
                most_common = np.argmax(counts)
                print(
                    "Most common duplicate is",
                    index[most_common],
                    " with a count of ",
                    counts[most_common],
                )
            # We'll expose an inverse map when unique=True for users to map from our internal structures to their data
            self._unique_inverse_ = inverse
        # If we aren't asking for unique use the full index.
        # This will save special cases later.
        else:
            index = list(range(X.shape[0]))
            inverse = list(range(X.shape[0]))
        # print('inverse, ' +  str(inverse))
        # Error check n_neighbors based on data size
        if X[index].shape[0] <= self.n_neighbors:
            if X[index].shape[0] == 1:
                self.embedding_ = np.zeros(
                    (1, self.n_components)
                )  # needed to sklearn comparability
                return self

            warn(
                "n_neighbors is larger than the dataset size; truncating to "
                "X.shape[0] - 1"
            )
            self._n_neighbors = X[index].shape[0] - 1
            # if self.featuremap:
            self._featuremap_kwds["n_neighbors"] = self._n_neighbors
        else:
            self._n_neighbors = self.n_neighbors

        # Note: unless it causes issues for setting 'index', could move this to
        # initial sparsity check above
        if self._sparse_data and not X.has_sorted_indices:
            X.sort_indices()

        random_state = check_random_state(self.random_state)

        if self.verbose:
            print(ts(), "Construct fuzzy simplicial set")

        """
        Construct the fuzzy_simplicial_set
        """    
        if self.metric == "precomputed" and self._sparse_data:
            # For sparse precomputed distance matrices, we just argsort the rows to find
            # nearest neighbors. To make this easier, we expect matrices that are
            # symmetrical (so we can find neighbors by looking at rows in isolation,
            # rather than also having to consider that sample's column too).
            # print("Computing KNNs for sparse precomputed distances...")
            if sparse_tril(X).getnnz() != sparse_triu(X).getnnz():
                raise ValueError(
                    "Sparse precomputed distance matrices should be symmetrical!"
                )
            if not np.all(X.diagonal() == 0):
                raise ValueError("Non-zero distances from samples to themselves!")
            self._knn_indices = np.zeros((X.shape[0], self.n_neighbors), dtype=np.int)
            self._knn_dists = np.zeros(self._knn_indices.shape, dtype=np.float)
            for row_id in range(X.shape[0]):
                # Find KNNs row-by-row
                row_data = X[row_id].data
                row_indices = X[row_id].indices
                if len(row_data) < self._n_neighbors:
                    raise ValueError(
                        "Some rows contain fewer than n_neighbors distances!"
                    )
                row_nn_data_indices = np.argsort(row_data)[: self._n_neighbors]
                self._knn_indices[row_id] = row_indices[row_nn_data_indices]
                self._knn_dists[row_id] = row_data[row_nn_data_indices]

            # Disconnect any vertices farther apart than _disconnection_distance
            disconnected_index = self._knn_dists >= self._disconnection_distance
            self._knn_indices[disconnected_index] = -1
            self._knn_dists[disconnected_index] = np.inf
            edges_removed = disconnected_index.sum()

            (
                self.graph_,
                self._sigmas,
                self._rhos,
                # self.graph_dists_,
            ) = fuzzy_simplicial_set(
                X[index],
                self.n_neighbors,
                random_state,
                "precomputed",
                self._metric_kwds,
                self._knn_indices,
                self._knn_dists,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                # self.output_feat,

            )
            # Report the number of vertices with degree 0 in our our self.graph_
            # This ensures that they were properly disconnected.
            vertices_disconnected = np.sum(
                np.array(self.graph_.sum(axis=1)).flatten() == 0
            )
            raise_disconnected_warning(
                edges_removed,
                vertices_disconnected,
                self._disconnection_distance,
                self._raw_data.shape[0],
                verbose=self.verbose,
            )
        # Handle small cases efficiently by computing all distances
        # TODO: get _knn_indices when less than 4096 
        elif X[index].shape[0] < 1000 and not self.force_approximation_algorithm:
            self._small_data = True
            try:
                # sklearn pairwise_distances fails for callable metric on sparse data
                _m = self.metric if self._sparse_data else self._input_distance_func
                dmat = pairwise_distances(X[index], metric=_m, **self._metric_kwds)
            except (ValueError, TypeError) as e:
                # metric is numba.jit'd or not supported by sklearn,
                # fallback to pairwise special

                if self._sparse_data:
                    # Get a fresh metric since we are casting to feate
                    if not callable(self.metric):
                        _m = dist.named_distances[self.metric]
                        dmat = dist.pairwise_special_metric(
                            X[index].toarray(),
                            metric=_m,
                            kwds=self._metric_kwds,
                        )
                    else:
                        dmat = dist.pairwise_special_metric(
                            X[index],
                            metric=self._input_distance_func,
                            kwds=self._metric_kwds,
                        )
                else:
                    dmat = dist.pairwise_special_metric(
                        X[index],
                        metric=self._input_distance_func,
                        kwds=self._metric_kwds,
                    )
            # set any values greater than disconnection_distance to be np.inf.
            # This will have no effect when _disconnection_distance is not set since it defaults to np.inf.
            edges_removed = np.sum(dmat >= self._disconnection_distance)
            dmat[dmat >= self._disconnection_distance] = np.inf
            # print(dmat.shape)
            # print(dmat[:2, :])
            self._knn_dists = np.sort(dmat, axis=1)
            self._knn_indices = np.argsort(dmat, axis=1)
            # print( self._knn_dists[:2, :])
            # print( self._knn_indices[:2, :])
            
            (
                self.graph_,
                self._sigmas,
                self._rhos,
                # self.graph_dists_,
            ) = fuzzy_simplicial_set(
                dmat,
                self._n_neighbors,
                random_state,
                "precomputed",
                self._metric_kwds,
                None,
                None,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                # self.output_feat,

            )
            # Report the number of vertices with degree 0 in our our self.graph_
            # This ensures that they were properly disconnected.
            vertices_disconnected = np.sum(
                np.array(self.graph_.sum(axis=1)).flatten() == 0
            )
            raise_disconnected_warning(
                edges_removed,
                vertices_disconnected,
                self._disconnection_distance,
                self._raw_data.shape[0],
                verbose=self.verbose,
            )
        else:
            # Standard case
            self._small_data = False
            # Standard case
            if self._sparse_data and self.metric in pynn_sparse_named_distances:
                nn_metric = self.metric
            elif not self._sparse_data and self.metric in pynn_named_distances:
                nn_metric = self.metric
            else:
                nn_metric = self._input_distance_func

            (
                self._knn_indices,
                self._knn_dists,
                self._knn_search_index,
            ) = nearest_neighbors(
                X[index],
                self._n_neighbors,
                nn_metric,
                self._metric_kwds,
                self.angular_rp_forest,
                random_state,
                self.low_memory,
                use_pynndescent=True,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
            )

            # Disconnect any vertices farther apart than _disconnection_distance
            disconnected_index = self._knn_dists >= self._disconnection_distance
            self._knn_indices[disconnected_index] = -1
            self._knn_dists[disconnected_index] = np.inf
            edges_removed = disconnected_index.sum()

            (
                self.graph_,
                self._sigmas,
                self._rhos,
                # self.graph_dists_,
            ) = fuzzy_simplicial_set(
                X[index],
                self.n_neighbors,
                random_state,
                nn_metric,
                self._metric_kwds,
                self._knn_indices,
                self._knn_dists,
                self.angular_rp_forest,
                self.set_op_mix_ratio,
                self.local_connectivity,
                True,
                self.verbose,
                # self.output_feat,

            )
            # Report the number of vertices with degree 0 in our our self.graph_
            # This ensures that they were properly disconnected.
            vertices_disconnected = np.sum(
                np.array(self.graph_.sum(axis=1)).flatten() == 0
            )
            raise_disconnected_warning(
                edges_removed,
                vertices_disconnected,
                self._disconnection_distance,
                self._raw_data.shape[0],
                verbose=self.verbose,
            )

        # if self.featuremap or self.output_feat:
        # self._featuremap_kwds["graph_dists"] = self.graph_dists_
        self._featuremap_kwds["n_neighbors"] = self.n_neighbors
        # self._featuremap_kwds["n_neighbors_in_guage"] = self.n_neighbors # TODO: n_neighbors_in_guage same as n_neighbors or not
        self._featuremap_kwds["_knn_indices"] = self._knn_indices

        if self.verbose:
            print(ts(), "Construct embedding")

        if self.transform_mode == "embedding":
            self.embedding_ = self._fit_embed_data(
                self._raw_data[index],
                self.n_epochs,
                # self.featuremap,
                init,
                random_state,  
            )
            # print('aux_data, ' + str(aux_data))
            # Assign any points that are fully disconnected from our manifold(s) to have embedding
            # coordinates of np.nan.  These will be filtered by our plotting functions automatically.
            # They also prevent users from being deceived a distance query to one of these points.
            # Might be worth moving this into simplicial_set_embedding or _fit_embed_data
            disconnected_vertices = np.array(self.graph_.sum(axis=1)).flatten() == 0
            if len(disconnected_vertices) > 0:
                self.embedding_[disconnected_vertices] = np.full(
                    self.n_components, np.nan
                )

            self.embedding_ = self.embedding_[inverse]
            # if self.output_feat:
            #     self.rad_orig_ = aux_data["rad_orig"][inverse]
            #     self.rad_emb_ = aux_data["rad_emb"][inverse]

        self._featuremap_kwds["X_embedding"] = self.embedding_
        if self.verbose:
            print(ts() + " Finished embedding")

        numba.set_num_threads(self._original_n_threads)
        self._input_hash = joblib.hash(self._raw_data)

        return self


    def _fit_embed_data(self, X, n_epochs, init, random_state):
        """A method wrapper for simplicial_set_embedding that can be
        replaced by subclasses.
        """
        # if featuremap:
        return simplicial_set_embedding_with_tangent_space_embedding(
            X,
            self.graph_,
            self.n_components,
            self._initial_alpha,
            self._a,
            self._b,
            self.repulsion_strength,
            self.negative_sample_rate,
            n_epochs,
            init,
            random_state,
            self._input_distance_func,
            self._metric_kwds,
            # self.featuremap,
            self._featuremap_kwds,
            self.output_feat,
            self._output_distance_func,
            self._output_metric_kwds,
            self.output_metric in ("euclidean", "l2"),
            self.random_state is None,
            self.verbose,
            tqdm_kwds=self.tqdm_kwds,
            output_variation=self.output_variation,
        )
        
            


[docs]
    def fit_transform(self, X):
        """Fit X into an embedded space and return that transformed
        output.

        Parameters
        ----------
        X : array, shape (n_samples, n_features) or (n_samples, n_samples)
            If the metric is 'precomputed' X must be a square distance
            matrix. Otherwise it contains a sample per row.

        Returns
        -------
        X_new : array, shape (n_samples, n_components)
            Embedding of the training data in low-dimensional space.

        """
        self.fit(X)
        if self.transform_mode == "embedding":
            if self.output_variation:
                return self.embedding_
            # elif self.output_feat:
            #     return self.embedding_, self.rad_orig_,  self.rad_emb_
            else:
                return self.embedding_
        elif self.transform_mode == "graph":
            return self.graph_
        else:
            raise ValueError(
                "Unrecognized transform mode {}; should be one of 'embedding' or 'graph'".format(
                    self.transform_mode
                )
            )


    



[docs]
    def __repr__(self):
        from sklearn.utils._pprint import _EstimatorPrettyPrinter
        import re
        pp = _EstimatorPrettyPrinter(
            compact=True,
            indent=1,
            indent_at_name=True,
            n_max_elements_to_show=50,
        )
        pp._changed_only = True
        repr_ = pp.pformat(self)
        repr_ = re.sub("tqdm_kwds={.*},", "", repr_, flags=re.S)
        # remove empty lines
        repr_ = re.sub("\n *\n", "\n", repr_, flags=re.S)
        # remove extra whitespaces after a comma
        repr_ = re.sub(", +", ", ", repr_)
        return repr_





import numpy as np
import numba
import umap.distances as dist
from umap.utils import tau_rand_int
from tqdm.auto import tqdm

@numba.njit()

[docs]
def clip(val):
    """Standard clamping of a value into a fixed range (in this case -4.0 to
    4.0)

    Parameters
    ----------
    val: float
        The value to be clamped.

    Returns
    -------
    The clamped value, now fixed to be in the range -4.0 to 4.0.
    """
    if val > 4.0:
        return 4.0
    elif val < -4.0:
        return -4.0
    else:
        return val





@numba.njit(
    "f4(f4[::1],f4[::1])",
    fastmath=True,
    cache=True,
    locals={
        "result": numba.types.float32,
        "diff": numba.types.float32,
        "dim": numba.types.intp,
    },
)

[docs]
def rdist(x, y):
    """Reduced Euclidean distance.

    Parameters
    ----------
    x: array of shape (embedding_dim,)
    y: array of shape (embedding_dim,)

    Returns
    -------
    The squared euclidean distance between x and y
    """
    result = 0.0
    dim = x.shape[0]
    for i in range(dim):
        diff = x[i] - y[i]
        result += diff * diff

    return result


# def vdiff(x, y):
#     """    
#     Vector difference.
    
#     Parameters
#     ----------
#     x : array of shape (embedding_dim,)
#     y : array of shape (embedding_dim,)
    
#     Returns
#     -------
#     The vector difference between x and y 

#     """
#     result = y - x
#     return result



def _optimize_layout_euclidean_single_epoch_grad(
    head_embedding,
    tail_embedding,
    head,
    tail,
    n_vertices,
    epochs_per_sample,
    a,
    b,
    rng_state,
    gamma,
    dim,
    move_other,
    alpha,
    epochs_per_negative_sample,
    epoch_of_next_negative_sample,
    epoch_of_next_sample,
    n,
    featuremap_flag,
    feat_phi_sum,
    feat_re_sum,
    feat_re_cov,
    feat_re_std,
    feat_re_mean,
    feat_lambda,
    feat_R,
    feat_VH_embedding,
    feat_mu,
    feat_mu_tot,
    
):  
    # print('epochs_per_sample.shape[0]' + str(epochs_per_sample.shape[0]))
    for i in numba.prange(epochs_per_sample.shape[0]):
        if epoch_of_next_sample[i] <= n:
            j = head[i]
            k = tail[i]

            current = head_embedding[j]
            other = tail_embedding[k]
            
            # vec_diff = vdiff(current, other)
            vec_diff = other - current

            inner_product = np.dot(vec_diff, vec_diff)
            outer_product = np.outer(vec_diff, vec_diff)
            
            grad_d = np.zeros(dim, dtype=np.float32)
            
            #dim = 1
            # random select a dimension from dim
            # d = tau_rand_int(rng_state) % dim
            # print('featuremap_flag' + str(featuremap_flag))
            if featuremap_flag:
                current_VH = feat_VH_embedding[j] # rotation matrix embedding;  
                other_VH = feat_VH_embedding[k]
                
                grad_cor_coeff = np.zeros(dim, dtype=np.float32)
                # TODO: focus on each d of dim
                for d in numba.prange(dim):
                    phi = 1.0 / (1.0 + a * pow(inner_product, b))
                    dphi_term = (
                        2 * a * b * pow(inner_product, b - 1) * vec_diff / 
                        (1.0 + a * pow(inner_product, b))
                    )

                    v_j = current_VH[d]
                    v_k = other_VH[d]
                    project_vec_j = np.dot(v_j, vec_diff)
                    project_vec_k = np.dot(v_k, vec_diff)

                    #TODO: check feat_phi_sum, feat_re_sum
                    # Have changed the order of j, k
                    q_jk = phi / feat_phi_sum[j]
                    q_kj = phi / feat_phi_sum[k]
                    
                   
                    # drj = q_jk * (
                    #     project_vec_j * (2 * v_j -  project_vec_j * dphi_term ) / np.exp(feat_re_sum[j,d]) + dphi_term
                    # )
                    # drk = q_kj * (
                    #     project_vec_k * (2 * v_k -  project_vec_k * dphi_term ) / np.exp(feat_re_sum[k,d]) + dphi_term
                    # )
                    drj = np.zeros(dim)
                    drk = np.zeros(dim)
                    for s in numba.prange(dim):
                        drj[s] = q_jk * (
                            project_vec_j * (2 * v_j[s] -  project_vec_j * dphi_term[s] ) / np.exp(feat_re_sum[j,d]) + dphi_term[s]
                        )
                        drk[s] = q_kj * (
                            project_vec_k * (2 * v_k[s] -  project_vec_k * dphi_term[s] ) / np.exp(feat_re_sum[k,d]) + dphi_term[s]
                        )
                    
                    # check feat_re_std: array shape (dim,)
                    re_std_sq = feat_re_std[d] * feat_re_std[d]
                    
         
                    weight_j = (
                        feat_R[j,d]
                        - feat_re_cov[d] * (feat_re_sum[j,d] - feat_re_mean[d]) / re_std_sq
                    )
                    weight_k = (
                        feat_R[k,d]
                        - feat_re_cov[d] * (feat_re_sum[k,d] - feat_re_mean[d]) / re_std_sq
                    )
                    for s in numba.prange(dim):
                        grad_cor_coeff[s] += (weight_j * drj[s] + weight_k * drk[s]) / feat_re_std[d]
    
                for s in numba.prange(dim):
                    grad_cor_coeff[s] = (
                        grad_cor_coeff[s]
                        * feat_lambda
                        * feat_mu_tot
                        / feat_mu[i]
                        / n_vertices
                    )

            # grad_coeff = np.zeros(dim, dtype=np.float32)
            if inner_product > 0.0:
                # gradient of log Q_jk 
                grad_coeff_term = (-2.0) * a * b * pow(inner_product, b - 1.0) 
                grad_coeff_term = grad_coeff_term / (a * pow(inner_product, b) + 1.0)
            else:
                grad_coeff_term = 0
                  
            # gradient w.r.t y_j; sampling edge (j,k), Z_jk = y_k - y_j 
            # grad_d = clip_arr(* vec_diff[d]) * vec_diff[d] * (-1.0)
            for d in numba.prange(dim):
                grad_d = clip(grad_coeff_term * vec_diff[d] * (-1.0))
                # grad_d = grad_coeff[d] * (-1.0) * vec_diff[d]

                if featuremap_flag:
                    # FIXME: grad_cor_coeff might be referenced before assignment
                    grad_d += clip(grad_cor_coeff[d] * (-1.0))
                
                # 
                current[d] += grad_d * alpha
                if move_other:
                    other[d] += -grad_d * alpha

            epoch_of_next_sample[i] += epochs_per_sample[i]

            n_neg_samples = int(
                (n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i]
            )

            for p in numba.prange(n_neg_samples):
                k = tau_rand_int(rng_state) % n_vertices

                other = tail_embedding[k]

                # vec_diff = vdiff(current, other)
                vec_diff = other - current
                inner_product = np.dot(vec_diff, vec_diff)
                
                if inner_product > 0.0:
                    grad_coeff_term = 2.0 * gamma * b
                    #divisor = np.repeat((0.001 + inner_product) * (a * pow(inner_product, b) + 1), dim)
                    grad_coeff_term = grad_coeff_term / ((0.001 + inner_product) * (a * pow(inner_product, b) + 1))
                elif j == k:
                    continue
                else:
                    grad_coeff_term = 0.0
                

                for d in numba.prange(dim):
                    if grad_coeff_term > 0.0:
                        grad_d = clip(grad_coeff_term * vec_diff[d] * (-1.0))
                    else:
                        grad_d = 2.0 
                    current[d] += grad_d * alpha

            epoch_of_next_negative_sample[i] += (
                n_neg_samples * epochs_per_negative_sample[i]
            )


# Compute the variance in each direction of embedding under rotation VH 
def _optimize_layout_euclidean_featuremap_epoch_init_grad(
    head_embedding,
    tail_embedding,
    head,
    tail,
    # random_state,
    # gauge_vh,
    VH_embedding,
    # rotation_angle,
    a,
    b,
    re_sum,
    phi_sum,
):
    """
    Compute the principal radius in the dim-dimensional embedding space.
    
    Parameter
    ---------
    re_sum: array of shape (n_vertices, dim) 
        The principal radius in the embedding space
    phi_sum: array of shape (n_vertices,)
        For node i, the sum of edge existing probability incident to this node
    """
    
    # VH_embedding.fill(0)
    # random_initial = random_state.randint(0, head_embedding.shape[0], 1).astype(np.int64)
    # vh_initial = gauge_vh[random_initial[0]][0]
    # for i in range(head_embedding.shape[0]):
    #     vh_vector = gauge_vh[i][0]
    #     # angle = angle_between(vh_initial, vh_vector)
    #     angle = random_state.random() * 3.14
    #     # angle = random_state.random() * 3.14
    #     # angle = random_state.normal(0,1)

    #     # rotation_angle[i] = angle
    #     vh_embedding = np.array([[np.cos(angle), np.sin(angle)],[-np.sin(angle), np.cos(angle)]])
    #     # vh_embedding = np.identity(2)
    #     VH_embedding[i] = vh_embedding
 
    re_sum.fill(0)
    phi_sum.fill(0)
    
    dim = head_embedding.shape[1]

    for i in numba.prange(head.size):
        j = head[i]
        k = tail[i]

        current = head_embedding[j]
        other = tail_embedding[k]
        
        # current_VH = VH[j][:,:dim] # projection matrix; keep first dim dimensions 
        # other_VH = VH[k][:,:dim]
        
        # Restrict dim to 2
        # theta_j = rotation_angle[j]
        # theta_k = rotation_angle[k]
        
        current_VH = VH_embedding[j] # array shape of (dim, dim)
        other_VH = VH_embedding[k]
        # current_VH = np.identity(2)
        # other_VH = np.identity(2)
        
        # vec_diff = vdiff(current, other)
        vec_diff = other - current
        # inner_product = np.dot(vec_diff, vec_diff)
        dist_squared = rdist(current, other)
        
        phi = 1.0 / (1.0 + a * pow(dist_squared, b))
        phi_sum[j] += phi
        phi_sum[k] += phi
        
        for d in numba.prange(dim):
            vec_proj_vh_j = np.dot(vec_diff, current_VH[d]) # project to d-th rotation direction
            vec_proj_vh_k = np.dot(vec_diff, other_VH[d])        
     
            re_sum[j,d] += phi * vec_proj_vh_j * vec_proj_vh_j
            re_sum[k,d] += phi * vec_proj_vh_k * vec_proj_vh_k
        
        # vec_proj_vh_j = np.dot(current_VH, vec_diff) # project to rotation direction
        # vec_proj_vh_k = np.dot(other_VH, vec_diff)     
        
        # re_sum[j] += phi * vec_proj_vh_j * vec_proj_vh_j
        # re_sum[k] += phi * vec_proj_vh_k * vec_proj_vh_k
         
    epsilon = 1e-8
    for i in numba.prange(re_sum.shape[0]):
        re_sum[i] = np.log(epsilon + (re_sum[i] / phi_sum[i]))
    
    # for i in range(re_sum.shape[0]):
    #     re_sum[i] = np.log(epsilon + np.sqrt((re_sum[i] / phi_sum[i])))
   
        




[docs]
def optimize_layout_euclidean_anisotropic_projection(
    head_embedding,
    tail_embedding,
    head,
    tail,
    n_epochs,
    n_vertices,
    epochs_per_sample,
    a,
    b,
    rng_state,
    gamma=1.0,
    initial_alpha=1.0,
    negative_sample_rate=5.0,
    parallel=False,
    verbose=False,
    # featuremap=False,
    featuremap_kwds=None,
    tqdm_kwds=None,
    move_other=False,
):
    """Improve an embedding using stochastic gradient descent to minimize the
    fuzzy set cross entropy between the 1-skeletons of the high dimensional
    and low dimensional fuzzy simplicial sets. In practice this is done by
    sampling edges based on their membership strength (with the (1-p) terms
    coming from negative sampling similar to word2vec).
    Parameters
    ----------
    head_embedding: array of shape (n_samples, n_components)
        The initial embedding to be improved by SGD.
    tail_embedding: array of shape (source_samples, n_components)
        The reference embedding of embedded points. If not embedding new
        previously unseen points with respect to an existing embedding this
        is simply the head_embedding (again); otherwise it provides the
        existing embedding to embed with respect to.
    head: array of shape (n_1_simplices)
        The indices of the heads of 1-simplices with non-zero membership.
    tail: array of shape (n_1_simplices)
        The indices of the tails of 1-simplices with non-zero membership.
    n_epochs: int
        The number of training epochs to use in optimization.
    n_vertices: int
        The number of vertices (0-simplices) in the dataset.
    epochs_per_sample: array of shape (n_1_simplices)
        A float value of the number of epochs per 1-simplex. 1-simplices with
        weaker membership strength will have more epochs between being sampled.
    a: float
        Parameter of differentiable approximation of right adjoint functor
    b: float
        Parameter of differentiable approximation of right adjoint functor
    rng_state: array of int64, shape (3,)
        The internal state of the rng
    gamma: float (optional, default 1.0)
        Weight to apply to negative samples.
    initial_alpha: float (optional, default 1.0)
        Initial learning rate for the SGD.
    negative_sample_rate: int (optional, default 5)
        Number of negative samples to use per positive sample.
    parallel: bool (optional, default False)
        Whether to run the computation using numba parallel.
        Running in parallel is non-deterministic, and is not used
        if a random seed has been set, to ensure reproducibility.
    verbose: bool (optional, default False)
        Whether to report information on the current progress of the algorithm.
    featuremap: bool (optional, default False)
        Whether to use the feature-augmented featuremap objective
    featuremap_kwds: dict (optional, default None)
        Auxiliary data for featuremap
    tqdm_kwds: dict (optional, default None)
        Keyword arguments for tqdm progress bar.
    move_other: bool (optional, default False)
        Whether to adjust tail_embedding alongside head_embedding
    Returns
    -------
    embedding: array of shape (n_samples, n_components)
        The optimized embedding.
    """

    dim = head_embedding.shape[1]
    alpha = initial_alpha

    epochs_per_negative_sample = epochs_per_sample / negative_sample_rate
    epoch_of_next_negative_sample = epochs_per_negative_sample.copy()
    epoch_of_next_sample = epochs_per_sample.copy()

    optimize_fn = numba.njit(
           _optimize_layout_euclidean_single_epoch_grad, 
           fastmath=True, 
           parallel=parallel,
         # nopython=True,
           cache=True
    )
    # optimize_fn = _optimize_layout_euclidean_single_epoch_grad
    
    if featuremap_kwds is None:
        featuremap_kwds = {}
    if tqdm_kwds is None:
        tqdm_kwds = {}
    
    # if featuremap:
    feat_init_fn = numba.njit(
         _optimize_layout_euclidean_featuremap_epoch_init_grad,
         fastmath=True,
         parallel=parallel,
        #  nopython=True,
         cache=True
    )
    
    
    # feat_init_fn = _optimize_layout_euclidean_featuremap_epoch_init_grad

    feat_mu_tot = np.sum(featuremap_kwds["mu_sum"]) / 2  # sum of all edges' existing probability, float, shape of (1,)
    # should we modify lambda? Yes, we should
    #TODO: modify lambda
    feat_lambda = featuremap_kwds["lambda"] 
    feat_R = featuremap_kwds["R"] # array shape of (n_vertices d)
    feat_VH = featuremap_kwds["VH"]
    # feat_VH_embedding = featuremap_kwds["VH_embedding"] # array of shape (n_vertices, dim)
    feat_VH_embedding = np.repeat(np.eye(2)[np.newaxis, :, :], head_embedding.shape[0], axis=0) # initialize vh_embedding by identity matrix
    feat_VH_embedding = feat_VH_embedding.astype(np.float32)

    # feat_rotation_angle = featuremap_kwds["rotation_angle"]
    feat_mu = featuremap_kwds["mu"] # edge probability
    feat_phi_sum = np.zeros(n_vertices, dtype=np.float32) # For each node i in embedding space, sum of edge existing probality incident to this node
    feat_re_sum = np.zeros([n_vertices, dim], dtype=np.float32) # Embedding radius in principal directions
    feat_var_shift = featuremap_kwds["var_shift"]
    # else: 
    #     feat_mu_tot = 0
    #     feat_lambda = 0
    #     feat_R = np.zeros(1, dtype=np.float32)
    #     feat_VH = np.zeros(1, dtype=np.float32)
    #     feat_VH_embedding = np.zeros(1, dtype=np.float32)
    #     # feat_rotation_angle = np.zeros(1, dtype=np.float32)
    #     feat_mu = np.zeros(1, dtype=np.float32)
    #     feat_phi_sum = np.zeros(1, dtype=np.float32)
    #     feat_re_sum = np.zeros(1, dtype=np.float32)
        
    if "disable" not in tqdm_kwds:
        tqdm_kwds["disable"] = not verbose
    
    for n in tqdm(range(n_epochs), **tqdm_kwds):
        featuremap_flag = (
            # featuremap and 
            (featuremap_kwds["lambda"] > 0)
            and (((n + 1) / float(n_epochs)) > (1 - featuremap_kwds["frac"]))
        )

        if featuremap_flag:
            # Compute the initial embedding under rotation VH
            # T1 = time.time()
            feat_init_fn(
                head_embedding,
                tail_embedding,
                head,
                tail,
                # random_state,
                # feat_VH,
                feat_VH_embedding,
                a,
                b,
                feat_re_sum,
                feat_phi_sum,
            )
            # T2 = time.time()
            # print(f'featuremap initialization time is {T2-T1}')
              
            # feat_init_fn.inspect_types()
            
            # FIXME: feat_var_shift might be referenced before assignment
            feat_re_std = np.sqrt(np.var(feat_re_sum, axis=0) + feat_var_shift)
            feat_re_mean = np.mean(feat_re_sum, axis=0)
            feat_re_sum_centered = np.subtract(feat_re_sum, feat_re_mean)

            product = np.diag(np.dot(feat_re_sum_centered.T, feat_R[:,:dim]))
            feat_re_cov = product / (n_vertices - 1)
        else:
            feat_re_std = np.zeros(dim, dtype=np.float32)
            feat_re_mean = np.zeros(dim, dtype=np.float32)
            feat_re_cov = np.zeros(dim, dtype=np.float32)

        # # recover the gauge from the low dimensional embedding
        # if featuremap_flag and n % 10 == 0:
        #     # print('recover gauge from embedding')
        #     recover_gauge_from_embedding(data_embedding=head_embedding, featuremap_kwds=featuremap_kwds)
        #     feat_VH_embedding = featuremap_kwds["VH_embedding"].astype(np.float32)

        # T1 = time.time()
        optimize_fn(
            head_embedding,
            tail_embedding,
            head,
            tail,
            n_vertices,
            epochs_per_sample,
            a,
            b,
            rng_state,
            gamma,
            dim,
            move_other,
            alpha,
            epochs_per_negative_sample,
            epoch_of_next_negative_sample,
            epoch_of_next_sample,
            n,
            featuremap_flag,
            feat_phi_sum,
            feat_re_sum,
            feat_re_cov,
            feat_re_std,
            feat_re_mean,
            feat_lambda,
            feat_R,
            feat_VH_embedding,
            feat_mu,
            feat_mu_tot,
        )

        # T2 = time.time()
        # print(f'Optimize_fn time is {T2-T1}')
        alpha = initial_alpha * (1.0 - (float(n) / float(n_epochs)))
        
    return head_embedding