Source code for scbean.model.visgp

import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
import multiprocessing
from tqdm import tqdm
#import fastlmm.util.stats.quadform as qf
import fastlmmclib.quadform as qf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
import tensorflow as tf
import tensorflow_probability as tfp

tfb = tfp.bijectors
tfd = tfp.distributions
tfk = tfp.math.psd_kernels
dtype = np.float64

os.environ["CUDA_VISIBLE_DEVICES"] = '-1'  # not GPU

[docs]class VISGP(object): """ Initialize VISGP object Parameters ---------- adata: anndata, (default: None) The input data for the model, including gene expression levels (adata.X, genes*spots), spatial location coordinates (adata.var) and genes name (adata.obs). inducing_points: int, optional (default: 20) The number of inducing points. iters: int, optional (default: 1000) The number of iters. processes: int, optional (default: 1) The number of concurrent processes. """ def __init__(self, adata=None, inducing_points=20, iters=1000, processes=1): self.adata = adata self.inducing_points = inducing_points self.iters = iters self.processes = processes self.acc = 1e-7
[docs] def covariance_matrix(self, length): """ Calculate the covariance matrix. Parameters ---------- length: int kernel parameter Returns ---------- :class:`~Numpy array(s)` Covariance matrix """ Xsq = np.sum(np.square(self.adata.var), 1) R2 = -2. * np.dot(self.adata.var, self.adata.var.T) + (Xsq[:, None] + Xsq[None, :]) R2 = np.clip(R2, 1e-12, np.inf) K = np.exp(-R2 / (2 * length ** 2)) return K
[docs] def score_test(self, K, y): """ Score statistics test. Parameters ---------- K: Covariance matrix y: A vector representing the expression value of a gene Returns ---------- :class:`~float` P value of a gene """ n = self.adata.n_vars # Perform the eigendecomposition eigenvalues = np.linalg.eigvalsh(K) # Round to zero eigenvalues[eigenvalues < 10e-5] = 0 eigenvalues = np.sort(eigenvalues)[::-1] # Calculate k = rank(K) k = int(sum(eigenvalues > 0)) # Calculate q = dim(ker(K) & col(I)) q = int(n - sum(eigenvalues > 10e-5)) r = n * np.dot(np.dot(y.T, K), y) / np.square(y).sum() alphars = np.concatenate((eigenvalues[:k] - float(r) / n, np.ones(q) * -float(r) / n), axis=0) return qf.qf(0, alphars, acc=self.acc)[0]
[docs] def qvalue(self, pv): """ Calculate Q values using BH adjustment. Parameters ---------- pv: P values of all genes Returns ---------- :class:`~Numpy array(s)` Q values of all genes """ original_shape = pv.shape pv = pv.ravel() m = float(len(pv)) p_ordered = np.argsort(pv) pv = pv[p_ordered] qv = np.zeros_like(pv) qv[-1] = pv[-1] for i in range(len(pv) - 2, -1, -1): qv[i] = min(m * pv[i] / (i + 1), pv[i + 1]) qv_temp = qv.copy() qv = np.zeros_like(qv) qv[p_ordered] = qv_temp qv = qv.reshape(original_shape) return qv
[docs] def build(self, k, y): """ Build and training model. Parameters ---------- k: int Used to mark a gene y: A vector representing the expression value of a gene Returns ---------- :class:`~int` and `~float` k and p-value """ # Create kernel parameters, and observation noise variance variable amplitude = tfp.util.TransformedVariable(1., tfb.Softplus(), dtype=dtype, name='amplitude') length_scale = tfp.util.TransformedVariable(1., tfb.Softplus(), dtype=dtype, name='length_scale') # k(x, y) = amplitude**2 * exp(-||x - y||**2 / (2 * length_scale**2)) kernel = tfk.ExponentiatedQuadratic(amplitude=amplitude, length_scale=length_scale) observation_noise_variance = tfp.util.TransformedVariable( 1., tfb.Softplus(), dtype=dtype, name='observation_noise_variance') # Create trainable inducing point locations and variational parameters. num_inducing_points_ = self.inducing_points inducing_mean = np.array([0, 0]) inducing_conv = np.array([[1, 0.0], [0.0, 1]]) inducing_index_points = tf.Variable( np.random.multivariate_normal(mean=inducing_mean, cov=inducing_conv, size=num_inducing_points_), dtype=dtype, name='inducing_index_points') variational_inducing_observations_loc = tf.Variable( np.zeros([num_inducing_points_], dtype=dtype), name='variational_inducing_observations_loc') variational_inducing_observations_scale = tf.Variable( np.eye(num_inducing_points_, dtype=dtype), name='variational_inducing_observations_scale') index_points_ = self.adata.var.values num_points_ = index_points_.shape[0] # Construct our variational GP Distribution instance. vgp = tfd.VariationalGaussianProcess( kernel, index_points=index_points_, inducing_index_points=inducing_index_points, variational_inducing_observations_loc=variational_inducing_observations_loc, variational_inducing_observations_scale=variational_inducing_observations_scale, observation_noise_variance=observation_noise_variance) optimizer = tf.keras.optimizers.Adam(learning_rate=.1) @tf.function def optimize(x_train_batch, y_train_batch): with tf.GradientTape() as tape: # Create the loss function we want to optimize. loss = vgp.variational_loss( observations=y_train_batch, observation_index_points=x_train_batch, kl_weight=float(y.shape[0]) / float(num_points_)) grads = tape.gradient(loss, vgp.trainable_variables) optimizer.apply_gradients(zip(grads, vgp.trainable_variables)) return loss for i in range(self.iters): optimize(index_points_, y) K = self.covariance_matrix(length_scale.numpy()) p_value = self.score_test(K, y) return k, p_value
[docs] def run(self): """ Run VISGP. Returns ---------- :class:`~pandas.core.frame.DataFrame` results, including ['gene', 'p_value', 'q_value'] """ names = self.adata.obs y_all_genes = self.adata.X # genes*spots(ndarray) y_all_genes = y_all_genes.T y_all_genes = preprocessing.scale(y_all_genes) y_all_genes = y_all_genes.T num_genes = self.adata.n_obs results = pd.DataFrame(columns=['gene', 'p_value', 'q_value']) if self.processes == 1: for k in tqdm(range(num_genes)): k, p_value = self.build(k, y_all_genes[k]) results.loc[k, 'gene'] = names.iloc[k, 0] results.loc[k, 'p_value'] = p_value else: args = [] pool = multiprocessing.Pool(self.processes) for k in range(num_genes): args.append((k, y_all_genes[k])) args = tqdm(args) result = pool.starmap(self.build, args) pool.close() pool.join() print("Results....................") for k in tqdm(range(num_genes)): results.loc[k, 'gene'] = names.iloc[result[k][0], 0] results.loc[k, 'p_value'] = result[k][1] q_value = self.qvalue(results['p_value'].values) for j in range(num_genes): results.loc[j, 'q_value'] = q_value[j] return results