Source code for local_fit_numba

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May  5 22:17:11 2022

@author: hill103

this script implement the optimization of theta and sigma^2 using Poisson log-normal distribution + heavy-tail

use Numba parallel for best performance

we calculate the likelihood for each gene sequentially, utilizing Numba's parallel capabilities to accelerate the computation within each gene's calculation.

NOTE: simplifying the loss-computation code and removing unneeded nested wrappers yielded a speedup

and print not supported in Numba function

functions for likelihood calculation:
    
    calc_hv_numba: Numba function so parallel computing by Numba; calculate likelihoods given an array of y and mu then return; NOT related with cache dict
    
    hv_comb: return sum of negative log-likelihoods + gradient of w; used for theta optimization; directly call hv_numba
    
    hv_wrapper: return sum of negative log-likelihoods; used for sigma^2 optimization; directly call hv_numba
    
    hv_numba: call calc_hv_numba and return array of likelihoods; support caching (DELETED)
    
    
sequence of function calls for parameter optimization:
    
    update_theta -> objective_loss_theta -> hv_comb -> calc_hv_numba
    
    update_sigma2 -> objective_loss_sigma2 -> hv_wrapper -> calc_hv_numba
    
    calcHVBaseModelLoss -> hv_wrapper in hyper parameter tunning
"""



import numpy as np
from scipy.optimize import minimize, basinhopping
import scipy.stats
import numba as na
import math
from config import min_val, min_theta, min_sigma2, N_z, gamma, print, theta_eps, sigma2_eps



################################# code related to global optimization #################################
[docs] class RandomDisplacementBounds(object): """A class to perform random displacement with bounds on parameters during optimization. This step function is designed to modify the basinhopping algorithm's step-taking behavior by ensuring that new positions remain within specified bounds. This implementation uses a direct calculation to determine the minimum and maximum allowable steps rather than using acceptance-rejection sampling, which can be found in the more general approach discussed on StackOverflow (https://stackoverflow.com/a/21967888/2320035). The approach has been modified for enhanced performance with specific bounds. ref: https://stackoverflow.com/questions/47055970/how-can-i-write-bounds-of-parameters-by-using-basinhopping Attributes: xmin (np.ndarray or list): The lower bounds of the parameters. None indicates unbounded. xmax (np.ndarray or list): The upper bounds of the parameters. None indicates unbounded. stepsize (float): The maximum step size to take in any parameter direction. """ def __init__(self, xmin, xmax, stepsize=0.5): """ Initializes the RandomDisplacementBounds with specified bounds and step size. Args: xmin (np.ndarray or list): The lower bounds of the parameters. xmax (np.ndarray or list): The upper bounds of the parameters. stepsize (float, optional): The maximum step size to take in any parameter direction. Defaults to 0.5. """ self.xmin = xmin self.xmax = xmax self.stepsize = stepsize def __call__(self, x): """ Calculates a new position for the optimization algorithm by taking a random step within the defined bounds. This method ensures that the new position does not violate the specified bounds. Args: x (np.ndarray): The current position of the parameters in the optimization algorithm. Returns: np.ndarray: The new position after taking a random step within the bounds. """ # define a custom function to consider None in bound def calcMinStep(xmin, x, stepsize): if xmin is None: return -stepsize else: return np.maximum(xmin - x, -stepsize) def calcMaxStep(xmax, x, stepsize): if xmax is None: return stepsize else: return np.minimum(xmax - x, stepsize) min_step = np.array([calcMinStep(this_xmin, this_x, self.stepsize) for (this_xmin, this_x) in zip(self.xmin, x)]) max_step = np.array([calcMaxStep(this_xmax, this_x, self.stepsize) for (this_xmax, this_x) in zip(self.xmax, x)]) random_step = np.random.uniform(low=min_step, high=max_step, size=x.shape) xnew = x + random_step return xnew
################################# code related to Poisson heavy-tail density calculation in Python ################################# # parameters related to heavy tail, just for standard N(0, 1) # a and c are chosen so that density is continuously differentiable at the boundary (x=3) a = 4/9 * np.exp(-3**2/2) / np.sqrt(2*np.pi) c = 7/3 # C is a normalizing constant making the density integrate to 1 C = 1 / ((a/(3-c) - scipy.stats.norm(0, 1).cdf(-3))*2 + 1) # generate data points to calculate integral z_hv = np.array(range(-N_z, N_z+1)) * gamma
[docs] def generate_log_heavytail_array(x, sigma): ''' generate a array of log density values of standard normal distribution + heavy-tail given a specific sigma we assume the normal distribution is N(0, sigma^2), and transform to N(0, 1) by divided by sigma, and calculate the heavy-tail density values, then transform back to N(0, sigma^2) and take log. We can re-use these pre-calculated values for all genes and spots after divided by sigma, the normal + heavy-tail integrate to 1, i.e. sum(p*gamma)=1, gamma is the interval Parameters ---------- x : 1-D numpy array data points served as x for calculation of probability density values sigma : float SD of normal distribution, corresponding to the dispertion in GLRM. All genes and all spots share the same SD Returns ------- 1-D numpy array log density values of normal distribution N(0, sigma^2) + heavy-tail. ''' if not math.isfinite(sigma): raise Exception('Error: sigma2 optimization in local model of all spots returns a non-finite value!') # change to N(0, 1) tmp_x = x / sigma p = np.empty(tmp_x.shape) tmp_idx = abs(tmp_x) < 3 not_idx = np.logical_not(tmp_idx) # normal part p[tmp_idx] = C/np.sqrt(2*np.pi) * np.exp(-0.5*(tmp_x[tmp_idx]**2)) # heavy tail part p[not_idx] = C*a / ((abs(tmp_x[not_idx])-c) ** 2) assert((p>=0).all()) # change back to N(0, sigma^2) and take log return np.log(p / sigma + min_val)
############ code related to Numba functions to calculate negative log-likelihood values ############ # use Numba to speed up Python and NumPy code # check in Reference Manual to make sure all Python and Numpy features in the code are supported # based on test, Numba parallel on one gene DO NOT improve speed much; on ALL genes DOES # set nopython True/False, fastmath True/False DO NOT affect speed # see the ref https://stackoverflow.com/a/64613366/13752320 for advices of using Numba efficiently # UPDATE: we modified it for better Numba handling
[docs] @na.jit(nopython=True, parallel=True, fastmath=False, error_model="numpy", cache=False) def calc_hv_numba(MU, y_vec, hv_x, hv_log_p, output, gamma): ''' calculate the likelihood value given the distribution parameters considering heavy-tail and assign them to the output variable note that add a small value to avoid log(0) Parameters ---------- MU : 1-D numpy array mean value of log-normal distribution for all genes. y_vec : 1-D numpy array observed gene counts of one spot. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. hv_log_p : 1-D numpy array, optional log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail. output : 1-D numpy array empty array used to store the calculated values. gamma : float increment to calculate the heavy-tail probabilities. Returns ------- 1-D numpy array likelihood of input genes in one spot. ''' G = MU.shape[0] Z = hv_x.shape[0] for i in na.prange(G): # parallel ONLY here mu_i = MU[i] yi = y_vec[i] lg = math.lgamma(yi + 1.0) # invariant over j acc = 0.0 for j in range(Z): # <- plain range (not prange) tmp = mu_i + hv_x[j] # two exps is unavoidable here; still hoisted invariants help acc += math.exp(yi*tmp + hv_log_p[j] - math.exp(tmp) - lg) output[i] = acc * gamma # single write return output
[docs] def hv_comb(w_vec, y_vec, mu, gamma_g, sigma2, hv_x, hv_log_p, N): ''' a function to calculate the negative likelihood value given the distribution parameters considering heavy-tail plus corresponding gradient vector also return 1st order derivative vector of w this function will be used as target function for theta optimization we assume: y ~ Poisson(N*lambda) ln(lambda) follow epsilon's distribution ~ N(mu, sigma2) then ln(N*lambda) ~ N(mu+ln(N), sigma2) N is sequencing depth for this spot mu is alpha + log(theta*marker gene) + gamma Now we also consider the heavy-tail instead of just normal distribution Parameters ---------- w_vec : 1-D numpy array e_alpha (spot-specific effect) * theta (celltype proportion) of one spot. y_vec : 1-D numpy array observed gene counts of one spot. mu : 2-D numpy matrix celltype specific marker genes (celltypes * genes) gamma_g : 1-D numpy array gene-specific platform effect for all genes. sigma2 : float variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. hv_log_p : 1-D numpy array, optional log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail. N : float sequencing depth for this spot. If is None, use sum(y_vec) instead. Returns ------- tuple of (float, 1-D numpy array) sum of negative log-likelihood across all genes in one spot + gradient vector of w. ''' # return both negative log-likelihoods and 1st order derivative if N is None: N = np.sum(y_vec) # The @ operator performs matrix multiplication, w_vec@mu produces a 1-D numpy array with length as number of genes MU = np.log(w_vec@mu+min_val) + gamma_g + np.log(N) this_lambda = (w_vec@mu) * np.exp(gamma_g) * N likelihoods = calc_hv_numba(MU, y_vec, hv_x, hv_log_p, np.zeros(MU.shape), gamma) loss = -np.sum(np.log(likelihoods + min_val)) # for gradient of w likelihoods_y_add_1 = calc_hv_numba(MU, y_vec+1, hv_x, hv_log_p, np.zeros(MU.shape), gamma) # element-wise operation tmp = (y_vec*likelihoods - (y_vec+1)*likelihoods_y_add_1 + min_val) / (this_lambda*likelihoods + min_val) der = [] # sum over genes, get result related to cell-types for i in range(mu.shape[0]): der.append(-np.sum(tmp * np.exp(gamma_g) * N * mu[i, :].flatten())) return loss, np.array(der)
[docs] def hv_wrapper(w_vec, y_vec, mu, gamma_g, sigma2, hv_x, hv_log_p, N): ''' a wrapper to calculate the negative log-likelihood value given the distribution parameters considering heavy-tail this wrapper will be used as target function for sigma^2 optimization we assume: y ~ Poisson(N*lambda) ln(lambda) follow epsilon's distribution ~ N(mu, sigma2) then ln(N*lambda) ~ N(mu+ln(N), sigma2) N is sequencing depth for this spot mu is alpha + log(theta*marker gene) + gamma Now we also consider the heavy-tail instead of just normal distribution Parameters ---------- w_vec : 1-D numpy array e_alpha (spot-specific effect) * theta (celltype proportion) of one spot. y_vec : 1-D numpy array observed gene counts of one spot. mu : 2-D numpy matrix celltype specific marker genes (celltypes * genes) gamma_g : 1-D numpy array gene-specific platform effect for all genes. sigma2 : float variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. hv_log_p : 1-D numpy array, optional log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail. N : float sequencing depth for this spot. If is None, use sum(y_vec) instead. Returns ------- float sum of negative log-likelihood across all genes in one spot. ''' if N is None: N = np.sum(y_vec) MU = np.log(w_vec@mu+min_val) + gamma_g + np.log(N) likelihoods = calc_hv_numba(MU, y_vec, hv_x, hv_log_p, np.zeros(MU.shape), gamma) loss = -np.sum(np.log(likelihoods + min_val)) return loss
################################# code related to update theta #################################
[docs] def objective_loss_theta(w_vec, y_vec, mu, gamma_g, sigma2, nu_vec, rho, lambda_r, lasso_weight_vec, lambda_l2, hybrid_version, hv_x, hv_log_p, N): ''' calculate loss function for updating theta (celltype proportion) the loss function contains four parts (defined for each spot separately) also return gradient of w 1. negative log-likelihood of the base model given observed data and initial parameter value. It sums across all genes 2. a loss of ADMM to make theta equals theta_hat (used for regularization/penalty; optional, controlled by nu_vec and rho) 1/(2*rho) (||w/sum(w) - theta_hat + u||_2)^2 u is the scaled dual variables to make theta = theta_hat and nu = theta_hat - u and we did re-parametrization w = e^alpha * theta, so theta = w / sum(w) 3. a loss of Adaptive Lasso to shrink theta (optional, controlled by lambda_r and lasso_weight_vec) lambda_r * (inner product(w/sum(w), lasso_weight)) 4. a loss of L2 penalty to shrink theta (optional, controlled by lambda_l2) lambda_l2 * (sum(squared(w/sum(w)))) Parameters ---------- w_vec : 1-D numpy array e_alpha (spot-specific effect) * theta (celltype proportion) of one spot. y_vec : 1-D numpy array observed gene counts of one spot. mu : 2-D numpy matrix celltype specific marker genes (celltypes * genes) gamma_g : 1-D numpy array gene-specific platform effect for all genes. sigma2 : float variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance. nu_vec : 1-D numpy array variable for ADMM penalty of one spot in 3 part ADMM nu = theta_hat (used for regularization/penalty) - u (scaled dual variables to make theta = theta_hat) rho : float parameter for the strength of ADMM loss to make theta equals theta_hat lambda_r : float parameter for the strength of Adaptive Lasso loss to shrink theta lasso_weight_vec : 1-D numpy array weight of Adaptive Lasso, 1 ./ theta lambda_l2 : float parameter for the strength of L2 panealty to shrink theta hybrid_version : bool, optional if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. hv_log_p : 1-D numpy array, optional log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail. N : float sequencing depth for this spot. If is None, use sum(y_vec) instead. Returns ------- a tuple (float, 1-D numpy array) the loss function (base model loss + ADMM loss + Adaptive Lasso loss + L2 loss) of one spot to update w (e_alpha*theta) + gradient vector ''' def admm_penalty_loss(): ''' calculate the loss for ADMM of making theta=theta_hat 1/(2*rho) (||w/sum(w) - theta_hat + u||_2)^2 and nu = theta_hat - u Returns ------- a tuple (float, 1-D numpy array) ADMM loss + gradient ''' if rho is None or nu_vec is None: return (0, np.zeros(w_vec.shape)) else: if hybrid_version: # ADMM loss on theta tmp_sum = np.sum(w_vec) return (np.sum(((w_vec/tmp_sum - nu_vec)**2))/(2*rho), 1/rho * ((w_vec*tmp_sum-np.sum(np.square(w_vec)))/(tmp_sum**3) - (nu_vec*tmp_sum-np.inner(nu_vec, w_vec))/(tmp_sum**2))) else: # ADMM loss on w return (np.sum(((w_vec-nu_vec)**2))/(2*rho), (w_vec-nu_vec)/rho) def adaptive_lasso_loss(): ''' calculate the loss for Adaptive Lasso lambda_r * (inner product(w/sum(w), lasso_weight)) Returns ------- a tuple (float, 1-D numpy array) Adaptive Lasso loss + gradient ''' if lambda_r is None or lasso_weight_vec is None: return (0, np.zeros(w_vec.shape)) else: if hybrid_version: # Adaptive Lasso loss on theta tmp_sum = np.sum(w_vec) return (lambda_r * np.inner(w_vec/tmp_sum, lasso_weight_vec), lambda_r * (lasso_weight_vec*tmp_sum-np.inner(w_vec, lasso_weight_vec)) / (tmp_sum**2)) else: # Adaptive Lasso loss on w return (lambda_r * np.inner(w_vec, lasso_weight_vec), lambda_r*lasso_weight_vec) def l2_loss(): ''' calculate the L2 penalty lambda_l2 * (sum(squared(w/sum(w)))) Returns ------- a tuple (float, 1-D numpy array) L2 penalty + gradient ''' if lambda_l2 is None: return (0, np.zeros(w_vec.shape)) else: if hybrid_version: # L2 loss on theta tmp_sum = np.sum(w_vec) return (lambda_l2 * np.sum(np.square(w_vec/tmp_sum)), lambda_l2 * 2 * (w_vec*tmp_sum - np.sum(np.square(w_vec))) / (tmp_sum**3)) else: # L2 loss on w return (lambda_l2 * np.inner(np.square(w_vec)), lambda_l2*2*w_vec) # combine loss and gradient on w separately return tuple([a+b+c+d for a,b,c,d in zip(hv_comb(w_vec, y_vec, mu, gamma_g, sigma2, hv_x, hv_log_p, N), admm_penalty_loss(), adaptive_lasso_loss(), l2_loss())])
[docs] def optimize_one_theta(mu, y_vec, N, this_warm_start_theta, this_warm_start_e_alpha, gamma_g, sigma2, spot_name, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', hv_x=None, hv_log_p=None, this_theta_mask=None, skip_opt=True, verbose=False): ''' update theta (celltype proportion) sigma2 (variance paramter of the log-normal distribution) and gamma_g (gene-specific platform effect) by MLE in ONE SPOT Parameters ---------- mu : 2-D numpy matrix matrix of celltype specific marker gene expression (celltypes * genes). y_vec : 1-D numpy array spatial gene expression (length #genes). N : int or None sequencing depth of this spot. If it's None, use sum of observed marker gene expressions as sequencing depth. spot_name : string name of this spot. this_warm_start_theta : 1-D numpy array (length #celltypes) initial guess of theta (celltype proportion). this_warm_start_e_alpha : float initial guess of e_alpha (spot-specific effect). gamma_g : 1-D numpy array gene-specific platform effect for all genes. sigma2 : float variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance. nu_vec : 1-D numpy array (length #celltypes), optional variable for ADMM penalty of all spots in 3 part ADMM nu = theta_hat (used for regularization/penalty) - u (scaled dual variables to make theta = theta_hat) rho : float, optional parameter for the strength of ADMM loss to make theta equals theta_hat lambda_r : float, optional parameter for the strength of Adaptive Lasso loss to shrink theta lasso_weight_vec : 1-D numpy array (length #celltypes), optional weight of Adaptive Lasso, 1 ./ theta lambda_l2 : float parameter for the strength of L2 panealty to shrink theta global_optimize : bool, optional if is True, use basin-hopping algorithm to find the global minimum. The default is False. hybrid_version : bool, optional if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True. opt_method : string, optional specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. hv_log_p : 1-D numpy array, optional log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail. this_theta_mask : 1-D numpy array (length #celltypes), optional mask for cell-type proportions (1: present, 0: not present). skip_opt : bool, optional if True, when only one cell type present, we skip optimization and directly return theta. verbose : bool, optional if True, print more information. Returns ------- w_result : 1-D numpy array (length #celltypes) updated w, i.e. theta (celltype proportion) * e_alpha. ''' n_celltype = len(this_warm_start_theta) if skip_opt: if np.isclose(this_warm_start_theta, 1.0, atol=1e-7).any(): assert np.sum(this_warm_start_theta) == 1 #print(f'skip optimization for spot {spot_name} as only one celltype present') return this_warm_start_theta # Prepare variables based on whether sparsity considered via theta mask if this_theta_mask is None: this_present_celltype_index = None else: this_present_celltype_index = this_theta_mask==1 # only one cell-type present, we can directly determine the proportion as 1 if skip_opt: if np.sum(this_present_celltype_index) == 1: simple_sol = np.zeros((n_celltype,)) simple_sol[this_present_celltype_index] = 1 #print(f'skip optimization for spot {spot_name} as only one celltype present in mask') return simple_sol if this_present_celltype_index is not None: # extract only marker gene expressions for presented cell-types mu = mu[this_present_celltype_index, :] if nu_vec is not None: nu_vec = nu_vec[this_present_celltype_index] this_warm_start_theta = this_warm_start_theta[this_present_celltype_index] if lasso_weight_vec is not None: lasso_weight_vec = lasso_weight_vec[this_present_celltype_index] # re-parametrization warm_start_w = this_warm_start_theta * this_warm_start_e_alpha # start optimization # call minimize function to solve w (e_alpha*theta) # bounds : tuple of tuples # sequence of (min, max) pairs for each element in w_vec # min not set as 0 to avoid divided by 0 or log(0) # if jac is a Boolean and is True, fun is assumed to return a tuple (f, g) containing the objective function and the gradient #from time import time #start_time = time() bounds = (((min_theta, None),) * len(warm_start_w)) if global_optimize: bounded_step = RandomDisplacementBounds(np.array([b[0] for b in bounds]), np.array([b[1] for b in bounds])) # use the basin-hopping algorithm to find the global minimum sol = basinhopping(objective_loss_theta, warm_start_w, niter=10, T=1.0, take_step=bounded_step, minimizer_kwargs={'method': opt_method, 'args': (y_vec, mu, gamma_g, sigma2, nu_vec, rho, lambda_r, lasso_weight_vec, lambda_l2, hybrid_version, hv_x, hv_log_p, N), 'bounds': bounds, 'options': {'maxiter': 250, 'eps': theta_eps}, 'jac': True }, disp=False) else: sol = minimize(objective_loss_theta, warm_start_w, args=(y_vec, mu, gamma_g, sigma2, nu_vec, rho, lambda_r, lasso_weight_vec, lambda_l2, hybrid_version, hv_x, hv_log_p, N), method=opt_method, bounds=bounds, options={'disp': False, 'maxiter': 250, 'eps': theta_eps}, jac=True) #print(f'spot {spot_name} optimization finished in {sol.nit} iterations. Elapsed time: {time()-start_time:.2f} seconds.') if not global_optimize: if not sol.success: if verbose: # Ref: status 2 - Maximum number of iterations has been exceeded print(f'[WARNING] w optimization in local model of spot {spot_name} not successful! Caused by: {sol.message}') solve_fail_flag = False if sum(sol.x) == 0: print(f'###### [Error] w optimization in local model of spot {spot_name} returns all 0s! ######') solve_fail_flag = True if np.any(np.isnan(sol.x)): print(f'###### [Error] w optimization in local model of spot {spot_name} returns NaN value! ######') solve_fail_flag = True if np.any(np.isinf(sol.x)): print(f'###### [Error] w optimization in local model of spot {spot_name} returns Infinite value! ######') solve_fail_flag = True if solve_fail_flag: # replace NaN or Infinite as 0 this_sol = np.nan_to_num(sol.x, nan=0.0, posinf=0.0, neginf=0.0) # NOTE this is w, no need to sum to 1 print('repalce non-numeric value as 0') if sum(this_sol) == 0: # reset w to all elements equal tmp_len = this_sol.shape[0] this_sol = np.full((tmp_len,), 1.0/tmp_len) * this_warm_start_e_alpha print('[WARNING] reset w to all elements identical!') else: this_sol = sol.x # set w values at boundary to 0; note to allow a tolerance this_sol[this_sol<=(min_theta*2)] = 0 # transform back w to original dimension, adding non-present values if this_theta_mask is None: w_result = this_sol else: w_result = np.zeros((n_celltype,)) w_result[this_present_celltype_index] = this_sol return w_result
[docs] def update_theta(data, warm_start_theta, warm_start_e_alpha, gamma_g, sigma2, nu=None, rho=None, lambda_r=None, lasso_weight=None, lambda_l2=None, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', hv_x=None, hv_log_p=None, theta_mask=None, verbose=False): ''' update theta (celltype proportion) and e_alpha (spot-specific effect) given sigma2 (variance paramter of the log-normal distribution) and gamma_g (gene-specific platform effect) by MLE we assume ln(lambda) = alpha + gamma_g + ln(sum(theta*mu_X)) + epsilon subject to sum(theta)=1, theta>=0 mu_X is marker genes from data['X'] then the mean parameter of the lognormal distribution of ln(lambda) is alpha + gamma_g + ln(sum(theta*mu_X)) we did re-parametrization w = e^alpha * theta, then ln(lambda) = gamma_g + ln(sum([e^alpha*theta]*mu_X)) + epsilon subject to w>=0, it will imply sum(theta)=1 and theta>=0 the steps to update theta and e_alpha: 1. dimension change of theta, theta_hat, u from 3-D (spots * celltypes * 1) to 1-D (celltypes), and do re-parametrization to get w 2. solve w for each spot in parallel 3. extract updated theta and e_alpha from w, and change the dimension of updated theta, theta_hat, u back Parameters ---------- data : Dict a Dict contains all info need for modeling: X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n celltype_names: a list of string of celltype names. warm_start_theta : 3-D numpy array (spots * celltypes * 1) initial guess of theta (celltype proportion). warm_start_e_alpha : 1-D numpy array initial guess of e_alpha (spot-specific effect). gamma_g : 1-D numpy array gene-specific platform effect for all genes. sigma2 : float variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance. nu : 3-D numpy array (spots * celltypes * 1), optional variable for ADMM penalty of all spots in 3 part ADMM nu = theta_hat (used for regularization/penalty) - u (scaled dual variables to make theta = theta_hat) rho : float, optional parameter for the strength of ADMM loss to make theta equals theta_hat lambda_r : float, optional parameter for the strength of Adaptive Lasso loss to shrink theta lasso_weight : 3-D numpy array (spots * celltypes * 1), optional weight of Adaptive Lasso, 1 ./ theta lambda_l2 : float parameter for the strength of L2 panealty to shrink theta global_optimize : bool, optional if is True, use basin-hopping algorithm to find the global minimum. The default is False. hybrid_version : bool, optional if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True. opt_method : string, optional specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. hv_log_p : 1-D numpy array, optional log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail. theta_mask : 3-D numpy array (spots * celltypes * 1), optional mask for cell-type proportions (1: present, 0: not present). Only used for stage 2 theta optmization. verbose : bool, optional if True, print more information. Returns ------- theta_results : 3-D numpy array (spots * celltypes * 1) updated theta (celltype proportion). e_alpha_results : 1-D numpy array updated e_alpha (spot-specific effect). ''' n_celltype = data["X"].shape[0] n_spot = data["Y"].shape[0] # NOTE input warm_start_theta, warm_start_e_alpha will NOT be changed inside the function # skip optimization if the initial theta corresponding to only one cell-type skip_opt = True # prepare parameter tuples for parallel computing results = [] for i in range(n_spot): this_spot_name = data["spot_names"][i] this_warm_start_theta = warm_start_theta[i, :, :].copy().flatten() this_warm_start_e_alpha = warm_start_e_alpha[i] if theta_mask is None: this_theta_mask = None else: this_theta_mask = theta_mask[i, :, :].copy().flatten() y_vec = data["Y"][i, :].copy() mu = data["X"].copy() if nu is None: nu_vec = None else: nu_vec = nu[i, :, :].copy().flatten() if lasso_weight is None: lasso_weight_vec = None else: lasso_weight_vec = lasso_weight[i, :, :].copy().flatten() if data["N"] is None: N = None else: N = data["N"][i] # filter zero genes if data['non_zero_mtx'] is None: this_y_vec = y_vec this_gamma_g = gamma_g.copy() this_mu = mu else: non_zero_gene_ind = data['non_zero_mtx'][i, :] #print(f'total {np.sum(non_zero_gene_ind)} non-zero genes ({np.sum(non_zero_gene_ind)/len(non_zero_gene_ind):.2%}) for spot {this_spot_name}') this_y_vec = y_vec[non_zero_gene_ind] this_gamma_g = gamma_g[non_zero_gene_ind].copy() this_mu = mu[:, non_zero_gene_ind] # call optimization function results.append(optimize_one_theta(this_mu, this_y_vec, N, this_warm_start_theta, this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=nu_vec, rho=rho, lambda_r=lambda_r, lasso_weight_vec=lasso_weight_vec, lambda_l2=lambda_l2, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=this_theta_mask, skip_opt=skip_opt, verbose=verbose)) # collect results: theta and e_alpha theta_results = np.zeros((n_spot, n_celltype, 1)) e_alpha_results = [] for i, this_result in enumerate(results): # extract theta and e_alpha tmp_e_alpha = np.sum(this_result) tmp_theta = this_result / tmp_e_alpha # change dimension back theta_results[i, :, :] = np.reshape(tmp_theta, (n_celltype, 1)) e_alpha_results.append(tmp_e_alpha) e_alpha_results = np.array(e_alpha_results) return theta_results, e_alpha_results
################################# code related to update sigma2 #################################
[docs] def update_sigma2(data, theta, e_alpha, gamma_g, sigma2, opt_method='L-BFGS-B', global_optimize=False, hv_x=None, verbose=False): ''' update sigma2 (variance paramter of the lognormal distribution) given theta (celltype proportion), e_alpha (spot-specific effect) and gamma_g (gene-specific platform effect) by MLE we assume ln(lambda) = alpha + gamma_g + ln(sum(theta*mu_X)) + epsilon subject to sum(theta)=1, theta>=0 mu_X is marker genes from data['X'] then the mean parameter of the log-normal distribution of ln(lambda) is alpha + gamma_g + ln(sum(theta*mu_X)) we did re-parametrization w = e^alpha * theta, then ln(lambda) = gamma_g + ln(sum([e^alpha*theta]*mu_X)) + epsilon subject to w>=0, it will imply sum(theta)=1 and theta>=0 currently optimization of sigma2 DO NOT support parallel computing Parameters ---------- data : Dict a Dict contains all info need for modeling: X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n celltype_names: a list of string of celltype names. theta : 3-D numpy array (spots * celltypes * 1) theta (celltype proportion). e_alpha : 1-D numpy array e_alpha (spot-specific effect). gamma_g : 1-D numpy array gene-specific platform effect for all genes. sigma2 : float initial guess of variance paramter of the lognormal distribution of ln(lambda). All genes and spots share the same variance. opt_method : string, optional specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds. global_optimize : bool, optional if is True, use basin-hopping algorithm to find the global minimum. The default is False. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. verbose : bool, optional if True, print more information. Returns ------- float updated sigma2 ''' def objective_loss_sigma2(sigma2, data, theta, e_alpha, gamma_g, hv_x): ''' calculate loss function for updating sigma2 (variance paramter of the log-normal distribution of ln(lambda)). All spots and genes share the same variance. the loss function is a sum of negative log-likelihood of the base model of each spot, and the so-called basemodel loss is the same as that for updating theta (celltype proportion) and we did re-parametrization w = e^alpha * theta, so theta = w / sum(w) we calculate the basemodel loss of each spot in a parallel way by Numba WARNING: this input sigma2 inside the loss function is actually a numpy array Returns ------- float the loss function (sum of base model loss across all spots) to update sigma2 ''' n_spot = data["Y"].shape[0] this_sigma2 = sigma2[0] # update density values of heavy-tail with current sigma^2 hv_log_p = generate_log_heavytail_array(hv_x, np.sqrt(this_sigma2)) #from time import time #start_time = time() results = 0.0 for i in range(n_spot): y_vec = data["Y"][i, :] this_theta = theta[i, :, :].flatten() this_e_alpha = e_alpha[i] # re-parametrization this_w = this_theta * this_e_alpha # sequencing depth if data["N"] is None: N = None else: N = data["N"][i] # filter zero genes if data['non_zero_mtx'] is None: this_y_vec = y_vec this_gamma_g = gamma_g this_mu = data["X"] else: non_zero_gene_ind = data['non_zero_mtx'][i, :] #print(f'total {np.sum(non_zero_gene_ind)} non-zero genes ({np.sum(non_zero_gene_ind)/len(non_zero_gene_ind):.2%}) for spot {i:d}') this_y_vec = y_vec[non_zero_gene_ind] this_gamma_g = gamma_g[non_zero_gene_ind] this_mu = data["X"][:, non_zero_gene_ind] results += hv_wrapper(this_w, this_y_vec, this_mu, this_gamma_g, this_sigma2, hv_x, hv_log_p, N) #print(f'after summing up spot {i:d}, current loss is {results:.6f}') #print(f'One round of summing up loss across all {n_spot:d} spots for sigma^2 optimization. Elapsed time: {time()-start_time:.2f} seconds.') return results # the min value for clip sigma^2 should be larger bounds = ((min_sigma2, None),) if global_optimize: bounded_step = RandomDisplacementBounds(np.array([b[0] for b in bounds]), np.array([b[1] for b in bounds])) # use the basin-hopping algorithm to find the global minimum sol = basinhopping(objective_loss_sigma2, sigma2, niter=10, T=1.0, take_step=bounded_step, minimizer_kwargs={'method': opt_method, 'args': (data, theta, e_alpha, gamma_g, hv_x), 'bounds': bounds, 'options': {'maxiter': 250, 'eps': sigma2_eps} }, disp=False) else: sol = minimize(objective_loss_sigma2, sigma2, args=(data, theta, e_alpha, gamma_g, hv_x), method=opt_method, bounds=bounds, options={'disp': False, 'maxiter': 250, 'eps': sigma2_eps}) if not global_optimize: if not sol.success: if verbose: print(f'[WARNING] sigma2 optimization in local model of all spots not successful! Caused by: {sol.message}') if not math.isfinite(sol.x[0]): raise Exception('[Error] sigma2 optimization in local model of all spots returns a non-finite value!') # the solution in x is a numpy array return sol.x[0]
################################# code related to update theta_tilde by Adaptive Lasso in ADMM #################################
[docs] def adaptive_lasso(nu, rho, lambda_r=1.0, lasso_weight=None): ''' update theta_tilde by Adaptive Lasso and ADMM loss theta_tilde = argmin lambda_r*lasso_weight*theta_tilde + 1/(2*rho) * (||theta_tilde-theta_hat+u_tilde||_2)^2 that is - if theta_tilde>=0, theta_tilde = theta_hat - u_tilde - lambda_r*rho*lasso_weight - if theta_tilde<0, theta_tilde = theta_hat - u_tilde + lambda_r*rho*lasso_weight change it to - if theta_hat-u_tilde-lambda_r*rho*lasso_weight>=0, theta_tilde = theta_hat - u_tilde - lambda_r*rho*lasso_weight - if theta_hat-u_tilde+lambda_r*rho*lasso_weight<=0, theta_tilde = theta_hat - u_tilde + lambda_r*rho*lasso_weight - if theta_hat-u_tilde-lambda_r*rho*lasso_weight<0 or theta_hat-u_tilde+lambda_r*rho*lasso_weight>0, not defined, let theta_tilde = 0 and nu = theta_hat - u_tilde WANRING: negative theta tilde value observed, so also clip it to >= 0 Parameters ---------- nu : 3-D numpy array (spots * celltypes * 1) variable for ADMM penalty of all spots in 3 part ADMM nu = theta_hat (used for regularization/penalty) - u_tilde (scaled dual variables to make theta_tilde = theta_hat) rho : float parameter for the strength of ADMM loss to make theta_tilde equals theta_hat lambda_r : float, optional strength for Adaptive Lasso penalty. The default is 1.0. lasso_weight : 3-D numpy array (spots * celltypes * 1), optional calculated weight for adaptive lasso. The default is None. Returns ------- 3-D numpy array (spots * celltypes * 1) updated theta_tilde. ''' if lasso_weight is None: lasso_weight = np.ones(nu.shape) result = np.maximum(nu - rho*lambda_r*lasso_weight, 0) - np.maximum(-nu - rho*lambda_r*lasso_weight, 0) # avoid negative values result[result<min_theta] = min_theta return result
################################# code related to update theta by in Stage Two without ADMM #################################
[docs] def fit_base_model_plus_laplacian(data, L, theta, e_alpha, gamma_g, sigma2, lambda_r=None, lasso_weight=None, hv_x=None, hv_log_p=None, theta_mask=None, opt_method='L-BFGS-B', global_optimize=False, hybrid_version=True, verbose=False): """ fit base model plus Laplacian penalty, this function is to replace the whole ADMM framework in GLRM stage two Note: To keep consistent, the output of the fit result is still 3-Dimensional. The dimension transform will be performed outside this function if needed Parameters ---------- data : Dict a Dict contains all info need for modeling: X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n celltype_names: a list of string of celltype names.\n initial_guess: initial guess of cell-type proportions of spatial spots. L : scipy sparse matrix (spots * spots) Laplacian matrix. Note the strenth lambda_g is already absorbed in the L theta : 3-D numpy array (spots * celltypes * 1) initial guess of theta (celltype proportion). e_alpha : 1-D numpy array initial guess of e_alpha (spot-specific effect). gamma_g : 1-D numpy array gene-specific platform effect for all genes. sigma2 : float initial guess of variance paramter of the lognormal distribution of ln(lambda). All genes and spots share the same variance. it may not be updated during this optimization, i.e. sigma2 is treated as an already optimized value. lambda_r : float, optional strength for Adaptive Lasso penalty. The default is None here. lasso_weight : 3-D numpy array (spots * celltypes * 1), optional calculated weight for adaptive lasso. The default is None. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. hv_log_p : 1-D numpy array, optional log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail. theta_mask : 3-D numpy array (spots * celltypes * 1), optional mask for cell-type proportions (1: present, 0: not present). Only used for stage 2 theta optmization. opt_method : string, optional specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds. global_optimize : bool, optional if is True, use basin-hopping algorithm to find the global minimum. The default is False. hybrid_version : bool, optional if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True. verbose : bool, optional if True, print more information. Returns ------- Dict estimated model coefficients, including: theta : celltype proportions (#spots * #celltypes * 1)\n e_alpha : spot-specific effect (1-D array with length #spot)\n sigma2 : variance paramter of the lognormal distribution (float)\n gamma_g : gene-specific platform effect for all genes (1-D array with length #gene) """ n_celltype = data['X'].shape[0] n_spot = data['Y'].shape[0] from time import time start_time = time() # NOTE input theta, e_alpha will NOT be changed inside the function # initialize x array for calculation of heavy-tail if hv_x is None: from local_fit_numba import z_hv hv_x = z_hv.copy() if hv_log_p is None: # initialize density values of heavy-tail with initial sigma^2 hv_log_p = generate_log_heavytail_array(hv_x, np.sqrt(sigma2)) assert theta_mask is not None if verbose: print('[HIGHLIGHT] set tighter bounds for proportion w values of NOT present cell types determined in Stage 1') assert lambda_r is None assert lasso_weight is None assert hybrid_version is True # the optimization is on w, but the constrain is on theta # prepare w, and 3D -> 1D w0_list = [] bounds = [] for i in range(n_spot): # take the i-th spot this_warm_start_theta = theta[i, :, 0] # shape (n_celltype,) this_theta_mask = theta_mask[i, :, 0] assert np.isclose(np.sum(this_warm_start_theta), 1.0, atol=1e-7), f"Spot {i}: sum={np.sum(this_warm_start_theta):.4g}" assert len(this_warm_start_theta) == n_celltype assert len(this_theta_mask) == n_celltype # per-cell-type-proportion bounds # avoid [[min_theta, None]] * n_celltype, which will creates shared inner lists, when change one, all will changed simultaneously # None means “no upper bound” this_bounds = [[min_theta, None] for _ in range(n_celltype)] # for cell type not presented, we set a very tight bounds for j in range(n_celltype): if this_theta_mask[j] == 0: # this cell type not present this_bounds[j][1] = min_theta * 2 bounds.extend(this_bounds) # re-parametrization: multiply θ_i by e_alpha[i] to get w_i w0_list.append(this_warm_start_theta * e_alpha[i]) w0 = np.concatenate(w0_list, axis=0) # shape (n_spot * n_celltype,) bounds = tuple([tuple(x) for x in bounds]) # Ensure symmetric Laplacian without densifying if (L - L.T).nnz != 0: raise ValueError("Laplacian L must be symmetric!") # loss function def loss_theta_L(w_vec, Y, mu, gamma_g, sigma2, hybrid_version, hv_x, hv_log_p, Ns, L): ''' calculate loss function for updating theta (celltype proportion) the loss function contains two parts, sum of base model, defined for each spot separately; and the graph Laplacian penalty also return gradient respect to w 1. for each spot, negative log-likelihood of the base model given observed data and initial parameter value. It sums across all genes. Then we sum it over all spots 2. graph Laplacian penalty Parameters ---------- w_vec : 1-D numpy array e_alpha (spot-specific effect) * theta (celltype proportion) of all spots, flattened to 1D array. Y : 2-D numpy array observed gene counts of all spots (spots * genes). mu : 2-D numpy matrix celltype specific marker genes (celltypes * genes) gamma_g : 1-D numpy array gene-specific platform effect for all genes. sigma2 : float variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance. hybrid_version : bool, optional if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True. hv_x : 1-D numpy array, optional data points served as x for calculation of probability density values. Only used for heavy-tail. hv_log_p : 1-D numpy array, optional log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail. Ns : 1-D numpy array sequencing depth of all spots (length #spots). If is None, use sum(y_vec) instead. Returns ------- a tuple (float, 1-D numpy array) the loss function (base model loss + graph Laplacian loss) over all spots to update w (e_alpha*theta) + gradient vector ''' # first revert the w to spot * cell type n_spot = Y.shape[0] n_celltype = mu.shape[0] assert len(w_vec) == n_spot * n_celltype W = w_vec.reshape((n_spot, n_celltype)) # row-wise reshape base_model_loss = 0 grad_W = np.zeros((n_spot, n_celltype)) # preallocate gradient matrix # re-construct theta from w, theta = w / sum(w) Theta = np.zeros((n_spot, n_celltype)) for i in range(n_spot): this_w = W[i, :] this_y = Y[i, :] # sequencing depth if Ns is None: N = None else: N = Ns[i] # return the loss function value and gradient for w in this spot i cur_loss, cur_gradient = objective_loss_theta(this_w, this_y, mu, gamma_g, sigma2, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, hybrid_version=hybrid_version, hv_x=hv_x, hv_log_p=hv_log_p, N=N) # total loss is sum over all spots base_model_loss += cur_loss # since each spot is independent in base model, the the gradient for this spot is only related to this spot, we can directly use it grad_W[i, :] = cur_gradient # re-construct theta, we are sure sum will >0, so no need to add a smalle value to avoid divided by zero; rather add a small value will break the invariance in gradient calculation, leading to incorrect results tmp_sum = np.sum(this_w) Theta[i, :] = this_w / tmp_sum assert hybrid_version is True # constrain on theta # Note L is a SciPy sparse matrix, and lambda_g is already absorbed in the L # @ is matrix multiply, * is Element-wise Multiplication # Theta: n_spot * n_celltype # loss is tr(Θ^⊤*L*Θ) graph_loss = np.sum(Theta * (L @ Theta)) # == tr(Theta.T @ L @ Theta) # we need to return gradient respect to w, so use chain rule df/dw = df/dθ * dθ/dw # df/dθ = 2LΘ, dθ/dw = 1/sum(w) - w/(sum(w)^2) # for each row i: # ∇_{w_i} f = g_i / s_i - ( (w_i · g_i) / s_i^2 ) * 1 g = 2 * L @ Theta # (n,k) s = np.sum(W, axis=1, keepdims=True) # (n,1) # rowwise corrections wg = np.sum(W * g, axis=1, keepdims=True) # (n,1), row-wise inner product w·g graph_grad_W = (g / s) - (wg / (s ** 2)) # broadcast-safe, (n,k) loss = base_model_loss + graph_loss grad = grad_W + graph_grad_W # reshape grad to vector grad_vec = grad.flatten(order='C') # row-wise return loss, grad_vec # start optimization # call minimize function to solve w (e_alpha*theta) # bounds : tuple of tuples # sequence of (min, max) pairs for each element in w_vec # min not set as 0 to avoid divided by 0 or log(0) # if jac is a Boolean and is True, fun is assumed to return a tuple (f, g) containing the objective function and the gradient sol = minimize(loss_theta_L, w0, args=(data["Y"], data["X"], gamma_g, sigma2, hybrid_version, hv_x, hv_log_p, data["N"], L), method=opt_method, bounds=bounds, options={'disp': False, 'maxiter': 250, 'eps': theta_eps}, jac=True) # sol would be (nk,) array, change to n*k if sol.success: if verbose: print(f'w optimization in Stage 2 successful in {sol.nit} iterations') # Ref: status 2 - Maximum number of iterations has been exceeded else: if verbose: print(f'[WARNING] w optimization in Stage 2 not successful! Caused by: {sol.message}') sol_W = np.reshape(sol.x, (n_spot, n_celltype)) # row-wise reshape sol_W_cor = np.zeros((n_spot, n_celltype)) for i in range(n_spot): solve_fail_flag = False this_sol = sol_W[i, ].copy() if sum(this_sol) == 0: if verbose: print(f'###### [Error] w optimization in STAGE TWO of spot {data["spot_names"][i]} returns all 0s! ######') solve_fail_flag = True if np.any(np.isnan(this_sol)): if verbose: print(f'###### [Error] w optimization in STAGE TWO of spot {data["spot_names"][i]} returns NaN value! ######') solve_fail_flag = True if np.any(np.isinf(this_sol)): if verbose: print(f'###### [Error] w optimization in STAGE TWO of spot {data["spot_names"][i]} returns Infinite value! ######') solve_fail_flag = True if solve_fail_flag: # replace NaN or Infinite as 0 this_sol = np.nan_to_num(this_sol, nan=0.0, posinf=0.0, neginf=0.0) if verbose: print('repalce non-numeric value as 0') # NOTE this is w, no need to sum to 1 if sum(this_sol) == 0: # reset w to all elements equal tmp_len = this_sol.shape[0] this_sol = np.full((tmp_len,), 1.0/tmp_len) * e_alpha[i] if verbose: print('[WARNING] reset w to all elements identical!') # set w values at boundary to 0; note to allow a tolerance tmp_ind = (this_sol > 0) & (this_sol <= min_theta * 5) if tmp_ind.any(): this_sol[tmp_ind] = 0 # NOTE this is w, no need to sum to 1 sol_W_cor[i, ] = this_sol # collect results: theta and e_alpha theta_results = np.zeros((n_spot, n_celltype, 1)) e_alpha_results = [] for i in range(n_spot): # extract theta and e_alpha this_w_result = sol_W_cor[i, ] tmp_e_alpha = np.sum(this_w_result) tmp_theta = this_w_result / tmp_e_alpha # change dimension back theta_results[i, :, :] = np.reshape(tmp_theta, (n_celltype, 1)) e_alpha_results.append(tmp_e_alpha) e_alpha_results = np.array(e_alpha_results) # construct result, DO NOT change theta back to 2-D array # the dimension transforming is performed outside this function is needed result = { 'theta': theta_results, 'e_alpha': e_alpha_results, 'sigma2': sigma2, 'gamma_g': gamma_g } if verbose: print(f'Optimization with graph Laplacian penalty finished. Elapsed time: {(time()-start_time)/60.0:.2f} minutes.\n') return result