Source code for local_fit_numba

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu May  5 22:17:11 2022

@author: hill103

this script implement the optimization of theta and sigma^2 using Poisson log-normal distribution + heavy-tail

use Numba parallel for best performance

we calculate the likelihood for each gene sequentially, utilizing Numba's parallel capabilities to accelerate the computation within each gene's calculation.

NOTE: simplifying the loss-computation code and removing unneeded nested wrappers yielded a speedup

and print not supported in Numba function

functions for likelihood calculation:
    
    calc_hv_numba: Numba function so parallel computing by Numba; calculate likelihoods given an array of y and mu then return; NOT related with cache dict
    
    hv_comb: return sum of negative log-likelihoods + gradient of w; used for theta optimization; directly call hv_numba
    
    hv_wrapper: return sum of negative log-likelihoods; used for sigma^2 optimization; directly call hv_numba
    
    hv_numba: call calc_hv_numba and return array of likelihoods; support caching (DELETED)
    
    
sequence of function calls for parameter optimization:
    
    update_theta -> objective_loss_theta -> hv_comb -> calc_hv_numba
    
    update_sigma2 -> objective_loss_sigma2 -> hv_wrapper -> calc_hv_numba
    
    calcHVBaseModelLoss -> hv_wrapper in hyper parameter tunning
"""



import numpy as np
from scipy.optimize import minimize, basinhopping
import scipy.stats
import numba as na
import math
from config import min_val, min_theta, min_sigma2, N_z, gamma, print, theta_eps, sigma2_eps



################################# code related to global optimization #################################

[docs]
class RandomDisplacementBounds(object):
    """A class to perform random displacement with bounds on parameters during optimization.

    This step function is designed to modify the basinhopping algorithm's step-taking behavior by 
    ensuring that new positions remain within specified bounds. This implementation uses a direct 
    calculation to determine the minimum and maximum allowable steps rather than using
    acceptance-rejection sampling, which can be found in the more general approach discussed on 
    StackOverflow (https://stackoverflow.com/a/21967888/2320035). The approach has been modified 
    for enhanced performance with specific bounds.
    
    ref: https://stackoverflow.com/questions/47055970/how-can-i-write-bounds-of-parameters-by-using-basinhopping

    Attributes:
        xmin (np.ndarray or list): The lower bounds of the parameters. None indicates unbounded.
        xmax (np.ndarray or list): The upper bounds of the parameters. None indicates unbounded.
        stepsize (float): The maximum step size to take in any parameter direction.

    """
    def __init__(self, xmin, xmax, stepsize=0.5):
        """
        Initializes the RandomDisplacementBounds with specified bounds and step size.

        Args:
            xmin (np.ndarray or list): The lower bounds of the parameters.
            xmax (np.ndarray or list): The upper bounds of the parameters.
            stepsize (float, optional): The maximum step size to take in any parameter direction. Defaults to 0.5.
        """
        self.xmin = xmin
        self.xmax = xmax
        self.stepsize = stepsize

    def __call__(self, x):
        """
        Calculates a new position for the optimization algorithm by taking a random step
        within the defined bounds.

        This method ensures that the new position does not violate the specified bounds.

        Args:
            x (np.ndarray): The current position of the parameters in the optimization algorithm.

        Returns:
            np.ndarray: The new position after taking a random step within the bounds.
        """
        # define a custom function to consider None in bound
        def calcMinStep(xmin, x, stepsize):
            if xmin is None:
                return -stepsize
            else:
                return np.maximum(xmin - x, -stepsize)
            
        def calcMaxStep(xmax, x, stepsize):
            if xmax is None:
                return stepsize
            else:
                return np.minimum(xmax - x, stepsize)
        
        min_step = np.array([calcMinStep(this_xmin, this_x, self.stepsize) for (this_xmin, this_x) in zip(self.xmin, x)])
        max_step = np.array([calcMaxStep(this_xmax, this_x, self.stepsize) for (this_xmax, this_x) in zip(self.xmax, x)])
        
        random_step = np.random.uniform(low=min_step, high=max_step, size=x.shape)
        xnew = x + random_step
        
        return xnew




################################# code related to Poisson heavy-tail density calculation in Python #################################

# parameters related to heavy tail, just for standard N(0, 1)
# a and c are chosen so that density is continuously differentiable at the boundary (x=3)
a = 4/9 * np.exp(-3**2/2) / np.sqrt(2*np.pi)
c = 7/3
# C is a normalizing constant making the density integrate to 1
C = 1 / ((a/(3-c) - scipy.stats.norm(0, 1).cdf(-3))*2 + 1)

# generate data points to calculate integral
z_hv = np.array(range(-N_z, N_z+1)) * gamma



[docs]
def generate_log_heavytail_array(x, sigma):
    '''
    generate a array of log density values of standard normal distribution + heavy-tail given a specific sigma
    
    we assume the normal distribution is N(0, sigma^2), and transform to N(0, 1) by divided by sigma, and calculate the heavy-tail density values, then transform back to N(0, sigma^2) and take log. We can re-use these pre-calculated values for all genes and spots
    
    after divided by sigma, the normal + heavy-tail integrate to 1, i.e. sum(p*gamma)=1, gamma is the interval

    Parameters
    ----------
    x : 1-D numpy array
        data points served as x for calculation of probability density values
    sigma : float
        SD of normal distribution, corresponding to the dispertion in GLRM. All genes and all spots share the same SD

    Returns
    -------
    1-D numpy array
        log density values of normal distribution N(0, sigma^2) + heavy-tail.

    '''
    
    if not math.isfinite(sigma):
        raise Exception('[Error] sigma2 optimization in local model of all spots returns a non-finite value!')
    
    # change to N(0, 1)
    tmp_x = x / sigma
    p = np.empty(tmp_x.shape)
    tmp_idx = abs(tmp_x) < 3
    not_idx = np.logical_not(tmp_idx)
    # normal part
    p[tmp_idx] = C/np.sqrt(2*np.pi) * np.exp(-0.5*(tmp_x[tmp_idx]**2))
    # heavy tail part
    p[not_idx] = C*a / ((abs(tmp_x[not_idx])-c) ** 2)
    
    assert((p>=0).all())
    
    # change back to N(0, sigma^2) and take log
    return np.log(p / sigma + min_val)




############ code related to Numba functions to calculate negative log-likelihood values  ############

# use Numba to speed up Python and NumPy code
# check in Reference Manual to make sure all Python and Numpy features in the code are supported
# based on test, Numba parallel on one gene DO NOT improve speed much; on ALL genes DOES
# set nopython True/False, fastmath True/False DO NOT affect speed
# see the ref https://stackoverflow.com/a/64613366/13752320 for advices of using Numba efficiently
# UPDATE: we modified it for better Numba handling

[docs]
@na.jit(nopython=True, parallel=True, fastmath=False, error_model="numpy", cache=False)
def calc_hv_numba(MU, y_vec, hv_x, hv_log_p, output, gamma):
    '''
    calculate the likelihood value given the distribution parameters considering heavy-tail and assign them to the output variable
    
    note that add a small value to avoid log(0)
    
    Parameters
    ----------
    MU : 1-D numpy array
        mean value of log-normal distribution for all genes.
    y_vec : 1-D numpy array
        observed gene counts of one spot.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    hv_log_p : 1-D numpy array, optional
        log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
    output : 1-D numpy array
        empty array used to store the calculated values.
    gamma : float
        increment to calculate the heavy-tail probabilities.
    
    Returns
    -------
    1-D numpy array
        likelihood of input genes in one spot.
    '''
    
    G = MU.shape[0]
    Z = hv_x.shape[0]
    for i in na.prange(G):                # parallel ONLY here
        mu_i = MU[i]
        yi   = y_vec[i]
        lg   = math.lgamma(yi + 1.0)   # invariant over j
        acc  = 0.0
        for j in range(Z):             # <- plain range (not prange)
            tmp = mu_i + hv_x[j]
            # two exps is unavoidable here; still hoisted invariants help
            acc += math.exp(yi*tmp + hv_log_p[j] - math.exp(tmp) - lg)
        output[i] = acc * gamma        # single write
    return output





[docs]
def hv_comb(w_vec, y_vec, mu, gamma_g, sigma2, hv_x, hv_log_p, N):
    '''
    a function to calculate the negative likelihood value given the distribution parameters considering heavy-tail plus corresponding gradient vector
    
    also return 1st order derivative vector of w
    
    this function will be used as target function for theta optimization
    
    we assume:
        
        y ~ Poisson(N*lambda)

        ln(lambda) follow epsilon's distribution ~ N(mu, sigma2)
        
    then ln(N*lambda) ~ N(mu+ln(N), sigma2)
    
    N is sequencing depth for this spot
    mu is alpha + log(theta*marker gene) + gamma
    
    Now we also consider the heavy-tail instead of just normal distribution
    
    Parameters
    ----------
    w_vec : 1-D numpy array
        e_alpha (spot-specific effect) * theta (celltype proportion) of one spot.
    y_vec : 1-D numpy array
        observed gene counts of one spot.
    mu : 2-D numpy matrix
        celltype specific marker genes (celltypes * genes)
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    sigma2 : float
        variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    hv_log_p : 1-D numpy array, optional
        log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
    N : float
        sequencing depth for this spot. If is None, use sum(y_vec) instead.

    Returns
    -------
    tuple of (float, 1-D numpy array)
        sum of negative log-likelihood across all genes in one spot + gradient vector of w.
    '''
    # return both negative log-likelihoods and 1st order derivative
    if N is None:
        N = np.sum(y_vec)
    
    # The @ operator performs matrix multiplication, w_vec@mu produces a 1-D numpy array with length as number of genes
    MU = np.log(w_vec@mu+min_val) + gamma_g + np.log(N)
    this_lambda = (w_vec@mu) * np.exp(gamma_g) * N
    
    likelihoods = calc_hv_numba(MU, y_vec, hv_x, hv_log_p, np.zeros(MU.shape), gamma)
    
    loss = -np.sum(np.log(likelihoods + min_val))
    
    # for gradient of w
    likelihoods_y_add_1 = calc_hv_numba(MU, y_vec+1, hv_x, hv_log_p, np.zeros(MU.shape), gamma)
    
    # element-wise operation
    tmp = (y_vec*likelihoods - (y_vec+1)*likelihoods_y_add_1 + min_val) / (this_lambda*likelihoods + min_val)
    der = []
    # sum over genes, get result related to cell-types
    for i in range(mu.shape[0]):
        der.append(-np.sum(tmp * np.exp(gamma_g) * N * mu[i, :].flatten()))
    
    return loss, np.array(der)





[docs]
def hv_wrapper(w_vec, y_vec, mu, gamma_g, sigma2, hv_x, hv_log_p, N):
    '''
    a wrapper to calculate the negative log-likelihood value given the distribution parameters considering heavy-tail
    
    this wrapper will be used as target function for sigma^2 optimization
    
    we assume:
        
        y ~ Poisson(N*lambda)
    
        ln(lambda) follow epsilon's distribution ~ N(mu, sigma2)
    
    then ln(N*lambda) ~ N(mu+ln(N), sigma2)
    
    N is sequencing depth for this spot
    mu is alpha + log(theta*marker gene) + gamma
    
    Now we also consider the heavy-tail instead of just normal distribution
    
    Parameters
    ----------
    w_vec : 1-D numpy array
        e_alpha (spot-specific effect) * theta (celltype proportion) of one spot.
    y_vec : 1-D numpy array
        observed gene counts of one spot.
    mu : 2-D numpy matrix
        celltype specific marker genes (celltypes * genes)
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    sigma2 : float
        variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    hv_log_p : 1-D numpy array, optional
        log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
    N : float
        sequencing depth for this spot. If is None, use sum(y_vec) instead.

    Returns
    -------
    float
        sum of negative log-likelihood across all genes in one spot.
    '''
    
    if N is None:
        N = np.sum(y_vec)
    
    MU = np.log(w_vec@mu+min_val) + gamma_g + np.log(N)
    likelihoods = calc_hv_numba(MU, y_vec, hv_x, hv_log_p, np.zeros(MU.shape), gamma)
    loss = -np.sum(np.log(likelihoods + min_val))
    
    return loss




################################# code related to update theta #################################


[docs]
def objective_loss_theta(w_vec, y_vec, mu, gamma_g, sigma2, nu_vec, rho, lambda_r, lasso_weight_vec, lambda_l2, hybrid_version, hv_x, hv_log_p, N):
    '''
    calculate loss function for updating theta (celltype proportion)
    
    the loss function contains four parts (defined for each spot separately)
    
    also return gradient of w
    
    1. negative log-likelihood of the base model given observed data and initial parameter value. It sums across all genes
    
    2. a loss of ADMM to make theta equals theta_hat (used for regularization/penalty; optional, controlled by nu_vec and rho)
                    
        1/(2*rho) (||w/sum(w) - theta_hat + u||_2)^2
    
    u is the scaled dual variables to make theta = theta_hat and nu = theta_hat - u
    
    and we did re-parametrization w = e^alpha * theta, so theta = w / sum(w)
    
    3. a loss of Adaptive Lasso to shrink theta (optional, controlled by lambda_r and lasso_weight_vec)
    
        lambda_r * (inner product(w/sum(w), lasso_weight))
                        
    4. a loss of L2 penalty to shrink theta (optional, controlled by lambda_l2)
    
        lambda_l2 * (sum(squared(w/sum(w))))
    
    Parameters
    ----------
    w_vec : 1-D numpy array
        e_alpha (spot-specific effect) * theta (celltype proportion) of one spot.
    y_vec : 1-D numpy array
        observed gene counts of one spot.
    mu : 2-D numpy matrix
        celltype specific marker genes (celltypes * genes)
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    sigma2 : float
        variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance.
    nu_vec : 1-D numpy array
        variable for ADMM penalty of one spot
        in 3 part ADMM nu = theta_hat (used for regularization/penalty) - u (scaled dual variables to make theta = theta_hat)
    rho : float
        parameter for the strength of ADMM loss to make theta equals theta_hat
    lambda_r : float
        parameter for the strength of Adaptive Lasso loss to shrink theta
    lasso_weight_vec : 1-D numpy array
        weight of Adaptive Lasso, 1 ./ theta
    lambda_l2 : float
        parameter for the strength of L2 panealty to shrink theta
    hybrid_version : bool, optional
        if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    hv_log_p : 1-D numpy array, optional
        log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
    N : float
        sequencing depth for this spot. If is None, use sum(y_vec) instead.
    
    Returns
    -------
    a tuple (float, 1-D numpy array)
        the loss function (base model loss + ADMM loss + Adaptive Lasso loss + L2 loss) of one spot to update w (e_alpha*theta) + gradient vector
    '''

    
    def admm_penalty_loss():
        '''
        calculate the loss for ADMM of making theta=theta_hat
        
                       1/(2*rho) (||w/sum(w) - theta_hat + u||_2)^2
                       
                       and nu = theta_hat - u

        Returns
        -------
        a tuple (float, 1-D numpy array)
            ADMM loss + gradient
        '''
        
        if rho is None or nu_vec is None:
            return (0, np.zeros(w_vec.shape))
        else:
            if hybrid_version:
                # ADMM loss on theta
                tmp_sum = np.sum(w_vec)
                return (np.sum(((w_vec/tmp_sum - nu_vec)**2))/(2*rho), 1/rho * ((w_vec*tmp_sum-np.sum(np.square(w_vec)))/(tmp_sum**3) - (nu_vec*tmp_sum-np.inner(nu_vec, w_vec))/(tmp_sum**2)))
            else:
                # ADMM loss on w
                return (np.sum(((w_vec-nu_vec)**2))/(2*rho), (w_vec-nu_vec)/rho)
        
    
    def adaptive_lasso_loss():
        '''
        calculate the loss for Adaptive Lasso
        
                       lambda_r * (inner product(w/sum(w), lasso_weight))

        Returns
        -------
        a tuple (float, 1-D numpy array)
            Adaptive Lasso loss + gradient
        '''
        
        if lambda_r is None or lasso_weight_vec is None:
            return (0, np.zeros(w_vec.shape))
        else:
            if hybrid_version:
                # Adaptive Lasso loss on theta
                tmp_sum = np.sum(w_vec)
                return (lambda_r * np.inner(w_vec/tmp_sum, lasso_weight_vec), lambda_r * (lasso_weight_vec*tmp_sum-np.inner(w_vec, lasso_weight_vec)) / (tmp_sum**2))
            else:
                # Adaptive Lasso loss on w
                return (lambda_r * np.inner(w_vec, lasso_weight_vec), lambda_r*lasso_weight_vec)
    
    
    def l2_loss():
        '''
        calculate the L2 penalty
        
                        lambda_l2 * (sum(squared(w/sum(w))))

        Returns
        -------
        a tuple (float, 1-D numpy array)
            L2 penalty + gradient

        '''
        
        if lambda_l2 is None:
            return (0, np.zeros(w_vec.shape))
        else:
            if hybrid_version:
                # L2 loss on theta
                tmp_sum = np.sum(w_vec)
                return (lambda_l2 * np.sum(np.square(w_vec/tmp_sum)), lambda_l2 * 2 * (w_vec*tmp_sum - np.sum(np.square(w_vec))) / (tmp_sum**3))
            else:
                # L2 loss on w
                return (lambda_l2 * np.inner(np.square(w_vec)), lambda_l2*2*w_vec)
    
    # combine loss and gradient on w separately
    return tuple([a+b+c+d for a,b,c,d in zip(hv_comb(w_vec, y_vec, mu, gamma_g, sigma2, hv_x, hv_log_p, N), admm_penalty_loss(), adaptive_lasso_loss(), l2_loss())])





[docs]
def optimize_one_theta(mu, y_vec, N, this_warm_start_theta, this_warm_start_e_alpha, gamma_g, sigma2, spot_name, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', hv_x=None, hv_log_p=None, this_theta_mask=None, skip_opt=True, verbose=False):
    '''
    update theta (celltype proportion) sigma2 (variance paramter of the log-normal distribution) and gamma_g (gene-specific platform effect) by MLE in ONE SPOT

    Parameters
    ----------
    mu : 2-D numpy matrix
        matrix of celltype specific marker gene expression (celltypes * genes).
    y_vec : 1-D numpy array
        spatial gene expression (length #genes).
    N : int or None
        sequencing depth of this spot. If it's None, use sum of observed marker gene expressions as sequencing depth.
    spot_name : string
        name of this spot.
    this_warm_start_theta : 1-D numpy array (length #celltypes)
        initial guess of theta (celltype proportion).
    this_warm_start_e_alpha : float
        initial guess of e_alpha (spot-specific effect).
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    sigma2 : float
        variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance.
    nu_vec : 1-D numpy array (length #celltypes), optional
        variable for ADMM penalty of all spots
        in 3 part ADMM nu = theta_hat (used for regularization/penalty) - u (scaled dual variables to make theta = theta_hat)
    rho : float, optional
        parameter for the strength of ADMM loss to make theta equals theta_hat
    lambda_r : float, optional
        parameter for the strength of Adaptive Lasso loss to shrink theta
    lasso_weight_vec : 1-D numpy array (length #celltypes), optional
        weight of Adaptive Lasso, 1 ./ theta
    lambda_l2 : float
        parameter for the strength of L2 panealty to shrink theta
    global_optimize : bool, optional
        if is True, use basin-hopping algorithm to find the global minimum. The default is False.
    hybrid_version : bool, optional
        if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
    opt_method : string, optional
        specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    hv_log_p : 1-D numpy array, optional
        log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
    this_theta_mask : 1-D numpy array (length #celltypes), optional
        mask for cell-type proportions (1: present, 0: not present).
    skip_opt : bool, optional
        if True, when only one cell type present, we skip optimization and directly return theta.
    verbose : bool, optional
        if True, print more information.
        
    Returns
    -------
    w_result : 1-D numpy array (length #celltypes)
        updated w, i.e. theta (celltype proportion) * e_alpha.
    '''


    n_celltype = len(this_warm_start_theta)
    
    if skip_opt:
        if np.isclose(this_warm_start_theta, 1.0, atol=1e-7).any():
            assert np.sum(this_warm_start_theta) == 1
            #print(f'skip optimization for spot {spot_name} as only one celltype present')
            return this_warm_start_theta
    
    # Prepare variables based on whether sparsity considered via theta mask
    if this_theta_mask is None:
        this_present_celltype_index = None
    else:
        this_present_celltype_index = this_theta_mask==1
    
        # only one cell-type present, we can directly determine the proportion as 1
        if skip_opt:
            if np.sum(this_present_celltype_index) == 1:
                simple_sol = np.zeros((n_celltype,))
                simple_sol[this_present_celltype_index] = 1
                #print(f'skip optimization for spot {spot_name} as only one celltype present in mask')
                return simple_sol

    if this_present_celltype_index is not None:
        # extract only marker gene expressions for presented cell-types
        mu = mu[this_present_celltype_index, :]
        
        if nu_vec is not None:
            nu_vec = nu_vec[this_present_celltype_index]
    
        this_warm_start_theta = this_warm_start_theta[this_present_celltype_index]
    
        if lasso_weight_vec is not None:
            lasso_weight_vec = lasso_weight_vec[this_present_celltype_index]
            
    # re-parametrization
    warm_start_w = this_warm_start_theta * this_warm_start_e_alpha
    
        
    # start optimization
    # call minimize function to solve w (e_alpha*theta)
    # bounds : tuple of tuples
    #    sequence of (min, max) pairs for each element in w_vec
    # min not set as 0 to avoid divided by 0 or log(0)
    # if jac is a Boolean and is True, fun is assumed to return a tuple (f, g) containing the objective function and the gradient
    
    #from time import time
    #start_time = time()
    
    bounds = (((min_theta, None),) * len(warm_start_w))
    
    if global_optimize:
        bounded_step = RandomDisplacementBounds(np.array([b[0] for b in bounds]), np.array([b[1] for b in bounds]))
        # use the basin-hopping algorithm to find the global minimum
        sol = basinhopping(objective_loss_theta, warm_start_w, niter=10, T=1.0, take_step=bounded_step,
                           minimizer_kwargs={'method': opt_method,
                                             'args': (y_vec, mu, gamma_g, sigma2, nu_vec, rho, lambda_r, lasso_weight_vec, lambda_l2, hybrid_version, hv_x, hv_log_p, N),
                                             'bounds': bounds,
                                             'options': {'maxiter': 250, 'eps': theta_eps},
                                             'jac': True
                                             },
                           disp=False)
    else:
        sol = minimize(objective_loss_theta, warm_start_w, args=(y_vec, mu, gamma_g, sigma2, nu_vec, rho, lambda_r, lasso_weight_vec, lambda_l2, hybrid_version, hv_x, hv_log_p, N),
                   method=opt_method,
                   bounds=bounds, options={'disp': False, 'maxiter': 250, 'eps': theta_eps}, jac=True)
    
    #print(f'spot {spot_name} optimization finished in {sol.nit} iterations. Elapsed time: {time()-start_time:.2f} seconds.')
    
    if not global_optimize:
        if not sol.success:
            if verbose:
                # Ref: status 2 - Maximum number of iterations has been exceeded
                print(f'[WARNING] w optimization in local model of spot {spot_name} not successful! Caused by: {sol.message}')
    
    solve_fail_flag = False
    
    if sum(sol.x) == 0:
        print(f'###### [Error] w optimization in local model of spot {spot_name} returns all 0s! ######')
        solve_fail_flag = True
        
    if np.any(np.isnan(sol.x)):
        print(f'###### [Error] w optimization in local model of spot {spot_name} returns NaN value! ######')
        solve_fail_flag = True
       
    if np.any(np.isinf(sol.x)):
        print(f'###### [Error] w optimization in local model of spot {spot_name} returns Infinite value! ######')
        solve_fail_flag = True
        
    if solve_fail_flag:
        # replace NaN or Infinite as 0
        this_sol = np.nan_to_num(sol.x, nan=0.0, posinf=0.0, neginf=0.0)
        # NOTE this is w, no need to sum to 1
        print('repalce non-numeric value as 0')
        
        if sum(this_sol) == 0:
            # reset w to all elements equal
            tmp_len = this_sol.shape[0]
            this_sol = np.full((tmp_len,), 1.0/tmp_len) * this_warm_start_e_alpha
            print('[WARNING] reset w to all elements identical!')

    else:
        this_sol = sol.x
        
    # set w values at boundary to 0; note to allow a tolerance
    this_sol[this_sol<=(min_theta*2)] = 0
    
    
    # transform back w to original dimension, adding non-present values
    if this_theta_mask is None:
        w_result = this_sol
    else:
        w_result = np.zeros((n_celltype,))
        w_result[this_present_celltype_index] = this_sol
    
    return w_result 




[docs]
def update_theta(data, warm_start_theta, warm_start_e_alpha, gamma_g, sigma2, nu=None, rho=None, lambda_r=None, lasso_weight=None, lambda_l2=None, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', hv_x=None, hv_log_p=None, theta_mask=None, verbose=False):
    '''
    update theta (celltype proportion) and e_alpha (spot-specific effect) given sigma2 (variance paramter of the log-normal distribution) and gamma_g (gene-specific platform effect) by MLE
    
    we assume 
    
        ln(lambda) = alpha + gamma_g + ln(sum(theta*mu_X)) + epsilon
    
        subject to sum(theta)=1, theta>=0
    
    mu_X is marker genes from data['X']
    
    then the mean parameter of the lognormal distribution of ln(lambda) is alpha + gamma_g + ln(sum(theta*mu_X))
    
    we did re-parametrization w = e^alpha * theta, then
    
        ln(lambda) = gamma_g + ln(sum([e^alpha*theta]*mu_X)) + epsilon
    
        subject to w>=0, it will imply sum(theta)=1 and theta>=0
    
    the steps to update theta and e_alpha:
        1. dimension change of theta, theta_hat, u from 3-D (spots * celltypes * 1) to 1-D (celltypes), and do re-parametrization to get w
        2. solve w for each spot in parallel
        3. extract updated theta and e_alpha from w, and change the dimension of updated theta, theta_hat, u back

    Parameters
    ----------
    data : Dict
        a Dict contains all info need for modeling:
            X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
            Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
            A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
            N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
            non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
            spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
            gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n
            celltype_names: a list of string of celltype names.
    warm_start_theta : 3-D numpy array (spots * celltypes * 1)
        initial guess of theta (celltype proportion).
    warm_start_e_alpha : 1-D numpy array
        initial guess of e_alpha (spot-specific effect).
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    sigma2 : float
        variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance.
    nu : 3-D numpy array (spots * celltypes * 1), optional
        variable for ADMM penalty of all spots
        in 3 part ADMM nu = theta_hat (used for regularization/penalty) - u (scaled dual variables to make theta = theta_hat)
    rho : float, optional
        parameter for the strength of ADMM loss to make theta equals theta_hat
    lambda_r : float, optional
        parameter for the strength of Adaptive Lasso loss to shrink theta
    lasso_weight : 3-D numpy array (spots * celltypes * 1), optional
        weight of Adaptive Lasso, 1 ./ theta
    lambda_l2 : float
        parameter for the strength of L2 panealty to shrink theta
    global_optimize : bool, optional
        if is True, use basin-hopping algorithm to find the global minimum. The default is False.
    hybrid_version : bool, optional
        if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
    opt_method : string, optional
        specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    hv_log_p : 1-D numpy array, optional
        log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
    theta_mask : 3-D numpy array (spots * celltypes * 1), optional
        mask for cell-type proportions (1: present, 0: not present). Only used for stage 2 theta optmization.
    verbose : bool, optional
        if True, print more information.
        
    Returns
    -------
    theta_results : 3-D numpy array (spots * celltypes * 1)
        updated theta (celltype proportion).
    e_alpha_results : 1-D numpy array
        updated e_alpha (spot-specific effect).
    '''


    n_celltype = data["X"].shape[0]
    n_spot = data["Y"].shape[0]
    
    # NOTE input warm_start_theta, warm_start_e_alpha will NOT be changed inside the function
    
    # skip optimization if the initial theta corresponding to only one cell-type
    skip_opt = True
    
    # prepare parameter tuples for parallel computing
    results = []
    for i in range(n_spot):
        
        this_spot_name = data["spot_names"][i]
        this_warm_start_theta = warm_start_theta[i, :, :].copy().flatten()
        this_warm_start_e_alpha = warm_start_e_alpha[i]

        if theta_mask is None:
            this_theta_mask = None
        else:
            this_theta_mask = theta_mask[i, :, :].copy().flatten()

        y_vec = data["Y"][i, :].copy()
        mu = data["X"].copy()

        if nu is None:
            nu_vec = None
        else:
            nu_vec = nu[i, :, :].copy().flatten()

        if lasso_weight is None:
            lasso_weight_vec = None
        else:
            lasso_weight_vec = lasso_weight[i, :, :].copy().flatten()
       
        if data["N"] is None:
            N = None
        else:
            N = data["N"][i]

        # filter zero genes
        if data['non_zero_mtx'] is None:
            this_y_vec = y_vec
            this_gamma_g = gamma_g.copy()
            this_mu = mu
        else:
            non_zero_gene_ind = data['non_zero_mtx'][i, :]
            #print(f'total {np.sum(non_zero_gene_ind)} non-zero genes ({np.sum(non_zero_gene_ind)/len(non_zero_gene_ind):.2%}) for spot {this_spot_name}')
            this_y_vec = y_vec[non_zero_gene_ind]
            this_gamma_g = gamma_g[non_zero_gene_ind].copy()
            this_mu = mu[:, non_zero_gene_ind]
            
        # call optimization function
        results.append(optimize_one_theta(this_mu, this_y_vec, N, this_warm_start_theta, this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=nu_vec, rho=rho, lambda_r=lambda_r, lasso_weight_vec=lasso_weight_vec, lambda_l2=lambda_l2, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=this_theta_mask, skip_opt=skip_opt, verbose=verbose))
        
    
    # collect results: theta and e_alpha
    theta_results = np.zeros((n_spot, n_celltype, 1))
    e_alpha_results = []
    
    for i, this_result in enumerate(results):
        # extract theta and e_alpha
        tmp_e_alpha = np.sum(this_result)
        tmp_theta = this_result / tmp_e_alpha
        # change dimension back
        theta_results[i, :, :] = np.reshape(tmp_theta, (n_celltype, 1))
        e_alpha_results.append(tmp_e_alpha)
    
    e_alpha_results = np.array(e_alpha_results)
    
    return theta_results, e_alpha_results




################################# code related to update sigma2 #################################


[docs]
def update_sigma2(data, theta, e_alpha, gamma_g, sigma2, opt_method='L-BFGS-B', global_optimize=False, hv_x=None, verbose=False):
    '''
    update sigma2 (variance paramter of the lognormal distribution) given theta (celltype proportion), e_alpha (spot-specific effect) and gamma_g (gene-specific platform effect) by MLE
    
    we assume
    
        ln(lambda) = alpha + gamma_g + ln(sum(theta*mu_X)) + epsilon
        
        subject to sum(theta)=1, theta>=0
    
    mu_X is marker genes from data['X']
    
    then the mean parameter of the log-normal distribution of ln(lambda) is alpha + gamma_g + ln(sum(theta*mu_X))
    
    we did re-parametrization w = e^alpha * theta, then
    
        ln(lambda) = gamma_g + ln(sum([e^alpha*theta]*mu_X)) + epsilon
        
        subject to w>=0, it will imply sum(theta)=1 and theta>=0
              
    currently optimization of sigma2 DO NOT support parallel computing

    Parameters
    ----------
    data : Dict
        a Dict contains all info need for modeling:
            X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
            Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
            A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
            N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
            non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
            spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
            gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n
            celltype_names: a list of string of celltype names.
    theta : 3-D numpy array (spots * celltypes * 1)
        theta (celltype proportion).
    e_alpha : 1-D numpy array
        e_alpha (spot-specific effect).
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    sigma2 : float
        initial guess of variance paramter of the lognormal distribution of ln(lambda). All genes and spots share the same variance.
    opt_method : string, optional
        specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
    global_optimize : bool, optional
        if is True, use basin-hopping algorithm to find the global minimum. The default is False.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    verbose : bool, optional
        if True, print more information.
        
    Returns
    -------
    float
        updated sigma2
    '''
  
    def objective_loss_sigma2(sigma2, data, theta, e_alpha, gamma_g, hv_x):
        '''
        calculate loss function for updating sigma2 (variance paramter of the log-normal distribution of ln(lambda)). All spots and genes share the same variance.
        
        the loss function is a sum of negative log-likelihood of the base model of each spot, and the so-called basemodel loss is the same as that for updating theta (celltype proportion)
        
        and we did re-parametrization w = e^alpha * theta, so theta = w / sum(w)
        
        we calculate the basemodel loss of each spot in a parallel way by Numba
        
        WARNING: this input sigma2 inside the loss function is actually a numpy array
        
        Returns
        -------
        float
            the loss function (sum of base model loss across all spots) to update sigma2
        '''
    
        n_spot = data["Y"].shape[0]
        
        this_sigma2 = sigma2[0]
        
        if not math.isfinite(this_sigma2):
            raise Exception(f'[Error] sigma2 optimization in local model of all spots returns a non-finite value {this_sigma2}!')
        
        # update density values of heavy-tail with current sigma^2
        hv_log_p = generate_log_heavytail_array(hv_x, np.sqrt(this_sigma2))
        
        #from time import time
        #start_time = time()
        
        results = 0.0
        
        for i in range(n_spot):
            y_vec = data["Y"][i, :]
            this_theta = theta[i, :, :].flatten()
            this_e_alpha = e_alpha[i]
            # re-parametrization
            this_w = this_theta * this_e_alpha
            
            # sequencing depth
            if data["N"] is None:
                N = None
            else:
                N = data["N"][i]
                
            # filter zero genes
            if data['non_zero_mtx'] is None:
                this_y_vec = y_vec
                this_gamma_g = gamma_g
                this_mu = data["X"]
            else:
                non_zero_gene_ind = data['non_zero_mtx'][i, :]
                #print(f'total {np.sum(non_zero_gene_ind)} non-zero genes ({np.sum(non_zero_gene_ind)/len(non_zero_gene_ind):.2%}) for spot {i:d}')
                this_y_vec = y_vec[non_zero_gene_ind]
                this_gamma_g = gamma_g[non_zero_gene_ind]
                this_mu = data["X"][:, non_zero_gene_ind]
        
            results += hv_wrapper(this_w, this_y_vec, this_mu, this_gamma_g, this_sigma2, hv_x, hv_log_p, N)
            
            #print(f'after summing up spot {i:d}, current loss is {results:.6f}')
            
        #print(f'One round of summing up loss across all {n_spot:d} spots for sigma^2 optimization. Elapsed time: {time()-start_time:.2f} seconds.')
  
        return results
    
    
    # the min value for clip sigma^2 should be larger
    bounds = ((min_sigma2, None),)
    
    try:
        if global_optimize:
            bounded_step = RandomDisplacementBounds(np.array([b[0] for b in bounds]), np.array([b[1] for b in bounds]))
            # use the basin-hopping algorithm to find the global minimum
            sol = basinhopping(objective_loss_sigma2, sigma2, niter=10, T=1.0, take_step=bounded_step,
                               minimizer_kwargs={'method': opt_method,
                                                 'args': (data, theta, e_alpha, gamma_g, hv_x),
                                                 'bounds': bounds,
                                                 'options': {'maxiter': 250, 'eps': sigma2_eps}
                                                },
                               disp=False)

        else:
            sol = minimize(objective_loss_sigma2, sigma2, args=(data, theta, e_alpha, gamma_g, hv_x),
                           method=opt_method,
                           bounds=bounds, options={'disp': False, 'maxiter': 250, 'eps': sigma2_eps})
    
    except Exception as e:
        print(e)
        print(f'reset sigma2 to minimum value {min_sigma2}')
        return min_sigma2
    
    if not global_optimize:
        if not sol.success:
            if verbose:
                print(f'[WARNING] sigma2 optimization in local model of all spots not successful! Caused by: {sol.message}')
    
    if not math.isfinite(sol.x[0]):
        if not sol.success:
            print(f'[WARNING] sigma2 optimization in local model of all spots not successful! Caused by: {sol.message}')
        else:
            print(f'sigma2 optimization success {sol.success} with {sol.message}')
        print(f'[Error] sigma2 optimization in local model of all spots returns a non-finite value {sol.x[0]}!')
        print(f'reset sigma2 to minimum value {min_sigma2}')
        return min_sigma2
        
    
    # the solution in x is a numpy array
    return sol.x[0]




################################# code related to update theta_tilde by Adaptive Lasso in ADMM #################################


[docs]
def adaptive_lasso(nu, rho, lambda_r=1.0, lasso_weight=None):
    '''
    update theta_tilde by Adaptive Lasso and ADMM loss
    
    theta_tilde = argmin lambda_r*lasso_weight*theta_tilde + 1/(2*rho) * (||theta_tilde-theta_hat+u_tilde||_2)^2
    
    that is
    
        - if theta_tilde>=0, theta_tilde = theta_hat - u_tilde - lambda_r*rho*lasso_weight
        
        - if theta_tilde<0, theta_tilde = theta_hat - u_tilde + lambda_r*rho*lasso_weight
            
    change it to
    
        - if theta_hat-u_tilde-lambda_r*rho*lasso_weight>=0, theta_tilde = theta_hat - u_tilde - lambda_r*rho*lasso_weight
        
        - if theta_hat-u_tilde+lambda_r*rho*lasso_weight<=0, theta_tilde = theta_hat - u_tilde + lambda_r*rho*lasso_weight
        
        - if theta_hat-u_tilde-lambda_r*rho*lasso_weight<0 or theta_hat-u_tilde+lambda_r*rho*lasso_weight>0, not defined, let theta_tilde = 0
            
    and nu = theta_hat - u_tilde
    
    WANRING: negative theta tilde value observed, so also clip it to >= 0

    Parameters
    ----------
    nu : 3-D numpy array (spots * celltypes * 1)
        variable for ADMM penalty of all spots
        in 3 part ADMM nu = theta_hat (used for regularization/penalty) - u_tilde (scaled dual variables to make theta_tilde = theta_hat)
    rho : float
        parameter for the strength of ADMM loss to make theta_tilde equals theta_hat
    lambda_r : float, optional
        strength for Adaptive Lasso penalty. The default is 1.0.
    lasso_weight : 3-D numpy array (spots * celltypes * 1), optional
        calculated weight for adaptive lasso. The default is None.

    Returns
    -------
    3-D numpy array (spots * celltypes * 1)
        updated theta_tilde.
    '''

    if lasso_weight is None:
        lasso_weight = np.ones(nu.shape)
    
    result = np.maximum(nu - rho*lambda_r*lasso_weight, 0) - np.maximum(-nu - rho*lambda_r*lasso_weight, 0)
    
    # avoid negative values
    result[result<min_theta] = min_theta
    
    return result




################################# code related to update theta by in Stage Two without ADMM #################################


[docs]
def fit_base_model_plus_laplacian(data, L, theta, e_alpha, gamma_g, sigma2,
                                  lambda_r=None, lasso_weight=None,
                                  hv_x=None, hv_log_p=None, theta_mask=None,
                                  opt_method='L-BFGS-B', global_optimize=False, hybrid_version=True,
                                  verbose=False):
    """
    fit base model plus Laplacian penalty, this function is to replace the whole ADMM framework in GLRM stage two
    
    Note: To keep consistent, the output of the fit result is still 3-Dimensional. The dimension transform will be performed outside this function if needed
    
    Parameters
    ----------
    data : Dict
        a Dict contains all info need for modeling:
            X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
            Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
            A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
            N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
            non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
            spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
            gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n
            celltype_names: a list of string of celltype names.\n
            initial_guess: initial guess of cell-type proportions of spatial spots.
    L : scipy sparse matrix (spots * spots)
        Laplacian matrix. Note the strenth lambda_g is already absorbed in the L
    theta : 3-D numpy array (spots * celltypes * 1)
        initial guess of theta (celltype proportion).
    e_alpha : 1-D numpy array
        initial guess of e_alpha (spot-specific effect).
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    sigma2 : float
        initial guess of variance paramter of the lognormal distribution of ln(lambda). All genes and spots share the same variance.
        it may not be updated during this optimization, i.e. sigma2 is treated as an already optimized value.
    lambda_r : float, optional
        strength for Adaptive Lasso penalty. The default is None here.
    lasso_weight : 3-D numpy array (spots * celltypes * 1), optional
        calculated weight for adaptive lasso. The default is None.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    hv_log_p : 1-D numpy array, optional
        log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
    theta_mask : 3-D numpy array (spots * celltypes * 1), optional
        mask for cell-type proportions (1: present, 0: not present). Only used for stage 2 theta optmization.
    opt_method : string, optional
        specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
    global_optimize : bool, optional
        if is True, use basin-hopping algorithm to find the global minimum. The default is False.
    hybrid_version : bool, optional
        if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
    verbose : bool, optional
        if True, print more information.

    Returns
    -------
    Dict
        estimated model coefficients, including:
            theta : celltype proportions (#spots * #celltypes * 1)\n
            e_alpha : spot-specific effect (1-D array with length #spot)\n
            sigma2 : variance paramter of the lognormal distribution (float)\n
            gamma_g : gene-specific platform effect for all genes (1-D array with length #gene)
    """
    
    n_celltype = data['X'].shape[0]
    n_spot = data['Y'].shape[0]
    
    from time import time
    start_time = time()
    
    # NOTE input theta, e_alpha will NOT be changed inside the function
    
    # initialize x array for calculation of heavy-tail
    if hv_x is None:
        from local_fit_numba import z_hv
        hv_x = z_hv.copy()
    
    if hv_log_p is None:
        # initialize density values of heavy-tail with initial sigma^2
        hv_log_p = generate_log_heavytail_array(hv_x, np.sqrt(sigma2))

    assert theta_mask is not None
    if verbose:
        print('[HIGHLIGHT] set tighter bounds for proportion w values of NOT present cell types determined in Stage 1')
    
    assert lambda_r is None
    assert lasso_weight is None
    assert hybrid_version is True  # the optimization is on w, but the constrain is on theta
    
    
    # prepare w, and 3D -> 1D
    w0_list = []
    bounds = []
    
    for i in range(n_spot):
        # take the i-th spot
        this_warm_start_theta = theta[i, :, 0]  # shape (n_celltype,)
        this_theta_mask = theta_mask[i, :, 0]
        
        assert np.isclose(np.sum(this_warm_start_theta), 1.0, atol=1e-7), f"Spot {i}: sum={np.sum(this_warm_start_theta):.4g}"
        assert len(this_warm_start_theta) == n_celltype
        assert len(this_theta_mask) == n_celltype
        
        # per-cell-type-proportion bounds
        # avoid [[min_theta, None]] * n_celltype, which will creates shared inner lists, when change one, all will changed simultaneously
        # None means “no upper bound”
        this_bounds = [[min_theta, None] for _ in range(n_celltype)]
        
        # for cell type not presented, we set a very tight bounds
        for j in range(n_celltype):
            if this_theta_mask[j] == 0:
                # this cell type not present
                this_bounds[j][1] = min_theta * 2
        bounds.extend(this_bounds)
        
        # re-parametrization: multiply θ_i by e_alpha[i] to get w_i
        w0_list.append(this_warm_start_theta * e_alpha[i])
    
    w0 = np.concatenate(w0_list, axis=0)  # shape (n_spot * n_celltype,)
    bounds = tuple([tuple(x) for x in bounds])
    
    # Ensure symmetric Laplacian without densifying
    if (L - L.T).nnz != 0:
        raise ValueError("Laplacian L must be symmetric!")
    
    
    # loss function
    def loss_theta_L(w_vec, Y, mu, gamma_g, sigma2, hybrid_version, hv_x, hv_log_p, Ns, L):
        '''
        calculate loss function for updating theta (celltype proportion)
        
        the loss function contains two parts, sum of base model, defined for each spot separately; and the graph Laplacian penalty
        
        also return gradient respect to w
        
        1. for each spot, negative log-likelihood of the base model given observed data and initial parameter value. It sums across all genes. Then we sum it over all spots
        
        2. graph Laplacian penalty
        
        Parameters
        ----------
        w_vec : 1-D numpy array
            e_alpha (spot-specific effect) * theta (celltype proportion) of all spots, flattened to 1D array.
        Y : 2-D numpy array
            observed gene counts of all spots (spots * genes).
        mu : 2-D numpy matrix
            celltype specific marker genes (celltypes * genes)
        gamma_g : 1-D numpy array
            gene-specific platform effect for all genes.
        sigma2 : float
            variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance.
        hybrid_version : bool, optional
            if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
        hv_x : 1-D numpy array, optional
            data points served as x for calculation of probability density values. Only used for heavy-tail.
        hv_log_p : 1-D numpy array, optional
            log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
        Ns : 1-D numpy array
            sequencing depth of all spots (length #spots). If is None, use sum(y_vec) instead.
        
        Returns
        -------
        a tuple (float, 1-D numpy array)
            the loss function (base model loss + graph Laplacian loss) over all spots to update w (e_alpha*theta) + gradient vector
        '''
        
        # first revert the w to spot * cell type
        n_spot = Y.shape[0]
        n_celltype = mu.shape[0]
        
        assert len(w_vec) == n_spot * n_celltype
        
        W = w_vec.reshape((n_spot, n_celltype)) # row-wise reshape
        
        base_model_loss = 0
        grad_W = np.zeros((n_spot, n_celltype))   # preallocate gradient matrix
        # re-construct theta from w, theta = w / sum(w)
        Theta = np.zeros((n_spot, n_celltype))
        
        for i in range(n_spot):
            this_w = W[i, :]
            this_y = Y[i, :]
                
            # sequencing depth
            if Ns is None:
                N = None
            else:
                N = Ns[i]
            
            # return the loss function value and gradient for w in this spot i
            cur_loss, cur_gradient = objective_loss_theta(this_w, this_y, mu, gamma_g, sigma2, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, hybrid_version=hybrid_version, hv_x=hv_x, hv_log_p=hv_log_p, N=N)
            
            # total loss is sum over all spots
            base_model_loss += cur_loss
            
            # since each spot is independent in base model, the the gradient for this spot is only related to this spot, we can directly use it
            grad_W[i, :] = cur_gradient
            
            # re-construct theta, we are sure sum will >0, so no need to add a smalle value to avoid divided by zero; rather add a small value will break the invariance in gradient calculation, leading to incorrect results
            tmp_sum = np.sum(this_w)
            Theta[i, :] = this_w / tmp_sum
        
        
        assert hybrid_version is True  # constrain on theta

        # Note L is a SciPy sparse matrix, and lambda_g is already absorbed in the L
        # @ is matrix multiply, * is Element-wise Multiplication
        # Theta: n_spot * n_celltype
        # loss is tr(Θ^⊤*L*Θ)
        graph_loss = np.sum(Theta * (L @ Theta))  # == tr(Theta.T @ L @ Theta)
        
        # we need to return gradient respect to w, so use chain rule df/dw = df/dθ * dθ/dw
        # df/dθ = 2LΘ, dθ/dw = 1/sum(w) - w/(sum(w)^2)
        # for each row i:
        # ∇_{w_i} f = g_i / s_i  -  ( (w_i · g_i) / s_i^2 ) * 1
        g = 2 * L @ Theta # (n,k)
        s = np.sum(W, axis=1, keepdims=True)  # (n,1)
        # rowwise corrections
        wg = np.sum(W * g, axis=1, keepdims=True)  # (n,1), row-wise inner product w·g
        graph_grad_W = (g / s) - (wg / (s ** 2))  # broadcast-safe, (n,k)
        
        loss = base_model_loss + graph_loss
        grad = grad_W + graph_grad_W
        # reshape grad to vector
        grad_vec = grad.flatten(order='C') # row-wise
        
        return loss, grad_vec

    
    # start optimization
    # call minimize function to solve w (e_alpha*theta)
    # bounds : tuple of tuples
    #    sequence of (min, max) pairs for each element in w_vec
    # min not set as 0 to avoid divided by 0 or log(0)
    # if jac is a Boolean and is True, fun is assumed to return a tuple (f, g) containing the objective function and the gradient
    sol = minimize(loss_theta_L, w0, args=(data["Y"], data["X"], gamma_g, sigma2, hybrid_version, hv_x, hv_log_p, data["N"], L),
               method=opt_method,
               bounds=bounds,
               options={'disp': False, 'maxiter': 250, 'eps': theta_eps}, jac=True)
    
    
    # sol would be (nk,) array, change to n*k
    if sol.success:
        if verbose:
            print(f'w optimization in Stage 2 successful in {sol.nit} iterations')
            # Ref: status 2 - Maximum number of iterations has been exceeded
    else:
        if verbose:
            print(f'[WARNING] w optimization in Stage 2 not successful! Caused by: {sol.message}')
    
    sol_W = np.reshape(sol.x, (n_spot, n_celltype)) # row-wise reshape
    sol_W_cor = np.zeros((n_spot, n_celltype))
    
    for i in range(n_spot):
        solve_fail_flag = False
        this_sol = sol_W[i, ].copy()
    
        if sum(this_sol) == 0:
            if verbose:
                print(f'###### [Error] w optimization in STAGE TWO of spot {data["spot_names"][i]} returns all 0s! ######')
            solve_fail_flag = True
        
        if np.any(np.isnan(this_sol)):
            if verbose:
                print(f'###### [Error] w optimization in STAGE TWO of spot {data["spot_names"][i]} returns NaN value! ######')
            solve_fail_flag = True
       
        if np.any(np.isinf(this_sol)):
            if verbose:
                print(f'###### [Error] w optimization in STAGE TWO of spot {data["spot_names"][i]} returns Infinite value! ######')
            solve_fail_flag = True
        
        if solve_fail_flag:
            # replace NaN or Infinite as 0
            this_sol = np.nan_to_num(this_sol, nan=0.0, posinf=0.0, neginf=0.0)
            if verbose:
                print('repalce non-numeric value as 0')
            # NOTE this is w, no need to sum to 1
            
            if sum(this_sol) == 0:
                # reset w to all elements equal
                tmp_len = this_sol.shape[0]
                this_sol = np.full((tmp_len,), 1.0/tmp_len) * e_alpha[i]
                if verbose:
                    print('[WARNING] reset w to all elements identical!')
    
        # set w values at boundary to 0; note to allow a tolerance
        tmp_ind = (this_sol > 0) & (this_sol <= min_theta * 5)
        if tmp_ind.any():
            this_sol[tmp_ind] = 0
            # NOTE this is w, no need to sum to 1
    
        sol_W_cor[i, ] = this_sol


    # collect results: theta and e_alpha
    theta_results = np.zeros((n_spot, n_celltype, 1))
    e_alpha_results = []

    for i in range(n_spot):
        # extract theta and e_alpha
        this_w_result = sol_W_cor[i, ]
        tmp_e_alpha = np.sum(this_w_result)
        tmp_theta = this_w_result / tmp_e_alpha
        # change dimension back
        theta_results[i, :, :] = np.reshape(tmp_theta, (n_celltype, 1))
        e_alpha_results.append(tmp_e_alpha)

    e_alpha_results = np.array(e_alpha_results)
    
    
    # construct result, DO NOT change theta back to 2-D array
    # the dimension transforming is performed outside this function is needed
    result = {
            'theta': theta_results,
            'e_alpha': e_alpha_results,
            'sigma2': sigma2,
            'gamma_g': gamma_g
            }
    
    if verbose:
        print(f'Optimization with graph Laplacian penalty finished. Elapsed time: {(time()-start_time)/60.0:.2f} minutes.\n')
    
    return result