Source code for model_fit

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May  7 22:36:00 2022

@author: hill103

this script stores functions of model fitting
currently we use a two-stage implement to fit GLRM
"""



import numpy as np
from time import time
from local_fit_numba import update_theta, update_sigma2, fit_base_model_plus_laplacian, optimize_one_theta
from admm_fit import one_admm_fit
import scipy.sparse as sparse
from utils import reparameterTheta
from hyper_parameter_optimization import cv_find_lambda_r, cv_find_lambda_g, BIC_find_lambda_r_one_spot, BIC_find_theta_subset_one_spot
from config import min_val, print




[docs]
def fit_base_model(data, gamma_g=None, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', verbose=False, use_initial_guess=False):
    '''
    fit local or base model without any Adaptive Lasso constrain or Graph Laplacian constrain
    
    this fitting is only used for gamma_g estimation and MLE theta estimation in GLRM stage 1
    
    when fitting the base model, we will perform fitting iteratively until the sigma^2 change <= 0.001
    
    Note: the output of the fit result is 3-Dimensional. The dimension transform will be performed outside this function if needed

    Parameters
    ----------
    data : Dict
        a Dict contains all info need for modeling:
            X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
            Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
            A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
            N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
            non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
            spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
            gene_names: a list of string of gene symbols. Only keep actually used marker gene symbols.\n
            celltype_names: a list of string of celltype names.\n
            initial_guess: initial guess of cell-type proportions of spatial spots.
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    global_optimize : bool, optional
        if is True, use basin-hopping algorithm to find the global minimum. The default is False.
    hybrid_version : bool, optional
        if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
    opt_method : string, optional
        specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
    verbose : bool, optional
        if True, print more information in program running.
    use_initial_guess : bool, optional
        if True, use initial guess instead of uniform distribution for theta initialization.

    Returns
    -------
    tuple
        a tuple of estimated theta, e_alpha and sigma2:
            theta : celltype proportions (3-D numpy array (spots * celltypes * 1))\n
            e_alpha : spot-specific effect (1-D array with length #spot)\n
            sigma2 : variance paramter of the lognormal distribution (float)
    '''

    start_time = time()
    
    n_celltype, n_gene = data['X'].shape
    n_spot = data['Y'].shape[0]
    
    if verbose:
        print('\nGLRM model initialization...')
    
    # Initialization
    if use_initial_guess and (data['initial_guess'] is not None):
        print('[HIGHLIGHT] use initial guess derived from CVAE rather than uniform distribution for theta initialization')
        # NOTE: some spots may be excluded in filtering
        assert data['initial_guess'].columns.to_list() == data['celltype_names']
        
        assert data['initial_guess'].shape[0] >= len(data['spot_names'])
        if data['initial_guess'].shape[0] == len(data['spot_names']):
            assert data['initial_guess'].index.to_list() == data['spot_names']
            theta = data['initial_guess'].values
        else:
            # use included spots only
            theta = data['initial_guess'].loc[data['spot_names']].values
            
        # note the shape is (#spots, #cell-types, 1)
        # use np.newaxis to add a new third dimension
        theta = theta[:, :, np.newaxis]
    else:
        theta = np.full((n_spot, n_celltype, 1), 1.0/n_celltype)
        
    e_alpha = np.full((n_spot,), 1.0)
    
    sigma2 = 1.0
   
    if gamma_g is None:
        gamma_g = np.zeros((n_gene,))
    
    # initialize x array for calculation of heavy-tail
    from local_fit_numba import z_hv, generate_log_heavytail_array
    # initialize density values of heavy-tail with initial sigma^2
    log_p_hv = generate_log_heavytail_array(z_hv, np.sqrt(sigma2))
    
    if global_optimize:
        if verbose:
            print('[CAUTION] global optimization turned on!')
    
    if verbose:
        print('calculate MLE theta and sigma^2 iteratively...')
    
    if verbose:
        print(f'{"iter" : >6} | {"time_opt": >8} | {"time_sig": >8} | {"sigma2": >10}')
    
    t = 0
    pre_sigma2 = sigma2
    
    while True:
        # update theta and e_alpha
        tmp_start_in_loop = time()
        theta, e_alpha = update_theta(data, theta, e_alpha, gamma_g.copy(), sigma2,
                                      global_optimize=global_optimize, hybrid_version=hybrid_version,  opt_method=opt_method,
                                      hv_x=z_hv, hv_log_p=log_p_hv)
        time_local_opt = time() - tmp_start_in_loop
        
        # update sigma^2
        tmp_start_in_loop = time()
        sigma2 = update_sigma2(data, theta, e_alpha, gamma_g.copy(), sigma2,
                                opt_method=opt_method, global_optimize=global_optimize,
                                hv_x=z_hv)
       
        # update density values of heavy-tail with current sigma^2
        log_p_hv = generate_log_heavytail_array(z_hv, np.sqrt(sigma2))
        time_sigma2 = time() - tmp_start_in_loop
    
        if verbose:
            print(f'{t : >6} | {time_local_opt:8.3f} | {time_sigma2:8.3f} | {sigma2:10.6f}')
        
        if abs(sigma2 - pre_sigma2) <= 1e-6:
            break
        else:
            t += 1
            pre_sigma2 = sigma2
    
    if verbose:
        print(f'MLE theta and sigma^2 calculation finished. Elapsed time: {(time()-start_time)/60.0:.2f} minutes.')
        
    return theta, e_alpha, sigma2





[docs]
def estimating_gamma_g(data, hybrid_version=True, opt_method='L-BFGS-B', verbose=False):
    """
    estimate platform effect gamma_g by fit model on pseudo-bulk measurement

    Parameters
    ----------
    data : Dict
        a Dict contains all info need for modeling:
            X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
            Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
            A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
            N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
            non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
            spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
            gene_names: a list of string of gene symbols. Only keep actually used marker gene symbols.\n
            celltype_names: a list of string of celltype names.\n
            initial_guess: initial guess of cell-type proportions of spatial spots.
    hybrid_version : bool, optional
        if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
    opt_method : string, optional
        specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
    verbose : bool, optional
        if True, print more information in program running
    
    Returns
    -------
    1-D numpy array (#genes)
        estimated gamma_g
    """
    
    start_time = time()
    
    if verbose:
        print(f'estimate platform effect gammg_g by {opt_method} and basinhopping')
    
    # combine all spots into one spots then build a new data dict
    tmp_data = {'X': data['X'].copy(),
                'Y': np.sum(data['Y'], axis=0, keepdims=True)}
    # sequencing depth is sum across all spots and genes
    tmp_data['N'] = np.array([np.sum(data['N'])])
    # genes with nUMI>0
    if data['non_zero_mtx'] is None:
        tmp_data['non_zero_mtx'] = None
    else:
        tmp_data['non_zero_mtx'] = np.sum(data['non_zero_mtx'], axis=0, keepdims=True) > 0
        
    tmp_data['spot_names'] = ['comb_spot']  # only one spot
    
    # fit base model
    theta, e_alpha, _ = fit_base_model(tmp_data, global_optimize=True, hybrid_version=hybrid_version, opt_method=opt_method, verbose=verbose, use_initial_guess=False)
    
    
    assert(len(e_alpha)==1)
    # calculate gamma_g
    # note theta is 3-Dimensional
    w = np.squeeze(theta) * e_alpha[0]
    
    gamma_g = np.log(np.squeeze(tmp_data['Y'])) - np.log(np.sum(tmp_data['Y']) * w@tmp_data['X'] + min_val)
    
    print(f'gamma_g estimation finished. Elapsed time: {(time()-start_time)/60.0:.2f} minutes.')
    
    return gamma_g





[docs]
def fit_stage1_spotwise_lambda_r(data, warm_start_theta, warm_start_e_alpha, gamma_g, sigma2, lambda_r, lasso_weight, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', hv_x=None, hv_log_p=None, verbose=False, diagnosis=False):
    '''
    update theta (celltype proportion) and e_alpha (spot-specific effect) given sigma2 (variance paramter of the log-normal distribution) and gamma_g (gene-specific platform effect) by MLE
    
    UPDATE: here we assume the hyperparameter lambda_r for Adaptive Lasso is spot-wise, i.e. for each spot, we search for the optimal lambda_r, then fit the model
    
    UPDATE2: we apply Nested Subset Selection Strategy, thus no lambda_r anymore
    
    we assume 
    
        ln(lambda) = alpha + gamma_g + ln(sum(theta*mu_X)) + epsilon
    
        subject to sum(theta)=1, theta>=0
    
    mu_X is marker genes from data['X']
    
    then the mean parameter of the lognormal distribution of ln(lambda) is alpha + gamma_g + ln(sum(theta*mu_X))
    
    we did re-parametrization w = e^alpha * theta, then
    
        ln(lambda) = gamma_g + ln(sum([e^alpha*theta]*mu_X)) + epsilon
    
        subject to w>=0, it will imply sum(theta)=1 and theta>=0
    
    the steps to update theta and e_alpha:
        1. dimension change of theta from 3-D (spots * celltypes * 1) to 1-D (celltypes), and do re-parametrization to get w
        2. solve w for each spot in parallel
        3. extract updated theta and e_alpha from w, and change the dimension of updated theta back

    Parameters
    ----------
    data : Dict
        a Dict contains all info need for modeling:
            X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
            Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
            A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
            N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
            non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
            spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
            gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n
            celltype_names: a list of string of celltype names.
    warm_start_theta : 3-D numpy array (spots * celltypes * 1)
        initial guess of theta (celltype proportion).
    warm_start_e_alpha : 1-D numpy array
        initial guess of e_alpha (spot-specific effect).
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    sigma2 : float
        variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance.
    lambda_r : float or 1-D numpy array
        parameter for the strength of Adaptive Lasso loss to shrink theta
    lasso_weight : 3-D numpy array (spots * celltypes * 1)
        weight of Adaptive Lasso, 1 ./ theta
    global_optimize : bool, optional
        if is True, use basin-hopping algorithm to find the global minimum. The default is False.
    hybrid_version : bool, optional
        if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
    opt_method : string, optional
        specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
    hv_x : 1-D numpy array, optional
        data points served as x for calculation of probability density values. Only used for heavy-tail.
    hv_log_p : 1-D numpy array, optional
        log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
    verbose : bool, optional
        if True, print more information.
    diagnosis : bool, optional
        if True print more information for diagnosis of hyper-parameter selection.
        
    Returns
    -------
    theta_results : 3-D numpy array (spots * celltypes * 1)
        updated theta (celltype proportion).
    e_alpha_results : 1-D numpy array
        updated e_alpha (spot-specific effect).
    '''
    
    assert lasso_weight is not None
    
    n_celltype = data["X"].shape[0]
    n_spot = data["Y"].shape[0]
    
    # strategy includes AIC, BIC or cross-validation for lambda_r tunning, or directly subset selection by AIC or BIC
    strategy = 'AIC'
    
    if strategy == 'BIC':
        print('[HIGHLIGHT] use BIC to select hyperparameter lambda_r in adaptive Lasso')
        use_AIC = False
    elif strategy == 'AIC':
        print('[HIGHLIGHT] use AIC to select hyperparameter lambda_r in adaptive Lasso')
        use_AIC = True
    elif strategy == 'CV':
        print('[HIGHLIGHT] use cross-validation to select hyperparameter lambda_r in adaptive Lasso')
    elif strategy == 'subset-BIC':
        print('[HIGHLIGHT] use Nested Subset Selection Strategy with BIC for presented cell type identification')
        use_AIC = False
    elif strategy == 'subset-AIC':
        print('[HIGHLIGHT] use Nested Subset Selection Strategy with AIC for presented cell type identification')
        use_AIC = True
    else:
        raise Exception(f'Unknown strategy in Stage 1: {strategy}!')
    
    # NOTE input warm_start_theta, warm_start_e_alpha will NOT be changed inside the function
    
    # prepare parameter tuples for parallel computing
    theta_results = []
    e_alpha_results = []
    
    if isinstance(lambda_r, list) and diagnosis:
        lambda_r_list = []
    
    prev_percent = -1
    
    for i in range(n_spot):
        
        progress = int(i / n_spot * 100)   # 0–100%
        # print only when hitting multiples of 10
        if progress // 10 > prev_percent:
            prev_percent = progress // 10
            print(f"{progress}%...", end='')
        
        this_spot_name = data["spot_names"][i]
        this_warm_start_theta = warm_start_theta[i, :, :].copy().flatten()
        this_warm_start_e_alpha = warm_start_e_alpha[i]

        y_vec = data["Y"][i, :].copy()
        mu = data["X"].copy()
        
        lasso_weight_vec = lasso_weight[i, :, :].copy().flatten()
       
        if data["N"] is None:
            N = None
        else:
            N = data["N"][i]

        # filter zero genes
        if data['non_zero_mtx'] is None:
            this_y_vec = y_vec
            this_gamma_g = gamma_g.copy()
            this_mu = mu
        else:
            non_zero_gene_ind = data['non_zero_mtx'][i, :]
            #print(f'total {np.sum(non_zero_gene_ind)} non-zero genes ({np.sum(non_zero_gene_ind)/len(non_zero_gene_ind):.2%}) for spot {this_spot_name}')
            this_y_vec = y_vec[non_zero_gene_ind]
            this_gamma_g = gamma_g[non_zero_gene_ind].copy()
            this_mu = mu[:, non_zero_gene_ind]
        
        this_theta_result = None
        this_e_alpha_result = None
        
        # first determine whether need to tune hyperparameter lambda_r
        if isinstance(lambda_r, list):
            if verbose:
                print(f'{i}th spot {this_spot_name} hyper-parameter for Adaptive Lasso: use {strategy} criteria to find the optimal value from {len(lambda_r)} candidates...')
                
            if (strategy == 'BIC') or (strategy == 'AIC'):
                # NOTE returned theta NOT w
                best_lambda_r, tmp_theta = BIC_find_lambda_r_one_spot(this_mu.copy(), this_y_vec.copy(), N, this_spot_name, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, lasso_weight_vec.copy(), lambda_r, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, use_AIC=use_AIC, reinitialize_theta=False, verbose=verbose)
                
                if isinstance(lambda_r, list) and diagnosis:
                    lambda_r_list.append({'spot': this_spot_name, 'optimal_lambda_r': best_lambda_r})
                
                # get non-zero cell types then refit; NOTE we set a min_theta in optimization to avoid theta as 0
                # and before return result, we already set them back to 0
                # binary theta by threshold to get a mask (1: present, 0: not present)
                tmp_theta_mask = np.zeros(tmp_theta.shape, dtype='int')
                tmp_theta_mask[tmp_theta > 0] = 1
                
                # refit
                this_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=tmp_theta_mask, skip_opt=True, verbose=False)
                # get theta and e_alpha
                this_e_alpha_result = np.sum(this_w)
                this_theta_result = this_w / this_e_alpha_result
                
            
            elif (strategy == 'subset-BIC') or (strategy == 'subset-AIC'):
                # start Nested Subset Selection Strategy
                # NOTE returned results are theta and e_alpha separately
                this_theta_result, this_e_alpha_result = BIC_find_theta_subset_one_spot(this_mu.copy(), this_y_vec.copy(), N, this_spot_name, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, use_AIC=use_AIC, reinitialize_theta=False, verbose=verbose)
            
            
            elif strategy == 'CV':
                # construct data dict with only one spot
                if data['non_zero_mtx'] is None:
                    this_spot_nonzero_mtx = None
                else:
                    this_spot_nonzero_mtx = data['non_zero_mtx'][i, :]
                    this_spot_nonzero_mtx = this_spot_nonzero_mtx.reshape((1, len(this_spot_nonzero_mtx)))
                
                onespot_data = {'X': this_mu,
                                'Y': this_y_vec.reshape((1, len(this_y_vec))),
                                'N': np.array([N]),
                                'non_zero_mtx': this_spot_nonzero_mtx,
                                'spot_names': [this_spot_name]
                                }
                reshaped_theta = this_warm_start_theta.reshape((1, n_celltype, 1))
                reshaped_e_alpha = np.array([this_warm_start_e_alpha])
                reshaped_lasso_weight = lasso_weight_vec.reshape((1, n_celltype, 1))

                best_lambda_r = cv_find_lambda_r(onespot_data, reshaped_theta.copy(),
                                                 reshaped_e_alpha.copy(), gamma_g.copy(), sigma2,
                                                 reshaped_lasso_weight.copy(), lambda_r,
                                                 hybrid_version=hybrid_version, opt_method=opt_method,
                                                 hv_x=hv_x, hv_log_p=hv_log_p,
                                                 use_admm=False, use_likelihood=True,
                                                 k=5, diagnosis=False, verbose=verbose)
                
                if isinstance(lambda_r, list) and diagnosis:
                    lambda_r_list.append({'spot': this_spot_name, 'optimal_lambda_r': best_lambda_r})
                
                # fit with best_lambda_r
                tmp_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=best_lambda_r, lasso_weight_vec=lasso_weight_vec.copy(), lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=None, skip_opt=True, verbose=False)
                # get theta and e_alpha
                tmp_theta = tmp_w / np.sum(tmp_w)
                
                # get non-zero cell types then refit; NOTE we set a min_theta in optimization to avoid theta as 0
                # and before return result, we already set them back to 0
                # binary theta by threshold to get a mask (1: present, 0: not present)
                tmp_theta_mask = np.zeros(tmp_theta.shape, dtype='int')
                tmp_theta_mask[tmp_theta > 0] = 1
                
                # refit
                this_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=tmp_theta_mask, skip_opt=True, verbose=False)
                # get theta and e_alpha
                this_e_alpha_result = np.sum(this_w)
                this_theta_result = this_w / this_e_alpha_result
                
        
        else:
            # directly run optimization with specified lambda_r
            tmp_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=lambda_r, lasso_weight_vec=lasso_weight_vec.copy(), lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=None, skip_opt=True, verbose=False)
            
            # get theta and e_alpha
            tmp_theta = tmp_w / np.sum(tmp_w)
            
            # get non-zero cell types then refit; NOTE we set a min_theta in optimization to avoid theta as 0
            # and before return result, we already set them back to 0
            # binary theta by threshold to get a mask (1: present, 0: not present)
            tmp_theta_mask = np.zeros(tmp_theta.shape, dtype='int')
            tmp_theta_mask[tmp_theta > 0] = 1
        
            this_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=tmp_theta_mask, skip_opt=True, verbose=False)
            # get theta and e_alpha
            this_e_alpha_result = np.sum(this_w)
            this_theta_result = this_w / this_e_alpha_result
        

        # collect results from each spot
        assert this_theta_result is not None
        assert this_e_alpha_result is not None
        theta_results.append(this_theta_result)
        e_alpha_results.append(this_e_alpha_result)
    
    print('100%\n')
    
    # change theta dimension back to 3D
    theta_results_return = np.zeros((n_spot, n_celltype, 1))
    
    for i, this_result in enumerate(theta_results):
        theta_results_return[i, :, :] = np.reshape(theta_results[i], (n_celltype, 1))
    
    e_alpha_results = np.array(e_alpha_results)
    
    if strategy in ['BIC', 'AIC', 'CV']:
        if isinstance(lambda_r, list) and diagnosis:
            # output optimal lambda_r
            import pandas as pd
            import os
            from config import diagnosis_path
            
            # need to create subfolders first, otherwise got FileNotFoundError
            os.makedirs(os.path.join(diagnosis_path, 'GLRM_params_tuning'), exist_ok=True)
            
            tmp_df = pd.DataFrame(lambda_r_list)
            tmp_df.to_csv(os.path.join(diagnosis_path, 'GLRM_params_tuning', 'optimal_lambda_r.csv.gz'), index=False, compression='gzip')
            
            # save optimal lambda_r histogram
            from diagnosis_plots import diagnosisParamsSpotwiseLambdarTuning
            diagnosisParamsSpotwiseLambdarTuning(tmp_df['optimal_lambda_r'].to_list(), strategy)
    
    return theta_results_return, e_alpha_results





[docs]
def fit_model_two_stage(data, gamma_g=None, lambda_r=None, lambda_g=None, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', verbose=False, diagnosis=False, use_admm=False):
    """
    fit Graph Laplacian Regularized Stratified Model (GLRM) in a two-stage way
    
    Parameters
    ----------
    data : Dict
        a Dict contains all info need for modeling:
            X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
            Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
            A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
            N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
            non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
            spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
            gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n
            celltype_names: a list of string of celltype names.\n
            initial_guess: initial guess of cell-type proportions of spatial spots.
    gamma_g : 1-D numpy array
        gene-specific platform effect for all genes.
    lambda_r : float
        strength for Adaptive Lasso penalty. If None, use cross-validation to determine optimal lambda_r
    lambda_g : float
        edge weight for graph, and will affect the strength of Graph Laplacian constrain. If None, use cross-validation to determine optimal graph_weight.
    global_optimize : bool, optional
        if is True, use basin-hopping algorithm to find the global minimum. The default is False.
    hybrid_version : bool, optional
        if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
    opt_method : string, optional
        specify method used in scipy.optimize.minimize for local model fitting. The default is '', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
    verbose : bool, optional
        if True, print more information in program running.
    diagnosis : bool, optional
        if True save more information to files for diagnosis CVAE and hyper-parameter selection.
    use_admm : bool, optinal
        if True, use ADMM framework in optimization, default is False.

    Returns
    -------
    Dict
        estimated model coefficients, including:
            theta : celltype proportions (#spots * #celltypes * 1)\n
            e_alpha : spot-specific effect (1-D array with length #spot)\n
            sigma2 : variance paramter of the lognormal distribution (float)\n
            gamma_g : gene-specific platform effect for all genes (1-D array with length #gene)
    """
    
    print('\n\nStart GLRM fitting...\n')
    
    start_time = time()
    
    n_celltype, n_gene = data['X'].shape
    n_spot = data['Y'].shape[0]
    
    
    print('first estimate MLE theta and corresponding e^alpha and sigma^2...')
    
    tmp_start_time = time()
    
    theta, e_alpha, sigma2 = fit_base_model(data, gamma_g=gamma_g, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, verbose=True, use_initial_guess=True)
    
    print(f'MLE theta estimation finished. Elapsed time: {(time()-tmp_start_time)/60.0:.2f} minutes.')
    
    # initialize x array for calculation of heavy-tail
    from local_fit_numba import z_hv, generate_log_heavytail_array
    # initialize density values of heavy-tail with initial sigma^2
    log_p_hv = generate_log_heavytail_array(z_hv, np.sqrt(sigma2))
    
    # calculate the weight for adaptive lasso based on MLE theta
    print('\ncalculate weights of Adaptive Lasso...')
    weight_threshold = 1e-3
    print(f'clip max weight to {1/weight_threshold:.0f} to avoid huge weights derived from tiny MLE theta')
    
    tmp_theta = theta.copy()
    # also avoid to divided by 0
    if hybrid_version:
        # adaptive lasso constrain on theta
        tmp_theta[tmp_theta<weight_threshold] = weight_threshold
        lasso_weight = 1.0 / tmp_theta
    else:
        # adaptive lasso constrain on w=theta*e^alpha
        tmp_w = reparameterTheta(tmp_theta, e_alpha)
        tmp_w[tmp_w<weight_threshold] = weight_threshold
        lasso_weight = 1.0 / tmp_w
    
    del tmp_theta

    # stage 1
    tmp_start_time = time()
    
    print('\nStage 1: variable selection using Adaptive Lasso starts with the MLE theta and e^alpha, using already estimated sigma^2 and gamma_g...')
    
    spotwise_lambda_r = False
    if spotwise_lambda_r:
        print('\n[HIGHLIGHT] use spot-wise hyper-parameter lambda_r instead of an overall lambda_r value for all spots\n')
    
    print(f'specified hyper-parameter for Adaptive Lasso is: {lambda_r}\n')
    
    
    if lambda_r == 0:
        print('[WARNING] User specified lambda_r as 0, meaning disable Adaptive Lasso. Stage 1 will be skiped\n')
        # directly copy MLE estimates to Stage 1 results
        stage1_result = {}
        stage1_result['theta'] = theta.copy()
        stage1_result['e_alpha'] = e_alpha.copy()
    
    elif isinstance(lambda_r, list):
        # we need tune hpyerparameter lambda_r
        if spotwise_lambda_r:
            print(f'Start tuning spot-wise hyper-parameter for Adaptive Lasso for each spot separately: find the optimal value from {len(lambda_r)} candidates...')
            stage1_result = {}
            stage1_result['theta'], stage1_result['e_alpha'] = fit_stage1_spotwise_lambda_r(data, theta.copy(), e_alpha.copy(), gamma_g.copy(), sigma2, lambda_r, lasso_weight.copy(), global_optimize=False, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=z_hv, hv_log_p=log_p_hv, verbose=True, diagnosis=diagnosis)
        
        else:
        
            print(f'hyper-parameter for Adaptive Lasso: use cross-validation to find the an overall optimal value for all spots from {len(lambda_r)} candidates...')
            best_lambda_r = cv_find_lambda_r(data, theta.copy(), e_alpha.copy(), gamma_g.copy(), sigma2, lasso_weight, lambda_r, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=z_hv, hv_log_p=log_p_hv, use_admm=False, use_likelihood=True, diagnosis=diagnosis)
    
            print(f'\nStart optimization with found optimal lambda_r: {best_lambda_r}\n')
    
            if use_admm:
                # stage 1 with determined lambda_r, but use ADMM framework
                # Laplacian Matrix is all 0 in stage 1
                # transform it to a scipy sparse matrix to be consistent with Laplacian Matrix derived from graph object
                L = sparse.csr_matrix(np.zeros((n_spot, n_spot)))
                stage1_result = one_admm_fit(data, L, theta.copy(), e_alpha.copy(), gamma_g.copy(),
                                sigma2, lambda_r=best_lambda_r, lasso_weight=lasso_weight.copy(),
                                hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=None,
                                opt_method=opt_method, global_optimize=global_optimize,
                                hybrid_version=hybrid_version, verbose=verbose)
            else:
                # stage 1 with determined lambda_r, use base model loss + Adaptive Lasso loss
                stage1_result = {}
                stage1_result['theta'], stage1_result['e_alpha'] = update_theta(data, theta.copy(),
                                    e_alpha.copy(), gamma_g.copy(), sigma2,
                                    lambda_r=best_lambda_r, lasso_weight=lasso_weight.copy(),
                                    hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=None,
                                    global_optimize=global_optimize, hybrid_version=hybrid_version,
                                    opt_method=opt_method, verbose=verbose)
    
    else:
        print(f'\nStart optimization with user specified optimal lambda_r: {lambda_r}\n')
        
        if spotwise_lambda_r:
            stage1_result = {}
            stage1_result['theta'], stage1_result['e_alpha'] = fit_stage1_spotwise_lambda_r(data, theta.copy(), e_alpha.copy(), gamma_g.copy(), sigma2, lambda_r, lasso_weight.copy(), global_optimize=False, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=z_hv, hv_log_p=log_p_hv, verbose=False, diagnosis=diagnosis)
        
        else:
            
            if use_admm:
                # stage 1 with determined lambda_r, but use ADMM framework
                # Laplacian Matrix is all 0 in stage 1
                # transform it to a scipy sparse matrix to be consistent with Laplacian Matrix derived from graph object
                L = sparse.csr_matrix(np.zeros((n_spot, n_spot)))
                stage1_result = one_admm_fit(data, L, theta.copy(), e_alpha.copy(), gamma_g.copy(),
                                sigma2, lambda_r=lambda_r, lasso_weight=lasso_weight.copy(),
                                hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=None,
                                opt_method=opt_method, global_optimize=global_optimize,
                                hybrid_version=hybrid_version, verbose=verbose)
            else:
                # stage 1 with determined lambda_r, use base model loss + Adaptive Lasso loss
                # compared with spotwise-related function, just without re-fit step
                stage1_result = {}
                stage1_result['theta'], stage1_result['e_alpha'] = update_theta(data, theta.copy(),
                                    e_alpha.copy(), gamma_g.copy(), sigma2,
                                    lambda_r=lambda_r, lasso_weight=lasso_weight.copy(),
                                    hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=None,
                                    global_optimize=global_optimize, hybrid_version=hybrid_version,
                                    opt_method=opt_method, verbose=verbose)


    print(f'Stage 1 variable selection finished. Elapsed time: {(time()-tmp_start_time)/60.0:.2f} minutes.')
    
    theta = stage1_result['theta']
    # binary theta by threshold to get a mask (1: present, 0: not present)
    theta_mask = np.zeros(theta.shape, dtype='int')
    # NOTE we already re-set w values at boundary to 0; so small theta values will be directly 0
    theta_mask[theta > 0] = 1
    
    
    # stage 2
    tmp_start_time = time()
    
    print('\nStage 2: final theta estimation with Graph Laplacian Constrain using already estimated sigma^2 and gamma_g')
    
    # re-initialize theta and e_alpha for only present cell-types
    # update: reuse the result from stage 1
    '''
    theta = np.zeros(theta.shape)
    for i in range(n_spot):
        theta[i, theta_mask[i,:,:]==1] = 1.0/np.sum(theta_mask[i,:,:])
    
    e_alpha = np.full((n_spot,), 1.0)
    '''
    
    # NO need to do it
    '''
    # set small values to 0
    # note theta shape is (#spots * #celltypes * 1)
    theta[theta < weight_threshold] = 0
    tmp_row_sums = theta.sum(axis=1)  # Sum along the second axis, got matrix (#spots * 1)
    theta = theta / tmp_row_sums[:, np.newaxis] # broadcasting row sum to (#spots * 1 * 1) then performs element-wise division
    assert theta.shape == theta_mask.shape
    for i in range(theta.shape[0]):
        assert abs(np.sum(theta[i,:,:]) - 1) < 1e-8
    '''
    
    # reuse e_alpha, as it related to spot not cell-types
    e_alpha = stage1_result['e_alpha']
    
    print('[HIGHLIGHT] reuse estimated theta and e^alpha in stage 1 as initial value')
    
    print(f'specified hyper-parameter for Graph Laplacian Constrain is: {lambda_g}\n')
    
    
    if data['A'] is None:
        
        print('[WARNING] No Adjacency Matrix of spots specified! Skip stage 2')
        result = {}
        result['theta'] = theta.copy()
        result['e_alpha'] = e_alpha.copy()
    
    elif lambda_g == 0:
        
        print('[WARNING] User specified lambda_g as 0, meaning disable graph Laplacian. Stage 2 will be skiped')
        result = {}
        result['theta'] = theta.copy()
        result['e_alpha'] = e_alpha.copy()
    
    else:
        
        # considering Laplacian Constrain
        # manually calculate Laplacian matrix L = D - W
        def calcLaplacian(W):
            # consistent with nx.laplacian_matrix(G), which return a SciPy sparse matrix
            deg = W.sum(axis=1)  # degree (row-sum of weights)
            D   = np.diag(deg)
            return D - W  # dense Laplacian
        
        # transform it to a scipy sparse matrix to be consistent with Laplacian Matrix derived from graph object
        L = sparse.csr_matrix(calcLaplacian(data['A']))
    
        if isinstance(lambda_g, list):
            print(f'hyper-parameter for Graph Laplacian Constrain: use cross-validation to find the optimal value from {len(lambda_g)} candidates...')
            # NOTE to use deep copy of Laplacian matrix to avoid modify it unexpectedly
            # UPDATE: NOT use ADMM for hyperparameter tuning
            best_lambda_g = cv_find_lambda_g(data, L.copy(), theta.copy(), e_alpha.copy(), theta_mask, gamma_g.copy(), sigma2, lambda_g, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=z_hv, hv_log_p=log_p_hv, use_admm=False, use_likelihood=True, diagnosis=diagnosis)
        
            print(f'\nStart optimization with found optimal lambda_g: {best_lambda_g}\n')
        
        else:
            best_lambda_g = lambda_g
            print(f'\nStart optimization with user specified optimal lambda_g: {best_lambda_g}\n')
        
        # UPDATE: multiple the hyperparameter with Laplacian matrix get 𝜆L
        lambda_gL = L * best_lambda_g
        
        if use_admm:
            # stage 2 ADMM iterations
            result = one_admm_fit(data, lambda_gL, theta.copy(), e_alpha.copy(), gamma_g.copy(), sigma2,
                                  lambda_r=0, lasso_weight=None,
                                  hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=theta_mask,
                                  opt_method=opt_method, global_optimize=global_optimize,
                                  hybrid_version=hybrid_version, verbose=verbose)
        else:
            result = fit_base_model_plus_laplacian(data, lambda_gL, theta.copy(), e_alpha.copy(),
                                               gamma_g.copy(), sigma2,
                                               lambda_r=None, lasso_weight=None,
                                               hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=theta_mask,
                                               opt_method=opt_method, global_optimize=global_optimize,
                                               hybrid_version=hybrid_version, verbose=verbose)
    
    print(f'\nstage 2 finished. Elapsed time: {(time()-tmp_start_time)/60.0:.2f} minutes.\n')
    
    print(f'GLRM fitting finished. Elapsed time: {(time()-start_time)/60.0:.2f} minutes.\n')
    
    # add sigma2
    result['sigma2'] = sigma2
    
    return result