#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat May 7 22:36:00 2022
@author: hill103
this script stores functions of model fitting
currently we use a two-stage implement to fit GLRM
"""
import numpy as np
from time import time
from local_fit_numba import update_theta, update_sigma2, fit_base_model_plus_laplacian, optimize_one_theta
from admm_fit import one_admm_fit
import scipy.sparse as sparse
from utils import reparameterTheta
from hyper_parameter_optimization import cv_find_lambda_r, cv_find_lambda_g, BIC_find_lambda_r_one_spot, BIC_find_theta_subset_one_spot
from config import min_val, print
[docs]
def fit_base_model(data, gamma_g=None, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', verbose=False, use_initial_guess=False):
'''
fit local or base model without any Adaptive Lasso constrain or Graph Laplacian constrain
this fitting is only used for gamma_g estimation and MLE theta estimation in GLRM stage 1
when fitting the base model, we will perform fitting iteratively until the sigma^2 change <= 0.001
Note: the output of the fit result is 3-Dimensional. The dimension transform will be performed outside this function if needed
Parameters
----------
data : Dict
a Dict contains all info need for modeling:
X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
gene_names: a list of string of gene symbols. Only keep actually used marker gene symbols.\n
celltype_names: a list of string of celltype names.\n
initial_guess: initial guess of cell-type proportions of spatial spots.
gamma_g : 1-D numpy array
gene-specific platform effect for all genes.
global_optimize : bool, optional
if is True, use basin-hopping algorithm to find the global minimum. The default is False.
hybrid_version : bool, optional
if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
opt_method : string, optional
specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
verbose : bool, optional
if True, print more information in program running.
use_initial_guess : bool, optional
if True, use initial guess instead of uniform distribution for theta initialization.
Returns
-------
tuple
a tuple of estimated theta, e_alpha and sigma2:
theta : celltype proportions (3-D numpy array (spots * celltypes * 1))\n
e_alpha : spot-specific effect (1-D array with length #spot)\n
sigma2 : variance paramter of the lognormal distribution (float)
'''
start_time = time()
n_celltype, n_gene = data['X'].shape
n_spot = data['Y'].shape[0]
if verbose:
print('\nGLRM model initialization...')
# Initialization
if use_initial_guess and (data['initial_guess'] is not None):
print('[HIGHLIGHT] use initial guess derived from CVAE rather than uniform distribution for theta initialization')
# NOTE: some spots may be excluded in filtering
assert data['initial_guess'].columns.to_list() == data['celltype_names']
assert data['initial_guess'].shape[0] >= len(data['spot_names'])
if data['initial_guess'].shape[0] == len(data['spot_names']):
assert data['initial_guess'].index.to_list() == data['spot_names']
theta = data['initial_guess'].values
else:
# use included spots only
theta = data['initial_guess'].loc[data['spot_names']].values
# note the shape is (#spots, #cell-types, 1)
# use np.newaxis to add a new third dimension
theta = theta[:, :, np.newaxis]
else:
theta = np.full((n_spot, n_celltype, 1), 1.0/n_celltype)
e_alpha = np.full((n_spot,), 1.0)
sigma2 = 1.0
if gamma_g is None:
gamma_g = np.zeros((n_gene,))
# initialize x array for calculation of heavy-tail
from local_fit_numba import z_hv, generate_log_heavytail_array
# initialize density values of heavy-tail with initial sigma^2
log_p_hv = generate_log_heavytail_array(z_hv, np.sqrt(sigma2))
if global_optimize:
if verbose:
print('[CAUTION] global optimization turned on!')
if verbose:
print('calculate MLE theta and sigma^2 iteratively...')
if verbose:
print(f'{"iter" : >6} | {"time_opt": >8} | {"time_sig": >8} | {"sigma2": >10}')
t = 0
pre_sigma2 = sigma2
while True:
# update theta and e_alpha
tmp_start_in_loop = time()
theta, e_alpha = update_theta(data, theta, e_alpha, gamma_g.copy(), sigma2,
global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method,
hv_x=z_hv, hv_log_p=log_p_hv)
time_local_opt = time() - tmp_start_in_loop
# update sigma^2
tmp_start_in_loop = time()
sigma2 = update_sigma2(data, theta, e_alpha, gamma_g.copy(), sigma2,
opt_method=opt_method, global_optimize=global_optimize,
hv_x=z_hv)
# update density values of heavy-tail with current sigma^2
log_p_hv = generate_log_heavytail_array(z_hv, np.sqrt(sigma2))
time_sigma2 = time() - tmp_start_in_loop
if verbose:
print(f'{t : >6} | {time_local_opt:8.3f} | {time_sigma2:8.3f} | {sigma2:10.6f}')
if abs(sigma2 - pre_sigma2) <= 1e-6:
break
else:
t += 1
pre_sigma2 = sigma2
if verbose:
print(f'MLE theta and sigma^2 calculation finished. Elapsed time: {(time()-start_time)/60.0:.2f} minutes.')
return theta, e_alpha, sigma2
[docs]
def estimating_gamma_g(data, hybrid_version=True, opt_method='L-BFGS-B', verbose=False):
"""
estimate platform effect gamma_g by fit model on pseudo-bulk measurement
Parameters
----------
data : Dict
a Dict contains all info need for modeling:
X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
gene_names: a list of string of gene symbols. Only keep actually used marker gene symbols.\n
celltype_names: a list of string of celltype names.\n
initial_guess: initial guess of cell-type proportions of spatial spots.
hybrid_version : bool, optional
if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
opt_method : string, optional
specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
verbose : bool, optional
if True, print more information in program running
Returns
-------
1-D numpy array (#genes)
estimated gamma_g
"""
start_time = time()
if verbose:
print(f'estimate platform effect gammg_g by {opt_method} and basinhopping')
# combine all spots into one spots then build a new data dict
tmp_data = {'X': data['X'].copy(),
'Y': np.sum(data['Y'], axis=0, keepdims=True)}
# sequencing depth is sum across all spots and genes
tmp_data['N'] = np.array([np.sum(data['N'])])
# genes with nUMI>0
if data['non_zero_mtx'] is None:
tmp_data['non_zero_mtx'] = None
else:
tmp_data['non_zero_mtx'] = np.sum(data['non_zero_mtx'], axis=0, keepdims=True) > 0
tmp_data['spot_names'] = ['comb_spot'] # only one spot
# fit base model
theta, e_alpha, _ = fit_base_model(tmp_data, global_optimize=True, hybrid_version=hybrid_version, opt_method=opt_method, verbose=verbose, use_initial_guess=False)
assert(len(e_alpha)==1)
# calculate gamma_g
# note theta is 3-Dimensional
w = np.squeeze(theta) * e_alpha[0]
gamma_g = np.log(np.squeeze(tmp_data['Y'])) - np.log(np.sum(tmp_data['Y']) * w@tmp_data['X'] + min_val)
print(f'gamma_g estimation finished. Elapsed time: {(time()-start_time)/60.0:.2f} minutes.')
return gamma_g
[docs]
def fit_stage1_spotwise_lambda_r(data, warm_start_theta, warm_start_e_alpha, gamma_g, sigma2, lambda_r, lasso_weight, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', hv_x=None, hv_log_p=None, verbose=False, diagnosis=False):
'''
update theta (celltype proportion) and e_alpha (spot-specific effect) given sigma2 (variance paramter of the log-normal distribution) and gamma_g (gene-specific platform effect) by MLE
UPDATE: here we assume the hyperparameter lambda_r for Adaptive Lasso is spot-wise, i.e. for each spot, we search for the optimal lambda_r, then fit the model
UPDATE2: we apply Nested Subset Selection Strategy, thus no lambda_r anymore
we assume
ln(lambda) = alpha + gamma_g + ln(sum(theta*mu_X)) + epsilon
subject to sum(theta)=1, theta>=0
mu_X is marker genes from data['X']
then the mean parameter of the lognormal distribution of ln(lambda) is alpha + gamma_g + ln(sum(theta*mu_X))
we did re-parametrization w = e^alpha * theta, then
ln(lambda) = gamma_g + ln(sum([e^alpha*theta]*mu_X)) + epsilon
subject to w>=0, it will imply sum(theta)=1 and theta>=0
the steps to update theta and e_alpha:
1. dimension change of theta from 3-D (spots * celltypes * 1) to 1-D (celltypes), and do re-parametrization to get w
2. solve w for each spot in parallel
3. extract updated theta and e_alpha from w, and change the dimension of updated theta back
Parameters
----------
data : Dict
a Dict contains all info need for modeling:
X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n
celltype_names: a list of string of celltype names.
warm_start_theta : 3-D numpy array (spots * celltypes * 1)
initial guess of theta (celltype proportion).
warm_start_e_alpha : 1-D numpy array
initial guess of e_alpha (spot-specific effect).
gamma_g : 1-D numpy array
gene-specific platform effect for all genes.
sigma2 : float
variance paramter of the lognormal distribution of ln(lambda). All gene share the same variance.
lambda_r : float or 1-D numpy array
parameter for the strength of Adaptive Lasso loss to shrink theta
lasso_weight : 3-D numpy array (spots * celltypes * 1)
weight of Adaptive Lasso, 1 ./ theta
global_optimize : bool, optional
if is True, use basin-hopping algorithm to find the global minimum. The default is False.
hybrid_version : bool, optional
if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
opt_method : string, optional
specify method used in scipy.optimize.minimize for local model fitting. The default is 'L-BFGS-B', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
hv_x : 1-D numpy array, optional
data points served as x for calculation of probability density values. Only used for heavy-tail.
hv_log_p : 1-D numpy array, optional
log density values of normal distribution N(0, sigma^2) + heavy-tail. Only used for heavy-tail.
verbose : bool, optional
if True, print more information.
diagnosis : bool, optional
if True print more information for diagnosis of hyper-parameter selection.
Returns
-------
theta_results : 3-D numpy array (spots * celltypes * 1)
updated theta (celltype proportion).
e_alpha_results : 1-D numpy array
updated e_alpha (spot-specific effect).
'''
assert lasso_weight is not None
n_celltype = data["X"].shape[0]
n_spot = data["Y"].shape[0]
# strategy includes AIC, BIC or cross-validation for lambda_r tunning, or directly subset selection by AIC or BIC
strategy = 'AIC'
if strategy == 'BIC':
print('[HIGHLIGHT] use BIC to select hyperparameter lambda_r in adaptive Lasso')
use_AIC = False
elif strategy == 'AIC':
print('[HIGHLIGHT] use AIC to select hyperparameter lambda_r in adaptive Lasso')
use_AIC = True
elif strategy == 'CV':
print('[HIGHLIGHT] use cross-validation to select hyperparameter lambda_r in adaptive Lasso')
elif strategy == 'subset-BIC':
print('[HIGHLIGHT] use Nested Subset Selection Strategy with BIC for presented cell type identification')
use_AIC = False
elif strategy == 'subset-AIC':
print('[HIGHLIGHT] use Nested Subset Selection Strategy with AIC for presented cell type identification')
use_AIC = True
else:
raise Exception(f'Unknown strategy in Stage 1: {strategy}!')
# NOTE input warm_start_theta, warm_start_e_alpha will NOT be changed inside the function
# prepare parameter tuples for parallel computing
theta_results = []
e_alpha_results = []
if isinstance(lambda_r, list) and diagnosis:
lambda_r_list = []
prev_percent = -1
for i in range(n_spot):
progress = int(i / n_spot * 100) # 0–100%
# print only when hitting multiples of 10
if progress // 10 > prev_percent:
prev_percent = progress // 10
print(f"{progress}%...", end='')
this_spot_name = data["spot_names"][i]
this_warm_start_theta = warm_start_theta[i, :, :].copy().flatten()
this_warm_start_e_alpha = warm_start_e_alpha[i]
y_vec = data["Y"][i, :].copy()
mu = data["X"].copy()
lasso_weight_vec = lasso_weight[i, :, :].copy().flatten()
if data["N"] is None:
N = None
else:
N = data["N"][i]
# filter zero genes
if data['non_zero_mtx'] is None:
this_y_vec = y_vec
this_gamma_g = gamma_g.copy()
this_mu = mu
else:
non_zero_gene_ind = data['non_zero_mtx'][i, :]
#print(f'total {np.sum(non_zero_gene_ind)} non-zero genes ({np.sum(non_zero_gene_ind)/len(non_zero_gene_ind):.2%}) for spot {this_spot_name}')
this_y_vec = y_vec[non_zero_gene_ind]
this_gamma_g = gamma_g[non_zero_gene_ind].copy()
this_mu = mu[:, non_zero_gene_ind]
this_theta_result = None
this_e_alpha_result = None
# first determine whether need to tune hyperparameter lambda_r
if isinstance(lambda_r, list):
if verbose:
print(f'{i}th spot {this_spot_name} hyper-parameter for Adaptive Lasso: use {strategy} criteria to find the optimal value from {len(lambda_r)} candidates...')
if (strategy == 'BIC') or (strategy == 'AIC'):
# NOTE returned theta NOT w
best_lambda_r, tmp_theta = BIC_find_lambda_r_one_spot(this_mu.copy(), this_y_vec.copy(), N, this_spot_name, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, lasso_weight_vec.copy(), lambda_r, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, use_AIC=use_AIC, reinitialize_theta=False, verbose=verbose)
if isinstance(lambda_r, list) and diagnosis:
lambda_r_list.append({'spot': this_spot_name, 'optimal_lambda_r': best_lambda_r})
# get non-zero cell types then refit; NOTE we set a min_theta in optimization to avoid theta as 0
# and before return result, we already set them back to 0
# binary theta by threshold to get a mask (1: present, 0: not present)
tmp_theta_mask = np.zeros(tmp_theta.shape, dtype='int')
tmp_theta_mask[tmp_theta > 0] = 1
# refit
this_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=tmp_theta_mask, skip_opt=True, verbose=False)
# get theta and e_alpha
this_e_alpha_result = np.sum(this_w)
this_theta_result = this_w / this_e_alpha_result
elif (strategy == 'subset-BIC') or (strategy == 'subset-AIC'):
# start Nested Subset Selection Strategy
# NOTE returned results are theta and e_alpha separately
this_theta_result, this_e_alpha_result = BIC_find_theta_subset_one_spot(this_mu.copy(), this_y_vec.copy(), N, this_spot_name, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, use_AIC=use_AIC, reinitialize_theta=False, verbose=verbose)
elif strategy == 'CV':
# construct data dict with only one spot
if data['non_zero_mtx'] is None:
this_spot_nonzero_mtx = None
else:
this_spot_nonzero_mtx = data['non_zero_mtx'][i, :]
this_spot_nonzero_mtx = this_spot_nonzero_mtx.reshape((1, len(this_spot_nonzero_mtx)))
onespot_data = {'X': this_mu,
'Y': this_y_vec.reshape((1, len(this_y_vec))),
'N': np.array([N]),
'non_zero_mtx': this_spot_nonzero_mtx,
'spot_names': [this_spot_name]
}
reshaped_theta = this_warm_start_theta.reshape((1, n_celltype, 1))
reshaped_e_alpha = np.array([this_warm_start_e_alpha])
reshaped_lasso_weight = lasso_weight_vec.reshape((1, n_celltype, 1))
best_lambda_r = cv_find_lambda_r(onespot_data, reshaped_theta.copy(),
reshaped_e_alpha.copy(), gamma_g.copy(), sigma2,
reshaped_lasso_weight.copy(), lambda_r,
hybrid_version=hybrid_version, opt_method=opt_method,
hv_x=hv_x, hv_log_p=hv_log_p,
use_admm=False, use_likelihood=True,
k=5, diagnosis=False, verbose=verbose)
if isinstance(lambda_r, list) and diagnosis:
lambda_r_list.append({'spot': this_spot_name, 'optimal_lambda_r': best_lambda_r})
# fit with best_lambda_r
tmp_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=best_lambda_r, lasso_weight_vec=lasso_weight_vec.copy(), lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=None, skip_opt=True, verbose=False)
# get theta and e_alpha
tmp_theta = tmp_w / np.sum(tmp_w)
# get non-zero cell types then refit; NOTE we set a min_theta in optimization to avoid theta as 0
# and before return result, we already set them back to 0
# binary theta by threshold to get a mask (1: present, 0: not present)
tmp_theta_mask = np.zeros(tmp_theta.shape, dtype='int')
tmp_theta_mask[tmp_theta > 0] = 1
# refit
this_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=tmp_theta_mask, skip_opt=True, verbose=False)
# get theta and e_alpha
this_e_alpha_result = np.sum(this_w)
this_theta_result = this_w / this_e_alpha_result
else:
# directly run optimization with specified lambda_r
tmp_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=lambda_r, lasso_weight_vec=lasso_weight_vec.copy(), lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=None, skip_opt=True, verbose=False)
# get theta and e_alpha
tmp_theta = tmp_w / np.sum(tmp_w)
# get non-zero cell types then refit; NOTE we set a min_theta in optimization to avoid theta as 0
# and before return result, we already set them back to 0
# binary theta by threshold to get a mask (1: present, 0: not present)
tmp_theta_mask = np.zeros(tmp_theta.shape, dtype='int')
tmp_theta_mask[tmp_theta > 0] = 1
this_w = optimize_one_theta(this_mu.copy(), this_y_vec.copy(), N, this_warm_start_theta.copy(), this_warm_start_e_alpha, this_gamma_g, sigma2, this_spot_name, nu_vec=None, rho=None, lambda_r=None, lasso_weight_vec=None, lambda_l2=None, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=hv_x, hv_log_p=hv_log_p, this_theta_mask=tmp_theta_mask, skip_opt=True, verbose=False)
# get theta and e_alpha
this_e_alpha_result = np.sum(this_w)
this_theta_result = this_w / this_e_alpha_result
# collect results from each spot
assert this_theta_result is not None
assert this_e_alpha_result is not None
theta_results.append(this_theta_result)
e_alpha_results.append(this_e_alpha_result)
print('100%\n')
# change theta dimension back to 3D
theta_results_return = np.zeros((n_spot, n_celltype, 1))
for i, this_result in enumerate(theta_results):
theta_results_return[i, :, :] = np.reshape(theta_results[i], (n_celltype, 1))
e_alpha_results = np.array(e_alpha_results)
if strategy in ['BIC', 'AIC', 'CV']:
if isinstance(lambda_r, list) and diagnosis:
# output optimal lambda_r
import pandas as pd
import os
from config import diagnosis_path
# need to create subfolders first, otherwise got FileNotFoundError
os.makedirs(os.path.join(diagnosis_path, 'GLRM_params_tuning'), exist_ok=True)
tmp_df = pd.DataFrame(lambda_r_list)
tmp_df.to_csv(os.path.join(diagnosis_path, 'GLRM_params_tuning', 'optimal_lambda_r.csv.gz'), index=False, compression='gzip')
# save optimal lambda_r histogram
from diagnosis_plots import diagnosisParamsSpotwiseLambdarTuning
diagnosisParamsSpotwiseLambdarTuning(tmp_df['optimal_lambda_r'].to_list(), strategy)
return theta_results_return, e_alpha_results
[docs]
def fit_model_two_stage(data, gamma_g=None, lambda_r=None, lambda_g=None, global_optimize=False, hybrid_version=True, opt_method='L-BFGS-B', verbose=False, diagnosis=False, use_admm=False):
"""
fit Graph Laplacian Regularized Stratified Model (GLRM) in a two-stage way
Parameters
----------
data : Dict
a Dict contains all info need for modeling:
X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n
celltype_names: a list of string of celltype names.\n
initial_guess: initial guess of cell-type proportions of spatial spots.
gamma_g : 1-D numpy array
gene-specific platform effect for all genes.
lambda_r : float
strength for Adaptive Lasso penalty. If None, use cross-validation to determine optimal lambda_r
lambda_g : float
edge weight for graph, and will affect the strength of Graph Laplacian constrain. If None, use cross-validation to determine optimal graph_weight.
global_optimize : bool, optional
if is True, use basin-hopping algorithm to find the global minimum. The default is False.
hybrid_version : bool, optional
if True, use the hybrid_version of GLRM, i.e. in ADMM local model loss function optimization for w but adaptive lasso constrain on theta. If False, local model loss function optimization and adaptive lasso will on the same w. The default is True.
opt_method : string, optional
specify method used in scipy.optimize.minimize for local model fitting. The default is '', a default method in scipy for optimization with bounds. Another choice would be 'SLSQP', a default method in scipy for optimization with constrains and bounds.
verbose : bool, optional
if True, print more information in program running.
diagnosis : bool, optional
if True save more information to files for diagnosis CVAE and hyper-parameter selection.
use_admm : bool, optinal
if True, use ADMM framework in optimization, default is False.
Returns
-------
Dict
estimated model coefficients, including:
theta : celltype proportions (#spots * #celltypes * 1)\n
e_alpha : spot-specific effect (1-D array with length #spot)\n
sigma2 : variance paramter of the lognormal distribution (float)\n
gamma_g : gene-specific platform effect for all genes (1-D array with length #gene)
"""
print('\n\nStart GLRM fitting...\n')
start_time = time()
n_celltype, n_gene = data['X'].shape
n_spot = data['Y'].shape[0]
print('first estimate MLE theta and corresponding e^alpha and sigma^2...')
tmp_start_time = time()
theta, e_alpha, sigma2 = fit_base_model(data, gamma_g=gamma_g, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, verbose=True, use_initial_guess=True)
print(f'MLE theta estimation finished. Elapsed time: {(time()-tmp_start_time)/60.0:.2f} minutes.')
# initialize x array for calculation of heavy-tail
from local_fit_numba import z_hv, generate_log_heavytail_array
# initialize density values of heavy-tail with initial sigma^2
log_p_hv = generate_log_heavytail_array(z_hv, np.sqrt(sigma2))
# calculate the weight for adaptive lasso based on MLE theta
print('\ncalculate weights of Adaptive Lasso...')
weight_threshold = 1e-3
print(f'clip max weight to {1/weight_threshold:.0f} to avoid huge weights derived from tiny MLE theta')
tmp_theta = theta.copy()
# also avoid to divided by 0
if hybrid_version:
# adaptive lasso constrain on theta
tmp_theta[tmp_theta<weight_threshold] = weight_threshold
lasso_weight = 1.0 / tmp_theta
else:
# adaptive lasso constrain on w=theta*e^alpha
tmp_w = reparameterTheta(tmp_theta, e_alpha)
tmp_w[tmp_w<weight_threshold] = weight_threshold
lasso_weight = 1.0 / tmp_w
del tmp_theta
# stage 1
tmp_start_time = time()
print('\nStage 1: variable selection using Adaptive Lasso starts with the MLE theta and e^alpha, using already estimated sigma^2 and gamma_g...')
spotwise_lambda_r = False
if spotwise_lambda_r:
print('\n[HIGHLIGHT] use spot-wise hyper-parameter lambda_r instead of an overall lambda_r value for all spots\n')
print(f'specified hyper-parameter for Adaptive Lasso is: {lambda_r}\n')
if lambda_r == 0:
print('[WARNING] User specified lambda_r as 0, meaning disable Adaptive Lasso. Stage 1 will be skiped\n')
# directly copy MLE estimates to Stage 1 results
stage1_result = {}
stage1_result['theta'] = theta.copy()
stage1_result['e_alpha'] = e_alpha.copy()
elif isinstance(lambda_r, list):
# we need tune hpyerparameter lambda_r
if spotwise_lambda_r:
print(f'Start tuning spot-wise hyper-parameter for Adaptive Lasso for each spot separately: find the optimal value from {len(lambda_r)} candidates...')
stage1_result = {}
stage1_result['theta'], stage1_result['e_alpha'] = fit_stage1_spotwise_lambda_r(data, theta.copy(), e_alpha.copy(), gamma_g.copy(), sigma2, lambda_r, lasso_weight.copy(), global_optimize=False, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=z_hv, hv_log_p=log_p_hv, verbose=True, diagnosis=diagnosis)
else:
print(f'hyper-parameter for Adaptive Lasso: use cross-validation to find the an overall optimal value for all spots from {len(lambda_r)} candidates...')
best_lambda_r = cv_find_lambda_r(data, theta.copy(), e_alpha.copy(), gamma_g.copy(), sigma2, lasso_weight, lambda_r, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=z_hv, hv_log_p=log_p_hv, use_admm=False, use_likelihood=True, diagnosis=diagnosis)
print(f'\nStart optimization with found optimal lambda_r: {best_lambda_r}\n')
if use_admm:
# stage 1 with determined lambda_r, but use ADMM framework
# Laplacian Matrix is all 0 in stage 1
# transform it to a scipy sparse matrix to be consistent with Laplacian Matrix derived from graph object
L = sparse.csr_matrix(np.zeros((n_spot, n_spot)))
stage1_result = one_admm_fit(data, L, theta.copy(), e_alpha.copy(), gamma_g.copy(),
sigma2, lambda_r=best_lambda_r, lasso_weight=lasso_weight.copy(),
hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=None,
opt_method=opt_method, global_optimize=global_optimize,
hybrid_version=hybrid_version, verbose=verbose)
else:
# stage 1 with determined lambda_r, use base model loss + Adaptive Lasso loss
stage1_result = {}
stage1_result['theta'], stage1_result['e_alpha'] = update_theta(data, theta.copy(),
e_alpha.copy(), gamma_g.copy(), sigma2,
lambda_r=best_lambda_r, lasso_weight=lasso_weight.copy(),
hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=None,
global_optimize=global_optimize, hybrid_version=hybrid_version,
opt_method=opt_method, verbose=verbose)
else:
print(f'\nStart optimization with user specified optimal lambda_r: {lambda_r}\n')
if spotwise_lambda_r:
stage1_result = {}
stage1_result['theta'], stage1_result['e_alpha'] = fit_stage1_spotwise_lambda_r(data, theta.copy(), e_alpha.copy(), gamma_g.copy(), sigma2, lambda_r, lasso_weight.copy(), global_optimize=False, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=z_hv, hv_log_p=log_p_hv, verbose=False, diagnosis=diagnosis)
else:
if use_admm:
# stage 1 with determined lambda_r, but use ADMM framework
# Laplacian Matrix is all 0 in stage 1
# transform it to a scipy sparse matrix to be consistent with Laplacian Matrix derived from graph object
L = sparse.csr_matrix(np.zeros((n_spot, n_spot)))
stage1_result = one_admm_fit(data, L, theta.copy(), e_alpha.copy(), gamma_g.copy(),
sigma2, lambda_r=lambda_r, lasso_weight=lasso_weight.copy(),
hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=None,
opt_method=opt_method, global_optimize=global_optimize,
hybrid_version=hybrid_version, verbose=verbose)
else:
# stage 1 with determined lambda_r, use base model loss + Adaptive Lasso loss
# compared with spotwise-related function, just without re-fit step
stage1_result = {}
stage1_result['theta'], stage1_result['e_alpha'] = update_theta(data, theta.copy(),
e_alpha.copy(), gamma_g.copy(), sigma2,
lambda_r=lambda_r, lasso_weight=lasso_weight.copy(),
hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=None,
global_optimize=global_optimize, hybrid_version=hybrid_version,
opt_method=opt_method, verbose=verbose)
print(f'Stage 1 variable selection finished. Elapsed time: {(time()-tmp_start_time)/60.0:.2f} minutes.')
theta = stage1_result['theta']
# binary theta by threshold to get a mask (1: present, 0: not present)
theta_mask = np.zeros(theta.shape, dtype='int')
# NOTE we already re-set w values at boundary to 0; so small theta values will be directly 0
theta_mask[theta > 0] = 1
# stage 2
tmp_start_time = time()
print('\nStage 2: final theta estimation with Graph Laplacian Constrain using already estimated sigma^2 and gamma_g')
# re-initialize theta and e_alpha for only present cell-types
# update: reuse the result from stage 1
'''
theta = np.zeros(theta.shape)
for i in range(n_spot):
theta[i, theta_mask[i,:,:]==1] = 1.0/np.sum(theta_mask[i,:,:])
e_alpha = np.full((n_spot,), 1.0)
'''
# NO need to do it
'''
# set small values to 0
# note theta shape is (#spots * #celltypes * 1)
theta[theta < weight_threshold] = 0
tmp_row_sums = theta.sum(axis=1) # Sum along the second axis, got matrix (#spots * 1)
theta = theta / tmp_row_sums[:, np.newaxis] # broadcasting row sum to (#spots * 1 * 1) then performs element-wise division
assert theta.shape == theta_mask.shape
for i in range(theta.shape[0]):
assert abs(np.sum(theta[i,:,:]) - 1) < 1e-8
'''
# reuse e_alpha, as it related to spot not cell-types
e_alpha = stage1_result['e_alpha']
print('[HIGHLIGHT] reuse estimated theta and e^alpha in stage 1 as initial value')
print(f'specified hyper-parameter for Graph Laplacian Constrain is: {lambda_g}\n')
if data['A'] is None:
print('[WARNING] No Adjacency Matrix of spots specified! Skip stage 2')
result = {}
result['theta'] = theta.copy()
result['e_alpha'] = e_alpha.copy()
elif lambda_g == 0:
print('[WARNING] User specified lambda_g as 0, meaning disable graph Laplacian. Stage 2 will be skiped')
result = {}
result['theta'] = theta.copy()
result['e_alpha'] = e_alpha.copy()
else:
# considering Laplacian Constrain
# manually calculate Laplacian matrix L = D - W
def calcLaplacian(W):
# consistent with nx.laplacian_matrix(G), which return a SciPy sparse matrix
deg = W.sum(axis=1) # degree (row-sum of weights)
D = np.diag(deg)
return D - W # dense Laplacian
# transform it to a scipy sparse matrix to be consistent with Laplacian Matrix derived from graph object
L = sparse.csr_matrix(calcLaplacian(data['A']))
if isinstance(lambda_g, list):
print(f'hyper-parameter for Graph Laplacian Constrain: use cross-validation to find the optimal value from {len(lambda_g)} candidates...')
# NOTE to use deep copy of Laplacian matrix to avoid modify it unexpectedly
# UPDATE: NOT use ADMM for hyperparameter tuning
best_lambda_g = cv_find_lambda_g(data, L.copy(), theta.copy(), e_alpha.copy(), theta_mask, gamma_g.copy(), sigma2, lambda_g, hybrid_version=hybrid_version, opt_method=opt_method, hv_x=z_hv, hv_log_p=log_p_hv, use_admm=False, use_likelihood=True, diagnosis=diagnosis)
print(f'\nStart optimization with found optimal lambda_g: {best_lambda_g}\n')
else:
best_lambda_g = lambda_g
print(f'\nStart optimization with user specified optimal lambda_g: {best_lambda_g}\n')
# UPDATE: multiple the hyperparameter with Laplacian matrix get 𝜆L
lambda_gL = L * best_lambda_g
if use_admm:
# stage 2 ADMM iterations
result = one_admm_fit(data, lambda_gL, theta.copy(), e_alpha.copy(), gamma_g.copy(), sigma2,
lambda_r=0, lasso_weight=None,
hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=theta_mask,
opt_method=opt_method, global_optimize=global_optimize,
hybrid_version=hybrid_version, verbose=verbose)
else:
result = fit_base_model_plus_laplacian(data, lambda_gL, theta.copy(), e_alpha.copy(),
gamma_g.copy(), sigma2,
lambda_r=None, lasso_weight=None,
hv_x=z_hv, hv_log_p=log_p_hv, theta_mask=theta_mask,
opt_method=opt_method, global_optimize=global_optimize,
hybrid_version=hybrid_version, verbose=verbose)
print(f'\nstage 2 finished. Elapsed time: {(time()-tmp_start_time)/60.0:.2f} minutes.\n')
print(f'GLRM fitting finished. Elapsed time: {(time()-start_time)/60.0:.2f} minutes.\n')
# add sigma2
result['sigma2'] = sigma2
return result