Source code for run_model

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 10 04:52:02 2022

@author: hill103

this script stores function to define a Graph Laplacian Regularized Stratified Model (GLRM) and call functions to fit the coefficients

It receieve the input data, build graph, build GLRM and fit coefficients
"""



import os
import numpy as np
from model_fit import fit_model_two_stage, estimating_gamma_g
from config import print



[docs] def run_GLRM(data, lambda_r=None, lambda_g=None, estimate_gamma_g=True, n_jobs=1, threshold=0, diagnosis=False, verbose=False): """ run GLRM, and fit coefficients Parameters ---------- data : Dict a Dict contains all info need for modeling: X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n gene_names: a list of string of gene symbols. Only keep actually used marker genes.\n celltype_names: a list of string of celltype names.\n initial_guess: initial guess of cell-type proportions of spatial spots. lambda_r : float, optional strength for Adaptive Lasso penalty. The default is None, i.e. use cross-validation to determine optimal value lambda_g : float, optional edge weight for graph, and will affect the strength of Graph Laplacian constrain. The default is None, i.e. use cross-validation to determine optimal value estimate_gamma_g : bool, optional whether to estimate gamma_g (gene-specific platform effect). The default is True. n_jobs : int, optional number of jobs to spawn. The default is 1. threshold : float, optional threshold value for hard thresholding estimated cell-type proportion theta. The default is 0, i.e. no hard thresholding. diagnosis : bool if True save more information to files for diagnosis CVAE and hyper-parameter selection verbose : bool, optional if True, print variables in each ADMM loop. The default is True. Returns ------- result : Dict estimated model coefficients, including (note theta dimension changed back to 2-D): theta : celltype proportions (#spots * #celltypes)\n e_alpha : spot-specific effect (1-D array with length #spot)\n sigma2 : variance paramter of the lognormal distribution (float)\n gamma_g : gene-specific platform effect for all genes (1-D array with length #gene) """ assert(data['Y'].shape[1] == data['X'].shape[1]) if data['A'] is not None: assert(data['A'].shape[0] == data['Y'].shape[0]) assert(data['A'].shape[1] == data['Y'].shape[0]) n_celltype, n_gene = data['X'].shape print('\n\n######### Start GLRM modeling... #########\n') print('GLRM settings:') # specify some program running parameters opt_method = 'L-BFGS-B' print('use SciPy minimize method: ', opt_method) global_optimize = False if global_optimize: print('use basin-hopping algorithm to find the global minimum') else: print('global optimization turned off, local minimum will be used in GLRM') hybrid_version = True if hybrid_version: print('use hybrid version of GLRM') else: print('[CAUTION] use w version in GLRM!') use_admm = False if use_admm: print('use ADMM optimization framework') else: print('[HIGHLIGHT] ADMM framework NOT used') import numba as na na.set_num_threads(n_jobs) print("\nNumba version:", na.__version__) try: import llvmlite print("llvmlite:", llvmlite.__version__) except Exception: print("llvmlite: <unknown>") print("THREADING_LAYER:", os.environ.get("NUMBA_THREADING_LAYER", "<auto>")) print(f'Numba detected total {na.config.NUMBA_NUM_THREADS:d} available CPU cores. Use {na.get_num_threads():d} CPU cores') # BLAS threads (oversubscription?) try: from threadpoolctl import threadpool_info for lib in threadpool_info(): name = (lib.get("internal_api","") + " " + lib.get("filename","")).lower() if "blas" in name or "openblas" in name or "mkl" in name: print("BLAS:", lib.get("filename","?"), "| threads:", lib.get("num_threads","?")) except Exception as e: print("threadpoolctl not available:", e) from utils import check_numba, numba_info numba_info() check_numba() print('\n') from local_fit_numba import z_hv print(f'use {len(z_hv):d} points to calculate the heavy-tail density') tmp_values = set(data['Y'].flatten()) print(f'total {len(tmp_values)} unique nUMIs, min: {min(tmp_values)}, max: {max(tmp_values)}') # define graph from adjacency matrix # UPDATE: we directly work on spatial weight matrix instead of graph object! if data['A'] is not None: n_spot = data['Y'].shape[0] print(f'\nSpatial Weight Matrix: {n_spot:.0f} spots; {np.count_nonzero(data["A"])/2:.0f} non-zero weights\n') # estimate gamma_g if estimate_gamma_g: print('\nEstimate gene-specific platform effect gamma_g...') gamma_g = estimating_gamma_g(data, hybrid_version=hybrid_version, opt_method=opt_method, verbose=True) else: gamma_g = np.zeros((n_gene,)) print('\nestimation of gene-specific platform effect gamma_g is skipped as already using CVAE to adjust platform effect') # start fitting model # use two-stage implement result = fit_model_two_stage(data, gamma_g=gamma_g, lambda_r=lambda_r, lambda_g=lambda_g, global_optimize=global_optimize, hybrid_version=hybrid_version, opt_method=opt_method, verbose=verbose, diagnosis=diagnosis, use_admm=use_admm) # change dimension of theta back to 2-D result['theta'] = np.squeeze(result['theta']) # post-process theta to set theta<XXXX as 0 then re-normalize remaining theta to sum to 1 print('\nPost-processing estimated cell-type proportion theta...') print(f'hard thresholding small theta values with threshold {threshold}') for i in range(result['theta'].shape[0]): if (np.all(result['theta'][i,:]<threshold)): print(f'[WARNING] Spot {i:d} all cell type proportion theta values<{threshold:.6f}! thresholding skipped') continue tmp_ind = (result['theta'][i,:]>0) & (result['theta'][i,:]<threshold) if tmp_ind.any(): print(f'set small theta values<{threshold:.6f} to 0 and renorm remaining theta for spot {i:d}') result['theta'][i, tmp_ind] = 0 result['theta'][i, :] = result['theta'][i, :] / np.sum(result['theta'][i, :]) # add gamma_g result result['gamma_g'] = gamma_g return result