Source code for cvaeglrm

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 10 03:15:36 2022

@author: hill103

this script is the main function of the whole CVAE-GLRM pipeline

pipeline steps:
    
    1. receive and parse the command line parameters and then pass the parameters to CVAE-GLRM model
    2. pre-process
    3. building CVAE (if available)
    4. GLRM model fitting
    5. post-process and save the results
"""



[docs] def main(): ''' main function Parameters ---------- None. Returns ------- None. ''' from config import print, cur_version, output_path import os os.environ['PYTHONHASHSEED'] = '0' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import warnings warnings.filterwarnings("ignore") print(f'\nSDePER (Spatial Deconvolution method with Platform Effect Removal) v{cur_version}\n') import pandas as pd from time import time from parse_opt import parseOpt from preprocess import preprocess from run_model import run_GLRM start_time = time() paramdict = parseOpt() # Configure a new global `tensorflow` session for reproductivity import tensorflow as tf tf.keras.utils.set_random_seed(paramdict['seed']) tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=paramdict['n_cores'], inter_op_parallelism_threads=paramdict['n_cores']) sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf) tf.compat.v1.keras.backend.set_session(sess) print('\n\n######### Preprocessing... #########\n') data = preprocess(paramdict['spatial_file'], paramdict['ref_file'], paramdict['ref_celltype_file'], paramdict['marker_file'], paramdict['A_file'], paramdict['use_cvae'], paramdict['n_hv_gene'], paramdict['n_marker_per_cmp'], paramdict['n_pseudo_spot'], paramdict['pseudo_spot_min_cell'], paramdict['pseudo_spot_max_cell'], paramdict['seq_depth_scaler'], paramdict['cvae_input_scaler'], paramdict['cvae_init_lr'], paramdict['num_hidden_layer'], paramdict['use_batch_norm'], paramdict['cvae_train_epoch'], paramdict['use_spatial_pseudo'], paramdict['redo_de'], paramdict['use_fdr'], paramdict['p_val_cutoff'], paramdict['fc_cutoff'], paramdict['pct1_cutoff'], paramdict['pct2_cutoff'], paramdict['sortby_fc'], paramdict['filter_cell'], paramdict['filter_gene'], paramdict['diagnosis']) # whether to estimate gamma_g, if CVAE is used then disable gamma_g estimation if paramdict['use_cvae']: estimate_gamma_g = False else: estimate_gamma_g = True # release RAM before modeling #import gc #import psutil #print(f'before gc RAM usage: {psutil.Process().memory_info().rss/1024**2:.2f} MB') #gc.collect() #print(f'after gc RAM usage: {psutil.Process().memory_info().rss/1024**2:.2f} MB') result = run_GLRM(data, lambda_r=paramdict['lambda_r'], lambda_g=paramdict['lambda_g'], estimate_gamma_g=estimate_gamma_g, n_jobs=paramdict['n_cores'], threshold=paramdict['threshold'], diagnosis=paramdict['diagnosis'], verbose=paramdict['verbose']) # save result save_all = False if save_all: # save all related estimations to xlsx file output_file = os.path.join(output_path, 'celltype_proportions.xlsx') with pd.ExcelWriter(output_file) as writer: # theta pd.DataFrame(result['theta'], index=data['spot_names'], columns=data['celltype_names']).to_excel(writer, sheet_name='theta') pd.DataFrame(result['e_alpha'], index=data['spot_names'], columns=['e_alpha']).to_excel(writer, sheet_name='e_alpha') pd.DataFrame(result['gamma_g'], index=data['gene_names'], columns=['gamma_g']).to_excel(writer, sheet_name='gamma_g') pd.DataFrame([result['sigma2']], index=None, columns=['sigma2']).to_excel(writer, sheet_name='sigma2', index=False) else: # only save theta to a csv file output_file = os.path.join(output_path, 'celltype_proportions.csv') pd.DataFrame(result['theta'], index=data['spot_names'], columns=data['celltype_names']).to_csv(output_file) print(f'\n\ncell type deconvolution finished. Estimate results saved in {output_file}. Elapsed time: {(time()-start_time)/3600.0:.2f} hours.') # imputation if paramdict['use_imputation']: from imputation import do_imputation print('\n\n######### Start imputation #########') for x in paramdict['impute_diameter']: print(f'\n\nimputation for {x} µm ...') impute_start = time() # we now totally discard the transforming from integer coordinates to pixels # we use stepsize = impute_diameter / diameter inside imputation function result = do_imputation(paramdict['loc_file'], output_file, paramdict['spatial_file'], float(x)/paramdict['diameter'], paramdict['hole_min_spots'], paramdict['preserve_shape'], diagnosis=paramdict['diagnosis']) # return imputed spot locations, cell type proportions and gene expressions result[0].to_csv(os.path.join(output_path, f'impute_diameter_{x}_spot_loc.csv')) result[1].to_csv(os.path.join(output_path, f'impute_diameter_{x}_spot_celltype_prop.csv')) result[2].to_csv(os.path.join(output_path, f'impute_diameter_{x}_spot_gene_norm_exp.csv')) print(f'imputation for {x} µm finished. Elapsed time: {(time()-impute_start)/60.0:.2f} minutes') else: print('\n\n######### No imputation #########') print(f'\n\nwhole pipeline finished. Total elapsed time: {(time()-start_time)/3600.0:.2f} hours.')
if __name__ == '__main__': main()