#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 10 03:15:36 2022
@author: hill103
this script is the main function of the whole CVAE-GLRM pipeline
pipeline steps:
1. receive and parse the command line parameters and then pass the parameters to CVAE-GLRM model
2. pre-process
3. building CVAE (if available)
4. GLRM model fitting
5. post-process and save the results
"""
[docs]
def main():
'''
main function
Parameters
----------
None.
Returns
-------
None.
'''
from config import print, cur_version, output_path
import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings("ignore")
print(f'\nSDePER (Spatial Deconvolution method with Platform Effect Removal) v{cur_version}\n')
import pandas as pd
from time import time
from parse_opt import parseOpt
from preprocess import preprocess
from run_model import run_GLRM
start_time = time()
paramdict = parseOpt()
# Configure a new global `tensorflow` session for reproductivity
import tensorflow as tf
tf.keras.utils.set_random_seed(paramdict['seed'])
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=paramdict['n_cores'], inter_op_parallelism_threads=paramdict['n_cores'])
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
tf.compat.v1.keras.backend.set_session(sess)
print('\n\n######### Preprocessing... #########\n')
data = preprocess(paramdict['spatial_file'], paramdict['ref_file'], paramdict['ref_celltype_file'], paramdict['marker_file'], paramdict['A_file'], paramdict['use_cvae'], paramdict['n_hv_gene'], paramdict['n_marker_per_cmp'], paramdict['n_pseudo_spot'], paramdict['pseudo_spot_min_cell'], paramdict['pseudo_spot_max_cell'], paramdict['seq_depth_scaler'], paramdict['cvae_input_scaler'], paramdict['cvae_init_lr'], paramdict['num_hidden_layer'], paramdict['use_batch_norm'], paramdict['cvae_train_epoch'], paramdict['use_spatial_pseudo'], paramdict['redo_de'], paramdict['use_fdr'], paramdict['p_val_cutoff'], paramdict['fc_cutoff'], paramdict['pct1_cutoff'], paramdict['pct2_cutoff'], paramdict['sortby_fc'], paramdict['filter_cell'], paramdict['filter_gene'], paramdict['diagnosis'])
# whether to estimate gamma_g, if CVAE is used then disable gamma_g estimation
if paramdict['use_cvae']:
estimate_gamma_g = False
else:
estimate_gamma_g = True
# release RAM before modeling
#import gc
#import psutil
#print(f'before gc RAM usage: {psutil.Process().memory_info().rss/1024**2:.2f} MB')
#gc.collect()
#print(f'after gc RAM usage: {psutil.Process().memory_info().rss/1024**2:.2f} MB')
result = run_GLRM(data, lambda_r=paramdict['lambda_r'], lambda_g=paramdict['lambda_g'], estimate_gamma_g=estimate_gamma_g, n_jobs=paramdict['n_cores'], threshold=paramdict['threshold'], diagnosis=paramdict['diagnosis'], verbose=paramdict['verbose'])
# save result
save_all = False
if save_all:
# save all related estimations to xlsx file
output_file = os.path.join(output_path, 'celltype_proportions.xlsx')
with pd.ExcelWriter(output_file) as writer:
# theta
pd.DataFrame(result['theta'], index=data['spot_names'], columns=data['celltype_names']).to_excel(writer, sheet_name='theta')
pd.DataFrame(result['e_alpha'], index=data['spot_names'], columns=['e_alpha']).to_excel(writer, sheet_name='e_alpha')
pd.DataFrame(result['gamma_g'], index=data['gene_names'], columns=['gamma_g']).to_excel(writer, sheet_name='gamma_g')
pd.DataFrame([result['sigma2']], index=None, columns=['sigma2']).to_excel(writer, sheet_name='sigma2', index=False)
else:
# only save theta to a csv file
output_file = os.path.join(output_path, 'celltype_proportions.csv')
pd.DataFrame(result['theta'], index=data['spot_names'], columns=data['celltype_names']).to_csv(output_file)
print(f'\n\ncell type deconvolution finished. Estimate results saved in {output_file}. Elapsed time: {(time()-start_time)/3600.0:.2f} hours.')
# imputation
if paramdict['use_imputation']:
from imputation import do_imputation
print('\n\n######### Start imputation #########')
for x in paramdict['impute_diameter']:
print(f'\n\nimputation for {x} µm ...')
impute_start = time()
# we now totally discard the transforming from integer coordinates to pixels
# we use stepsize = impute_diameter / diameter inside imputation function
result = do_imputation(paramdict['loc_file'], output_file, paramdict['spatial_file'], float(x)/paramdict['diameter'], paramdict['hole_min_spots'], paramdict['preserve_shape'], diagnosis=paramdict['diagnosis'])
# return imputed spot locations, cell type proportions and gene expressions
result[0].to_csv(os.path.join(output_path, f'impute_diameter_{x}_spot_loc.csv'))
result[1].to_csv(os.path.join(output_path, f'impute_diameter_{x}_spot_celltype_prop.csv'))
result[2].to_csv(os.path.join(output_path, f'impute_diameter_{x}_spot_gene_norm_exp.csv'))
print(f'imputation for {x} µm finished. Elapsed time: {(time()-impute_start)/60.0:.2f} minutes')
else:
print('\n\n######### No imputation #########')
print(f'\n\nwhole pipeline finished. Total elapsed time: {(time()-start_time)/3600.0:.2f} hours.')
if __name__ == '__main__':
main()