Source code for preprocess

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 10 22:13:23 2022

@author: hill103

this script stores functions related to pre-processing including:
    
    1. read data from CSV files
    
    2. determine whether to perform DE for cell-type specific marker genes:
        if user specified marker file, then DE step will be skipped
        
    3. determine whether to build CVAE to adjust the platform effect:
        if user specified use_cvae as True and also provide original scRNA-seq data and corresponding cell-type annotation, then CVAE will be built
        
    4. filter genes and spots before GLRM modeling
    
    5. check datasets to avoid some mistakes
"""



import pandas as pd
from cvae import build_CVAE_whole
from utils import read_spatial_data, run_DE_only, read_csv_file
from config import print




[docs]
def preprocess(spatial_file, ref_file, ref_anno_file, marker_file, A_file, use_cvae, n_hv_gene, n_marker_per_cmp, n_pseudo_spot, pseudo_spot_min_cell, pseudo_spot_max_cell, seq_depth_scaler, cvae_input_scaler, cvae_init_lr, num_hidden_layer, use_batch_norm, cvae_train_epoch, use_spatial_pseudo, redo_de, use_fdr, p_val_cutoff, fc_cutoff, pct1_cutoff, pct2_cutoff, sortby_fc, filter_cell, filter_gene, diagnosis):
    '''
    preprocess files

    Parameters
    ----------
    spatial_file : string
        full path of input csv file of raw nUMI counts in spatial transcriptomic data (spots * genes).
    ref_file : string
        full path of input csv file of raw nUMI counts in scRNA-seq data (cells * genes).
    ref_anno_file : string
        full path of input csv file of cell-type annotations for all cells in scRNA-seq data.
    marker_file : string
        full path of input csv file of cell-typee marker gene expression (cell-types * genes).
    A_file : string
        full path of input csv file of Adjacency Matrix of spots in spatial transcriptomic data (spots * spots).
    use_cvae : bool
        whether to build CVAE to adjust platform effect.
    n_hv_gene : int
        number of highly variable genes for CVAE.
    n_marker_per_cmp : int
        number of TOP marker genes for each comparison in DE.
    n_pseudo_spot : int
        number of pseudo-spots.
    pseudo_spot_min_cell : int
        minimum value of cells in pseudo-spot.
    pseudo_spot_max_cell : int
        maximum value of cells in pseudo-spot.
    seq_depth_scaler : int
        a scaler of scRNA-seq sequencing depth.
    cvae_input_scaler : int
        maximum value of the scaled input for CVAE.
    cvae_init_lr : float
        initial learning rate for training CVAE.
    num_hidden_layer : int
        number of hidden layers in encoder and decoder.
    use_batch_norm : bool
        whether to use Batch Normalization.
    cvae_train_epoch : int
        max number of training epochs for the CVAE.
    use_spatial_pseudo : int
        whether to generate "pseudo-spots" in spatial condition.
    redo_de : bool
        whether to redo DE after CVAE transformation.
    use_fdr : bool
        whether to use FDR adjusted p value for filtering and sorting.
    p_val_cutoff : float
        threshold of p value (or FDR if --use_fdr is true) in marker genes filtering.
    fc_cutoff : float
        threshold of fold change (without log transform!) in marker genes filtering.
    pct1_cutoff : float
        threshold of pct.1 in marker genes filtering.
    pct2_cutoff : float
        threshold of pct.2 in marker genes filtering.
    sortby_fc : bool
        whether to sort marker genes by fold change.
    diagnosis : bool
        if True save more information to files for diagnosis CVAE and hyper-parameter selection.
    filter_cell : bool
        whether to filter cells before DE.
    filter_gene : bool
        whether to filter genes before DE.

    Returns
    -------
    data : Dict
        a Dict contains all info need for modeling:
            X: a 2-D numpy matrix of celltype specific marker gene expression (celltypes * genes).\n
            Y: a 2-D numpy matrix of spatial gene expression (spots * genes).\n
            A: a 2-D numpy matrix of Adjacency matrix (spots * spots), or is None. Adjacency matrix of spatial sptots (1: connected / 0: disconnected). All 0 in diagonal.\n
            N: a 1-D numpy array of sequencing depth of all spots (length #spots). If it's None, use sum of observed marker gene expressions as sequencing depth.\n
            non_zero_mtx: If it's None, then do not filter zeros during regression. If it's a bool 2-D numpy matrix (spots * genes) as False means genes whose nUMI=0 while True means genes whose nUMI>0 in corresponding spots. The bool indicators can be calculated based on either observerd raw nUMI counts in spatial data, or CVAE transformed nUMI counts.\n
            spot_names: a list of string of spot barcodes. Only keep spots passed filtering.\n
            gene_names: a list of string of gene symbols. Only keep actually used marker gene symbols.\n
            celltype_names: a list of string of cell-type names.\n
            initial_guess: initial guess of cell-type proportions of spatial spots.
    '''
    
    # first determine whether to build CVAE
    if use_cvae:
        if ref_file is None or ref_anno_file is None:
            raise Exception('ERROR: building CVAE requires both reference scRNA-seq data and corresponding cell-type annotation specified! But at least one of them is not specified!')
            
        print('first build CVAE...\n')
        # build CVAE, and return the data dict including transformed spatial data and reference gene expression
        spatial_df, cvae_marker_df, new_markers, cvae_pred = build_CVAE_whole(spatial_file, ref_file, ref_anno_file, marker_file, n_hv_gene, n_marker_per_cmp, n_pseudo_spot, pseudo_spot_min_cell, pseudo_spot_max_cell, seq_depth_scaler, cvae_input_scaler, cvae_init_lr, num_hidden_layer, use_batch_norm, cvae_train_epoch, use_spatial_pseudo, redo_de, use_fdr, p_val_cutoff, fc_cutoff, pct1_cutoff, pct2_cutoff, sortby_fc, diagnosis, filter_cell, filter_gene)
        
        # calculate squencing depth, sum also works on sparse dataframe
        N = spatial_df.sum(axis=1)
    
    else:
        
        print('building CVAE skipped...\n')
        
        # read spatial data
        spatial_obj, spatial_df, N = read_spatial_data(spatial_file, filter_gene)
        
        new_markers = None
        cvae_pred = None
    
        
    # use marker genes from original scRNA-seq or CVAE transformed data
    if new_markers is None:
        
        # whether to perform DE
        if marker_file is not None:
            # directly use provide marker gene expression
            # the default mangle_dupe_cols=True will handle the duplicated columns
            marker_df = read_csv_file(marker_file)
            print('user provided marker gene profile, DE will be skipped...\n')
            print(f'read {marker_df.shape[1]} marker genes from user specified marker gene file')
            
            # check whether cell name are unique
            if len(set(marker_df.index.to_list())) < marker_df.shape[0]:
                raise Exception('cell-type names in user provided marker gene profile are not unique!')
            
            # extract marker gene overlapped with spatial data
            marker_genes = sorted(list(set(spatial_df.columns) & set(marker_df.columns)))
            print(f'from user specified marker gene expression use {len(marker_genes)} marker genes overlapped with spatial data')
            # if len(marker_genes) < spatial_df.shape[1]:
            #     print(f'{spatial_df.shape[1]-len(marker_genes)} genes in overlapped gene list between spatial data but not found in user provided marker gene expression: {", ".join(set(spatial_df.columns).difference(set(marker_genes)))}\n')
            
        else:
            # perform DE, return the marker gene expression. The identified markers but not in spatial data has already been removed
            print('no marker gene profile provided. Perform DE to get cell-type marker genes on scRNA-seq data...\n')
            tmp_scrna_obj, marker_df = run_DE_only(ref_file, ref_anno_file, spatial_df.columns.tolist(), n_marker_per_cmp, use_fdr, p_val_cutoff, fc_cutoff, pct1_cutoff, pct2_cutoff, sortby_fc, diagnosis, filter_cell, filter_gene)
            # note dataframe index and columns returns a RangeIndex object rather than list
            marker_genes = marker_df.columns.to_list()
    
    else:
        
        print('use the marker genes derived from CVAE transformed scRNA-seq for downstream regression!')
        marker_genes = new_markers
    
    
    if use_cvae:
        # replace the marker gene profile with the one transformed by CVAE
        marker_df = cvae_marker_df
        
    
    nUMI_threshold = 5
    
    # exclude genes which are all zeros in spatial data
    print('\ngene filtering before modeling...')
    tmp = spatial_df[marker_genes].sum(axis=0)
    all_zero_genes = tmp[tmp < nUMI_threshold].index.to_list()
    if len(all_zero_genes) > 0:
        print(f'{len(all_zero_genes)} genes with nUMIs<{nUMI_threshold} in all spatial spots and need to be excluded')
    
        marker_genes = sorted(list(set(marker_genes) - set(all_zero_genes)))
        print(f'finally use {len(marker_genes)} genes for modeling')
    else:
        print('all genes passed filtering')
    
    # subset genes
    spatial_df = spatial_df[marker_genes]
    marker_df = marker_df[marker_genes]
    
    # reorder cell-type orders in marker gene profile for consistency
    celltype_order = sorted(marker_df.index.to_list())
    if cvae_pred is not None:
        assert celltype_order == cvae_pred.columns.to_list()
    marker_df = marker_df.loc[celltype_order, :]
    
    if A_file is None:
        A_df = None
    else:
        A_df = read_csv_file(A_file)
        
    # filter spots based on sum of nUMI
    print('\nspot filtering before modeling...')
    exclude_spots = spatial_df.index[spatial_df.sum(axis=1) < nUMI_threshold].tolist()
    if len(exclude_spots) > 0:
        print(f'{len(exclude_spots):d} spots will be excluded as sum of nUMI < {nUMI_threshold:d}')
        spatial_df.drop(exclude_spots, inplace=True)
        if A_df is not None:
            A_df.drop(exclude_spots, inplace=True)
            A_df.drop(exclude_spots, axis=1, inplace=True)
    else:
        print('all spots passed filtering')
        
        
    if diagnosis and not use_cvae:
        # plot UMAP for raw input gene expressions here
        import scanpy as sc
        from diagnosis_plots import defineColor, rawInputUMAP
        # get raw input of scRNA-seq cells
        if 'tmp_scrna_obj' not in locals():
            if ref_file is not None and ref_anno_file is not None:
                from utils import read_scRNA_data
                print('\nread reference scRNA-seq data for plot only')
                tmp_scrna_obj = read_scRNA_data(ref_file, ref_anno_file, filter_cell, filter_gene)
            else:
                tmp_scrna_obj = None
        
        if tmp_scrna_obj is not None:
            tmp_scrna_celltype = sc.get.obs_df(tmp_scrna_obj, keys='celltype').to_frame()
            tmp_scrna_celltype['celltype'] = tmp_scrna_celltype['celltype'].astype(str)
            plot_colors = defineColor(spatial_df.shape[0], tmp_scrna_celltype)
            rawInputUMAP(spatial_df, sc.get.obs_df(tmp_scrna_obj, keys=marker_genes), tmp_scrna_celltype, plot_colors)

    
    # record the zeros in spatial data if filtering zeros is turned on
    filter_zero_gene = False
    use_original_nUMI = True
    
    if filter_zero_gene:
        print('\n[CAUTION] filtering genes with nUMI=0 in spot before regression!')
        if use_original_nUMI:
            print('[CAUTION] gene nUMI=0 determined by original spatial raw nUMI counts')
            # reload the spatial data, and find out genes with nUMI>0
            non_zero_df = pd.concat(chunk.astype('Sparse[int]') for chunk in read_csv_file(spatial_file, chunksize=1e4))
            non_zero_df = non_zero_df.loc[spatial_df.index, spatial_df.columns]
            non_zero_df = non_zero_df > 0
        else:
            print('[CAUTION] gene nUMI=0 determined by current CVAE transformed nUMI counts')
            # just use the current spatial data, and find out genes with nUMI>0
            non_zero_df = spatial_df > 0
        non_zero_mtx = non_zero_df.values
    else:
        non_zero_mtx = None
    
    
    # final check to avoid mistakes

    # check the spot barcodes is the same order
    if A_df is not None:
        assert((spatial_df.index==A_df.index).all())
        assert((A_df.index==A_df.columns).all())
        
    # check the gene names is the same order
    assert((spatial_df.columns==marker_df.columns).all())
    
    if A_df is not None:
        A = A_df.values
    else:
        A = None
    
    # return dense values
    return {'X': marker_df.values,
            'Y': spatial_df.values,
            'A': A,
            'N': N.values,
            'non_zero_mtx': non_zero_mtx,
            'spot_names': spatial_df.index.to_list(),
            'gene_names': spatial_df.columns.to_list(),
            'celltype_names': marker_df.index.to_list(),
            'initial_guess': cvae_pred}