Source code for diagnosis_plots

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 06:02:34 2024

@author: hill103

We move all functions for generating diagnosis plots to here.
"""



import os
from config import print, diagnosis_path
#from tensorflow.keras.utils import plot_model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import Normalize
import seaborn as sns
sns.set()
import umap
from distinctipy import distinctipy
from scipy.interpolate import griddata
from sklearn.decomposition import PCA




[docs]
def plotCVAELoss(history):
    '''
    plot training and validation loss in CVAE training
    
    NOTE validation loss may be unavailable if no validation data in training

    Parameters
    ----------
    history : History object
        History object returned by Keras Model.fit in CVAE training.

    Returns
    -------
    None.
    '''
    
    # need to create subfolders first, otherwise got FileNotFoundError
    os.makedirs(os.path.join(diagnosis_path, 'CVAE_training'), exist_ok=True)
    
    
    # plot total loss
    plt.figure(figsize=(6.4*2, 4.8*2))
    plt.plot(history.history['loss'], label='Training')
    if 'val_loss' in history.history:
        plt.plot(history.history['val_loss'], label='Validation')
    plt.ylabel('Total Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_training', 'CVAE_total_loss_in_training.png'))
    plt.close()
    
    
    # plot components of loss
    plt.figure(figsize=(6.4*2, 4.8*2))
    
    # Training KL Loss
    plt.plot(history.history['KL_loss'], color='#1f77b4', marker='o', label='Training KL Loss')
    # Validation KL Loss
    if 'val_KL_loss' in history.history:
        plt.plot(history.history['val_KL_loss'], color='#ff7f0e', marker='o', label='Validation KL Loss')

    # Training Reconstruction Loss
    plt.plot(history.history['reconstruction_loss'], color='#1f77b4', marker='s', linestyle='dashed', label='Training Reconstruction Loss')
    # Validation Reconstruction Loss
    if 'val_reconstruction_loss' in history.history:
        plt.plot(history.history['val_reconstruction_loss'], color='#ff7f0e', marker='s', linestyle='dashed', label='Validation Reconstruction Loss')
    
    # Adding labels
    plt.xlabel('Epoch')
    plt.ylabel('Model Loss')
    # Add a legend
    plt.legend(loc='upper right')
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_training', 'CVAE_loss_components_in_training.png'))
    plt.close()





[docs]
def defineColor(n_spatial_spot, scRNA_celltype):
    '''
    generate n visually distinct colours for cell-types. Use these colours across whole pipeline for consistency

    Parameters
    ----------
    n_spatial_spot : int
        number of spatial spots.
    scRNA_celltype : dataframe
        cell-type annotations for cells in scRNA-seq data. Only 1 column named <celltype>.

    Returns
    -------
    plot_colors : dict
        generated color palette, keys are cell-types together with the number of cells with this cell-type, and values are RGB colors.
    '''
    
    celltype_order = sorted(list(scRNA_celltype.celltype.unique()))
    celltype_count_dict = scRNA_celltype.celltype.value_counts().to_dict()
    
    plot_colors = {}
    for one_celltype, one_color in zip([f'spatial ({n_spatial_spot})']+[f'{x} ({celltype_count_dict[x]})' for x in celltype_order], distinctipy.get_colors(len(celltype_order)+1)):
        plot_colors[one_celltype] = one_color
    # assign pseudo spots as gray80
    plot_colors['pseudo'] = '#cccccc'
    
    return plot_colors





[docs]
def rawInputUMAP(spatial_df, scRNA_df, scRNA_celltype, plot_colors):
    '''
    generate UMAP of spatial spots together with scRNA-seq cells

    Parameters
    ----------
    spatial_df : dataframe
        normalized gene expression in spatial transcriptomic data (spots * genes).
    scRNA_df : dataframe
        normalized gene expression in scRNA-seq data (cells * genes).
    scRNA_celltype : dataframe
        cell-type annotations for cells in scRNA-seq data. Only 1 column named <celltype>.
    plot_colors : dict
        color palette for plot.

    Returns
    -------
    None.
    '''
    
    assert((scRNA_df.index == scRNA_celltype.index).all())
    assert((spatial_df.columns == scRNA_df.columns).all())
    
    # need to create subfolders first, otherwise got FileNotFoundError
    os.makedirs(os.path.join(diagnosis_path, 'raw_input_data'), exist_ok=True)
    
    celltype_count_dict = scRNA_celltype.celltype.value_counts().to_dict()
    
    # take average within cell-types to get cell-type specific marker gene profile
    scRNA_avg_df = scRNA_df.copy()
    scRNA_avg_df['celltype'] = scRNA_celltype.celltype
    scRNA_avg_df = scRNA_avg_df.groupby(['celltype']).mean()
    
    # UMAP of raw input spatial and scRNA-seq cell gene expression
    # we also add marker gene expression profile here
    # the order will affect the point overlay, first row draw first
    # umap has embeded seed (default 42), by specify random_state, umap will use special mode to keep reproducibility
    # note here both gene expressions are sequencing normalized values, without log transform and scaling
    tmp_df = pd.concat([spatial_df, scRNA_df, scRNA_avg_df], ignore_index=True)
    
    all_umap = umap.UMAP(random_state=42).fit_transform(tmp_df.values)
    
    # add cell/spot count in the annotation
    plot_df = pd.DataFrame({'UMAP1': all_umap[:, 0],
                            'UMAP2': all_umap[:, 1],
                            'celltype': [f'spatial ({spatial_df.shape[0]})']*spatial_df.shape[0] +
                                        [f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
                                        [f'{x} ({celltype_count_dict[x]})' for x in scRNA_avg_df.index],
                            'dataset': ['spatial']*spatial_df.shape[0] +
                                       ['scRNA-seq']*scRNA_df.shape[0] +
                                       ['scRNA-seq']*scRNA_avg_df.shape[0],
                            'datatype': ['cell/spot']*spatial_df.shape[0] +
                                        ['cell/spot']*scRNA_df.shape[0] +
                                        ['marker']*scRNA_avg_df.shape[0]
                            },
                           index = spatial_df.index.to_list() +
                                   scRNA_df.index.to_list() +
                                   [f'{x}-marker' for x in scRNA_avg_df.index])
    
    plot_sizes = {'cell/spot': 20, 'marker': 200}
    plot_markers = {'cell/spot': 'o', 'marker': 'X'}
    # color for plot already defined

    sns.set_style("darkgrid")
    
    # relplot return a FacetGrid object
    # specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
    fgrid = sns.relplot(data=plot_df, x='UMAP1', y='UMAP2', hue='celltype', size='datatype', style='datatype', sizes=plot_sizes, markers=plot_markers, palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
    fgrid.set(xlabel='UMAP 1', ylabel='UMAP 2')
    # Put the legend out of the figure
    #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    # add cell-type annotations around marker coordinates
    # adjustText do not support seaborn relplot
    #from adjustText import adjust_text
    # fgrid.axes return an array of all axes in the figure
    ax = fgrid.axes[0, 0]
    texts = []
    for one_row in scRNA_avg_df.index:
        texts.append(ax.text(plot_df.at[one_row+'-marker', 'UMAP1'], plot_df.at[one_row+'-marker', 'UMAP2'], one_row, weight='bold'))
    #adjust_text(texts)
    
    plt.savefig(os.path.join(diagnosis_path, 'raw_input_data', 'UMAP_raw_input_color_by_celltype.png'))
    plt.close()
    
    
    # also save UMAP coordinates
    plot_df[['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'raw_input_data', 'UMAP_coordinates_raw_input.csv.gz'), compression='gzip')





[docs]
def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df, spatial_transformed_numi, pseudo_spatial_embed, scRNA_celltype, celltype_order, celltype_count_dict, scrna_cell_celltype_prop, scRNA_embed, scrna_n_cell, pseudo_spots_celltype_prop, n_cell_in_spot, pseudo_spot_embed, scRNA_decode_df, scRNA_decode_avg_df, new_markers, plot_colors):
    '''
    save CVAE related Keras models to h5 file, generate figures to diagnosis the training of CVAE

    Parameters
    ----------
    cvae : Keras model
        already trained CVAE model
    encoder: Keras model
        encoder part extract from CVAE model
    decoder : Keras model
        a separated decoder whose weights are already updated, i.e. it should give the same decoded output with CVAE
    spatial_embed : 2-D numpy array
        mu in latent space of spatial spots (spots * latent neurons)
    spatial_transformed_df : dataframe
        CVAE transformed (platform effect adjusted) spatial spot gene normalized expressions (spots * genes)
    spatial_transformed_numi : dataframe
        CVAE transformed (platform effect adjusted) spatial spot gene raw nUMI counts (spots * genes)
    pseudo_spatial_embed : dataframe
        mu in latent space of spatial pseudo-spots (spatial pseudo-spots * latent neurons). It will have 0 rows if no spatial pseudo-spots generated
    scRNA_celltype : dataframe
        cell-type annotations for cells in scRNA-seq data. Only 1 column named <celltype>
    celltype_order : list
        already sorted unique cell-types. Its order matters, and will be the order in pseudo_spots_celltype (columns) and cell-type gene expression profile (rows)
    celltype_count_dict : dict
        number of cells in reference scRNA-seq data for each cell-type
    scrna_cell_celltype_prop : dataframe
        scRNA-seq cells cell-type proportions (cells * cell-types)
    scRNA_embed : 2-D numpy array
        mu in latent space of scRNA-seq cells (cells * latent neurons)
    scrna_n_cell : list
        number of cells in scRNA-seq data. Note augmented single cells also included
    pseudo_spots_celltype_prop : dataframe
        pseudo-spot cell-type proportions (pseudo-spots * cell-types; NO scRNA-seq cells included)
    n_cell_in_spot : list
        number of cells in pseudo-spots (no scRNA-seq cells included)
    pseudo_spot_embed : dataframe
        mu in latent space of pseudo spots (pseudo-spots * latent neurons); NO scRNA-seq cells included. It will have 0 rows if no scRNA-seq pseudo-spots generated
    scRNA_decode_df : dataframe
        CVAE decodered gene expression (normalized) of scRNA-seq cells (cells * genes)
    scRNA_decode_avg_df : dataframe
        CVAE decodered average gene expression (normalized) of cell-types in scRNA-seq data (cell-types * genes)
    new_markers : list or None
        marker genes from re-run DE on CVAE transformed scRNA-seq data. It will be None if not re-run DE (rerun_DE=False)
    plot_colors : dict
        color palette for plot
        
    Returns
    -------
    None.
    '''
    
    print('\nsave variables related to CVAE to files!')
    
    # need to create subfolders first, otherwise got FileNotFoundError
    os.makedirs(os.path.join(diagnosis_path, 'CVAE_model'), exist_ok=True)
    os.makedirs(os.path.join(diagnosis_path, 'CVAE_transformed_data'), exist_ok=True)
    os.makedirs(os.path.join(diagnosis_path, 'CVAE_latent_space'), exist_ok=True)
    
    '''
    # plot model
    # need Graphviz binaries (https://graphviz.gitlab.io/, install in OS) and pydot python package
    # we do not plot it in package implementation
    plot_model(cvae, to_file=os.path.join(diagnosis_path, 'CVAE_model', 'CVAE_model.png'), show_shapes=True, show_layer_names=True)
    '''
    
    # save model
    cvae.save(os.path.join(diagnosis_path, 'CVAE_model', 'CVAE_whole.h5'))
    encoder.save(os.path.join(diagnosis_path, 'CVAE_model', 'CVAE_encoder.h5'))
    decoder.save(os.path.join(diagnosis_path, 'CVAE_model', 'CVAE_decoder.h5'))
    
    
    # save transformed data, which are used in GLRM modeling
    if new_markers is None:
        spatial_transformed_numi.to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'spatial_spots_transformToscRNA_decoded.csv.gz'), compression='gzip')
    else:
        # save only marker genes used in downstream anlaysis
        spatial_transformed_numi[new_markers].to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'spatial_spots_transformToscRNA_decoded.csv.gz'), compression='gzip')
    
    if new_markers is None:
        scRNA_decode_avg_df.to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'scRNA_decoded_avg_exp_bycelltypes.csv.gz'), compression='gzip')
    else:
        # save only marker genes used in downstream anlaysis
        scRNA_decode_avg_df[new_markers].to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'scRNA_decoded_avg_exp_bycelltypes.csv.gz'), compression='gzip')
        
    
    # plot variance of mu of spatial spots and scRNA-seq cells
    latent_var = np.concatenate((spatial_embed, scRNA_embed), axis=0).var(axis=0)
    plt.figure()
    ax = sns.histplot(x=np.log10(latent_var))
    ax.set(xlabel='log10(var(mu))')
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'histogram_latent_neurons_variance_spatial_spots_and_scRNA-seq_cells.png'))
    plt.close()
    
    
    # plot histogram of transformed spots nUMI sum
    plt.figure()
    ax = sns.histplot(x=np.sum(spatial_transformed_numi.values, axis=1))
    ax.set(xlabel='Sum of nUMI per spatial spot after transformation')
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'histogram_nUMI_sum_per_spatial_spot_after_transformation.png'))
    plt.close()
    
    
    # plot histogram of transformed spots nUMI
    # exclude zeros since too many zeros
    plt.figure()
    tmp_values = spatial_transformed_numi.stack().values
    ax = sns.histplot(x=tmp_values[tmp_values>0])
    ax.set(xlabel='non-zero nUMI of all genes and all spatial spots after transformation')
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'histogram_non-zero_nUMI_all_gene_all_spatial_spot_after_transformation.png'))
    plt.close()
    
    
    # plot zero percentage of marker genes in spatial spots after transformation
    plt.figure()
    if new_markers is None:
        tmp_mtx = spatial_transformed_numi.values
    else:
        tmp_mtx = spatial_transformed_numi.loc[:, new_markers].values
    ax = sns.histplot(x=np.sum(tmp_mtx==0, axis=1)/tmp_mtx.shape[1])
    ax.set(xlabel=f'Zero percentage of {tmp_mtx.shape[1]} genes per spot after transformation')
    ax.set_xlim(xmin=0, xmax=1)
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'histogram_zero_percentage_per_spatial_spot_after_transformation.png'))
    plt.close()
    
    
    # plot umap of latent space of spatial spots and scRNA-seq cells plus pseudo spots
    # there is no way to add marker gene expression profile here, since the marker gene average expression is calculated on decoded values, it has no meaning even if we feed it to encoder
    # the order will affect the point overlay, first row draw first
    # umap has embeded seed (default 42), by specify random_state, umap will use special mode to keep reproducibility
    all_umap = umap.UMAP(random_state=42).fit_transform(np.concatenate((pseudo_spot_embed, pseudo_spatial_embed, scRNA_embed, spatial_embed), axis=0))
    
    # add cell/spot count in the annotation
    plot_df = pd.DataFrame({'UMAP1': all_umap[:, 0],
                            'UMAP2': all_umap[:, 1],
                            'celltype': ['pseudo']*pseudo_spot_embed.shape[0] +
                                        ['pseudo']*pseudo_spatial_embed.shape[0] + 
                                        [f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
                                        [f'spatial ({spatial_embed.shape[0]})']*spatial_embed.shape[0],
                            'dataset': ['scRNA-seq']*pseudo_spot_embed.shape[0] +
                                       ['spatial']*pseudo_spatial_embed.shape[0] +
                                       ['scRNA-seq']*scRNA_embed.shape[0] +
                                       ['spatial']*spatial_embed.shape[0],
                            'datatype': ['scrna-pseudo']*pseudo_spot_embed.shape[0] +
                                        ['spatial-pseudo']*pseudo_spatial_embed.shape[0] +
                                        ['scrna-cell']*scRNA_embed.shape[0] +
                                        ['spatial-spot']*spatial_embed.shape[0],
                            },
                           index = [f'scrna_pseudo{x}' for x in range(pseudo_spot_embed.shape[0])] +
                                   [f'spatial_pseudo{x}' for x in range(pseudo_spatial_embed.shape[0])] +
                                   scRNA_decode_df.index.to_list() +
                                   spatial_transformed_df.index.to_list())
    
    # plot colors already defined
    sns.set_style("darkgrid")
    
    # relplot return a FacetGrid object
    # specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
    fgrid = sns.relplot(data=plot_df, x='UMAP1', y='UMAP2', hue='celltype', s=20, marker='o', palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
    fgrid.set(xlabel='UMAP 1', ylabel='UMAP 2')
    # Put the legend out of the figure
    #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_celltype.png'))
    plt.close()
    
    
    # save UMAP coordinates of latent space
    plot_df.loc[plot_df['datatype']=='spatial-spot', ['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_coordinates_latent_mu_embedding_spatial_spots.csv.gz'), compression='gzip')
    # save scRNA-seq cells UMAP coordinates
    plot_df.loc[plot_df['datatype']=='scrna-cell', ['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')
    
    
    # NOTE: based on UMAP
    # plot distribution of number of cells in pseudo-spots
    # add number of cells in pseudo-spot to dataframe
    plot_df['n_cell_in_spot'] = np.nan
    plot_df.loc[plot_df['datatype']=='scrna-cell', 'n_cell_in_spot'] = scrna_n_cell
    plot_df.loc[plot_df['datatype']=='scrna-pseudo', 'n_cell_in_spot'] = n_cell_in_spot
    
    sns.set_style("darkgrid")
 
    # generate a colormap with a specified color for NA (spatial spots), but not work for relplot...
    # show the full legend of colorbar in relplot, otherwise it will only show a sample of evenly spaced values (The FacetGrid hue is categorical, not continuous)
    # finally use plt.subplots instead
    fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(6.4*4, 4.8*2))
    # left panel: scatter plot of pseudo-spots + scRNA-seq cells
    tmp_df = plot_df.loc[plot_df['dataset']=='scRNA-seq']
    sc = ax1.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], c=tmp_df['n_cell_in_spot'], cmap='cubehelix', s=10, marker='o')
    ax1.set_title('dataset = scRNA-seq')
    ax1.set_xlabel('UMAP 1')
    ax1.set_ylabel('UMAP 2')
    
    # right panel: scatter plot of spatial spots + spatial pseudo-spots
    # first draw spatial pseudo-spots, which lay at the bottom
    tmp_df = plot_df.loc[plot_df['datatype']=='spatial-pseudo']
    ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors['pseudo'], s=10, marker='o')
    tmp_df = plot_df.loc[plot_df['datatype']=='spatial-spot']
    ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
    ax2.set_title('dataset = spatial')
    ax2.set_xlabel('UMAP 1')
    ax2.set_ylabel('UMAP 2')
    
    # add colorbar with title to the most right (https://stackoverflow.com/questions/13784201/how-to-have-one-colorbar-for-all-subplots, conflict with tight_layout)
    cbar = fig.colorbar(sc, ax=ax2)
    cbar.ax.set_title('#cell in spot')
    
    fig.tight_layout()
    fig.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_num_cell_in_spot.png'))
    plt.close()
    
    
    # NOTE: based on UMAP
    # plot distribution of cell-type proportions of each cell-type
    # create a proportion dataframe with the same row order
    prop_df = pd.concat([pseudo_spots_celltype_prop,
                         pd.DataFrame(0, index=['spatial_pseudo' + str(idx) for idx in range(pseudo_spatial_embed.shape[0])], columns=pseudo_spots_celltype_prop.columns),
                         scrna_cell_celltype_prop,
                         pd.DataFrame(0, index=spatial_transformed_df.index, columns=pseudo_spots_celltype_prop.columns)
                        ], ignore_index=False)
    assert prop_df.shape[0] == plot_df.shape[0]
    assert (prop_df.index == plot_df.index).all()
    
    # save multiple figures into one PDF
    with PdfPages(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_celltype_proportion.pdf')) as pdf:
        for this_celltype in celltype_order:
        
            # start to plot
            fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(6.4*4, 4.8*2))
            
            # left panel: scatter plot of pseudo-spots + scRNA-seq cells
            tmp_df = plot_df.loc[plot_df['dataset']=='scRNA-seq']
            # don't forget the .values
            tmp_df = tmp_df.assign(proportion = prop_df.loc[plot_df['dataset']=='scRNA-seq', this_celltype].values)
            
            sc = ax1.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], c=tmp_df['proportion'], cmap='cubehelix', s=10, marker='o', norm=Normalize(vmin=0, vmax=1))
       
            ax1.set_title('dataset = scRNA-seq')
            ax1.set_xlabel('UMAP 1')
            ax1.set_ylabel('UMAP 2')
        
            # right panel: scatter plot of spatial spots + spatial pseudo-spots
            # we also interpolate the grid and draw a contour plot of cell type proportions in right panel
            grid_x, grid_y = np.mgrid[tmp_df['UMAP1'].min():tmp_df['UMAP1'].max():0.025, tmp_df['UMAP2'].min():tmp_df['UMAP2'].max():0.025]
            grid_z = griddata(tmp_df.loc[:, ['UMAP1', 'UMAP2']].values, tmp_df['proportion'].values, (grid_x, grid_y), method='linear',  fill_value=np.nan)
        
            try:
                ax2.contourf(grid_x, grid_y, grid_z, cmap='cubehelix', norm=Normalize(vmin=0, vmax=1), alpha=0.3)
            except:
                pass
        
            # first draw spatial pseudo-spots, which lay at the bottom
            tmp_df = plot_df.loc[plot_df['datatype']=='spatial-pseudo']
            ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors['pseudo'], s=10, marker='o')
            tmp_df = plot_df.loc[plot_df['datatype']=='spatial-spot']
            ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
        
            ax2.set_title('dataset = spatial')
            ax2.set_xlabel('UMAP 1')
            ax2.set_ylabel('UMAP 2')
        
            # add colorbar with title
            cbar = fig.colorbar(sc, ax=ax2)
            cbar.ax.set_title('proportion')
        
            fig.suptitle(this_celltype)
        
            fig.tight_layout()
            
            pdf.savefig(fig)  # Save the current figure to pdf
            plt.close(fig)  # Close the figure to free memory
    
    
    # plot PCA of latent space of spatial spots and scRNA-seq cells plus pseudo spots
    # the order will affect the point overlay, first row draw first
    all_pca = PCA(n_components=2).fit_transform(np.concatenate((pseudo_spot_embed, pseudo_spatial_embed, scRNA_embed, spatial_embed), axis=0))
    
    # add cell/spot count in the annotation
    plot_df = pd.DataFrame({'PC1': all_pca[:, 0],
                            'PC2': all_pca[:, 1],
                            'celltype': ['pseudo']*pseudo_spot_embed.shape[0] +
                                        ['pseudo']*pseudo_spatial_embed.shape[0] + 
                                        [f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
                                        [f'spatial ({spatial_embed.shape[0]})']*spatial_embed.shape[0],
                            'dataset': ['scRNA-seq']*pseudo_spot_embed.shape[0] +
                                       ['spatial']*pseudo_spatial_embed.shape[0] +
                                       ['scRNA-seq']*scRNA_embed.shape[0] +
                                       ['spatial']*spatial_embed.shape[0],
                            'datatype': ['scrna-pseudo']*pseudo_spot_embed.shape[0] +
                                        ['spatial-pseudo']*pseudo_spatial_embed.shape[0] +
                                        ['scrna-cell']*scRNA_embed.shape[0] +
                                        ['spatial-spot']*spatial_embed.shape[0],
                            },
                           index = [f'scrna_pseudo{x}' for x in range(pseudo_spot_embed.shape[0])] +
                                   [f'spatial_pseudo{x}' for x in range(pseudo_spatial_embed.shape[0])] +
                                   scRNA_decode_df.index.to_list() +
                                   spatial_transformed_df.index.to_list())
    
    # plot colors already defined
    sns.set_style("darkgrid")
    
    # relplot return a FacetGrid object
    # specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
    fgrid = sns.relplot(data=plot_df, x='PC1', y='PC2', hue='celltype', s=20, marker='o', palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
    fgrid.set(xlabel='PC 1', ylabel='PC 2')
    # Put the legend out of the figure
    #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_celltype.png'))
    plt.close()
    
    
    # save PCA coordinates of latent space
    plot_df.loc[plot_df['datatype']=='spatial-spot', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_spatial_spots.csv.gz'), compression='gzip')
    # save scRNA-seq cells PCA coordinates
    plot_df.loc[plot_df['datatype']=='scrna-cell', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')
    
    
    # Plot PCA density
    # for safe to avoid program exit when density estimation failed
    try:
        plt.figure(figsize=(6.4*2, 4.8*2))
        sns.set_style('whitegrid')
        ax = sns.kdeplot(data=plot_df, x='PC1', y='PC2', fill=True)
        ax.set(xlabel='PC 1', ylabel='PC 2')
        plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_density.png'))
        plt.close()
    except:
        pass
    
    
    # UMAP of decoded spatial and scRNA-seq cell gene expression
    # now we can add marker gene expression profile here
    # the order will affect the point overlay, first row draw first
    # umap has embeded seed (default 42), by specify random_state, umap will use special mode to keep reproducibility
    # note we use normalized value rather than raw nUMI for spatial spots
    tmp_df = pd.concat([spatial_transformed_df, scRNA_decode_df, scRNA_decode_avg_df], ignore_index=True)
    # only use marker genes
    if new_markers is not None:
        tmp_df = tmp_df[new_markers]
    
    all_umap = umap.UMAP(random_state=42).fit_transform(tmp_df.values)
    
    # add cell/spot count in the annotation
    plot_df = pd.DataFrame({'UMAP1': all_umap[:, 0],
                            'UMAP2': all_umap[:, 1],
                            'celltype': [f'spatial ({spatial_transformed_df.shape[0]})']*spatial_transformed_df.shape[0] +
                                        [f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
                                        [f'{x} ({celltype_count_dict[x]})' for x in scRNA_decode_avg_df.index],
                            'dataset': ['spatial']*spatial_transformed_df.shape[0] +
                                       ['scRNA-seq']*scRNA_decode_df.shape[0] +
                                       ['scRNA-seq']*scRNA_decode_avg_df.shape[0],
                            'datatype': ['cell/spot']*spatial_transformed_df.shape[0] +
                                        ['cell/spot']*scRNA_decode_df.shape[0] +
                                        ['marker']*scRNA_decode_avg_df.shape[0]
                            },
                           index = spatial_transformed_df.index.to_list() +
                                   scRNA_decode_df.index.to_list() +
                                   [f'{x}-marker' for x in scRNA_decode_avg_df.index])
    
    plot_sizes = {'cell/spot': 20, 'marker': 200}
    plot_markers = {'cell/spot': 'o', 'marker': 'X'}
    # color for plot already defined

    sns.set_style("darkgrid")
    
    # relplot return a FacetGrid object
    # specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
    fgrid = sns.relplot(data=plot_df, x='UMAP1', y='UMAP2', hue='celltype', size='datatype', style='datatype', sizes=plot_sizes, markers=plot_markers, palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
    fgrid.set(xlabel='UMAP 1', ylabel='UMAP 2')
    # Put the legend out of the figure
    #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    
    # add cell-type annotations around marker coordinates
    # adjustText do not support seaborn relplot
    #from adjustText import adjust_text
    # fgrid.axes return an array of all axes in the figure
    ax = fgrid.axes[0, 0]
    texts = []
    for one_row in scRNA_decode_avg_df.index:
        texts.append(ax.text(plot_df.at[one_row+'-marker', 'UMAP1'], plot_df.at[one_row+'-marker', 'UMAP2'], one_row, weight='bold'))
    #adjust_text(texts)
    
    plt.savefig(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'UMAP_decoded_value_color_by_celltype.png'))
    plt.close()
    
    
    # also save UMAP coordinates
    plot_df[['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'UMAP_coordinates_decoded_value.csv.gz'), compression='gzip')





[docs]
def diagnosisParamsTuning(x, y, optimal_idx, x_label, y_label):
    '''
    generate scatter plot show performance in hyper parameter tuning by cross-validation

    Parameters
    ----------
    x : list
        a list of candidates of hyper parameter
    y : list
        corresponding performance for each hyper parameter candidate. NOTE not all candidates have corresponding performance due to early stop
    optimal_idx : int
        index of list to indicate the optimal hyper parameter value which achieve the best performance
    x_label : string
        label of x axis in the plot, representing the number of hyper parameter
    y_label : string
        label of y axis in the plot, representing the performance metric used

    Returns
    -------
    None.
    '''
    
    # need to create subfolders first, otherwise got FileNotFoundError
    os.makedirs(os.path.join(diagnosis_path, 'GLRM_params_tuning'), exist_ok=True)
    
    sns.set()
    
    fig, ax = plt.subplots(figsize=(6.4*2, 4))
    ax.plot(x[:len(y)], y, marker='o')
    
    # if the range of the x-axis exceeds 100 (where the ratio of the largest to smallest value is greater than 100), we utilize a logarithmic scale for the x-axis in the plot.
    # log scale do not support 0, so use symlog
    # "linear threshold" determines the range around zero within which the scale will be linear ranther than instead of logarithmic to avoid distortion that occurs with logarithmic scaling near zero and make the plot more readable
    if x[-1] / (x[0] + 1e-6) > 100:
        ax.set_xscale('symlog', linthresh=x[1])
    
    ax.set_xticks(x)
    ax.set_xticklabels(x, rotation=45)
    ax.axvline(x[optimal_idx], color='red', linestyle='--')
        
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
        
    fig.tight_layout()
    fig.savefig(os.path.join(diagnosis_path, 'GLRM_params_tuning', f'tuning_{x_label}_by_cv.png'))
    plt.close()





[docs]
def diagnosisParamsSpotwiseLambdarTuning(x, strategy):
    '''
    generate histogram plot show optimal lambda_r in hyper parameter tuning by BIC

    Parameters
    ----------
    x : list
        a list of optimal lambda_r
    strategy : string
        the used strategy for tuning lambda_r

    Returns
    -------
    None.
    '''
    
    # need to create subfolders first, otherwise got FileNotFoundError
    os.makedirs(os.path.join(diagnosis_path, 'GLRM_params_tuning'), exist_ok=True)
    
    sns.set()
    
    fig, ax = plt.subplots()
    sns.histplot(x, ax=ax)

    ax.set_xlabel('Optimal lambda_r')
    ax.set_ylabel('Count')
        
    fig.tight_layout()
    fig.savefig(os.path.join(diagnosis_path, 'GLRM_params_tuning', f'tuning_lambda_r_by_{strategy}.png'))
    plt.close()





[docs]
def plot_imputation(df, grid_df, contours, hierarchy, figname, figsize=(6.4, 4.8)):
    '''
    draw scatter plot of spatial spots and imputed spots for diagnosis.

    Parameters
    ----------
    df : dataframe
        dataframe of spatial spots at original resolution, with columns 'x', 'y', 'border'.
    grid_df : dataframe
        dataframe of generated grid points at higher resolution, with columns 'x', 'y'.
    contours : tuple
        contours variable returned by cv2.findContours, used for creating grid.
    hierarchy : 3-D numpy array (1 * #contours * 4)
        hierarchy variable returned by cv2.findContours, used for creating grid.
    figname: string
        name of figure.
    figsize : tuple, optional
        figure size. The default is (6.4, 4.8).

    Returns
    -------
    None.
    '''
    
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set_theme()
    sns.set_style('white')
    
    plt.figure(figsize=figsize)
    
    # scatter plot of original spatial spots, highlighting edge spots
    sns.scatterplot(data=df, x='x', y='y', hue='border', s=100, alpha=0.5, legend=False)
    
    outer_edges = [] # each element is a single outer edge with several points
    inner_edges = [] # each element is a single inner edge with several points
    
    # Go through all contours and hierarchy
    for i, (cnt, hrc) in enumerate(zip(contours, hierarchy[0])):
        # Convert contour points back to original coordinates
        cnt = cnt.reshape(-1, 2)
        
        # Check if it's an outer or inner edge (hierarchy: [Next, Previous, First Child, Parent])
        if hrc[3] == -1:  # it's an outer edge if it has no parent
            outer_edges.append(cnt)
        else:  # it's an inner edge if it has a parent
            inner_edges.append(cnt)
    
    # add edges
    # connect the last point to the first one
    if len(outer_edges) > 0:
        for tmp_plot in outer_edges:
            tmp_plot = np.append(tmp_plot, [tmp_plot[0]], axis=0)
            plt.plot(tmp_plot[:, 0], tmp_plot[:, 1], 'g--')
    
    if len(inner_edges) > 0:
        for tmp_plot in inner_edges:
            tmp_plot = np.append(tmp_plot, [tmp_plot[0]], axis=0)
            plt.plot(tmp_plot[:, 0], tmp_plot[:, 1], 'r--')
            
    # add generated high resolution grid
    sns.scatterplot(data=grid_df, x='x', y='y', marker='X', color='k')
    
    # need to create subfolders first, otherwise got FileNotFoundError
    os.makedirs(os.path.join(diagnosis_path, 'imputation'), exist_ok=True)
    
    plt.savefig(os.path.join(diagnosis_path, 'imputation', figname))
    
    plt.close()