#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 25 06:02:34 2024
@author: hill103
We move all functions for generating diagnosis plots to here.
"""
import os
from config import print, diagnosis_path
#from tensorflow.keras.utils import plot_model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.colors import Normalize
import seaborn as sns
sns.set()
import umap
from distinctipy import distinctipy
from scipy.interpolate import griddata
from sklearn.decomposition import PCA
[docs]
def plotCVAELoss(history):
'''
plot training and validation loss in CVAE training
NOTE validation loss may be unavailable if no validation data in training
Parameters
----------
history : History object
History object returned by Keras Model.fit in CVAE training.
Returns
-------
None.
'''
# need to create subfolders first, otherwise got FileNotFoundError
os.makedirs(os.path.join(diagnosis_path, 'CVAE_training'), exist_ok=True)
# plot total loss
plt.figure(figsize=(6.4*2, 4.8*2))
plt.plot(history.history['loss'], label='Training')
if 'val_loss' in history.history:
plt.plot(history.history['val_loss'], label='Validation')
plt.ylabel('Total Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.savefig(os.path.join(diagnosis_path, 'CVAE_training', 'CVAE_total_loss_in_training.png'))
plt.close()
# plot components of loss
plt.figure(figsize=(6.4*2, 4.8*2))
# Training KL Loss
plt.plot(history.history['KL_loss'], color='#1f77b4', marker='o', label='Training KL Loss')
# Validation KL Loss
if 'val_KL_loss' in history.history:
plt.plot(history.history['val_KL_loss'], color='#ff7f0e', marker='o', label='Validation KL Loss')
# Training Reconstruction Loss
plt.plot(history.history['reconstruction_loss'], color='#1f77b4', marker='s', linestyle='dashed', label='Training Reconstruction Loss')
# Validation Reconstruction Loss
if 'val_reconstruction_loss' in history.history:
plt.plot(history.history['val_reconstruction_loss'], color='#ff7f0e', marker='s', linestyle='dashed', label='Validation Reconstruction Loss')
# Adding labels
plt.xlabel('Epoch')
plt.ylabel('Model Loss')
# Add a legend
plt.legend(loc='upper right')
plt.savefig(os.path.join(diagnosis_path, 'CVAE_training', 'CVAE_loss_components_in_training.png'))
plt.close()
[docs]
def defineColor(n_spatial_spot, scRNA_celltype):
'''
generate n visually distinct colours for cell-types. Use these colours across whole pipeline for consistency
Parameters
----------
n_spatial_spot : int
number of spatial spots.
scRNA_celltype : dataframe
cell-type annotations for cells in scRNA-seq data. Only 1 column named <celltype>.
Returns
-------
plot_colors : dict
generated color palette, keys are cell-types together with the number of cells with this cell-type, and values are RGB colors.
'''
celltype_order = sorted(list(scRNA_celltype.celltype.unique()))
celltype_count_dict = scRNA_celltype.celltype.value_counts().to_dict()
plot_colors = {}
for one_celltype, one_color in zip([f'spatial ({n_spatial_spot})']+[f'{x} ({celltype_count_dict[x]})' for x in celltype_order], distinctipy.get_colors(len(celltype_order)+1)):
plot_colors[one_celltype] = one_color
# assign pseudo spots as gray80
plot_colors['pseudo'] = '#cccccc'
return plot_colors
[docs]
def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df, spatial_transformed_numi, pseudo_spatial_embed, scRNA_celltype, celltype_order, celltype_count_dict, scrna_cell_celltype_prop, scRNA_embed, scrna_n_cell, pseudo_spots_celltype_prop, n_cell_in_spot, pseudo_spot_embed, scRNA_decode_df, scRNA_decode_avg_df, new_markers, plot_colors):
'''
save CVAE related Keras models to h5 file, generate figures to diagnosis the training of CVAE
Parameters
----------
cvae : Keras model
already trained CVAE model
encoder: Keras model
encoder part extract from CVAE model
decoder : Keras model
a separated decoder whose weights are already updated, i.e. it should give the same decoded output with CVAE
spatial_embed : 2-D numpy array
mu in latent space of spatial spots (spots * latent neurons)
spatial_transformed_df : dataframe
CVAE transformed (platform effect adjusted) spatial spot gene normalized expressions (spots * genes)
spatial_transformed_numi : dataframe
CVAE transformed (platform effect adjusted) spatial spot gene raw nUMI counts (spots * genes)
pseudo_spatial_embed : dataframe
mu in latent space of spatial pseudo-spots (spatial pseudo-spots * latent neurons). It will have 0 rows if no spatial pseudo-spots generated
scRNA_celltype : dataframe
cell-type annotations for cells in scRNA-seq data. Only 1 column named <celltype>
celltype_order : list
already sorted unique cell-types. Its order matters, and will be the order in pseudo_spots_celltype (columns) and cell-type gene expression profile (rows)
celltype_count_dict : dict
number of cells in reference scRNA-seq data for each cell-type
scrna_cell_celltype_prop : dataframe
scRNA-seq cells cell-type proportions (cells * cell-types)
scRNA_embed : 2-D numpy array
mu in latent space of scRNA-seq cells (cells * latent neurons)
scrna_n_cell : list
number of cells in scRNA-seq data. Note augmented single cells also included
pseudo_spots_celltype_prop : dataframe
pseudo-spot cell-type proportions (pseudo-spots * cell-types; NO scRNA-seq cells included)
n_cell_in_spot : list
number of cells in pseudo-spots (no scRNA-seq cells included)
pseudo_spot_embed : dataframe
mu in latent space of pseudo spots (pseudo-spots * latent neurons); NO scRNA-seq cells included. It will have 0 rows if no scRNA-seq pseudo-spots generated
scRNA_decode_df : dataframe
CVAE decodered gene expression (normalized) of scRNA-seq cells (cells * genes)
scRNA_decode_avg_df : dataframe
CVAE decodered average gene expression (normalized) of cell-types in scRNA-seq data (cell-types * genes)
new_markers : list or None
marker genes from re-run DE on CVAE transformed scRNA-seq data. It will be None if not re-run DE (rerun_DE=False)
plot_colors : dict
color palette for plot
Returns
-------
None.
'''
print('\nsave variables related to CVAE to files!')
# need to create subfolders first, otherwise got FileNotFoundError
os.makedirs(os.path.join(diagnosis_path, 'CVAE_model'), exist_ok=True)
os.makedirs(os.path.join(diagnosis_path, 'CVAE_transformed_data'), exist_ok=True)
os.makedirs(os.path.join(diagnosis_path, 'CVAE_latent_space'), exist_ok=True)
'''
# plot model
# need Graphviz binaries (https://graphviz.gitlab.io/, install in OS) and pydot python package
# we do not plot it in package implementation
plot_model(cvae, to_file=os.path.join(diagnosis_path, 'CVAE_model', 'CVAE_model.png'), show_shapes=True, show_layer_names=True)
'''
# save model
cvae.save(os.path.join(diagnosis_path, 'CVAE_model', 'CVAE_whole.h5'))
encoder.save(os.path.join(diagnosis_path, 'CVAE_model', 'CVAE_encoder.h5'))
decoder.save(os.path.join(diagnosis_path, 'CVAE_model', 'CVAE_decoder.h5'))
# save transformed data, which are used in GLRM modeling
if new_markers is None:
spatial_transformed_numi.to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'spatial_spots_transformToscRNA_decoded.csv.gz'), compression='gzip')
else:
# save only marker genes used in downstream anlaysis
spatial_transformed_numi[new_markers].to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'spatial_spots_transformToscRNA_decoded.csv.gz'), compression='gzip')
if new_markers is None:
scRNA_decode_avg_df.to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'scRNA_decoded_avg_exp_bycelltypes.csv.gz'), compression='gzip')
else:
# save only marker genes used in downstream anlaysis
scRNA_decode_avg_df[new_markers].to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'scRNA_decoded_avg_exp_bycelltypes.csv.gz'), compression='gzip')
# plot variance of mu of spatial spots and scRNA-seq cells
latent_var = np.concatenate((spatial_embed, scRNA_embed), axis=0).var(axis=0)
plt.figure()
ax = sns.histplot(x=np.log10(latent_var))
ax.set(xlabel='log10(var(mu))')
plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'histogram_latent_neurons_variance_spatial_spots_and_scRNA-seq_cells.png'))
plt.close()
# plot histogram of transformed spots nUMI sum
plt.figure()
ax = sns.histplot(x=np.sum(spatial_transformed_numi.values, axis=1))
ax.set(xlabel='Sum of nUMI per spatial spot after transformation')
plt.savefig(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'histogram_nUMI_sum_per_spatial_spot_after_transformation.png'))
plt.close()
# plot histogram of transformed spots nUMI
# exclude zeros since too many zeros
plt.figure()
tmp_values = spatial_transformed_numi.stack().values
ax = sns.histplot(x=tmp_values[tmp_values>0])
ax.set(xlabel='non-zero nUMI of all genes and all spatial spots after transformation')
plt.savefig(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'histogram_non-zero_nUMI_all_gene_all_spatial_spot_after_transformation.png'))
plt.close()
# plot zero percentage of marker genes in spatial spots after transformation
plt.figure()
if new_markers is None:
tmp_mtx = spatial_transformed_numi.values
else:
tmp_mtx = spatial_transformed_numi.loc[:, new_markers].values
ax = sns.histplot(x=np.sum(tmp_mtx==0, axis=1)/tmp_mtx.shape[1])
ax.set(xlabel=f'Zero percentage of {tmp_mtx.shape[1]} genes per spot after transformation')
ax.set_xlim(xmin=0, xmax=1)
plt.savefig(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'histogram_zero_percentage_per_spatial_spot_after_transformation.png'))
plt.close()
# plot umap of latent space of spatial spots and scRNA-seq cells plus pseudo spots
# there is no way to add marker gene expression profile here, since the marker gene average expression is calculated on decoded values, it has no meaning even if we feed it to encoder
# the order will affect the point overlay, first row draw first
# umap has embeded seed (default 42), by specify random_state, umap will use special mode to keep reproducibility
all_umap = umap.UMAP(random_state=42).fit_transform(np.concatenate((pseudo_spot_embed, pseudo_spatial_embed, scRNA_embed, spatial_embed), axis=0))
# add cell/spot count in the annotation
plot_df = pd.DataFrame({'UMAP1': all_umap[:, 0],
'UMAP2': all_umap[:, 1],
'celltype': ['pseudo']*pseudo_spot_embed.shape[0] +
['pseudo']*pseudo_spatial_embed.shape[0] +
[f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
[f'spatial ({spatial_embed.shape[0]})']*spatial_embed.shape[0],
'dataset': ['scRNA-seq']*pseudo_spot_embed.shape[0] +
['spatial']*pseudo_spatial_embed.shape[0] +
['scRNA-seq']*scRNA_embed.shape[0] +
['spatial']*spatial_embed.shape[0],
'datatype': ['scrna-pseudo']*pseudo_spot_embed.shape[0] +
['spatial-pseudo']*pseudo_spatial_embed.shape[0] +
['scrna-cell']*scRNA_embed.shape[0] +
['spatial-spot']*spatial_embed.shape[0],
},
index = [f'scrna_pseudo{x}' for x in range(pseudo_spot_embed.shape[0])] +
[f'spatial_pseudo{x}' for x in range(pseudo_spatial_embed.shape[0])] +
scRNA_decode_df.index.to_list() +
spatial_transformed_df.index.to_list())
# plot colors already defined
sns.set_style("darkgrid")
# relplot return a FacetGrid object
# specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
fgrid = sns.relplot(data=plot_df, x='UMAP1', y='UMAP2', hue='celltype', s=20, marker='o', palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
fgrid.set(xlabel='UMAP 1', ylabel='UMAP 2')
# Put the legend out of the figure
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_celltype.png'))
plt.close()
# save UMAP coordinates of latent space
plot_df.loc[plot_df['datatype']=='spatial-spot', ['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_coordinates_latent_mu_embedding_spatial_spots.csv.gz'), compression='gzip')
# save scRNA-seq cells UMAP coordinates
plot_df.loc[plot_df['datatype']=='scrna-cell', ['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')
# NOTE: based on UMAP
# plot distribution of number of cells in pseudo-spots
# add number of cells in pseudo-spot to dataframe
plot_df['n_cell_in_spot'] = np.nan
plot_df.loc[plot_df['datatype']=='scrna-cell', 'n_cell_in_spot'] = scrna_n_cell
plot_df.loc[plot_df['datatype']=='scrna-pseudo', 'n_cell_in_spot'] = n_cell_in_spot
sns.set_style("darkgrid")
# generate a colormap with a specified color for NA (spatial spots), but not work for relplot...
# show the full legend of colorbar in relplot, otherwise it will only show a sample of evenly spaced values (The FacetGrid hue is categorical, not continuous)
# finally use plt.subplots instead
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(6.4*4, 4.8*2))
# left panel: scatter plot of pseudo-spots + scRNA-seq cells
tmp_df = plot_df.loc[plot_df['dataset']=='scRNA-seq']
sc = ax1.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], c=tmp_df['n_cell_in_spot'], cmap='cubehelix', s=10, marker='o')
ax1.set_title('dataset = scRNA-seq')
ax1.set_xlabel('UMAP 1')
ax1.set_ylabel('UMAP 2')
# right panel: scatter plot of spatial spots + spatial pseudo-spots
# first draw spatial pseudo-spots, which lay at the bottom
tmp_df = plot_df.loc[plot_df['datatype']=='spatial-pseudo']
ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors['pseudo'], s=10, marker='o')
tmp_df = plot_df.loc[plot_df['datatype']=='spatial-spot']
ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
ax2.set_title('dataset = spatial')
ax2.set_xlabel('UMAP 1')
ax2.set_ylabel('UMAP 2')
# add colorbar with title to the most right (https://stackoverflow.com/questions/13784201/how-to-have-one-colorbar-for-all-subplots, conflict with tight_layout)
cbar = fig.colorbar(sc, ax=ax2)
cbar.ax.set_title('#cell in spot')
fig.tight_layout()
fig.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_num_cell_in_spot.png'))
plt.close()
# NOTE: based on UMAP
# plot distribution of cell-type proportions of each cell-type
# create a proportion dataframe with the same row order
prop_df = pd.concat([pseudo_spots_celltype_prop,
pd.DataFrame(0, index=['spatial_pseudo' + str(idx) for idx in range(pseudo_spatial_embed.shape[0])], columns=pseudo_spots_celltype_prop.columns),
scrna_cell_celltype_prop,
pd.DataFrame(0, index=spatial_transformed_df.index, columns=pseudo_spots_celltype_prop.columns)
], ignore_index=False)
assert prop_df.shape[0] == plot_df.shape[0]
assert (prop_df.index == plot_df.index).all()
# save multiple figures into one PDF
with PdfPages(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_celltype_proportion.pdf')) as pdf:
for this_celltype in celltype_order:
# start to plot
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(6.4*4, 4.8*2))
# left panel: scatter plot of pseudo-spots + scRNA-seq cells
tmp_df = plot_df.loc[plot_df['dataset']=='scRNA-seq']
# don't forget the .values
tmp_df = tmp_df.assign(proportion = prop_df.loc[plot_df['dataset']=='scRNA-seq', this_celltype].values)
sc = ax1.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], c=tmp_df['proportion'], cmap='cubehelix', s=10, marker='o', norm=Normalize(vmin=0, vmax=1))
ax1.set_title('dataset = scRNA-seq')
ax1.set_xlabel('UMAP 1')
ax1.set_ylabel('UMAP 2')
# right panel: scatter plot of spatial spots + spatial pseudo-spots
# we also interpolate the grid and draw a contour plot of cell type proportions in right panel
grid_x, grid_y = np.mgrid[tmp_df['UMAP1'].min():tmp_df['UMAP1'].max():0.025, tmp_df['UMAP2'].min():tmp_df['UMAP2'].max():0.025]
grid_z = griddata(tmp_df.loc[:, ['UMAP1', 'UMAP2']].values, tmp_df['proportion'].values, (grid_x, grid_y), method='linear', fill_value=np.nan)
try:
ax2.contourf(grid_x, grid_y, grid_z, cmap='cubehelix', norm=Normalize(vmin=0, vmax=1), alpha=0.3)
except:
pass
# first draw spatial pseudo-spots, which lay at the bottom
tmp_df = plot_df.loc[plot_df['datatype']=='spatial-pseudo']
ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors['pseudo'], s=10, marker='o')
tmp_df = plot_df.loc[plot_df['datatype']=='spatial-spot']
ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
ax2.set_title('dataset = spatial')
ax2.set_xlabel('UMAP 1')
ax2.set_ylabel('UMAP 2')
# add colorbar with title
cbar = fig.colorbar(sc, ax=ax2)
cbar.ax.set_title('proportion')
fig.suptitle(this_celltype)
fig.tight_layout()
pdf.savefig(fig) # Save the current figure to pdf
plt.close(fig) # Close the figure to free memory
# plot PCA of latent space of spatial spots and scRNA-seq cells plus pseudo spots
# the order will affect the point overlay, first row draw first
all_pca = PCA(n_components=2).fit_transform(np.concatenate((pseudo_spot_embed, pseudo_spatial_embed, scRNA_embed, spatial_embed), axis=0))
# add cell/spot count in the annotation
plot_df = pd.DataFrame({'PC1': all_pca[:, 0],
'PC2': all_pca[:, 1],
'celltype': ['pseudo']*pseudo_spot_embed.shape[0] +
['pseudo']*pseudo_spatial_embed.shape[0] +
[f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
[f'spatial ({spatial_embed.shape[0]})']*spatial_embed.shape[0],
'dataset': ['scRNA-seq']*pseudo_spot_embed.shape[0] +
['spatial']*pseudo_spatial_embed.shape[0] +
['scRNA-seq']*scRNA_embed.shape[0] +
['spatial']*spatial_embed.shape[0],
'datatype': ['scrna-pseudo']*pseudo_spot_embed.shape[0] +
['spatial-pseudo']*pseudo_spatial_embed.shape[0] +
['scrna-cell']*scRNA_embed.shape[0] +
['spatial-spot']*spatial_embed.shape[0],
},
index = [f'scrna_pseudo{x}' for x in range(pseudo_spot_embed.shape[0])] +
[f'spatial_pseudo{x}' for x in range(pseudo_spatial_embed.shape[0])] +
scRNA_decode_df.index.to_list() +
spatial_transformed_df.index.to_list())
# plot colors already defined
sns.set_style("darkgrid")
# relplot return a FacetGrid object
# specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
fgrid = sns.relplot(data=plot_df, x='PC1', y='PC2', hue='celltype', s=20, marker='o', palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
fgrid.set(xlabel='PC 1', ylabel='PC 2')
# Put the legend out of the figure
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_celltype.png'))
plt.close()
# save PCA coordinates of latent space
plot_df.loc[plot_df['datatype']=='spatial-spot', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_spatial_spots.csv.gz'), compression='gzip')
# save scRNA-seq cells PCA coordinates
plot_df.loc[plot_df['datatype']=='scrna-cell', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')
# Plot PCA density
# for safe to avoid program exit when density estimation failed
try:
plt.figure(figsize=(6.4*2, 4.8*2))
sns.set_style('whitegrid')
ax = sns.kdeplot(data=plot_df, x='PC1', y='PC2', fill=True)
ax.set(xlabel='PC 1', ylabel='PC 2')
plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_density.png'))
plt.close()
except:
pass
# UMAP of decoded spatial and scRNA-seq cell gene expression
# now we can add marker gene expression profile here
# the order will affect the point overlay, first row draw first
# umap has embeded seed (default 42), by specify random_state, umap will use special mode to keep reproducibility
# note we use normalized value rather than raw nUMI for spatial spots
tmp_df = pd.concat([spatial_transformed_df, scRNA_decode_df, scRNA_decode_avg_df], ignore_index=True)
# only use marker genes
if new_markers is not None:
tmp_df = tmp_df[new_markers]
all_umap = umap.UMAP(random_state=42).fit_transform(tmp_df.values)
# add cell/spot count in the annotation
plot_df = pd.DataFrame({'UMAP1': all_umap[:, 0],
'UMAP2': all_umap[:, 1],
'celltype': [f'spatial ({spatial_transformed_df.shape[0]})']*spatial_transformed_df.shape[0] +
[f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
[f'{x} ({celltype_count_dict[x]})' for x in scRNA_decode_avg_df.index],
'dataset': ['spatial']*spatial_transformed_df.shape[0] +
['scRNA-seq']*scRNA_decode_df.shape[0] +
['scRNA-seq']*scRNA_decode_avg_df.shape[0],
'datatype': ['cell/spot']*spatial_transformed_df.shape[0] +
['cell/spot']*scRNA_decode_df.shape[0] +
['marker']*scRNA_decode_avg_df.shape[0]
},
index = spatial_transformed_df.index.to_list() +
scRNA_decode_df.index.to_list() +
[f'{x}-marker' for x in scRNA_decode_avg_df.index])
plot_sizes = {'cell/spot': 20, 'marker': 200}
plot_markers = {'cell/spot': 'o', 'marker': 'X'}
# color for plot already defined
sns.set_style("darkgrid")
# relplot return a FacetGrid object
# specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
fgrid = sns.relplot(data=plot_df, x='UMAP1', y='UMAP2', hue='celltype', size='datatype', style='datatype', sizes=plot_sizes, markers=plot_markers, palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
fgrid.set(xlabel='UMAP 1', ylabel='UMAP 2')
# Put the legend out of the figure
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# add cell-type annotations around marker coordinates
# adjustText do not support seaborn relplot
#from adjustText import adjust_text
# fgrid.axes return an array of all axes in the figure
ax = fgrid.axes[0, 0]
texts = []
for one_row in scRNA_decode_avg_df.index:
texts.append(ax.text(plot_df.at[one_row+'-marker', 'UMAP1'], plot_df.at[one_row+'-marker', 'UMAP2'], one_row, weight='bold'))
#adjust_text(texts)
plt.savefig(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'UMAP_decoded_value_color_by_celltype.png'))
plt.close()
# also save UMAP coordinates
plot_df[['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'CVAE_transformed_data', 'UMAP_coordinates_decoded_value.csv.gz'), compression='gzip')
[docs]
def diagnosisParamsTuning(x, y, optimal_idx, x_label, y_label):
'''
generate scatter plot show performance in hyper parameter tuning by cross-validation
Parameters
----------
x : list
a list of candidates of hyper parameter
y : list
corresponding performance for each hyper parameter candidate. NOTE not all candidates have corresponding performance due to early stop
optimal_idx : int
index of list to indicate the optimal hyper parameter value which achieve the best performance
x_label : string
label of x axis in the plot, representing the number of hyper parameter
y_label : string
label of y axis in the plot, representing the performance metric used
Returns
-------
None.
'''
# need to create subfolders first, otherwise got FileNotFoundError
os.makedirs(os.path.join(diagnosis_path, 'GLRM_params_tuning'), exist_ok=True)
sns.set()
fig, ax = plt.subplots(figsize=(6.4*2, 4))
ax.plot(x[:len(y)], y, marker='o')
# if the range of the x-axis exceeds 100 (where the ratio of the largest to smallest value is greater than 100), we utilize a logarithmic scale for the x-axis in the plot.
# log scale do not support 0, so use symlog
# "linear threshold" determines the range around zero within which the scale will be linear ranther than instead of logarithmic to avoid distortion that occurs with logarithmic scaling near zero and make the plot more readable
if x[-1] / (x[0] + 1e-6) > 100:
ax.set_xscale('symlog', linthresh=x[1])
ax.set_xticks(x)
ax.set_xticklabels(x, rotation=45)
ax.axvline(x[optimal_idx], color='red', linestyle='--')
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
fig.tight_layout()
fig.savefig(os.path.join(diagnosis_path, 'GLRM_params_tuning', f'tuning_{x_label}_by_cv.png'))
plt.close()
[docs]
def diagnosisParamsSpotwiseLambdarTuning(x, strategy):
'''
generate histogram plot show optimal lambda_r in hyper parameter tuning by BIC
Parameters
----------
x : list
a list of optimal lambda_r
strategy : string
the used strategy for tuning lambda_r
Returns
-------
None.
'''
# need to create subfolders first, otherwise got FileNotFoundError
os.makedirs(os.path.join(diagnosis_path, 'GLRM_params_tuning'), exist_ok=True)
sns.set()
fig, ax = plt.subplots()
sns.histplot(x, ax=ax)
ax.set_xlabel('Optimal lambda_r')
ax.set_ylabel('Count')
fig.tight_layout()
fig.savefig(os.path.join(diagnosis_path, 'GLRM_params_tuning', f'tuning_lambda_r_by_{strategy}.png'))
plt.close()
[docs]
def plot_imputation(df, grid_df, contours, hierarchy, figname, figsize=(6.4, 4.8)):
'''
draw scatter plot of spatial spots and imputed spots for diagnosis.
Parameters
----------
df : dataframe
dataframe of spatial spots at original resolution, with columns 'x', 'y', 'border'.
grid_df : dataframe
dataframe of generated grid points at higher resolution, with columns 'x', 'y'.
contours : tuple
contours variable returned by cv2.findContours, used for creating grid.
hierarchy : 3-D numpy array (1 * #contours * 4)
hierarchy variable returned by cv2.findContours, used for creating grid.
figname: string
name of figure.
figsize : tuple, optional
figure size. The default is (6.4, 4.8).
Returns
-------
None.
'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
sns.set_style('white')
plt.figure(figsize=figsize)
# scatter plot of original spatial spots, highlighting edge spots
sns.scatterplot(data=df, x='x', y='y', hue='border', s=100, alpha=0.5, legend=False)
outer_edges = [] # each element is a single outer edge with several points
inner_edges = [] # each element is a single inner edge with several points
# Go through all contours and hierarchy
for i, (cnt, hrc) in enumerate(zip(contours, hierarchy[0])):
# Convert contour points back to original coordinates
cnt = cnt.reshape(-1, 2)
# Check if it's an outer or inner edge (hierarchy: [Next, Previous, First Child, Parent])
if hrc[3] == -1: # it's an outer edge if it has no parent
outer_edges.append(cnt)
else: # it's an inner edge if it has a parent
inner_edges.append(cnt)
# add edges
# connect the last point to the first one
if len(outer_edges) > 0:
for tmp_plot in outer_edges:
tmp_plot = np.append(tmp_plot, [tmp_plot[0]], axis=0)
plt.plot(tmp_plot[:, 0], tmp_plot[:, 1], 'g--')
if len(inner_edges) > 0:
for tmp_plot in inner_edges:
tmp_plot = np.append(tmp_plot, [tmp_plot[0]], axis=0)
plt.plot(tmp_plot[:, 0], tmp_plot[:, 1], 'r--')
# add generated high resolution grid
sns.scatterplot(data=grid_df, x='x', y='y', marker='X', color='k')
# need to create subfolders first, otherwise got FileNotFoundError
os.makedirs(os.path.join(diagnosis_path, 'imputation'), exist_ok=True)
plt.savefig(os.path.join(diagnosis_path, 'imputation', figname))
plt.close()