Statistics of the snRNA-seq IPF dataset
#only Mesenchymal cell lineage
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np
import scipy
import matplotlib.gridspec as gridspec
def func(pct, allvals):
absolute = int(pct/100.*np.sum(allvals))
return "{:.1f}%\n{:d}".format(pct, absolute)
#figure 2
mes = 'dataset.h5ad'
species = (
"Mesenchymal"
)
cells = [[] for _ in range(4)]
dataset = [mes]
sf = {}
n_patients = []
for i, each in enumerate(dataset):
adata = sc.read_h5ad(each)
for each in adata.obs['Surface.Density'].unique().tolist():
if i == 0:
sf[each] = []
ids = adata.obs[adata.obs['Surface.Density'] == each].index.tolist()
sf[each].append(len(ids))
for stage in adata.obs['stage'].unique():
stageadata = adata[adata.obs['stage'] == stage]
cells[int(stage)].append(stageadata.n_obs)
n_patients.append(len(stageadata.obs['Sample.ID'].unique()))
weight_counts = {
"Control": np.array(cells[0]),
"Stage 1": np.array(cells[1]),
"Stage 2": np.array(cells[2]),
"Stage 3": np.array(cells[3]),
}
width = 0.5
fig = plt.figure(figsize=(10, 9),dpi=300)
gs = gridspec.GridSpec(3, 10)
ax1 = fig.add_subplot(gs[0, :3])
ax2 = fig.add_subplot(gs[0, 3:])
ax3 = fig.add_subplot(gs[1, :])
ax4 = fig.add_subplot(gs[2, :4])
ax5 = fig.add_subplot(gs[2, 5:])
####panel a
ipf_sample = 0
control_sample = 0
sample_cells = {}
ipf_cells = 0
control_cells = 0
for each in ['Control', 'IPF']:#['Sample.ID']
condition_data = adata[adata.obs['Disease.Ident'] == each]
if each == 'IPF':
ipf_sample = len(list(condition_data.obs['Sample.ID'].unique()))
ipf_cells = len(condition_data)
else:
control_sample = len(list(condition_data.obs['Sample.ID'].unique()))
control_cells = len(condition_data)
for each in list(adata.obs['Sample.ID'].unique()):
sample_cells[each] = len(adata[adata.obs['Sample.ID'] == each])
ax1.pie([ipf_sample,control_sample],labels=['IPF','Control'],autopct=lambda pct: func(pct, [ipf_sample,control_sample]),center=(-3,0))
ax1.set_title('Sample Proportion')
###panel b
# fig, axs = plt.subplots(1,2,dpi=300,figsize= (8,4))
ssf = {key: sf[key] for key in sorted(sf,reverse=True)}
temp = {"Mesenchymal":[]}
temp2 = {}
for each in list(ssf.keys()):
temp2[each] = np.sum(ssf[each])
temp['Mesenchymal'].append(ssf[each][0])
x = np.arange(len(ssf)) # the label locations
width = 0.5 # the width of the bars
multiplier = 0
for attribute, measurement in temp.items():
rects = ax2.bar(tuple(x), measurement, width, label=attribute)
surface_density = list(ssf.keys())
#use .4f to round to 4 decimal places
surface_density = [f"{float(i):.4f}" for i in surface_density]
ax2.plot(0,0, "<k", transform=ax2.get_yaxis_transform(), clip_on=False)
# ax2.plot(-1,1, "^k", transform=ax2.get_xaxis_transform(), clip_on=False)
# ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
ax2.legend(loc = 'best',prop={'size': 8})
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.set_xticks(x, surface_density,rotation=90,fontsize=4)
ax2.set_ylabel('Number of Cells')
ax2.set_xlabel('Surface Density')
ax2.set_title('Cells vs. Surface Density')
from scipy.stats import norm
def extractSDandID(adata,setofSD):
SD={}
for each in setofSD:
SD[each] = adata.obs[adata.obs['Surface.Density'] == each].index.tolist()
return SD
def extractControlandIPF(adata):
control=adata.obs[adata.obs['Disease.Ident'] == "Control"]#.index.tolist()
IPF= adata.obs[adata.obs['Disease.Ident'] == "IPF"]#.index.tolist()
controlSD=set(control['Surface.Density'])
IPFSD=set(IPF['Surface.Density'])
##control=extractSDandID(control)
#IPF=extractSDandID(IPF)
return controlSD, IPFSD
#SDdict = extractSDandID(adata)
control, IPF=extractControlandIPF(adata)
import matplotlib.pyplot as plt
#split datasets
a = []
b = []
c = []
d = []
def getAlpha(mu, sigma,data):
if norm.cdf(data,mu,sigma) == 0:
return 0.0000000000000000000000001
elif norm.cdf(data,mu,sigma) == 1:
return 0.0000000000000000000000001
elif norm.cdf(data,mu,sigma) <=0.5:
return float(norm.cdf(data,mu,sigma))
else:
return float(1-norm.cdf(data,mu,sigma))
def splitIPF_back(mu1, sigma1, mu2, sigma2, mu3, sigma3,data):
Stage_1 = []
Stage_2 = []
Stage_3 = []
IPF4 = []
for each in data:
#dis1 = abs(each-mu1)
#dis2 = abs(each-mu2)
alpha1 = getAlpha(mu1,sigma1, each)
alpha2 = getAlpha(mu2,sigma2, each)
score3 = getAlpha(mu3,sigma3, each)
#print(alpha1,alpha2)
score1 = alpha1#dis1/alpha1
score2 = alpha2#dis2/alpha2
totalScore = score1+score2+score3
Stage_1.append(score1/totalScore)
Stage_2.append(score2/totalScore)
Stage_3.append(score3/totalScore)
return np.array(Stage_1), np.array(Stage_2), np.array(Stage_3)
val = -1*np.array(list(IPF)+list(control))
IPF = -1*np.array(list(IPF))
control= -1*np.array(list(control))
mu1,sigma1,mu2,sigma2,mu3,sigma3 = [0.0045000000000000005, 0.0016733333333333335, 0.008666666666666666, 0.0006444444444444444, 0.0115, 0.001118888888888889]#[0.0045000000000000005, 0.0016733333333333335, 0.008666666666666666, 0.0006444444444444444, 0.0115, 0.001118888888888889]
# sigma4 = sigmas
mu4,sigma4 = [0.01378078078078078, 0.0011888888888888889]
Stage_1,Stage_2,Stage_3 = splitIPF_back(-1*mu1, sigma1, -1*mu2, sigma2,-1*mu3, sigma3, IPF)
k = np.array([Stage_1,Stage_2,Stage_3])
val_label = np.argmax(k,axis=0)
for i, item in enumerate(val_label):
if item == 0:
a.append(IPF[i])
elif item == 1:
b.append(IPF[i])
elif item ==2:
c.append(IPF[i])
a = np.array(a).reshape(-1,)
b = np.array(b).reshape(-1,)
c = np.array(c).reshape(-1,)
control = control.reshape(-1,)
n, bins, patches = ax3.hist([c,b,a,control], bins=len(val), label=['Stage 1','Stage 2','Stage 3','Control'], range=(val.min(),val.max()),alpha = 0.3,stacked=True,density=False)
l7 = ax3.plot(val, np.zeros_like(val)+0.1, 'x',color='black', label='Samples')
ax3.set_xlabel('Surface Density')
# ax3.legend(bbox_to_anchor=(1.08, 1), loc='upper left', borderaxespad=0.)
ax3.legend(loc='best')
ax3.set_ylabel('Number of Samples')
#print(len(val))
ax3_1 = ax3.twinx()
values = np.linspace(-0.01534466,0, 1000)
y1 = norm.pdf(values, -1*mu1, sigma1).reshape(-1,)
y2 = norm.pdf(values, -1*mu2, sigma2).reshape(-1,)
y3 = norm.pdf(values, -1*mu3, sigma3).reshape(-1,)
y4 = norm.pdf(values, -1*mu4, sigma4).reshape(-1,)
l4 = ax3_1.plot(values, y3,label='Stage 1',color='tab:blue')
l3 = ax3_1.plot(values, y2,label='Stage 2',color='tab:orange')
l2 = ax3_1.plot(values, y1,label='Stage 3',color='tab:green')
l5 = ax3_1.plot(values, y4,label='Control',color='tab:red')
l1 = ax3_1.plot(values, y1+y2+y3+y4,'--', c='tab:pink',label='Mixture')
ax3_1.legend(loc='best', bbox_to_anchor=(0.85, 0.98),borderaxespad=0.1)
ax3_1.set_ylabel('Probability Density')
ax3.spines['top'].set_visible(False)
ax3_1.spines['top'].set_visible(False)
###panel d
size = 0.4
vals = np.array(cells).T
cmap = plt.colormaps["tab20c"]
outer_colors = cmap(np.arange(3)*4)
inner_colors = cmap([0,1, 2,3,4,5, 6,7,8, 9, 10,11])
panel4 = vals.flatten()
panel4 =[panel4[0]/n_patients[0], panel4[1]/n_patients[1],panel4[2]/n_patients[2], panel4[3]/n_patients[3] ]
bc = ax5.barh([0.2,0.4,0.6,0.8],panel4[::-1],0.2, left=0,color= ['tab:red','tab:blue','tab:orange','tab:green'][::-1])
panel4 = [f"{float(i):.2f}" for i in panel4]
ax5.bar_label(bc, labels=panel4[::-1],label_type='center',fontsize=9)
ax5.set_yticks([0.2,0.4,0.6,0.8])
ax5.set_yticklabels(['Stage 3', 'Stage 2', 'Stage 1','Control '])
ax5.set_title('Number of Cells per Patient at Each Stage')
ax5.get_xaxis().set_ticks([])
ax5.spines['top'].set_visible(False)
ax5.spines['right'].set_visible(False)
ax5.spines['bottom'].set_visible(False)
ax5.spines['left'].set_visible(False)
###panel e
width = .2
bottom = 1
x = np.arange(len(species))
multiplier = 0
# fig,ax = plt.subplots(figsize=(6, 4),dpi=120)
c = ['C0', 'C1', 'C2','C4']
stages = []
bc = ax4.barh([0.2,0.4,0.6,0.8],vals.flatten()[::-1], width, left = 0, label='Mesenchymal',color=['tab:red','tab:blue','tab:orange','tab:green'][::-1])
ax4.bar_label(bc, labels=vals.flatten()[::-1],label_type='center',fontsize=9)
ax4.set_yticks([0.2,0.4,0.6,0.8])
ax4.set_yticklabels(['Stage 3', 'Stage 2', 'Stage 1','Control '])
# ax5.set_yticks(x+width, species,rotation=90)
ax4.annotate('Number of Cells at Each Stage',xy=(0.1,0.2683), xycoords='figure fraction')
for tick in ax4.yaxis.get_majorticklabels():
tick.set_horizontalalignment("right")
ax4.tick_params(axis='both', length=0, pad=7)
ax4.get_xaxis().set_ticks([])
ax4.spines['top'].set_visible(False)
ax4.spines['right'].set_visible(False)
ax4.spines['bottom'].set_visible(False)
ax4.spines['left'].set_visible(False)
plt.tight_layout()
plt.show()
Dendrograms for individual IPF stages
import scanpy as sc
import pickle
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
adata = sc.read_h5ad('dataset.h5ad')
adata.uns = pickle.load(open('attribute.pkl','rb'))
def plot_hc_dendrogram(adata,stage):
sch.dendrogram(adata.uns['hcmarkers'][str(stage)]['Z'],no_plot=True)
#replace the labels of the leaves with the labels of the original data
leaves = sch.leaves_list(adata.uns['hcmarkers'][str(stage)]['Z'])
adata.obs['leiden_celltype'] = adata.obs['leiden'].astype(str) +'_'+ adata.obs['ident'].astype(str)
stage0 = adata[adata.obs['stage'] == str(stage)]
sc.pl.umap(stage0,color='ident')
new_leaves = []
stage0.obs['leiden'] == stage0.obs['leiden'].astype(str)
for each in leaves:
temp = stage0[stage0.obs['leiden'] == str(each)]
new_leaves.append(temp.obs['leiden_celltype'].unique()[0])
plt.figure(figsize=(10,10),dpi=300)
ax = plt.gca()
sch.dendrogram(adata.uns['hcmarkers'][str(stage)]['Z'],ax=ax)
ax.set_xticklabels(new_leaves)
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('dendrogram_stage_%d.pdf'%(stage))
plt.show()
for each in list(adata.obs['stage'].unique()):
plot_hc_dendrogram(adata,int(each))
Heatmap of level 0 hierarchical static markers in the Fibroblast Adventitial cluster at the control stage.
import pickle
import pandas as pd
import os
import matplotlib.pyplot as plt
import json
import numpy as np
import scanpy as sc
adata = sc.read_h5ad('dataset.h5ad')
uns = pickle.load(open('attribute.pkl', 'rb'))
adata.uns = uns
protein_coding_genes = []
for each in adata.var.index:
if each[:2] != 'AC' and each[:2] != 'AL' and each[:2] != 'AP' and each[:4] != 'LINC' and '.' not in each:
protein_coding_genes.append(each)
adata = adata[:,protein_coding_genes]
sc.set_figure_params(scanpy=True, dpi=80,dpi_save=300)
def clustertype40(adata):
'''
annotate the cluster with cells >40% if no one >40%, annotate with the highest one
args:
adata: anndata of one cluster
return: the most common cell types in the cluster
'''
dic = {}
total = 0
for each in adata.obs['name.simple']:
if each not in dic.keys():
dic[each]=1
else:
dic[each]+=1
total+=1
#print(dic)
anootate = ''
flag = False #flag to see if there are more than 1 cell types > 40%
for each in list(dic.keys()):
if dic[each] > total*0.5:
if flag == False:
anootate+=each
flag = True
else:
anootate+='/'+each
if flag == False:
for each in list(dic.keys()):
if dic[each] > total*0.4:
if flag == False:
anootate+=each
flag = True
else:
anootate+='/'+each
if flag == False:
for each in list(dic.keys()):
if dic[each] > total*0.3:
if flag == False:
anootate+=each
flag = True
else:
anootate+='/'+each
if flag == False:
for each in list(dic.keys()):
if dic[each] > total*0.2:
if flag == False:
anootate+=each
flag = True
else:
anootate+='/'+each
if flag == False:
anootate = 'NA'
return anootate
adata.obs['stage'] = adata.obs['stage'].astype(str)
adata = adata[adata.obs['stage'] == '0']
temp_count = 0
temp_clustertype = {}
choice = []
scope = []
scopes_cellids = []
adata.obs['leiden'] = adata.obs['leiden'].astype(str)
for xd, each in enumerate(adata.uns['clusterType']['0']):
if each =='FibroblastAdventitial':
choice.append(str(xd))
else:
temp_clustertype[str(xd)] = each
scope.append(str(xd))
choice_adata = adata[adata.obs['leiden'].isin(choice)]
scope_adata = adata[adata.obs['leiden'].isin(scope)]
choice_adata.obs['test'] = 'FibroblastAdventitial'
scope_adata.obs['test'] = 'FibroblastAlveolar'
apat = choice_adata.concatenate(scope_adata)
apat.layers['scaled'] = sc.pp.scale(apat, copy=True).X
marker_genes = []
pos = []
for i in range(len(temp_clustertype.keys())):
temp = [i*10,i*10+9]
pos.append(tuple(temp))
sc.tl.rank_genes_groups(apat,'test', method='wilcoxon', n_genes=100)
marker_genes = apat.uns['rank_genes_groups']['names']['FibroblastAdventitial']
sc.pl.rank_genes_groups_heatmap(apat,groupby=['ident','leiden'],layer='scaled', min_logfoldchange=1,n_genes=50,swap_axes=True, use_raw=False,show_gene_labels=True,vmin=-0.5, vmax=0.5, cmap='RdBu_r',var_group_positions=pos,dendrogram=False, var_group_labels=[str(i) for i in range(len(apat.obs['test'].keys()))],var_group_rotation=0,figsize=(20, 16),save='_sf3.pdf')
Heatmap of level 4 hierarchical static markers in the Fibroblast Adventitial cluster at the control stage.
import pickle
import pandas as pd
import os
import matplotlib.pyplot as plt
import json
import numpy as np
import scanpy as sc
adata = sc.read_h5ad('dataset.h5ad')
uns = pickle.load(open('attribute.pkl', 'rb'))
adata.uns = uns
protein_coding_genes = []
for each in adata.var.index:
if each[:2] != 'AC' and each[:2] != 'AL' and each[:2] != 'AP' and each[:4] != 'LINC' and '.' not in each:
protein_coding_genes.append(each)
adata = adata[:,protein_coding_genes]
sc.set_figure_params(scanpy=True, dpi=80,dpi_save=300)
def clustertype40(adata):
'''
annotate the cluster with cells >40% if no one >40%, annotate with the highest one
args:
adata: anndata of one cluster
return: the most common cell types in the cluster
'''
dic = {}
total = 0
for each in adata.obs['name.simple']:
if each not in dic.keys():
dic[each]=1
else:
dic[each]+=1
total+=1
#print(dic)
anootate = ''
flag = False #flag to see if there are more than 1 cell types > 40%
for each in list(dic.keys()):
if dic[each] > total*0.5:
if flag == False:
anootate+=each
flag = True
else:
anootate+='/'+each
if flag == False:
for each in list(dic.keys()):
if dic[each] > total*0.4:
if flag == False:
anootate+=each
flag = True
else:
anootate+='/'+each
if flag == False:
for each in list(dic.keys()):
if dic[each] > total*0.3:
if flag == False:
anootate+=each
flag = True
else:
anootate+='/'+each
if flag == False:
for each in list(dic.keys()):
if dic[each] > total*0.2:
if flag == False:
anootate+=each
flag = True
else:
anootate+='/'+each
if flag == False:
anootate = 'NA'
return anootate
adata.obs['stage'] = adata.obs['stage'].astype(str)
adata = adata[adata.obs['stage'] == '0']
temp_count = 0
temp_clustertype = {}
choice = []
scope = []
# choice = 17
# scope = [17,4,7]
scopes_cellids = []
adata.obs['leiden'] = adata.obs['leiden'].astype(str)
for xd, each in enumerate(adata.uns['clusterType']['0']):
if each =='FibroblastAdventitial':
choice.append(str(xd))
elif each =='FibroblastAlveolar':
temp_clustertype[str(xd)] = each
scope.append(str(xd))
choice_adata = adata[adata.obs['leiden'].isin(choice)]
scope_adata = adata[adata.obs['leiden'].isin(scope)]
choice_adata.obs['test'] = 'FibroblastAdventitial'
scope_adata.obs['test'] = 'FibroblastAlveolar'
apat = choice_adata.concatenate(scope_adata)
apat.layers['scaled'] = sc.pp.scale(apat, copy=True).X
marker_genes = []
pos = []
for i in range(len(temp_clustertype.keys())):
temp = [i*10,i*10+9]
pos.append(tuple(temp))
sc.tl.rank_genes_groups(apat,'test', method='wilcoxon', n_genes=100)
marker_genes = apat.uns['rank_genes_groups']['names']['FibroblastAdventitial']
sc.pl.heatmap(apat, marker_genes[:25], groupby=['test','leiden'],layer='scaled',swap_axes=True,vmin=-0.5, vmax=0.5, cmap='RdBu_r',figsize=(20, 16))#,save='inter_heatmap.pdf')#,var_group_labels=[str(i) for i in range(len(apat.obs['test'].keys()))])
sc.pl.rank_genes_groups_heatmap(apat,groupby=['test','leiden'],layer='scaled', min_logfoldchange=1,n_genes=50,swap_axes=True, use_raw=False,show_gene_labels=True,vmin=-0.5, vmax=0.5, cmap='RdBu_r',var_group_positions=pos,dendrogram=False, var_group_labels=[str(i) for i in range(len(apat.obs['test'].keys()))],var_group_rotation=0,figsize=(20, 16),save='_sf4.pdf')
Visualizing the overlapping between dynamic proteins and markers using the Venn plot
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import pickle as pkl
protein = pd.read_csv('protein_validation.csv')
protein = protein.set_index('gene')
all_protein = protein.index.tolist()
protein = protein.loc[(protein['padj'] <= 0.01)] #remove unsignificant proteins
protein_genes = protein.index.unique().tolist()
import pickle
adata = sc.read_h5ad('dataset.h5ad')
adata.uns = pickle.load(open('attribute.pkl', 'rb'))
adata_genes = adata.var_names.tolist()
from UNAGI.getProgressionTopMarkers import runGetProgressionMarker_one_dist
import numpy as np
background = np.load('progressionmarker_background.npy',allow_pickle=True).tolist()
one_dist = []
for each in background.keys():
one_dist.append(np.array(background[each]))
one_dist = np.array(one_dist).reshape(-1, 2484)
dpm = runGetProgressionMarker_one_dist('idremVizCluster',size=2484,background = background,cutoff=0.01)
intersection = np.intersect1d(protein_genes, adata_genes)
print('protein genes in adata: ', len(intersection))
intersection_protein = protein.loc[intersection]
all_intersection = np.intersect1d(all_protein, adata_genes)
from scipy.stats import hypergeom, binom
#M: 151 significant protein overlapped with IPF data adjusted_pval <0.05
#N: 2482; M 151; n track markers (pval<0.05); m track intersection
def hypergeometric_test(N, M, n, m):
rv = hypergeom(N, M, n)
probability = rv.pmf(m)
return probability
all_markers = []
significances = []
temp = {}
temp['Dynamic Markers'] = []
temp['Dynamic Protein Coding Genes'] = []
pvals = []
for track in dpm.keys():
track_markers = []
for direction in dpm[track].keys():
df = pd.DataFrame.from_dict(dpm[track][direction])
track_markers+=list(df['gene'].values)
all_markers+=list(df['gene'].values)
# print(attribute['progressionMarkers'][track][direction]['qval'])
track_markers = np.intersect1d(track_markers, adata_genes)
temp['Dynamic Markers'].append(len(track_markers))
track_intersection = np.intersect1d(intersection, track_markers)
temp['Dynamic Protein Coding Genes'].append(len(track_intersection))
print('track: ', track, 'intersection: ', len(track_intersection), 'track markers: ', len(track_markers))
significance = hypergeometric_test(len(adata_genes), len(intersection), len(track_markers), len(track_intersection))
#bionomial test
# print(len(track_intersection), len(track_markers), len(intersection),len(adata_genes))
# significance = 1-binom.cdf(len(track_intersection)-1, len(track_markers), len(intersection)/len(all_intersection))
significances.append(significance)
print('significance: ', significance)
pvals.append(significance)
all_markers = np.intersect1d(list(set(all_markers)),adata_genes)
all_track_intersection = np.intersect1d(intersection, all_markers)
all_track_intersection = list(set(all_track_intersection))
print('all track intersection: ', len(all_track_intersection), 'all markers: ', len(all_markers))
significance = hypergeometric_test(len(adata_genes), len(intersection), len(all_markers), len(all_track_intersection))
# significance = binom.cdf(len(all_track_intersection)-1, len(all_markers), len(intersection)/len(all_intersection))
print('significance: ', significance)
significances.append(significance)
temp['Dynamic Markers'].append(len(all_markers))
temp['Dynamic Protein Coding Genes'].append(len(all_track_intersection))
import matplotlib.pyplot as plt
x = np.arange(len(significances)) # the label locations
width = 0.5 # the width of the bars
multiplier = 0
fig, ax = plt.subplots(figsize=(10, 5),dpi=200)
flag = True
od = np.argsort(significances)
track_names = list(dpm.keys())
track_names.append('All Tracks')
track_names = ['FibAlv-4','All Tracks','VEven-2','LymEnd-19','VEcap-1','FibAdv-17','VEaero-3','SMC-18','VEaero-0','PerAlv-12','VEcap-6']
i=0
labels = []
from statsmodels.stats.multitest import fdrcorrection
for each in np.array(significances)[od]:
if each <0.05 and each >= 0.01:
labels.append('*')
elif each >=0.001 and each < 0.01:
labels.append('**')
elif each < 0.001:
labels.append('***')
else:
labels.append('ns')
for att, measurement in temp.items():
measurement = np.array(measurement).flatten()[od]
rects = ax.bar(x, measurement, width, label=att)
i+=1
if flag:
ax.bar_label(rects,labels,padding=0.5)
flag=False
plt.legend()
plt.ylabel('Number of Genes')
plt.xticks(x, np.array(track_names),rotation = 90)
plt.tight_layout()
plt.savefig('sf dynamic marker and protein overlapping.pdf',dpi=300)
plt.show()
from matplotlib_venn import venn3, venn3_circles
set1 = set(adata_genes)
set2 = set(protein_genes)
set3 = set(all_markers)
plt.figure(figsize=(10,10),dpi=300)
v = venn3([set1,set2,set3],('','',''))
v.get_patch_by_id('100').set_color('blue')
v.get_patch_by_id('010').set_color('green')
v.get_patch_by_id('110').set_color('tab:blue')
v.get_patch_by_id('110').set_alpha(1)
v.get_patch_by_id('111').set_color('tab:orange')
v.get_patch_by_id('111').set_alpha(1)
plt.title('Dynamic Markers vs Protein')
plt.savefig('sf_venn_protein_and_markers.pdf',dpi=300)
plt.show()
UMAPs of the whole snRNA-seq IPF dataset based on the learned latent representation ‘Z’
import scanpy as sc
sc.set_figure_params(dpi_save=300)
adata = sc.read_h5ad('dataset.h5ad')
adata.obs['ident'] = adata.obs['ident'].astype(str)
del adata.obsm['X_umap']
del adata.obsm['umap']
sc.pp.neighbors(adata,use_rep='z')
sc.tl.umap(adata,min_dist=0.3)
sc.pl.umap(adata,color='stage')#,save='_all_stages.pdf')
sc.pl.umap(adata,color='ident')#,save='_all_cell_type.pdf')
UMAPs of the whole snRNA-seq IPF dataset based on the PCA embeddings
import scanpy as sc
sc.set_figure_params(dpi_save=300)
adata = sc.read_h5ad('dataset.h5ad')
adata.obs['ident'] = adata.obs['ident'].astype(str)
del adata.obsm
sc.pp.neighbors(adata)
sc.tl.umap(adata,min_dist=0.3)
sc.pl.umap(adata,color='stage',save='_all_stages.pdf')
sc.pl.umap(adata,color='ident',save='_all_cell_type.pdf')
Compare the in-silico and real perturbation outcomes on top differentially expressed genes
NIF and NIN reconstruction of top 100 treatment markers and ECM organization
import scanpy as sc
import numpy as np
import torch
from scipy.sparse import issparse
from torch.utils.data import DataLoader
from sklearn.neighbors import kneighbors_graph
from UNAGI.perturbation_general import *
from UNAGI.ppi import getCompleteNodes,getCompleteNodesPCA,runPPIthreads,getPPINetworkDict
import anndata
from zig_my import *
# sc.set_figure_params(dpi=180)
np.random.seed(0)
adata0 = sc.read_h5ad('../pcls/0.h5ad')
adata1 = sc.read_h5ad('../pcls/1.h5ad')
adata0 = adata0[adata0.obs['condition2'] == 'Control']
adata1 = adata1[adata1.obs['condition2'] == 'Fibrotic_Cocktail']
adata_nif = sc.read_h5ad('../3.h5ad')
adata_nin = sc.read_h5ad('../2.h5ad')
def getImpactFactors(adata,gene,PPINetworkDict): ##new merged
ids = adata
PCA = getCompleteNodesPCA(ids,0,ids)
genenames = adata.var.index.tolist()
#genenames = genenames.tolist()
genetable = {}
for i, each in enumerate(genenames):
genetable[each] = i
#k=np.load('./PPINetworkDict.npy',allow_pickle=True)
PPIDict=PPINetworkDict
table = runPPIthreads(ids,PCA,genetable,PPIDict,gene)
return table
def getZandZc(adata,impactfactor=None):
clusterAdata = adata
input = clusterAdata.X
if issparse(input):
input = input.toarray()
data = input
loadModelDict = './pcls/model_save/zig_0.pth'
vae = VAE(data.shape[1], 256 , 1024,64)
vae.load_state_dict(torch.load(loadModelDict,map_location=torch.device('cpu')))
vae.eval()
recons = []
print(data.shape)
if impactfactor is not None:
cell_loader = DataLoader(data.astype('float32'), batch_size=len(data), shuffle=False, num_workers=0)
else:
cell_loader = DataLoader(data.astype('float32'), batch_size=len(data), shuffle=False, num_workers=0)
if 'gcn_connectivities' not in clusterAdata.obsp.keys():
adj = kneighbors_graph(adata.obsm['X_pca'], 24, mode='connectivity', include_self=False,n_jobs=20)
adj.setdiag(25)
adj = torch.tensor(adj.toarray().astype('float32'))
else:
adj = torch.tensor(clusterAdata.obsp['gcn_connectivities'].toarray().astype('float32'))
for x in cell_loader:
if impactfactor is not None:
impactfactor = impactfactor.astype('float32')
x = x+impactfactor
print(x.shape)
recons = recons.cpu().detach().numpy().tolist()
# recons += vae.get_latent_representation(x,adj).cpu().detach().numpy().tolist()#vae.generate(x,adj,1).cpu().detach().numpy().tolist()
recons = np.array(recons)
return recons
fibrosis_mean = np.mean(np.array(adata1.X.toarray()),axis=0)
nif_mean = np.mean(np.array(adata_nif.X.toarray()),axis=0)
differences = nif_mean - fibrosis_mean
impactfactors = differences.reshape(1,-1)
adata1_perturbed = adata1.copy()
adata1_perturbed_z = getZandZc(adata1_perturbed,impactfactors)
adata1_perturbed = anndata.AnnData(X = adata1_perturbed_z,obs=adata1_perturbed.obs,var=adata1_perturbed.var)
nin_mean = np.mean(np.array(adata_nin.X.toarray()),axis=0)
differences = nin_mean - fibrosis_mean
impactfactors = differences.reshape(1,-1)
adata2_perturbed = adata1.copy()
adata2_perturbed_z = getZandZc(adata2_perturbed,impactfactors)
adata2_perturbed = anndata.AnnData(X = adata2_perturbed_z,obs=adata2_perturbed.obs,var = adata2_perturbed.var)
adatanif_z = getZandZc(adata_nif)
adatanif = anndata.AnnData(X = adatanif_z,obs=adata_nif.obs,var = adata_nif.var)
adata1_z = getZandZc(adata1)
adata1_z = anndata.AnnData(X = adata1_z,obs=adata1.obs,var = adata1.var)
adata0_z = getZandZc(adata0)
adata0_z = anndata.AnnData(X = adata0_z,obs=adata0.obs, var = adata0.var)
adata2_z = getZandZc(adata_nin)
adata2_z = anndata.AnnData(X = adata2_z,obs=adata_nin.obs, var = adata0.var)
adata0_z.obs['test'] = 'healthy'
adata1_z.obs['test'] = 'fibrosis'
adata2_z.obs['test'] = 'intedanib'
adatanif.obs['test'] = 'Nifedipine'
adata1_perturbed.obs['condition2'] = 'Fibrotic_Cocktail_Nifedipine_Perturbed'
adata2_perturbed.obs['condition2'] = 'Fibrotic_Cocktail_Nintedanib_Perturbed'
asdf = [adata1_z,adata2_z,adatanif,adata2_perturbed,adata1_perturbed]
final = anndata.concat(asdf)
sc.pp.neighbors(final,use_rep='X')
sc.tl.umap(final)
sc.pl.umap(final,color='condition2',palette={
"Fibrotic_Cocktail_Nifedipine_Perturbed": "tab:cyan",
"Fibrotic_Cocktail_Nintedanib_Perturbed": "tab:pink",
"Fibrotic_Cocktail": "tab:green",
"Fibrotic_Cocktail_Nintedanib": "tab:brown",
"Fibrotic_Cocktail_Nifedipine": "tab:purple"
})
nif_perturbed = adata1_perturbed
nin_perturbed = adata2_perturbed
nif_perturbed.obs['condition3'] = 'nifedipine_in_silico'
adata_nif.obs['condition3'] = 'nifedipine'
adata1.obs['condition3'] = 'fibrosis'
nin_perturbed.obs['condition3'] = 'nintedanib_in_silico'
adata_nin.obs['condition3'] = 'nintedanib'
final = anndata.concat([adata1, adata_nif,nif_perturbed])
# final = anndata.concat([adata1, adata_nin,nin_perturbed])
sc.tl.rank_genes_groups(final, 'condition3', method='wilcoxon',rankby_abs=True, reference='fibrosis')
def my_logfold_change(adata1, adata2,topN,log_fold_change_cutoff=None,abs = False):
genenames = adata1.var_names.tolist()
mean1 = np.mean(adata1.X.toarray(),axis=0)
mean2 = np.mean(adata2.X.toarray(),axis=0)
logfold_change = mean2 - mean1
logfold_change_dict = {}
for i, each in enumerate(genenames):
logfold_change_dict[each] = logfold_change[i]
if abs ==True:
temp = {k: v for k, v in sorted(logfold_change_dict.items(), key=lambda item: np.abs(item[1]),reverse=True)}
else:
temp = {k: v for k, v in sorted(logfold_change_dict.items(), key=lambda item: item[1],reverse=True)}
df = pd.DataFrame()
df['logfoldchange'] = temp.values()
df['genes'] = temp.keys()
if log_fold_change_cutoff is not None:
df = df[np.abs(df['logfoldchange'])>log_fold_change_cutoff]
topDEG = list(df[:topN]['genes'].values)
return topDEG,df
from scipy.stats import f_oneway, ks_2samp, mannwhitneyu, wilcoxon, ttest_ind,ranksums,wasserstein_distance,chisquare,pearsonr,energy_distance
from scipy.stats import linregress
from sklearn.metrics import r2_score
def calculate_r_squared(y_true, y_pred):
ssr = np.sum((y_true - y_pred) ** 2)
sst = np.sum((y_true - np.mean(y_true)) ** 2)
r_squared = 1 - (ssr / sst)
return r_squared
def calculate_adjusted_r_squared(r_squared, n, k):
adjusted_r_squared = 1 - (1 - r_squared) * ((n - 1) / (n - k - 1))
return adjusted_r_squared
sc.set_figure_params(dpi_save=300,figsize=[20,3])
def cosine_similarity(v1, v2):
# Compute the dot product of v1 and v2
dot_product = np.dot(v1, v2)
# Compute the L2 norm of v1
norm_v1 = np.linalg.norm(v1)
# Compute the L2 norm of v2
norm_v2 = np.linalg.norm(v2)
# Compute cosine similarity
similarity = dot_product / (norm_v1 * norm_v2)
return similarity
def adjusted_r2(r2, n, k):
"""
Calculate the adjusted R-squared.
Parameters:
- r2: The regular R-squared value
- n: The number of observations
- k: The number of predictors (excluding the intercept)
Returns:
- The adjusted R-squared value
"""
return 1 - ((1 - r2) * (n - 1) / (n - k - 1))
# Example usage:
def draw_real_recons_heatmap(adata, adata1, adata2, genes,gorupby, enmax_palette, ymax, topN=20,use_zero=False,save_fig = False,cut = False,similarity=False):
temp = []
pval = []
for idx, each in enumerate(genes):
if cut:
if np.median(adata1[:,each].X.toarray()) <1:
temp.append(0)
pval.append(0)
continue
else:
print(np.median(adata1[:,each].X.toarray()))
obs_nif = adata1[:,each].X.toarray().flatten()
obs_nif_perturbed = adata2[:,each].X.toarray().flatten()
if use_zero:
pair1 = energy_distance(obs_nif_perturbed,obs_nif)
else:
pair1 = ks_2samp(obs_nif_perturbed[obs_nif_perturbed>0],obs_nif[obs_nif>0],alternative='two-sided')[1]
pval.append(ks_2samp(obs_nif_perturbed[obs_nif_perturbed>0],obs_nif[obs_nif>0],alternative='two-sided')[1])
temp.append(pair1)
#z_score norm
# adata.layers['scaled'] = sc.pp.scale(adata, copy=True).X
print(np.array(pval)[np.argsort(temp)[::-1]][:topN])
protencoding = [each for each in genes if 'AC' != each[:2] and 'AL' != each[:2] and 'AP' != each[:2] and 'LINC' != each[:4] ]
# sc.tl.rank_genes_groups(adata, gorupby, method='wilcoxon',rankby_abs=True, reference='perturbed')
if similarity == False:
df = sc.get.obs_df(adata, list(np.array(protencoding)[:topN])+['condition3'])
else:
df = sc.get.obs_df(adata, list(np.array(genes)[np.argsort(temp)[::-1]][:topN])+['condition3'])
custom_dict = {}
for i ,each in enumerate(list(np.array(genes)[np.argsort(temp)[::-1]][:topN])):
custom_dict[each] = i
df = df.set_index('condition3').stack().reset_index()
df.columns = ['condition3', 'gene', 'value']
df.sort_values(by=['gene'], key=lambda x: x.map(custom_dict))
print(df['gene'])
# df = df[df['value']>0]
#sort df based on gene order
#calcualte the mean expression of all genes in each condition
print(list(np.array(genes)[np.argsort(temp)[::-1]][:topN]))
original_mean = np.mean(np.array(adata1[:,np.array(genes)[np.argsort(temp)[::-1]]].X.toarray()),axis=0)
perturbed_mean = np.mean(np.array(adata2[:,np.array(genes)[np.argsort(temp)[::-1]]].X.toarray()),axis=0)
#calculate pearson correlation between original and perturbed
print('pearsonr correlation:',pearsonr(original_mean,perturbed_mean))
print('cosine similarity:',cosine_similarity(original_mean,perturbed_mean))
# Generate some sample data
np.random.seed(0)
x = original_mean
y = perturbed_mean
# Perform linear regression
print('raw:', x)
print('perturbed:',y)
rsquare = calculate_r_squared(x,y)
rsquare = adjusted_r2(rsquare,len(x),1)
print("R^2 value:",rsquare)
slope, intercept, r_value, p_value, std_err = linregress(x, y,alternative='greater')
adjusted_r2_val = adjusted_r2(r_value,len(x),1)
print('adjusted R^2:',adjusted_r2_val)
# Create scatter plot and plot regression line
fig,ax = plt.subplots(figsize=(3,3),dpi=100)
ax.scatter(x, y, label='Data', color='blue')
ax.plot(x, slope * x + intercept, label=f'Fit (R^2 = {rsquare:.2f})', color='red')
ax.legend(bbox_to_anchor=(1, 1))
# ax.grid(False)
plt.xlabel('original')
plt.ylabel('perturbed')
plt.title('$R^2$ value')
plt.savefig('R^2'+save_fig,dpi=300)
print('R-val:', r_value)
print(f"P-value: {p_value:.4f}")
if save_fig == False:
fig,ax = plt.subplots(figsize=(15,3),dpi=100)
sns.boxplot(data=df, x='gene', y='value', hue='condition3', palette=enmax_palette)
# sns.violinplot(data=df, x='gene', y='value', hue='condition3', split=True, palette=enmax_palette,scale='area')
ax.grid(False)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.ylim(0,ymax)
plt.show()
else:
fig,ax = plt.subplots(figsize=(15,3),dpi=100)
sns.boxplot(data=df, x='gene', y='value', hue='condition3', palette=enmax_palette)
# sns.violinplot(data=df, x='gene', y='value', hue='condition3', split=True, palette=enmax_palette,scale='area')
ax.grid(False)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1),fontsize=7)
#rotate xticklabels 90 degree
for item in ax.get_xticklabels():
item.set_rotation(90)
#make x ticks label smaller
ax.tick_params(axis='x', which='major', labelsize=7)
plt.ylim(0,ymax)
#squize the figure
plt.tight_layout()
plt.savefig(save_fig,dpi=300)
plt.show()
def rankgene_logfold_change(adata1, adata2,gene_list,abs = False):
genenames = adata1.var_names.tolist()
mean1 = np.mean(adata1.X.toarray(),axis=0)
mean2 = np.mean(adata2.X.toarray(),axis=0)
logfold_change = mean2 - mean1
logfold_change_dict = {}
for i, each in enumerate(genenames):
logfold_change_dict[each] = logfold_change[i]
if abs ==True:
temp = {k: v for k, v in sorted(logfold_change_dict.items(), key=lambda item: np.abs(item[1]),reverse=True)}
else:
temp = {k: v for k, v in sorted(logfold_change_dict.items(), key=lambda item: item[1],reverse=True)}
df = pd.DataFrame()
df['logfoldchange'] = temp.values()
df['genes'] = temp.keys()
ranked_gene_list = []
for each in temp.keys():
if each in gene_list:
ranked_gene_list.append(each)
return ranked_gene_list
import seaborn as sns
nif = [adata_nif,nif_perturbed]
nif = anndata.concat(nif)
test_gene,df = my_logfold_change(adata1, adata_nif,100)
enmax_palette = sns.color_palette(["#00A087", "#4DBBD5"])
draw_real_recons_heatmap(nif, adata_nif, nif_perturbed, test_gene,'condition3',enmax_palette, ymax=4,similarity=False, topN=100,use_zero=False,cut=True,save_fig='box_nif_topN100_100.pdf')
ecm_genes = ['MUSK', 'TLL1', 'VCAN', 'CDH1', 'TNC', 'LTBP1', 'ELN', 'LAMC3', 'COL23A1', 'LAMA3', 'LAMC2', 'COL11A1', 'COL17A1', 'NTN4', 'FBLN1', 'ITGA8', 'COL5A3', 'COL4A4', 'CEACAM6', 'ICAM1', 'P3H2', 'LAMB1', 'ITGA6', 'TGFB2', 'MMP11', 'MMP9', 'TIMP1', 'COMP', 'ITGB8', 'SERPINE1', 'ICAM2', 'COL1A1', 'VWF', 'COL12A1', 'SPARC', 'COL7A1', 'ITGB6', 'FN1', 'TNR', 'SPP1', 'LTBP2', 'MMP19', 'COL10A1', 'SDC4', 'COL21A1', 'BMP2', 'KDR', 'COL5A1', 'MATN3', 'COL4A2', 'ADAMTS8', 'CAPN9', 'CTSV', 'MMP7', 'THBS1', 'ITGA11', 'LOXL4', 'ADAMTS14', 'FGF2', 'FBN2', 'LUM', 'ADAMTS18', 'ITGA9', 'COL8A1', 'ADAMTS16', 'HAPLN1', 'ADAM12', 'NCAM1', 'PLOD2', 'ADAMTS5', 'MMP16', 'ADAMTS3', 'ACAN', 'VCAM1', 'CAPN13', 'COL6A3', 'ADAMTS9', 'ITGA2', 'COL1A2', 'FBN1', 'MMP10', 'BMP1', 'COL3A1', 'COL4A3', 'COL22A1', 'COL24A1', 'FGG', 'FGA', 'NRXN1', 'BGN', 'COL4A1', 'COL14A1', 'COL25A1', 'SPOCK3', 'LAMA2', 'MMP1', 'COL27A1', 'LAMB3', 'COL4A6', 'MFAP5', 'DMD', 'CAPN8', 'COL5A2', 'COL15A1', 'COL6A6', 'ITGA1', 'COL28A1']
ecm_genes = sorted(list(set(adata_nif.var_names.tolist() ) & set(ecm_genes)))
# col = adata_nif.var_names[adata_nif.var_names.str.startswith('COL')]
ecm_genes = rankgene_logfold_change(adata1, adata_nif,ecm_genes,abs=True)
draw_real_recons_heatmap(nif, adata_nif, nif_perturbed, ecm_genes,'condition3',enmax_palette,ymax=4.5,topN=len(ecm_genes),similarity=False,use_zero=False,save_fig='nif_ecm_sup.pdf')
nin = [adata_nin,nin_perturbed]
enmax_palette = sns.color_palette(["#EE4C97","#fccde5"])
nin = anndata.concat(nin)
test_gene,df = my_logfold_change(adata1, adata_nin,100)
draw_real_recons_heatmap(nin, adata_nin, nin_perturbed, test_gene,'condition3',enmax_palette,ymax=4, topN=100,use_zero=False,similarity=False,cut=True,save_fig='box_nin_topN100_100.pdf')
enmax_palette = sns.color_palette(["#EE4C97", "#fccde5"])
nin = anndata.concat(nin)
ecm_genes = rankgene_logfold_change(adata1, adata_nin,ecm_genes,abs=True)
draw_real_recons_heatmap(nin, adata_nin, nin_perturbed, ecm_genes,'condition3',enmax_palette,ymax=4.5,topN=len(ecm_genes),similarity=False,use_zero=False,save_fig='nin_ecm_sup.pdf')#,save_fig='nin_neg_topN10_500.pdf')
COVID-19 cell type composition
import pickle
import pandas as pd
import os
import matplotlib as mpl
import json
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
adata = sc.read_h5ad('../covid/dataset.h5ad')
uns = pickle.load(open('../covid/attribute.pkl', 'rb'))
all_types = adata.obs['ident'].unique().tolist()
all_types = sorted(all_types)
stage_types = {'Stage 3':[],'Stage 2':[],'Stage 1':[],'Control':[]}
stage_keys = ['Stage 3', 'Stage 2','Stage 1','Control']
for i in range(4):
stage_adata = adata[adata.obs['stage'] == str(i)]
stage_cells = len(stage_adata)
for each in all_types:
stage_types[stage_keys[3-i]].append(len(stage_adata[stage_adata.obs['ident'] == each])/stage_cells)
multiplier = 0
width = 0.3
plt,ax = plt.subplots(figsize=(15, 3),dpi=300)
#transform the type of list to array
# y_pos = [0, 1]
for i in stage_types.keys():
stage_types[i] = np.array(stage_types[i])
bottom = 0
data = np.array(list(stage_types.values()))
data_cum = data.cumsum(axis=1)
category_colors = mpl.colormaps['Spectral'](
np.linspace(0.1, 1, data.shape[1]))
for j, (types,c) in enumerate(zip(all_types,category_colors)):
widths = data[:,j]
starts = data_cum[:, j] - widths
# labels = list(stage_types.keys())
# labels = [labels[v] if val > 0.05 else "" for v, val in enumerate(widths)]
bc = ax.barh(list(stage_types.keys()), widths, left = starts, label=types,color=c,height=0.8)
# labels = [f'%.2f%%'%(v*100) if v > 0.04 else "" for v in widths]
labels = ["" if v > 0.04 else "" for v in widths]
# ax.bar_label(c, labels=labels, label_type="center",fmt='%.2f')
# bottom += weight_count
# multiplier += 1
ax.bar_label(bc, labels=labels,label_type='center',fmt='%.2f',fontsize=9,color='black')
ax.legend(ncol=len(all_types)//2+1, bbox_to_anchor=(0.1, 1.3),
loc='upper left', fontsize='x-small')
# threshold = 0.15
for c in ax.containers:#comments out to disable percentage on the plot
labels = [f'%.2f%%'%(v*100) if v > 0.05 else "" for v in c.datavalues] #comments out to disable percentage on the plot
ax.bar_label(c, labels=labels, label_type="center",fmt='%.2f')#comments out to disable percentage on the plot
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.get_xaxis().set_ticks([])
ax.grid(False)
plt.tight_layout()
plt.savefig('harmony_cell_composition_precentage.pdf')
Dotplots of COVID-19 cell type marekrs
import scanpy as sc
adata = sc.read('../covid/dataset.h5ad')
adata1 = sc.read('../data/haniffa21.processed.h5ad')
for i in list(adata.obs['stage'].unique()):
adata_temp = adata[adata.obs['stage']==i]
stage2_raw = adata1[adata_temp.obs.index.tolist()]
stage2_raw.obs = adata_temp.obs
temp = ['MS4A1',
'CD79AA',
'IGHM','CD3D',
'IL7R',
'LTB',
'CD8A',
'CD8B',
'CD14',
'S100A8',
'FCER1G',
'FCGR3A',
'CST3',
'MS4A7',
'HLA-DQA1',
'CD1C',
'CD34',
'SPINK2',
'MKI67',
'GNLY',
'GZMB',
'IGHG1',
'IGKC',
'CD38',
'PF4',
'РРВР',
'HBA2',
'HBB',
'LILRA4',
'IL3RA',
'IRF7',
]
sc.set_figure_params(dpi_save=300)
to_draw = []
for each in temp:
if each in stage2_raw.var.index.tolist():
to_draw.append(each)
stage2_raw.layers['scaled'] = sc.pp.scale(stage2_raw, copy=True).X
sc.pl.dotplot(stage2_raw, groupby='ident',layer='scaled', var_names=to_draw,vmax=2,vmin=0,swap_axes=False,save='harmony_stage%s_cellType_markers.pdf'%str(i))
Dendrograms of the COVID-19 dataset
import scanpy as sc
import pickle
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
adata = sc.read_h5ad('../covid/dataset.h5ad')
adata.uns = pickle.load(open('../covid/attribute.pkl','rb'))
def plot_hc_dendrogram(adata,stage):
sch.dendrogram(adata.uns['hcmarkers'][str(stage)]['Z'],no_plot=True)
#replace the labels of the leaves with the labels of the original data
leaves = sch.leaves_list(adata.uns['hcmarkers'][str(stage)]['Z'])
adata.obs['leiden_celltype'] = adata.obs['leiden'].astype(str) +'_'+ adata.obs['ident'].astype(str)
stage0 = adata[adata.obs['stage'] == str(stage)]
sc.pl.umap(stage0,color='ident')
new_leaves = []
stage0.obs['leiden'] == stage0.obs['leiden'].astype(str)
for each in leaves:
temp = stage0[stage0.obs['leiden'] == str(each)]
new_leaves.append(temp.obs['leiden_celltype'].unique()[0])
plt.figure(figsize=(10,10),dpi=300)
ax = plt.gca()
sch.dendrogram(adata.uns['hcmarkers'][str(stage)]['Z'],ax=ax)
ax.set_xticklabels(new_leaves)
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig('dendrogram_stage_%d.pdf'%(stage))
plt.show()
for each in list(adata.obs['stage'].unique()):
plot_hc_dendrogram(adata,int(each))
Dot plots of the gene-weight markers in the IPF dataset and enrichment analysis of top weighted gene markers
import scanpy as sc
import pickle
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
adata = sc.read_h5ad('dataset.h5ad')
adata.uns = pickle.load(open('attribute.pkl','rb'))
protein_coding_genes = []
for each in adata.var.index:
if each[:2] != 'AC' and each[:2] != 'AL' and each[:2] != 'AP' and each[:4] != 'LINC' and '.' not in each:
protein_coding_genes.append(each)
adata = adata[:,protein_coding_genes]
adata.obs['cluster_type'] = adata.obs['leiden'].astype(str) +'_'+ adata.obs['ident'].astype(str)
for each in list(adata.obs['stage'].unique()):
if int(each) == 0:
continue
stageadata = adata[adata.obs['stage'] == each]
del stageadata.uns
sc.tl.rank_genes_groups(stageadata, 'cluster_type', method='wilcoxon',layer='geneWeight',n_genes=100)
sc.pl.rank_genes_groups_dotplot(stageadata,n_genes=5,groupby='cluster_type',layer='geneWeight',cmap='RdBu_r',standard_scale='var',save='_IPF_geneWeight_markers_stage%s.pdf'%(str(each)))
# print(stageadata.uns['rank_genes_groups']['names']['10_FibroblastFibrotic'])
all_go_terms = ['Matrisome',
'Extracellular matrix organization',
'ECM glycoproteins',
'Collagen chain trimerization',
'Collagens',
'Assembly of collagen fibrils \nand other multimeric structures',
'Collagen biosynthesis and \nmodifying enzymes',
'Alphavbeta3 integrin signaling pathway',
'FGFR2b ligand binding and activation',
'Collagen formation',
'Molecules associated with elastic fibres']
fdr_bhs = [1.640E-7, 9.787E-4, 1.719E-3,6.613E-3, 8.262E-3, 1.036E-2, 2.757E-2, 3.825E-2, 4.045E-2, 4.045E-2,4.820E-2]
fig ,ax = plt.subplots(dpi = 100,figsize=(15,3))
ax.bar(all_go_terms,-np.log10(fdr_bhs),color='#7BBADF')#3C5488, 4DBBD5, 00A087, 91D1C2, 8491B4
# ax.set_xticks(all_go_terms,rotation=90)
ax.set_xticklabels(labels=all_go_terms,rotation=90)
ax.set_ylabel('-log10(FDR)')
ax.set_xlabel('Pathways')
#set the font size of yticks
# ax.tick_params(axis='y', which='major', labelsize=7)
#remove grid
ax.grid(False)
plt.tight_layout()
plt.savefig('10_fibrotic_enrichment_geneweightsMarkers.pdf',dpi=300)
plt.show()
Visualizing the benchmarking results using UMAPs
import scanpy as sc
import numpy as np
import pandas as pd
import pickle
sc.set_figure_params(dpi_save=300)
def plot_with_colormap(values,color_dict):
color_list = [[0.36862745, 0.30980392, 0.63529412, 1. ],'tab:pink','tab:olive','tab:cyan','gold', 'springgreen','coral','skyblue','tab:blue','tab:orange','tab:green','tab:red','tab:purple','tab:brown','yellow','aqua', 'turquoise','orangered', 'lightblue','darkorchid', 'fuchsia','royalblue','slategray', 'silver', 'teal', 'fuchsia','grey','indigo','khaki','magenta','tab:gray']
# random.shuffle(color_list)
values = list(set(values))
values = sorted(values)
print(values)
for i, value in enumerate(values):
if value not in list(color_dict.keys()):
color_dict[value] = color_list[(len(list(color_dict.keys()))+1)]
# color_dict[value] = generate_hex_color()#cmap(i / len(set(values)))
# color_dict = {value: cmap(i) for i, value in enumerate(set(values))}
# colors = [color_dict[value] for value in values]
return color_dict
adata = sc.read_h5ad('dataset.h5ad')
del adata.obsm
color_dict_unagi = {}
for each in list(adata.obs['stage'].unique()):
stageadata = adata[adata.obs['stage'] == each]
sorted_list = sorted(list(stageadata.obs['name.simple'].unique()))
color_dict_unagi = plot_with_colormap(sorted_list,color_dict_unagi)
sc.tl.pca(stageadata)#,n_comps=64)
sc.pp.neighbors(stageadata,use_rep='X_pca')
sc.tl.leiden(stageadata,resolution=0.5+int(each)*0.07)
sc.tl.umap(stageadata,min_dist=0)
sc.pl.umap(stageadata,color='name.simple',palette=color_dict_unagi,save='_scanpy_stage%s_taylor.pdf'%(str(each)))
sc.pl.umap(stageadata, color = 'leiden',save = '_scanpy_stage%s_leiden.pdf'%(str(each)))
sc.tl.pca(adata,n_comps=64)
sc.pp.neighbors(adata)
sc.tl.umap(adata,min_dist=0)
sc.pl.umap(adata,color='name.simple',palette=color_dict_unagi,save='_scanpy_taylor.pdf')