{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Statistics of the snRNA-seq IPF dataset " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#only Mesenchymal cell lineage\n", "import scanpy as sc\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import scipy\n", "import matplotlib.gridspec as gridspec\n", "def func(pct, allvals):\n", " absolute = int(pct/100.*np.sum(allvals))\n", " return \"{:.1f}%\\n{:d}\".format(pct, absolute)\n", "#figure 2\n", "mes = 'dataset.h5ad'\n", "species = (\n", " \"Mesenchymal\"\n", ")\n", "cells = [[] for _ in range(4)]\n", "dataset = [mes]\n", "sf = {}\n", "n_patients = []\n", "for i, each in enumerate(dataset):\n", " adata = sc.read_h5ad(each)\n", " for each in adata.obs['Surface.Density'].unique().tolist():\n", " if i == 0:\n", " sf[each] = []\n", " ids = adata.obs[adata.obs['Surface.Density'] == each].index.tolist()\n", " sf[each].append(len(ids))\n", " for stage in adata.obs['stage'].unique():\n", " stageadata = adata[adata.obs['stage'] == stage]\n", "\n", " cells[int(stage)].append(stageadata.n_obs)\n", " n_patients.append(len(stageadata.obs['Sample.ID'].unique()))\n", "weight_counts = {\n", " \"Control\": np.array(cells[0]),\n", " \"Stage 1\": np.array(cells[1]),\n", " \"Stage 2\": np.array(cells[2]),\n", " \"Stage 3\": np.array(cells[3]),\n", "}\n", "width = 0.5\n", "fig = plt.figure(figsize=(10, 9),dpi=300)\n", "gs = gridspec.GridSpec(3, 10)\n", "ax1 = fig.add_subplot(gs[0, :3])\n", "ax2 = fig.add_subplot(gs[0, 3:])\n", "ax3 = fig.add_subplot(gs[1, :])\n", "ax4 = fig.add_subplot(gs[2, :4])\n", "ax5 = fig.add_subplot(gs[2, 5:])\n", "\n", "####panel a\n", "ipf_sample = 0\n", "control_sample = 0\n", "sample_cells = {}\n", "ipf_cells = 0\n", "control_cells = 0\n", "for each in ['Control', 'IPF']:#['Sample.ID']\n", " condition_data = adata[adata.obs['Disease.Ident'] == each]\n", " if each == 'IPF':\n", " ipf_sample = len(list(condition_data.obs['Sample.ID'].unique()))\n", " ipf_cells = len(condition_data)\n", " else:\n", " control_sample = len(list(condition_data.obs['Sample.ID'].unique()))\n", " control_cells = len(condition_data)\n", "for each in list(adata.obs['Sample.ID'].unique()):\n", " \n", " sample_cells[each] = len(adata[adata.obs['Sample.ID'] == each])\n", "\n", "ax1.pie([ipf_sample,control_sample],labels=['IPF','Control'],autopct=lambda pct: func(pct, [ipf_sample,control_sample]),center=(-3,0))\n", "ax1.set_title('Sample Proportion')\n", "###panel b\n", "# fig, axs = plt.subplots(1,2,dpi=300,figsize= (8,4))\n", "ssf = {key: sf[key] for key in sorted(sf,reverse=True)}\n", "temp = {\"Mesenchymal\":[]}\n", "temp2 = {}\n", "for each in list(ssf.keys()):\n", " temp2[each] = np.sum(ssf[each])\n", " temp['Mesenchymal'].append(ssf[each][0])\n", "x = np.arange(len(ssf)) # the label locations\n", "width = 0.5 # the width of the bars\n", "multiplier = 0\n", "for attribute, measurement in temp.items():\n", " \n", " rects = ax2.bar(tuple(x), measurement, width, label=attribute)\n", "surface_density = list(ssf.keys())\n", " #use .4f to round to 4 decimal places\n", "surface_density = [f\"{float(i):.4f}\" for i in surface_density]\n", "ax2.plot(0,0, \"40% if no one >40%, annotate with the highest one\n", " args:\n", " adata: anndata of one cluster\n", " \n", " return: the most common cell types in the cluster\n", " '''\n", " dic = {}\n", " total = 0\n", " \n", " for each in adata.obs['name.simple']:\n", " if each not in dic.keys():\n", " dic[each]=1\n", " else:\n", " dic[each]+=1\n", " total+=1\n", " #print(dic)\n", " \n", " anootate = ''\n", " \n", " flag = False #flag to see if there are more than 1 cell types > 40%\n", " for each in list(dic.keys()):\n", " \n", " if dic[each] > total*0.5:\n", " if flag == False:\n", " anootate+=each\n", " flag = True\n", " else:\n", " anootate+='/'+each\n", " if flag == False:\n", " for each in list(dic.keys()):\n", " \n", " if dic[each] > total*0.4:\n", " if flag == False:\n", " anootate+=each\n", " flag = True\n", " else:\n", " anootate+='/'+each\n", " if flag == False:\n", " for each in list(dic.keys()):\n", " \n", " if dic[each] > total*0.3:\n", " if flag == False:\n", " anootate+=each\n", " flag = True\n", " else:\n", " anootate+='/'+each\n", " if flag == False:\n", " for each in list(dic.keys()):\n", " \n", " if dic[each] > total*0.2:\n", " if flag == False:\n", " anootate+=each\n", " flag = True\n", " else:\n", " anootate+='/'+each\n", " if flag == False:\n", " anootate = 'NA'\n", " return anootate\n", "adata.obs['stage'] = adata.obs['stage'].astype(str)\n", "adata = adata[adata.obs['stage'] == '0']\n", "temp_count = 0\n", "temp_clustertype = {}\n", "choice = []\n", "scope = []\n", "\n", "scopes_cellids = []\n", "adata.obs['leiden'] = adata.obs['leiden'].astype(str)\n", "for xd, each in enumerate(adata.uns['clusterType']['0']):\n", " if each =='FibroblastAdventitial':\n", " choice.append(str(xd))\n", " else:\n", " temp_clustertype[str(xd)] = each\n", " scope.append(str(xd))\n", "choice_adata = adata[adata.obs['leiden'].isin(choice)]\n", "scope_adata = adata[adata.obs['leiden'].isin(scope)]\n", "\n", "choice_adata.obs['test'] = 'FibroblastAdventitial'\n", "scope_adata.obs['test'] = 'FibroblastAlveolar'\n", "apat = choice_adata.concatenate(scope_adata)\n", "apat.layers['scaled'] = sc.pp.scale(apat, copy=True).X\n", "\n", "\n", "marker_genes = []\n", "\n", "pos = []\n", "for i in range(len(temp_clustertype.keys())):\n", " temp = [i*10,i*10+9]\n", "\n", " pos.append(tuple(temp))\n", "\n", "\n", "sc.tl.rank_genes_groups(apat,'test', method='wilcoxon', n_genes=100)\n", "\n", "marker_genes = apat.uns['rank_genes_groups']['names']['FibroblastAdventitial']\n", "sc.pl.rank_genes_groups_heatmap(apat,groupby=['ident','leiden'],layer='scaled', min_logfoldchange=1,n_genes=50,swap_axes=True, use_raw=False,show_gene_labels=True,vmin=-0.5, vmax=0.5, cmap='RdBu_r',var_group_positions=pos,dendrogram=False, var_group_labels=[str(i) for i in range(len(apat.obs['test'].keys()))],var_group_rotation=0,figsize=(20, 16),save='_sf3.pdf')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Heatmap of level 4 hierarchical static markers in the Fibroblast Adventitial cluster at the control stage. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import pandas as pd\n", "import os\n", "import matplotlib.pyplot as plt\n", "import json\n", "import numpy as np\n", "import scanpy as sc\n", "adata = sc.read_h5ad('dataset.h5ad')\n", "uns = pickle.load(open('attribute.pkl', 'rb'))\n", "adata.uns = uns\n", "protein_coding_genes = []\n", "for each in adata.var.index:\n", " if each[:2] != 'AC' and each[:2] != 'AL' and each[:2] != 'AP' and each[:4] != 'LINC' and '.' not in each:\n", " protein_coding_genes.append(each)\n", "adata = adata[:,protein_coding_genes]\n", "sc.set_figure_params(scanpy=True, dpi=80,dpi_save=300)\n", "def clustertype40(adata):\n", " '''\n", " annotate the cluster with cells >40% if no one >40%, annotate with the highest one\n", " args:\n", " adata: anndata of one cluster\n", " \n", " return: the most common cell types in the cluster\n", " '''\n", " dic = {}\n", " total = 0\n", " \n", " for each in adata.obs['name.simple']:\n", " if each not in dic.keys():\n", " dic[each]=1\n", " else:\n", " dic[each]+=1\n", " total+=1\n", " #print(dic)\n", " \n", " anootate = ''\n", " \n", " flag = False #flag to see if there are more than 1 cell types > 40%\n", " for each in list(dic.keys()):\n", " \n", " if dic[each] > total*0.5:\n", " if flag == False:\n", " anootate+=each\n", " flag = True\n", " else:\n", " anootate+='/'+each\n", " if flag == False:\n", " for each in list(dic.keys()):\n", " \n", " if dic[each] > total*0.4:\n", " if flag == False:\n", " anootate+=each\n", " flag = True\n", " else:\n", " anootate+='/'+each\n", " if flag == False:\n", " for each in list(dic.keys()):\n", " \n", " if dic[each] > total*0.3:\n", " if flag == False:\n", " anootate+=each\n", " flag = True\n", " else:\n", " anootate+='/'+each\n", " if flag == False:\n", " for each in list(dic.keys()):\n", " \n", " if dic[each] > total*0.2:\n", " if flag == False:\n", " anootate+=each\n", " flag = True\n", " else:\n", " anootate+='/'+each\n", " if flag == False:\n", " anootate = 'NA'\n", " return anootate\n", "adata.obs['stage'] = adata.obs['stage'].astype(str)\n", "adata = adata[adata.obs['stage'] == '0']\n", "temp_count = 0\n", "temp_clustertype = {}\n", "choice = []\n", "scope = []\n", "# choice = 17\n", "# scope = [17,4,7]\n", "scopes_cellids = []\n", "adata.obs['leiden'] = adata.obs['leiden'].astype(str)\n", "for xd, each in enumerate(adata.uns['clusterType']['0']):\n", " if each =='FibroblastAdventitial':\n", " choice.append(str(xd))\n", " elif each =='FibroblastAlveolar':\n", " temp_clustertype[str(xd)] = each\n", " scope.append(str(xd))\n", "choice_adata = adata[adata.obs['leiden'].isin(choice)]\n", "scope_adata = adata[adata.obs['leiden'].isin(scope)]\n", "\n", "choice_adata.obs['test'] = 'FibroblastAdventitial'\n", "scope_adata.obs['test'] = 'FibroblastAlveolar'\n", "apat = choice_adata.concatenate(scope_adata)\n", "apat.layers['scaled'] = sc.pp.scale(apat, copy=True).X\n", "\n", "\n", "marker_genes = []\n", "\n", "pos = []\n", "for i in range(len(temp_clustertype.keys())):\n", " temp = [i*10,i*10+9]\n", "\n", " pos.append(tuple(temp))\n", "\n", "\n", "sc.tl.rank_genes_groups(apat,'test', method='wilcoxon', n_genes=100)\n", "\n", "marker_genes = apat.uns['rank_genes_groups']['names']['FibroblastAdventitial']\n", "sc.pl.heatmap(apat, marker_genes[:25], groupby=['test','leiden'],layer='scaled',swap_axes=True,vmin=-0.5, vmax=0.5, cmap='RdBu_r',figsize=(20, 16))#,save='inter_heatmap.pdf')#,var_group_labels=[str(i) for i in range(len(apat.obs['test'].keys()))])\n", "\n", "sc.pl.rank_genes_groups_heatmap(apat,groupby=['test','leiden'],layer='scaled', min_logfoldchange=1,n_genes=50,swap_axes=True, use_raw=False,show_gene_labels=True,vmin=-0.5, vmax=0.5, cmap='RdBu_r',var_group_positions=pos,dendrogram=False, var_group_labels=[str(i) for i in range(len(apat.obs['test'].keys()))],var_group_rotation=0,figsize=(20, 16),save='_sf4.pdf')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualizing the overlapping between dynamic proteins and markers using the Venn plot" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import scanpy as sc\n", "import matplotlib.pyplot as plt\n", "import pickle as pkl\n", "protein = pd.read_csv('protein_validation.csv')\n", "protein = protein.set_index('gene')\n", "all_protein = protein.index.tolist()\n", "protein = protein.loc[(protein['padj'] <= 0.01)] #remove unsignificant proteins\n", "protein_genes = protein.index.unique().tolist()\n", "import pickle\n", "adata = sc.read_h5ad('dataset.h5ad')\n", "adata.uns = pickle.load(open('attribute.pkl', 'rb'))\n", "adata_genes = adata.var_names.tolist()\n", "from UNAGI.getProgressionTopMarkers import runGetProgressionMarker_one_dist\n", "import numpy as np\n", "background = np.load('progressionmarker_background.npy',allow_pickle=True).tolist()\n", "one_dist = []\n", "for each in background.keys():\n", " one_dist.append(np.array(background[each]))\n", "one_dist = np.array(one_dist).reshape(-1, 2484)\n", "dpm = runGetProgressionMarker_one_dist('idremVizCluster',size=2484,background = background,cutoff=0.01)\n", "intersection = np.intersect1d(protein_genes, adata_genes)\n", "print('protein genes in adata: ', len(intersection))\n", "intersection_protein = protein.loc[intersection]\n", "all_intersection = np.intersect1d(all_protein, adata_genes)\n", "from scipy.stats import hypergeom, binom\n", "\n", "#M: 151 significant protein overlapped with IPF data adjusted_pval <0.05\n", "#N: 2482; M 151; n track markers (pval<0.05); m track intersection\n", "\n", "def hypergeometric_test(N, M, n, m):\n", " rv = hypergeom(N, M, n)\n", " probability = rv.pmf(m)\n", " return probability\n", "all_markers = []\n", "significances = []\n", "temp = {}\n", "temp['Dynamic Markers'] = []\n", "temp['Dynamic Protein Coding Genes'] = []\n", "pvals = []\n", "for track in dpm.keys():\n", " track_markers = []\n", " \n", "\n", " for direction in dpm[track].keys():\n", " df = pd.DataFrame.from_dict(dpm[track][direction])\n", " track_markers+=list(df['gene'].values)\n", " all_markers+=list(df['gene'].values)\n", " # print(attribute['progressionMarkers'][track][direction]['qval'])\n", "\n", " track_markers = np.intersect1d(track_markers, adata_genes)\n", " temp['Dynamic Markers'].append(len(track_markers))\n", " track_intersection = np.intersect1d(intersection, track_markers)\n", " temp['Dynamic Protein Coding Genes'].append(len(track_intersection))\n", " print('track: ', track, 'intersection: ', len(track_intersection), 'track markers: ', len(track_markers))\n", " significance = hypergeometric_test(len(adata_genes), len(intersection), len(track_markers), len(track_intersection))\n", " #bionomial test\n", " # print(len(track_intersection), len(track_markers), len(intersection),len(adata_genes))\n", " # significance = 1-binom.cdf(len(track_intersection)-1, len(track_markers), len(intersection)/len(all_intersection))\n", " significances.append(significance)\n", " print('significance: ', significance)\n", " pvals.append(significance)\n", " \n", "all_markers = np.intersect1d(list(set(all_markers)),adata_genes)\n", "all_track_intersection = np.intersect1d(intersection, all_markers)\n", "all_track_intersection = list(set(all_track_intersection))\n", "print('all track intersection: ', len(all_track_intersection), 'all markers: ', len(all_markers))\n", "significance = hypergeometric_test(len(adata_genes), len(intersection), len(all_markers), len(all_track_intersection))\n", "# significance = binom.cdf(len(all_track_intersection)-1, len(all_markers), len(intersection)/len(all_intersection))\n", "print('significance: ', significance)\n", "significances.append(significance)\n", "temp['Dynamic Markers'].append(len(all_markers))\n", "temp['Dynamic Protein Coding Genes'].append(len(all_track_intersection))\n", "import matplotlib.pyplot as plt\n", "x = np.arange(len(significances)) # the label locations\n", "width = 0.5 # the width of the bars\n", "multiplier = 0\n", "fig, ax = plt.subplots(figsize=(10, 5),dpi=200)\n", "flag = True\n", "od = np.argsort(significances)\n", "track_names = list(dpm.keys())\n", "track_names.append('All Tracks')\n", "track_names = ['FibAlv-4','All Tracks','VEven-2','LymEnd-19','VEcap-1','FibAdv-17','VEaero-3','SMC-18','VEaero-0','PerAlv-12','VEcap-6']\n", "i=0\n", "labels = []\n", "from statsmodels.stats.multitest import fdrcorrection\n", "\n", "for each in np.array(significances)[od]:\n", " if each <0.05 and each >= 0.01:\n", " labels.append('*')\n", " elif each >=0.001 and each < 0.01:\n", " labels.append('**')\n", " elif each < 0.001:\n", " labels.append('***')\n", " else:\n", " labels.append('ns')\n", "\n", "for att, measurement in temp.items():\n", " measurement = np.array(measurement).flatten()[od]\n", " rects = ax.bar(x, measurement, width, label=att)\n", " i+=1\n", " \n", " if flag:\n", " ax.bar_label(rects,labels,padding=0.5)\n", " flag=False\n", "plt.legend()\n", "plt.ylabel('Number of Genes')\n", "plt.xticks(x, np.array(track_names),rotation = 90)\n", "plt.tight_layout()\n", "plt.savefig('sf dynamic marker and protein overlapping.pdf',dpi=300)\n", "\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from matplotlib_venn import venn3, venn3_circles\n", "set1 = set(adata_genes)\n", "set2 = set(protein_genes)\n", "set3 = set(all_markers)\n", "plt.figure(figsize=(10,10),dpi=300)\n", "v = venn3([set1,set2,set3],('','',''))\n", "v.get_patch_by_id('100').set_color('blue')\n", "v.get_patch_by_id('010').set_color('green')\n", "v.get_patch_by_id('110').set_color('tab:blue')\n", "v.get_patch_by_id('110').set_alpha(1)\n", "v.get_patch_by_id('111').set_color('tab:orange')\n", "v.get_patch_by_id('111').set_alpha(1)\n", "plt.title('Dynamic Markers vs Protein')\n", "plt.savefig('sf_venn_protein_and_markers.pdf',dpi=300)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# UMAPs of the whole snRNA-seq IPF dataset based on the learned latent representation 'Z'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "sc.set_figure_params(dpi_save=300)\n", "adata = sc.read_h5ad('dataset.h5ad')\n", "adata.obs['ident'] = adata.obs['ident'].astype(str)\n", "\n", "del adata.obsm['X_umap']\n", "del adata.obsm['umap']\n", "sc.pp.neighbors(adata,use_rep='z')\n", "sc.tl.umap(adata,min_dist=0.3)\n", "sc.pl.umap(adata,color='stage')#,save='_all_stages.pdf')\n", "sc.pl.umap(adata,color='ident')#,save='_all_cell_type.pdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# UMAPs of the whole snRNA-seq IPF dataset based on the PCA embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "sc.set_figure_params(dpi_save=300)\n", "adata = sc.read_h5ad('dataset.h5ad')\n", "adata.obs['ident'] = adata.obs['ident'].astype(str)\n", "\n", "del adata.obsm\n", "sc.pp.neighbors(adata)\n", "sc.tl.umap(adata,min_dist=0.3)\n", "sc.pl.umap(adata,color='stage',save='_all_stages.pdf')\n", "sc.pl.umap(adata,color='ident',save='_all_cell_type.pdf')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compare the in-silico and real perturbation outcomes on top differentially expressed genes\n", "NIF and NIN reconstruction of top 100 treatment markers and ECM organization " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "import numpy as np\n", "import torch\n", "from scipy.sparse import issparse\n", "from torch.utils.data import DataLoader\n", "from sklearn.neighbors import kneighbors_graph\n", "from UNAGI.perturbation_general import *\n", "from UNAGI.ppi import getCompleteNodes,getCompleteNodesPCA,runPPIthreads,getPPINetworkDict\n", "import anndata\n", "from zig_my import *\n", "# sc.set_figure_params(dpi=180)\n", "np.random.seed(0)\n", "adata0 = sc.read_h5ad('../pcls/0.h5ad')\n", "adata1 = sc.read_h5ad('../pcls/1.h5ad')\n", "adata0 = adata0[adata0.obs['condition2'] == 'Control']\n", "adata1 = adata1[adata1.obs['condition2'] == 'Fibrotic_Cocktail']\n", "adata_nif = sc.read_h5ad('../3.h5ad')\n", "adata_nin = sc.read_h5ad('../2.h5ad')\n", "\n", "def getImpactFactors(adata,gene,PPINetworkDict): ##new merged \n", " ids = adata\n", "\n", " PCA = getCompleteNodesPCA(ids,0,ids)\n", " genenames = adata.var.index.tolist()\n", " #genenames = genenames.tolist()\n", " genetable = {}\n", " for i, each in enumerate(genenames):\n", " genetable[each] = i\n", "\n", " #k=np.load('./PPINetworkDict.npy',allow_pickle=True)\n", " PPIDict=PPINetworkDict\n", " table = runPPIthreads(ids,PCA,genetable,PPIDict,gene)\n", " return table\n", "def getZandZc(adata,impactfactor=None):\n", " clusterAdata = adata\n", " input = clusterAdata.X\n", " if issparse(input):\n", " input = input.toarray()\n", " data = input\n", " loadModelDict = './pcls/model_save/zig_0.pth'\n", "\n", " vae = VAE(data.shape[1], 256 , 1024,64) \n", " vae.load_state_dict(torch.load(loadModelDict,map_location=torch.device('cpu')))\n", " vae.eval()\n", " recons = []\n", "\n", " print(data.shape)\n", " if impactfactor is not None:\n", " cell_loader = DataLoader(data.astype('float32'), batch_size=len(data), shuffle=False, num_workers=0)\n", " else:\n", " cell_loader = DataLoader(data.astype('float32'), batch_size=len(data), shuffle=False, num_workers=0)\n", " if 'gcn_connectivities' not in clusterAdata.obsp.keys():\n", " adj = kneighbors_graph(adata.obsm['X_pca'], 24, mode='connectivity', include_self=False,n_jobs=20)\n", " adj.setdiag(25)\n", " adj = torch.tensor(adj.toarray().astype('float32'))\n", " else:\n", " adj = torch.tensor(clusterAdata.obsp['gcn_connectivities'].toarray().astype('float32'))\n", " for x in cell_loader:\n", "\n", "\n", " if impactfactor is not None:\n", "\n", " impactfactor = impactfactor.astype('float32')\n", " x = x+impactfactor\n", " print(x.shape)\n", " \n", " recons = recons.cpu().detach().numpy().tolist()\n", " # recons += vae.get_latent_representation(x,adj).cpu().detach().numpy().tolist()#vae.generate(x,adj,1).cpu().detach().numpy().tolist()\n", " recons = np.array(recons)\n", "\n", " return recons\n", "fibrosis_mean = np.mean(np.array(adata1.X.toarray()),axis=0)\n", "nif_mean = np.mean(np.array(adata_nif.X.toarray()),axis=0)\n", "differences = nif_mean - fibrosis_mean\n", "impactfactors = differences.reshape(1,-1)\n", "\n", "adata1_perturbed = adata1.copy()\n", "adata1_perturbed_z = getZandZc(adata1_perturbed,impactfactors)\n", "adata1_perturbed = anndata.AnnData(X = adata1_perturbed_z,obs=adata1_perturbed.obs,var=adata1_perturbed.var)\n", "\n", "nin_mean = np.mean(np.array(adata_nin.X.toarray()),axis=0)\n", "differences = nin_mean - fibrosis_mean\n", "impactfactors = differences.reshape(1,-1)\n", "adata2_perturbed = adata1.copy()\n", "adata2_perturbed_z = getZandZc(adata2_perturbed,impactfactors)\n", "adata2_perturbed = anndata.AnnData(X = adata2_perturbed_z,obs=adata2_perturbed.obs,var = adata2_perturbed.var)\n", "adatanif_z = getZandZc(adata_nif)\n", "adatanif = anndata.AnnData(X = adatanif_z,obs=adata_nif.obs,var = adata_nif.var)\n", "adata1_z = getZandZc(adata1)\n", "adata1_z = anndata.AnnData(X = adata1_z,obs=adata1.obs,var = adata1.var)\n", "adata0_z = getZandZc(adata0)\n", "adata0_z = anndata.AnnData(X = adata0_z,obs=adata0.obs, var = adata0.var)\n", "\n", "adata2_z = getZandZc(adata_nin)\n", "adata2_z = anndata.AnnData(X = adata2_z,obs=adata_nin.obs, var = adata0.var)\n", "adata0_z.obs['test'] = 'healthy'\n", "adata1_z.obs['test'] = 'fibrosis'\n", "adata2_z.obs['test'] = 'intedanib'\n", "adatanif.obs['test'] = 'Nifedipine'\n", "adata1_perturbed.obs['condition2'] = 'Fibrotic_Cocktail_Nifedipine_Perturbed'\n", "adata2_perturbed.obs['condition2'] = 'Fibrotic_Cocktail_Nintedanib_Perturbed'\n", "asdf = [adata1_z,adata2_z,adatanif,adata2_perturbed,adata1_perturbed]\n", "final = anndata.concat(asdf)\n", "sc.pp.neighbors(final,use_rep='X')\n", "sc.tl.umap(final)\n", "sc.pl.umap(final,color='condition2',palette={\n", " \"Fibrotic_Cocktail_Nifedipine_Perturbed\": \"tab:cyan\",\n", " \"Fibrotic_Cocktail_Nintedanib_Perturbed\": \"tab:pink\",\n", " \"Fibrotic_Cocktail\": \"tab:green\",\n", " \"Fibrotic_Cocktail_Nintedanib\": \"tab:brown\",\n", " \"Fibrotic_Cocktail_Nifedipine\": \"tab:purple\"\n", " })\n", "nif_perturbed = adata1_perturbed\n", "nin_perturbed = adata2_perturbed\n", "nif_perturbed.obs['condition3'] = 'nifedipine_in_silico'\n", "adata_nif.obs['condition3'] = 'nifedipine'\n", "adata1.obs['condition3'] = 'fibrosis'\n", "nin_perturbed.obs['condition3'] = 'nintedanib_in_silico'\n", "adata_nin.obs['condition3'] = 'nintedanib'\n", "final = anndata.concat([adata1, adata_nif,nif_perturbed])\n", "# final = anndata.concat([adata1, adata_nin,nin_perturbed])\n", "sc.tl.rank_genes_groups(final, 'condition3', method='wilcoxon',rankby_abs=True, reference='fibrosis')\n", "def my_logfold_change(adata1, adata2,topN,log_fold_change_cutoff=None,abs = False):\n", " genenames = adata1.var_names.tolist()\n", " mean1 = np.mean(adata1.X.toarray(),axis=0)\n", " mean2 = np.mean(adata2.X.toarray(),axis=0)\n", " logfold_change = mean2 - mean1\n", " logfold_change_dict = {}\n", "\n", " for i, each in enumerate(genenames):\n", " logfold_change_dict[each] = logfold_change[i]\n", " if abs ==True:\n", " temp = {k: v for k, v in sorted(logfold_change_dict.items(), key=lambda item: np.abs(item[1]),reverse=True)}\n", " else:\n", " temp = {k: v for k, v in sorted(logfold_change_dict.items(), key=lambda item: item[1],reverse=True)}\n", "\n", " df = pd.DataFrame()\n", " df['logfoldchange'] = temp.values()\n", " df['genes'] = temp.keys()\n", " if log_fold_change_cutoff is not None:\n", " df = df[np.abs(df['logfoldchange'])>log_fold_change_cutoff]\n", " topDEG = list(df[:topN]['genes'].values)\n", " return topDEG,df\n", "from scipy.stats import f_oneway, ks_2samp, mannwhitneyu, wilcoxon, ttest_ind,ranksums,wasserstein_distance,chisquare,pearsonr,energy_distance\n", "from scipy.stats import linregress\n", "from sklearn.metrics import r2_score\n", "def calculate_r_squared(y_true, y_pred):\n", " ssr = np.sum((y_true - y_pred) ** 2)\n", " sst = np.sum((y_true - np.mean(y_true)) ** 2)\n", " r_squared = 1 - (ssr / sst)\n", " return r_squared\n", "def calculate_adjusted_r_squared(r_squared, n, k):\n", " adjusted_r_squared = 1 - (1 - r_squared) * ((n - 1) / (n - k - 1))\n", " return adjusted_r_squared\n", "sc.set_figure_params(dpi_save=300,figsize=[20,3])\n", "def cosine_similarity(v1, v2):\n", " # Compute the dot product of v1 and v2\n", " dot_product = np.dot(v1, v2)\n", " # Compute the L2 norm of v1\n", " norm_v1 = np.linalg.norm(v1)\n", " # Compute the L2 norm of v2\n", " norm_v2 = np.linalg.norm(v2)\n", " # Compute cosine similarity\n", " similarity = dot_product / (norm_v1 * norm_v2)\n", " return similarity\n", "def adjusted_r2(r2, n, k):\n", " \"\"\"\n", " Calculate the adjusted R-squared.\n", " \n", " Parameters:\n", " - r2: The regular R-squared value\n", " - n: The number of observations\n", " - k: The number of predictors (excluding the intercept)\n", " \n", " Returns:\n", " - The adjusted R-squared value\n", " \"\"\"\n", " return 1 - ((1 - r2) * (n - 1) / (n - k - 1))\n", "\n", "# Example usage:\n", "def draw_real_recons_heatmap(adata, adata1, adata2, genes,gorupby, enmax_palette, ymax, topN=20,use_zero=False,save_fig = False,cut = False,similarity=False):\n", " temp = []\n", " pval = []\n", " for idx, each in enumerate(genes):\n", " if cut:\n", " \n", " if np.median(adata1[:,each].X.toarray()) <1:\n", " temp.append(0)\n", " pval.append(0)\n", " continue\n", " else:\n", " print(np.median(adata1[:,each].X.toarray()))\n", " obs_nif = adata1[:,each].X.toarray().flatten()\n", " obs_nif_perturbed = adata2[:,each].X.toarray().flatten()\n", "\n", " if use_zero:\n", " pair1 = energy_distance(obs_nif_perturbed,obs_nif)\n", " else:\n", " pair1 = ks_2samp(obs_nif_perturbed[obs_nif_perturbed>0],obs_nif[obs_nif>0],alternative='two-sided')[1]\n", " pval.append(ks_2samp(obs_nif_perturbed[obs_nif_perturbed>0],obs_nif[obs_nif>0],alternative='two-sided')[1])\n", " temp.append(pair1)\n", " #z_score norm\n", " # adata.layers['scaled'] = sc.pp.scale(adata, copy=True).X\n", " print(np.array(pval)[np.argsort(temp)[::-1]][:topN])\n", " protencoding = [each for each in genes if 'AC' != each[:2] and 'AL' != each[:2] and 'AP' != each[:2] and 'LINC' != each[:4] ]\n", " # sc.tl.rank_genes_groups(adata, gorupby, method='wilcoxon',rankby_abs=True, reference='perturbed')\n", " if similarity == False:\n", " df = sc.get.obs_df(adata, list(np.array(protencoding)[:topN])+['condition3'])\n", " else:\n", " df = sc.get.obs_df(adata, list(np.array(genes)[np.argsort(temp)[::-1]][:topN])+['condition3'])\n", " custom_dict = {}\n", " for i ,each in enumerate(list(np.array(genes)[np.argsort(temp)[::-1]][:topN])):\n", " custom_dict[each] = i\n", " df = df.set_index('condition3').stack().reset_index()\n", " \n", " df.columns = ['condition3', 'gene', 'value']\n", " df.sort_values(by=['gene'], key=lambda x: x.map(custom_dict))\n", " print(df['gene'])\n", " # df = df[df['value']>0]\n", " #sort df based on gene order\n", " \n", " #calcualte the mean expression of all genes in each condition\n", " print(list(np.array(genes)[np.argsort(temp)[::-1]][:topN]))\n", " original_mean = np.mean(np.array(adata1[:,np.array(genes)[np.argsort(temp)[::-1]]].X.toarray()),axis=0)\n", " perturbed_mean = np.mean(np.array(adata2[:,np.array(genes)[np.argsort(temp)[::-1]]].X.toarray()),axis=0)\n", " #calculate pearson correlation between original and perturbed\n", " print('pearsonr correlation:',pearsonr(original_mean,perturbed_mean))\n", " print('cosine similarity:',cosine_similarity(original_mean,perturbed_mean))\n", "\n", "\n", "\n", " # Generate some sample data\n", " np.random.seed(0)\n", " x = original_mean\n", " y = perturbed_mean\n", " # Perform linear regression\n", " print('raw:', x)\n", " print('perturbed:',y)\n", " rsquare = calculate_r_squared(x,y)\n", " rsquare = adjusted_r2(rsquare,len(x),1)\n", " print(\"R^2 value:\",rsquare)\n", " slope, intercept, r_value, p_value, std_err = linregress(x, y,alternative='greater')\n", " adjusted_r2_val = adjusted_r2(r_value,len(x),1)\n", " print('adjusted R^2:',adjusted_r2_val)\n", " # Create scatter plot and plot regression line\n", " fig,ax = plt.subplots(figsize=(3,3),dpi=100)\n", " ax.scatter(x, y, label='Data', color='blue')\n", " ax.plot(x, slope * x + intercept, label=f'Fit (R^2 = {rsquare:.2f})', color='red')\n", " ax.legend(bbox_to_anchor=(1, 1))\n", " # ax.grid(False)\n", " plt.xlabel('original')\n", " plt.ylabel('perturbed')\n", " plt.title('$R^2$ value')\n", " \n", " plt.savefig('R^2'+save_fig,dpi=300)\n", " print('R-val:', r_value)\n", " \n", " \n", " print(f\"P-value: {p_value:.4f}\")\n", " if save_fig == False:\n", " \n", " fig,ax = plt.subplots(figsize=(15,3),dpi=100)\n", " sns.boxplot(data=df, x='gene', y='value', hue='condition3', palette=enmax_palette)\n", " # sns.violinplot(data=df, x='gene', y='value', hue='condition3', split=True, palette=enmax_palette,scale='area')\n", " ax.grid(False)\n", " sns.move_legend(ax, \"upper left\", bbox_to_anchor=(1, 1))\n", " plt.ylim(0,ymax)\n", " plt.show()\n", " \n", " else:\n", " fig,ax = plt.subplots(figsize=(15,3),dpi=100)\n", " sns.boxplot(data=df, x='gene', y='value', hue='condition3', palette=enmax_palette)\n", " # sns.violinplot(data=df, x='gene', y='value', hue='condition3', split=True, palette=enmax_palette,scale='area')\n", " ax.grid(False)\n", " \n", " sns.move_legend(ax, \"upper left\", bbox_to_anchor=(1, 1),fontsize=7)\n", "\n", " #rotate xticklabels 90 degree\n", " for item in ax.get_xticklabels():\n", " item.set_rotation(90)\n", " #make x ticks label smaller\n", " ax.tick_params(axis='x', which='major', labelsize=7)\n", " plt.ylim(0,ymax)\n", " #squize the figure\n", " plt.tight_layout()\n", " plt.savefig(save_fig,dpi=300)\n", " plt.show()\n", "def rankgene_logfold_change(adata1, adata2,gene_list,abs = False):\n", " genenames = adata1.var_names.tolist()\n", " mean1 = np.mean(adata1.X.toarray(),axis=0)\n", " mean2 = np.mean(adata2.X.toarray(),axis=0)\n", " logfold_change = mean2 - mean1\n", " logfold_change_dict = {}\n", "\n", " for i, each in enumerate(genenames):\n", " logfold_change_dict[each] = logfold_change[i]\n", " if abs ==True:\n", " temp = {k: v for k, v in sorted(logfold_change_dict.items(), key=lambda item: np.abs(item[1]),reverse=True)}\n", " else:\n", " temp = {k: v for k, v in sorted(logfold_change_dict.items(), key=lambda item: item[1],reverse=True)}\n", "\n", "\n", " df = pd.DataFrame()\n", " df['logfoldchange'] = temp.values()\n", " df['genes'] = temp.keys()\n", " ranked_gene_list = []\n", " for each in temp.keys():\n", " if each in gene_list:\n", " ranked_gene_list.append(each)\n", " return ranked_gene_list\n", "import seaborn as sns\n", "nif = [adata_nif,nif_perturbed]\n", "nif = anndata.concat(nif)\n", "\n", "test_gene,df = my_logfold_change(adata1, adata_nif,100)\n", "enmax_palette = sns.color_palette([\"#00A087\", \"#4DBBD5\"])\n", "draw_real_recons_heatmap(nif, adata_nif, nif_perturbed, test_gene,'condition3',enmax_palette, ymax=4,similarity=False, topN=100,use_zero=False,cut=True,save_fig='box_nif_topN100_100.pdf')\n", "ecm_genes = ['MUSK', 'TLL1', 'VCAN', 'CDH1', 'TNC', 'LTBP1', 'ELN', 'LAMC3', 'COL23A1', 'LAMA3', 'LAMC2', 'COL11A1', 'COL17A1', 'NTN4', 'FBLN1', 'ITGA8', 'COL5A3', 'COL4A4', 'CEACAM6', 'ICAM1', 'P3H2', 'LAMB1', 'ITGA6', 'TGFB2', 'MMP11', 'MMP9', 'TIMP1', 'COMP', 'ITGB8', 'SERPINE1', 'ICAM2', 'COL1A1', 'VWF', 'COL12A1', 'SPARC', 'COL7A1', 'ITGB6', 'FN1', 'TNR', 'SPP1', 'LTBP2', 'MMP19', 'COL10A1', 'SDC4', 'COL21A1', 'BMP2', 'KDR', 'COL5A1', 'MATN3', 'COL4A2', 'ADAMTS8', 'CAPN9', 'CTSV', 'MMP7', 'THBS1', 'ITGA11', 'LOXL4', 'ADAMTS14', 'FGF2', 'FBN2', 'LUM', 'ADAMTS18', 'ITGA9', 'COL8A1', 'ADAMTS16', 'HAPLN1', 'ADAM12', 'NCAM1', 'PLOD2', 'ADAMTS5', 'MMP16', 'ADAMTS3', 'ACAN', 'VCAM1', 'CAPN13', 'COL6A3', 'ADAMTS9', 'ITGA2', 'COL1A2', 'FBN1', 'MMP10', 'BMP1', 'COL3A1', 'COL4A3', 'COL22A1', 'COL24A1', 'FGG', 'FGA', 'NRXN1', 'BGN', 'COL4A1', 'COL14A1', 'COL25A1', 'SPOCK3', 'LAMA2', 'MMP1', 'COL27A1', 'LAMB3', 'COL4A6', 'MFAP5', 'DMD', 'CAPN8', 'COL5A2', 'COL15A1', 'COL6A6', 'ITGA1', 'COL28A1']\n", "ecm_genes = sorted(list(set(adata_nif.var_names.tolist() ) & set(ecm_genes)))\n", "# col = adata_nif.var_names[adata_nif.var_names.str.startswith('COL')]\n", "ecm_genes = rankgene_logfold_change(adata1, adata_nif,ecm_genes,abs=True)\n", "draw_real_recons_heatmap(nif, adata_nif, nif_perturbed, ecm_genes,'condition3',enmax_palette,ymax=4.5,topN=len(ecm_genes),similarity=False,use_zero=False,save_fig='nif_ecm_sup.pdf')\n", "nin = [adata_nin,nin_perturbed]\n", "enmax_palette = sns.color_palette([\"#EE4C97\",\"#fccde5\"])\n", "nin = anndata.concat(nin)\n", "\n", "test_gene,df = my_logfold_change(adata1, adata_nin,100)\n", "draw_real_recons_heatmap(nin, adata_nin, nin_perturbed, test_gene,'condition3',enmax_palette,ymax=4, topN=100,use_zero=False,similarity=False,cut=True,save_fig='box_nin_topN100_100.pdf')\n", "enmax_palette = sns.color_palette([\"#EE4C97\", \"#fccde5\"])\n", "nin = anndata.concat(nin)\n", "ecm_genes = rankgene_logfold_change(adata1, adata_nin,ecm_genes,abs=True)\n", "draw_real_recons_heatmap(nin, adata_nin, nin_perturbed, ecm_genes,'condition3',enmax_palette,ymax=4.5,topN=len(ecm_genes),similarity=False,use_zero=False,save_fig='nin_ecm_sup.pdf')#,save_fig='nin_neg_topN10_500.pdf')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# COVID-19 cell type composition" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import pandas as pd\n", "import os\n", "import matplotlib as mpl\n", "import json\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import scanpy as sc\n", "adata = sc.read_h5ad('../covid/dataset.h5ad')\n", "uns = pickle.load(open('../covid/attribute.pkl', 'rb'))\n", "\n", "all_types = adata.obs['ident'].unique().tolist()\n", "all_types = sorted(all_types)\n", "stage_types = {'Stage 3':[],'Stage 2':[],'Stage 1':[],'Control':[]}\n", "stage_keys = ['Stage 3', 'Stage 2','Stage 1','Control']\n", "for i in range(4):\n", " stage_adata = adata[adata.obs['stage'] == str(i)]\n", " stage_cells = len(stage_adata)\n", " for each in all_types:\n", " stage_types[stage_keys[3-i]].append(len(stage_adata[stage_adata.obs['ident'] == each])/stage_cells)\n", "multiplier = 0\n", "width = 0.3\n", "\n", "plt,ax = plt.subplots(figsize=(15, 3),dpi=300)\n", "#transform the type of list to array\n", "# y_pos = [0, 1]\n", "for i in stage_types.keys():\n", " stage_types[i] = np.array(stage_types[i])\n", "bottom = 0\n", "data = np.array(list(stage_types.values()))\n", "data_cum = data.cumsum(axis=1)\n", "category_colors = mpl.colormaps['Spectral'](\n", " np.linspace(0.1, 1, data.shape[1]))\n", "\n", "for j, (types,c) in enumerate(zip(all_types,category_colors)):\n", " widths = data[:,j]\n", " starts = data_cum[:, j] - widths\n", " # labels = list(stage_types.keys())\n", " # labels = [labels[v] if val > 0.05 else \"\" for v, val in enumerate(widths)] \n", " bc = ax.barh(list(stage_types.keys()), widths, left = starts, label=types,color=c,height=0.8)\n", " # labels = [f'%.2f%%'%(v*100) if v > 0.04 else \"\" for v in widths] \n", " labels = [\"\" if v > 0.04 else \"\" for v in widths] \n", " # ax.bar_label(c, labels=labels, label_type=\"center\",fmt='%.2f')\n", " # bottom += weight_count\n", " # multiplier += 1\n", " ax.bar_label(bc, labels=labels,label_type='center',fmt='%.2f',fontsize=9,color='black')\n", "ax.legend(ncol=len(all_types)//2+1, bbox_to_anchor=(0.1, 1.3),\n", " loc='upper left', fontsize='x-small')\n", "# threshold = 0.15\n", "for c in ax.containers:#comments out to disable percentage on the plot\n", " labels = [f'%.2f%%'%(v*100) if v > 0.05 else \"\" for v in c.datavalues] #comments out to disable percentage on the plot\n", "\n", " ax.bar_label(c, labels=labels, label_type=\"center\",fmt='%.2f')#comments out to disable percentage on the plot\n", "\n", "ax.spines['top'].set_visible(False)\n", "ax.spines['right'].set_visible(False)\n", "ax.spines['bottom'].set_visible(False)\n", "ax.spines['left'].set_visible(False)\n", "ax.get_xaxis().set_ticks([])\n", "ax.grid(False)\n", "plt.tight_layout()\n", "plt.savefig('harmony_cell_composition_precentage.pdf') " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dotplots of COVID-19 cell type marekrs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "adata = sc.read('../covid/dataset.h5ad')\n", "adata1 = sc.read('../data/haniffa21.processed.h5ad')\n", "for i in list(adata.obs['stage'].unique()):\n", " adata_temp = adata[adata.obs['stage']==i]\n", "\n", " stage2_raw = adata1[adata_temp.obs.index.tolist()]\n", " stage2_raw.obs = adata_temp.obs\n", "\n", " temp = ['MS4A1',\n", " 'CD79AA',\n", " 'IGHM','CD3D',\n", " 'IL7R',\n", " 'LTB',\n", " 'CD8A',\n", " 'CD8B',\n", " 'CD14',\n", " 'S100A8',\n", " 'FCER1G',\n", " 'FCGR3A',\n", " 'CST3',\n", " 'MS4A7',\n", " 'HLA-DQA1',\n", " 'CD1C',\n", " 'CD34',\n", " 'SPINK2',\n", " 'MKI67',\n", " 'GNLY',\n", " 'GZMB',\n", " 'IGHG1',\n", " 'IGKC',\n", " 'CD38',\n", " 'PF4',\n", " 'РРВР',\n", " 'HBA2',\n", " 'HBB',\n", " 'LILRA4',\n", " 'IL3RA',\n", " 'IRF7',\n", " ]\n", " sc.set_figure_params(dpi_save=300)\n", " to_draw = []\n", " for each in temp:\n", " if each in stage2_raw.var.index.tolist():\n", " to_draw.append(each)\n", " stage2_raw.layers['scaled'] = sc.pp.scale(stage2_raw, copy=True).X\n", " sc.pl.dotplot(stage2_raw, groupby='ident',layer='scaled', var_names=to_draw,vmax=2,vmin=0,swap_axes=False,save='harmony_stage%s_cellType_markers.pdf'%str(i))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dendrograms of the COVID-19 dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "import pickle\n", "import matplotlib.pyplot as plt\n", "import scipy.cluster.hierarchy as sch\n", "adata = sc.read_h5ad('../covid/dataset.h5ad')\n", "adata.uns = pickle.load(open('../covid/attribute.pkl','rb'))\n", "def plot_hc_dendrogram(adata,stage):\n", "\n", " sch.dendrogram(adata.uns['hcmarkers'][str(stage)]['Z'],no_plot=True)\n", " #replace the labels of the leaves with the labels of the original data\n", " leaves = sch.leaves_list(adata.uns['hcmarkers'][str(stage)]['Z'])\n", " adata.obs['leiden_celltype'] = adata.obs['leiden'].astype(str) +'_'+ adata.obs['ident'].astype(str)\n", " stage0 = adata[adata.obs['stage'] == str(stage)]\n", " sc.pl.umap(stage0,color='ident')\n", " new_leaves = []\n", " stage0.obs['leiden'] == stage0.obs['leiden'].astype(str)\n", " for each in leaves:\n", " temp = stage0[stage0.obs['leiden'] == str(each)]\n", " \n", " \n", " new_leaves.append(temp.obs['leiden_celltype'].unique()[0])\n", " plt.figure(figsize=(10,10),dpi=300)\n", " ax = plt.gca()\n", " sch.dendrogram(adata.uns['hcmarkers'][str(stage)]['Z'],ax=ax)\n", " ax.set_xticklabels(new_leaves)\n", " plt.xticks(rotation=90)\n", " plt.tight_layout()\n", " plt.savefig('dendrogram_stage_%d.pdf'%(stage))\n", " plt.show()\n", "for each in list(adata.obs['stage'].unique()):\n", " plot_hc_dendrogram(adata,int(each))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dot plots of the gene-weight markers in the IPF dataset and enrichment analysis of top weighted gene markers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "import pickle\n", "import matplotlib.pyplot as plt\n", "import scipy.cluster.hierarchy as sch\n", "adata = sc.read_h5ad('dataset.h5ad')\n", "adata.uns = pickle.load(open('attribute.pkl','rb'))\n", "protein_coding_genes = []\n", "for each in adata.var.index:\n", " if each[:2] != 'AC' and each[:2] != 'AL' and each[:2] != 'AP' and each[:4] != 'LINC' and '.' not in each:\n", " protein_coding_genes.append(each)\n", "adata = adata[:,protein_coding_genes]\n", "adata.obs['cluster_type'] = adata.obs['leiden'].astype(str) +'_'+ adata.obs['ident'].astype(str)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for each in list(adata.obs['stage'].unique()):\n", " if int(each) == 0:\n", " continue\n", " stageadata = adata[adata.obs['stage'] == each]\n", " del stageadata.uns\n", " sc.tl.rank_genes_groups(stageadata, 'cluster_type', method='wilcoxon',layer='geneWeight',n_genes=100)\n", " sc.pl.rank_genes_groups_dotplot(stageadata,n_genes=5,groupby='cluster_type',layer='geneWeight',cmap='RdBu_r',standard_scale='var',save='_IPF_geneWeight_markers_stage%s.pdf'%(str(each)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# print(stageadata.uns['rank_genes_groups']['names']['10_FibroblastFibrotic'])\n", "all_go_terms = ['Matrisome',\n", " 'Extracellular matrix organization',\n", " 'ECM glycoproteins',\n", " 'Collagen chain trimerization',\n", " 'Collagens',\n", " 'Assembly of collagen fibrils \\nand other multimeric structures',\n", " 'Collagen biosynthesis and \\nmodifying enzymes',\n", " 'Alphavbeta3 integrin signaling pathway',\n", " 'FGFR2b ligand binding and activation',\n", " 'Collagen formation',\n", " 'Molecules associated with elastic fibres']\n", "fdr_bhs = [1.640E-7, 9.787E-4, 1.719E-3,6.613E-3, 8.262E-3, 1.036E-2, 2.757E-2, 3.825E-2, 4.045E-2, 4.045E-2,4.820E-2] \n", "fig ,ax = plt.subplots(dpi = 100,figsize=(15,3))\n", "ax.bar(all_go_terms,-np.log10(fdr_bhs),color='#7BBADF')#3C5488, 4DBBD5, 00A087, 91D1C2, 8491B4\n", "# ax.set_xticks(all_go_terms,rotation=90)\n", "ax.set_xticklabels(labels=all_go_terms,rotation=90)\n", "ax.set_ylabel('-log10(FDR)')\n", "ax.set_xlabel('Pathways')\n", "#set the font size of yticks\n", "# ax.tick_params(axis='y', which='major', labelsize=7)\n", "#remove grid\n", "ax.grid(False)\n", "\n", "plt.tight_layout()\n", "plt.savefig('10_fibrotic_enrichment_geneweightsMarkers.pdf',dpi=300)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualizing the benchmarking results using UMAPs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "import numpy as np\n", "import pandas as pd\n", "import pickle\n", "sc.set_figure_params(dpi_save=300)\n", "def plot_with_colormap(values,color_dict):\n", " color_list = [[0.36862745, 0.30980392, 0.63529412, 1. ],'tab:pink','tab:olive','tab:cyan','gold', 'springgreen','coral','skyblue','tab:blue','tab:orange','tab:green','tab:red','tab:purple','tab:brown','yellow','aqua', 'turquoise','orangered', 'lightblue','darkorchid', 'fuchsia','royalblue','slategray', 'silver', 'teal', 'fuchsia','grey','indigo','khaki','magenta','tab:gray']\n", " # random.shuffle(color_list)\n", " values = list(set(values))\n", " values = sorted(values)\n", " print(values)\n", " for i, value in enumerate(values):\n", " if value not in list(color_dict.keys()):\n", " color_dict[value] = color_list[(len(list(color_dict.keys()))+1)]\n", " # color_dict[value] = generate_hex_color()#cmap(i / len(set(values)))\n", " # color_dict = {value: cmap(i) for i, value in enumerate(set(values))}\n", " # colors = [color_dict[value] for value in values]\n", " return color_dict\n", "adata = sc.read_h5ad('dataset.h5ad')\n", "del adata.obsm\n", "color_dict_unagi = {}\n", "for each in list(adata.obs['stage'].unique()):\n", " stageadata = adata[adata.obs['stage'] == each]\n", " sorted_list = sorted(list(stageadata.obs['name.simple'].unique()))\n", " color_dict_unagi = plot_with_colormap(sorted_list,color_dict_unagi)\n", " sc.tl.pca(stageadata)#,n_comps=64)\n", " sc.pp.neighbors(stageadata,use_rep='X_pca')\n", " sc.tl.leiden(stageadata,resolution=0.5+int(each)*0.07)\n", " sc.tl.umap(stageadata,min_dist=0)\n", " sc.pl.umap(stageadata,color='name.simple',palette=color_dict_unagi,save='_scanpy_stage%s_taylor.pdf'%(str(each)))\n", " sc.pl.umap(stageadata, color = 'leiden',save = '_scanpy_stage%s_leiden.pdf'%(str(each)))\n", "sc.tl.pca(adata,n_comps=64)\n", "sc.pp.neighbors(adata)\n", "sc.tl.umap(adata,min_dist=0)\n", "sc.pl.umap(adata,color='name.simple',palette=color_dict_unagi,save='_scanpy_taylor.pdf')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "wks", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 2 }