import gc
import os
import json
import pickle
import scanpy as sc
import subprocess
import numpy as np
from .utils.analysis_helper import find_overlap_and_assign_direction,calculateDataPathwayOverlapGene,calculateTopPathwayGeneRanking,process_customized_drug_database
from .marker_discovery.hierachical_static_markers import get_dataset_hcmarkers
from .perturbations.speedup_perturbation import perturbation
from .marker_discovery.dynamic_markers_helper import get_progressionmarker_background
from .marker_discovery.dynamic_markers import runGetProgressionMarker_one_dist
[docs]
class analyst:
'''
The analyst class is the class to perform downstream analysis. The analyst class will calculate the hierarchical markers, dynamic markers and perform the pathway and drug perturbations.
parameters
----------------
data_path: str
the directory of the data (h5ad format, e.g. dataset.h5ad).
iteration: int
the iteration used for analysis.
target_dir: str
the directory to save the results. Default is None.
customized_drug: str
the customized drug perturbation list. Default is None.
cmap_dir: str
the directory to the cmap database. Default is None.
'''
[docs]
def __init__(self,data_path,iteration,target_dir=None,customized_drug=None,cmap_dir=None,customized_mode = False):
self.adata = sc.read(data_path)
self.data_folder = os.path.dirname(data_path)
self.adata.uns = pickle.load(open(self.data_folder+'/attribute.pkl', 'rb'))
self.total_stage = len(self.adata.obs['stage'].unique())
self.customized_drug = customized_drug
self.cmap_dir = cmap_dir
self.iteration = iteration
if target_dir is None:
self.target_dir = './'+self.data_folder.split('/')[-3]+'_'+str(self.iteration)
initalcommand = 'mkdir '+ self.target_dir
p = subprocess.Popen(initalcommand, stdout=subprocess.PIPE, shell=True)
else:
self.target_dir = target_dir
if customized_mode:
train_params = json.load(open(os.path.join(os.path.dirname(data_path),'model_save/training_parameters.json'),'r'))
else:
train_params = json.load(open(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(data_path))),'model_save/training_parameters.json'),'r'))
self.model_name = train_params['task']+'_'+str(self.iteration)+'.pth'
def perturbation_analyse_customized_pathway(self,customized_pathway,bound=0.5,save_csv = None,save_adata = None,CUDA=False,device='cpu',random_genes=5,random_times=100):
'''
Perform perturbation on customized pathway.
'''
self.adata = calculateDataPathwayOverlapGene(self.adata,customized_pathway=customized_pathway)
print('calculateDataPathwayOverlapGene done')
self.adata = calculateTopPathwayGeneRanking(self.adata)
print('Start perturbation....')
gc.collect()
a = perturbation(self.adata, self.target_dir+'/model_save/'+self.model_name,self.target_dir+'/idrem')
a.run('pathway',bound,inplace=True,CUDA=CUDA,device=device)
a.run('random_background',bound,inplace=True,CUDA=CUDA,device=device,random_genes=random_genes,random_times=random_times)
print('random background done')
a.analysis('pathway',bound)
print('Finish results analysis')
if save_csv is not None:
a.uns['pathway_perturbation'].to_csv(save_csv)
if save_adata is not None:
a.adata.write(save_adata,compression='gzip', compression_opts=9)
def perturbation_analyse_customized_drug(self,customized_drug,bound=0.5,save_csv = None,save_adata = None,CUDA=True,device='cuda:0',advanced=False,random_genes=2,random_times=100):
'''
Perform perturbation on customized drug.
'''
self.adata = process_customized_drug_database(self.adata, customized_drug=customized_drug)
print('Start perturbation....')
gc.collect()
a = perturbation(self.adata, self.target_dir+'/model_save/'+self.model_name,self.target_dir+'/idrem')
a.run('drug',bound,inplace=True,CUDA=CUDA,device=device)
print('drug perturabtion done')
a.run('random_background',bound,inplace=True,CUDA=CUDA,device=device,random_genes=random_genes,random_times=random_times)
print('random background done')
a.analysis('drug',bound)
print('Finish results analysis')
if save_csv is not None:
a.uns['drug_perturbation'].to_csv(save_csv)
if save_adata is not None:
a.adata.write(save_adata,compression='gzip', compression_opts=9)
[docs]
def start_analyse(self,progressionmarker_background_sampling,run_pertubration):
'''
Perform downstream tasks including dynamic markers discoveries, hierarchical markers discoveries, pathway perturbations and compound perturbations.
parameters
----------------
progressionmarker_background_sampling: int
the number of times to sample the background cells for dynamic markers discoveries.
'''
print('calculate hierarchical markers.....')
hcmarkers= get_dataset_hcmarkers(self.adata,stage_key='stage',cluster_key='leiden',use_rep='umaps')
print('hierarchical static markers done')
self.adata = calculateDataPathwayOverlapGene(self.adata)
print('calculateDataPathwayOverlapGene done')
self.adata = calculateTopPathwayGeneRanking(self.adata)
print('calculateTopPathwayGeneRanking done')
if not os.path.exists(os.path.join(self.target_dir,'idrem')):
initalcommand = 'cp -r ' + os.path.join(os.path.dirname(self.data_folder),'idremResults') +' '+self.target_dir+'/idrem'
p = subprocess.Popen(initalcommand, stdout=subprocess.PIPE, shell=True)
initalcommand = 'mkdir '+self.target_dir+'/model_save'+'&& cp ' + os.path.join(os.path.dirname(os.path.dirname(self.data_folder)),'model_save',self.model_name)+' '+self.target_dir+'/model_save/'+self.model_name +'&& cp ' + os.path.join(os.path.dirname(os.path.dirname(self.data_folder)),'model_save/training_parameters.json')+' '+self.target_dir+'/model_save/training_parameters.json'
p = subprocess.Popen(initalcommand, stdout=subprocess.PIPE, shell=True)
if os.path.exists(os.path.join(self.target_dir,str(progressionmarker_background_sampling)+'progressionmarker_background.npy')):
progressionmarker_background = np.load(os.path.join(self.target_dir,str(progressionmarker_background_sampling)+'progressionmarker_background.npy'),allow_pickle=True)
progressionmarker_background = dict(progressionmarker_background.tolist())
else:
progressionmarker_background = get_progressionmarker_background(times=progressionmarker_background_sampling,adata= self.adata,total_stage=self.total_stage)
np.save(os.path.join(self.target_dir,str(progressionmarker_background_sampling)+'progressionmarker_background.npy'),progressionmarker_background)
self.adata.uns['progressionMarkers'] = runGetProgressionMarker_one_dist(os.path.join(os.path.dirname(self.data_folder),'idremResults'),progressionmarker_background,self.adata.shape[1],cutoff=0.05)
print('Dynamic markers discovery.....done....')
a = perturbation(self.adata, self.target_dir+'/model_save/'+self.model_name,self.target_dir+'/idrem')
if run_pertubration:
if self.customized_drug is not None:
self.adata = find_overlap_and_assign_direction(self.adata, customized_drug=self.customized_drug,cmap_dir=self.cmap_dir)
else:
self.adata = find_overlap_and_assign_direction(self.adata,cmap_dir=self.cmap_dir)
print('Start perturbation....')
gc.collect()
a.run('pathway',0.5,inplace=True,CUDA=True)
print('pathway perturbatnion done')
a.run('drug',0.5,inplace=True)
print('drug perturabtion done')
a.run('random_background',0.5,inplace=True)
print('random background done')
a.run('online_random_background',0.5,inplace=True)
print('online random background done')
a.analysis('pathway',0.5)
print('analysis of pathway perturbation')
a.analysis('drug',0.5)
print('analysis of drug perturbation')
a.adata.uns['hcmarkers'] = hcmarkers #get_dataset_hcmarkers(self.adata,stage_key='stage',cluster_key='leiden',use_rep='umaps')
with open(os.path.join(self.target_dir,'attribute.pkl'),'wb') as f:
pickle.dump(a.adata.uns,f)
del a.adata.uns
a.adata.obs['leiden'] = a.adata.obs['leiden'].astype(str)
a.adata.obs['stage'] = a.adata.obs['stage'].astype(str)
a.adata.obs['ident'] = a.adata.obs['ident'].astype(str)
a.adata.write(self.target_dir+ '/dataset.h5ad',compression='gzip', compression_opts=9)