Source code for UNAGI.dynamic_regulatory_networks.processTFs

'''
This module contains functions to identify dynamic genes and accumulate gene weights.
'''
import numpy as np
import gc
import os
import json
import pandas as pd
import scanpy as sc
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
import gc

def getLastStageIDs(json,total_stage):
    '''
    get the list of node ids in last stage 
    parameters
    -----------
    json: idrem meta
    total_stage: total number of stages

    return
    -----------
    out: a list of node ids in last stage
    '''
    out = []
    for each in json[0][1:]:
        if each['nodetime'] == 'stage'+str(total_stage-1):
            out.append(each['nodeID'])
    return out
def getidremPath(json,cid):
    '''
    given the last node id in idrem, get the whole paths
    
    parameters
    -----------
    json: idrem meta file
    cid: the last node id in idrem

    return
    -----------
    nodes: a list of node ids in the path
    '''
    nodes = []

    for each in reversed(json[0][1:]):

        if each['nodeID'] == cid:
            nodes.append(cid)
            if each['parent']!= -1:
                cid = each['parent']
            else:
                break
    nodes.reverse()
    return nodes
def getIdremPaths(json,total_stage):
    '''
    get all paths in idrem
    
    parameters
    -----------
    json: list
        idrem meta files

    return
    -----------
    paths: a list of all paths
    '''
    
    stage3IDs = getLastStageIDs(json,total_stage) 
    paths = []
    for i in stage3IDs:
        paths.append(getidremPath(json, i))
    return paths
def getPosNegMaxChangesNodes(json,paths,total_stage):
    means = []
    for path in paths:
        pathmean = []
        for each in path:
            
            for node in json[0][1:]:
                if node['nodeID'] == each:
                    pathmean.append(node['nodeMean'])
        means.append(pathmean)
    means = np.array(means)
    indicator = total_stage-1
    while indicator > 0:
        means[:,indicator] = means[:,indicator]-means[:,indicator-1]
        indicator-=1
    poschangenodesid = np.argmax(means,axis=0)
    negchangenodesid = np.argmax(-means,axis=0)
    poschangenodes = []
    negchangenodes = []
    for i, each in enumerate(poschangenodesid):
        poschangenodes.append(paths[each][i])
    for i, each in enumerate(negchangenodesid):
        negchangenodes.append(paths[each][i])
    return poschangenodes, negchangenodes
def testChanges(path,filename):
    '''
    a test function
    '''
    tt = readIdremJson(path, filename)
    paths = getIdremPaths(tt,4)

    return getPosNegMaxChangesNodes(tt,paths)
def getTargetGenes(path,N):
    '''
    get top N genes of each path
    parameters
    -----------
    path: the file path of IDREM results
    
    return
    -----------
    out: a list of top N up or down regulators of each path
    '''
    out = []

    filenames = os.listdir(path)
    
    for each in filenames:
        if each[0] != '.':
            #out.append(getMaxMinPathGenes(path,each,N)) #get the genes from nodes that have highest and lowest nodemean
            out.append(getPosNegDynamicPathGenes(path,each,N)) #get the genes from nodes increase and descrease most between stages
    return out

def getPosNegDynamicPathGenes(path,filename,topN):
    '''
    get the genes from nodes that increase and decrease most between stages
    
    parameters
    -----------
    path: the file path of IDREM results
    filename: the file name of IDREM results
    topN: the number of genes to accumulate gene weights

    return
    -----------
    out: a list of top N up or down regulators of each path
    '''
    total_stage = len(filename.split('.')[0].split('-'))
    tt = readIdremJson(path, filename)
    paths = getIdremPaths(tt,total_stage)
    posdynamicids, negdynamicids = getPosNegMaxChangesNodes(tt,paths,total_stage-1)
    posdynamicgenes,posdynamicgeneids = getMaxOrMinNodesGenes(tt,posdynamicids)
    negdynamicgenes,negdynamicgeneids = getMaxOrMinNodesGenes(tt,negdynamicids)
    posnegdynamicgenes = [posdynamicgenes[i]+negdynamicgenes[i] for i in range(total_stage-1)]
    posnegdynamicgeneids = [posdynamicgeneids[i]+negdynamicgeneids[i] for i in range(total_stage-1)]
    out = getTopNTargetGenes(tt,posnegdynamicgenes,posnegdynamicgeneids,topN, total_stage-1)
    return out

def getMaxOrMinNodesGenes(json,nodes):
    '''
    get the genes from dynamic nodes
    parameters
    -----------
    json: idrem meta
    nodes: the list of dynamic nodes

    return
    -----------
    genes: a list of genes in the nodes
    '''
    boolgenes = []
    genes = []
    geneids = []

    for each in json[0]:
        if each['nodeID'] in nodes:
            boolgenes.append(each['genesInNode'])
    for each in boolgenes:
        tempgenes = []
        tempgeneids = []
        for i, gene in enumerate(each):
            if gene == True:
                tempgenes.append(json[3][i].upper())
                tempgeneids.append(i)
        genes.append(tempgenes)
        geneids.append(tempgeneids)
    return genes,geneids

def readIdremJson(path, filename):
    '''
    Parse the IDREM json file
    parameters
    -----------
    path: the file path of IDREM results
    filename: the file name of IDREM results

    return
    -----------
    tt: the parsed IDREM json file
    '''
    print('getting Target genes from ', filename)
    path = os.path.join(path,filename,'DREM.json')
    f=open(path,"r")
    lf=f.readlines()
    f.close()
    lf="".join(lf)
    lf=lf[5:-2]+']'
    tt=json.loads(lf,strict=False)
    return tt

def getTopNTargetGenes(json,genenames,geneids,topN,total_stage):
    '''
    get top N genes of each path sorted by the change of gene expression between stages

    parameters
    -----------
    json: the parsed IDREM json file
    genenames: a list of genes in the nodes
    geneids: a list of gene ids in the nodes
    topN: the number of genes to accumulate gene weights
    total_stage: the total number of stages

    return
    -----------
    out: a list of top N up or down regulators of each path
    '''

    out = [[] for i in range(total_stage)]
    for i in range(total_stage):
        changegene = np.array([json[5][j] for j in geneids[i]])
        change = abs(changegene[:,i+1]-changegene[:,i])
        pddata = pd.DataFrame(change, columns=['change_Value'])
        pddata.index = genenames[i]
        sortedchange = pddata.sort_values(by = 'change_Value',ascending=False)
        topNGenes = sortedchange.index.tolist()[:topN]
        out[i] = topNGenes
    return out

def listTracks(mid, iteration,total_stage):
    '''
    list all tracks in the selected iteration
    parameters
    -----------
    mid: directory to the task
    iteration: the selected iteration
    total_stage: the total number of stages

    return
    -----------
    tempTrack: a list of tracks
    '''
    filenames= os.listdir(os.path.join(mid,str(iteration)+'/idremResults/'))
    #filenames = os.listdir('./reresult/idremVizCluster0.1-nov14/') #defalut path
    tempTrack = [[] for _ in range(total_stage)]
    for each in filenames:
        temp = each.split('.')[0].split('-')
        for i,item in enumerate(temp):
            temp1 = item.split('n')
            tempTrack[i].append(temp1)
    return tempTrack

def matchTFandTGWithFoldChange(TFs,scopes,avgCluster,filename,genenames,total_stage):
    '''
    use target genes from IDREM as scopes to count tf and tgs with human-encode
    parameters
    -----------
    TFs: list
        A list of top N up or down regulators of each path
    scopes: list
        A list of target genes of each path
    avgCluster: list
        A list of average gene expression of each cluster
    filename: str
        The name of IDREM results
    genenames: list
        A list of gene names
    total_stage: int
        The total number of stages

    return
    -----------
    TFTG: a list of gene weights of each path
    '''
    f = open(filename,'r')
    rl = f.readlines()
    f.close()
    TG = {}
    for line in rl[1:]:
        item = line.split('\t')
        if item[0] not in TG.keys():
            TG[item[0]] = []
        genename = item[1].split(';')[0]
        #if genename in genenames:
        TG[item[0]].append(genename)
    genedict = {}
    for i, each in enumerate(genenames):
        genedict[each.upper()] = i
    TFTG = [[] for _ in range(len(TFs))]
    for i, track in enumerate(TFs):
        temp = [[] for _ in range(total_stage-1)]
        TFTG[i] = temp
        
        for j, stage in enumerate(track):
            TFTG[i][j] = np.zeros(shape=len(genenames))
            for tf in stage:
                
                targetGenes = TG[tf[0].split(' ')[0]]
                for each in scopes[i][j]:
                    foldChange = abs(np.log2(avgCluster[j+1][j][genedict[each]]+1) - np.log2(avgCluster[j][i][genedict[each]]+1))
                    TFTG[i][j][genedict[each]] = 1*foldChange+1
                temp2 = np.zeros(shape=len(genenames))
                for each in enumerate(targetGenes):
                    
                    
                    if each[1] in genedict.keys() and each[1] in scopes[i][j]:
                    
                    

                        foldChange = abs(np.log2(avgCluster[j+1][j][genedict[each[1]]]+1) - np.log2(avgCluster[j][i][genedict[each[1]]]+1))
                        # if TFTG[i][j][genedict[each[1]]]<2:
                        TFTG[i][j][genedict[each[1]]] = 1*foldChange+1 #hyperparameter needed to be adusted by users
                    elif each[1] in genedict.keys() and each[1] not in scopes[i][j]:
                        
                        foldChange = abs(np.log2(avgCluster[j+1][i][genedict[each[1]]]+1)-np.log2(avgCluster[j][i][genedict[each[1]]]+1))
                        
                        # if TFTG[i][j][genedict[each[1]]]<2:
                        TFTG[i][j][genedict[each[1]]] = 0.5*foldChange+0.5 #hyperparameter needed to be adusted by users

                if tf[0].split(' ')[0] in genenames:
                    index = genedict[tf[0].split(' ')[0]]
                    # if avgCluster[j+1][i][index] > avgCluster[j][i][index]:
                    #     if avgCluster[j][i][index] == 0:
                    #         foldChange = 1
                    #     else:
                    #         foldChange = avgCluster[j+1][i][index]/avgCluster[j][i][index]
                    # elif avgCluster[j+1][i][index] < avgCluster[j][i][index]:
                    #     if avgCluster[j+1][i][index] == 0:
                    #         foldChange = 1
                    #     else:
                    foldChange = abs(np.log2(avgCluster[j][i][index]+1)-np.log2(avgCluster[j+1][i][index]+1))
                    TFTG[i][j][index] = 2*foldChange+2
                #TFTG[i][j].append(temp2)
            #TFTG[i][j] = np.array(TFTG[i][j])
            #TFTG[i][j] = np.sum(TFTG[i][j],axis = 0)
    TFTG = np.array(TFTG)
    return TFTG

def updateGeneFactorsWithDecay(adata, clusterid,iteration,geneWeight, decayRate = 0.5):
    '''
    update gene weights and decay the weight of genes that are not important in this iteration of a cluster

    parameters
    ----------------------
    adata: anndata
        the cluster of single cell data
    clusterid: int
        the cluster id
    iteration: int
        the selected iteration
    geneWeight: np.array
        the gene weights
    decayRate: float
        the decay rate

    return
    ----------------------
    adata: anndata
        the updated single-cell data
    '''
    geneWeight = geneWeight.reshape(1,-1)
    adata.obs['leiden']=adata.obs['leiden'].astype('int64')
    cells = adata.obs.reset_index()
    celllist = cells[cells['leiden']==int(clusterid)].index.tolist()
    if 'geneWeight' not in adata.layers.keys():
        adata.layers['geneWeight'] = lil_matrix(np.zeros(adata.X.shape)) 
    else:
        adata.layers['geneWeight'] = adata.layers['geneWeight'].tolil()
    adata.layers['geneWeight'][celllist] += geneWeight
    adata.layers['geneWeight'] = adata.layers['geneWeight'].tocsr()
    return adata
    
[docs] def updataGeneTablesWithDecay(mid, iteration, geneFactors,total_stage, decayRate = 0.3): ''' update gene weights and decay the weight of genes that are not important in this iteration parameters ---------------------- mid: str The task name iteration: int the selected iteration geneFactors: np.array the gene weights of each cell total_stage: int the total number of stages decayRate: float the decay rate return ---------------------- difference: np.float the average difference of gene weights between stages ''' tracks = listTracks(mid,iteration,total_stage) difference = 0 for i, stage in enumerate(tracks): if i != 0: gc.collect() adata = sc.read_h5ad(os.path.join(mid,str(iteration)+'/stagedata/%d.h5ad'%(i))) temppreviousMySigmoidGeneWeight = adata.layers['geneWeight'] adata.layers['geneWeight'] = adata.layers['geneWeight'].multiply(decayRate) #sep8 change to all decrease for all cell #adata = sc.read_h5ad('./stagedata/%d.h5ad'%(i)) for j, clusterids in enumerate(stage):#in some tracks, the one stage have many clusters for clusterid in clusterids: # print('number of idrem file', j) # print('stage',i) adata = updateGeneFactorsWithDecay(adata,clusterid,iteration,geneFactors[j][i-1]) # difference+=clusterDiff#each update will get the mySigmoid(increment) previousMySigmoidGeneWeight = mySigmoid(temppreviousMySigmoidGeneWeight.toarray()) currentMySigmoidGeneWeight = mySigmoid(adata.layers['geneWeight'].toarray()) difference += np.mean(np.absolute(currentMySigmoidGeneWeight - previousMySigmoidGeneWeight)) adata.layers['geneWeight'] = adata.layers['geneWeight'].tocsr() adata.write(os.path.join(mid,str(iteration)+'/stagedata/%d.h5ad'%i),compression='gzip' ) else: if int(iteration) == 0: adata = sc.read_h5ad(os.path.join(mid,str(iteration)+'/stagedata/%d.h5ad'%(i))) if 'geneWeight' not in adata.layers.keys(): adata.layers['geneWeight'] = csr_matrix(np.zeros(adata.X.shape)) adata.write(os.path.join(mid,str(iteration)+'/stagedata/%d.h5ad'%i),compression='gzip' ) return difference/(total_stage-1)
def checkupDown(idrem, genename): ''' check if the TFs is a up or down regulator parameters ----------- idrem: list the iDREM metafile genename: list a list of name of TF(str) return ----------- flag: bool up regulator return 1, down regulator return 0 ''' genename=genename.split(' ')[0] if genename not in idrem[3]: flag = 0 else: flag = 1 return flag def mergeTFs(TFs,total_stage): ''' merge top N up or down regulators into the stage level and remove the repeated regulators among sibling nodes of IDREM tree parameters ---------------------- TFs: list a list of top N up or down regulators of a IDREM tree total_stage: int the total number of stages return ----------- out: list a list of up or down regulators of each stage ''' upAndDownset= [set(),set(),set()] out = [[] for _ in range(total_stage-1)] for i, each in enumerate(TFs): for item in each: for data in item: if data[0].split(' ')[0] not in upAndDownset[i]: upAndDownset[i].add(data[0].split(' ')[0]) out[i].append(data) return out def getTopNUpandDown(TFs,topN=20): ''' obtain top 20 up or down regulators based on the score overall (P value) parameters ----------- TFs: list a list of up or down regulators topN: int the number of top regulators to be extracted. Default is 20 return ----------- TFs: list top N up or down regulators ''' if len(TFs[0]) == 6: TFs = sorted(TFs,key=lambda x:x[5]) else: TFs = sorted(TFs,key=lambda x:x[6]) TFs = TFs[:topN] return TFs def extractTFs(path,filename,total_stage,topN=20): ''' extract top N up or down TFs of a certain path from the DREM json file parameters ----------- filename: str the name of certain paths total_stage: int the total number of stages topN: int the number of top regulators to be extracted. Default is 20 return ----------- extractedTFs: list top N up or down TFs of a certain path ''' print('getting TFs from ', filename) path = os.path.join(path,filename,'DREM.json') extractedTFs = [[] for _ in range(total_stage-1)] f=open(path,"r") lf=f.readlines() f.close() lf="".join(lf) lf=lf[5:-2]+']' tt=json.loads(lf,strict=False) TFs = [[] for _ in range(total_stage-1)] stages = [str(i+1) for i in range(total_stage-1)] for each in tt[0][1:]: temp = [] for item in each['ETF']: if checkupDown(tt, item[0]): temp.append(item) if len(temp) ==0: continue temp = getTopNUpandDown(temp,topN) # print(each['nodetime'],stages.index(each['nodetime'][-1])) TFs[stages.index(each['nodetime'][-1])].append(temp) extractedTFs = mergeTFs(TFs,total_stage=total_stage) return extractedTFs def getTFs(path,total_stage,topN=20): ''' get top N up or down regulators of each path parameters ----------- path: str the file path of IDREM results topN: int the number of top regulators to be extracted. Default is 20 total_stage: int the total number of stages return ----------- out: list a list of top N up or down regulators of each path ''' out = [] filenames = os.listdir(path) for each in filenames: if each[0] != '.': out.append(extractTFs(path,each,topN=topN,total_stage=total_stage)) return out def mySigmoid(z,weight=-4): ''' new shifted sigmoid transformation for replace strategy parameters ----------- z: np.array input data weight: float the weight of sigmoid transformation return ----------- out: np.array data after shifted sigmoid transformation ''' out = 1/(1+20*np.exp(weight*z+1.5)) #hyper parameters needed to be adjusted by users return out if __name__ == '__main__': getPosNegDynamicPathGenes('/mnt/md0/yumin/to_upload/UNAGI/tutorials/example_1/idrem','4-4-5-1.txt_viz',50)