Source code for UNAGI.dynamic_regulatory_networks.processIDREM

import numpy as np
import os
import threading
import subprocess
import gc
def getClusterPaths(edges, total_stages):
    '''
    Obtain the paths of each cluster for multiple stages.
    
    parameters
    -----------
    edges: list
        A list of lists, where each sublist contains edges between consecutive stages.
    total_stages: int
        Total number of stages.

    return
    -----------
    paths: list
        A collection of paths of clusters.
    '''
    if len(edges) != total_stages - 1:
        raise ValueError("Number of edges must be one less than total stages")

    paths = {}
    for key in list(edges.keys()):
        edges[int(key)] = edges[key]
    # Initialize paths with the first set of edges
    for each in edges[0]:
        if str(each[0]) not in paths:
            paths[str(each[0])] = [[each[0]], [each[1]]]
        else:
            paths[str(each[0])][1].append(each[1])

    # Iterate through remaining stages
    for stage in range(1, total_stages - 1):
        for each in edges[stage]:
            for item in paths.keys():
                if len(paths[item]) == stage:
                    continue
                if each[0] in paths[item][stage]:
                    if len(paths[item]) == stage + 1:
                        paths[item].append([each[1]])
                    else:
                        paths[item][stage + 1].append(each[1])

    return paths

def getClusterIdrem(paths, state, total_stages):
    '''
    Concatenate the average gene expression in a cluster tree. Shape: [number of stages, number of genes]
    
    parameters
    -----------
    paths: The collection of paths.
    state: A list of average gene expression of each state.
    total_stages: Total number of stages.
    
    return
    -----------
    out: A list of gene expression of each cluster tree.
    '''
    out = []

    for path_key in paths.keys():
        path = paths[path_key]

        # Ensure the path contains the expected number of stages
        if len(path) == total_stages:
            stages = [averageNode(node, state[i]) for i, node in enumerate(path)]

            # Reshape each stage and concatenate
            reshaped_stages = [stage.reshape(-1, 1) for stage in stages]
            joint_matrix = np.concatenate(reshaped_stages, axis=1)

            out.append(joint_matrix)

    return out

def getIdrem(paths,state):
    '''
    concatenate the average gene expression of clusters in each path. shape: [number of stages, number of gene]
    parameters
    ----------------------
    paths: list
        the list of paths
    state: list
        a list of average gene expression of each state
    
    return
    ---------------------- 
    out: list
        a list of gene expression of each path
    '''
    out = []
    for i,each in enumerate(paths):
        joint_matrix = np.concatenate((state[0][each[0]].reshape(-1,1), state[1][each[1]].reshape(-1,1), state[2][each[2]].reshape(-1,1), state[3][each[3]].reshape(-1,1)),axis =1)
        out.append(joint_matrix)
    return out

class IDREMthread(threading.Thread):
    '''
    the thread for running IDREM. Support multiple threads.
    '''
    def __init__(self, indir, outdir, each,idrem_dir):
        threading.Thread.__init__(self)
        self.indir = indir
        self.outdir = outdir
        self.each = each
        self.idrem_dir = idrem_dir
    def run(self):
        
        command = 'cd '+ str(self.idrem_dir)+' && java -Xmx8G -jar idrem.jar -b %s %s'%(self.indir, self.outdir)
        p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        print(p.stdout.read())


[docs]
def runIdrem(paths, midpath, idremInput,genenames,iteration, idrem_dir, species='Human', Minimum_Standard_Deviation = 0.01,Convergence_Likelihood=0.1,Minimum_Absolute_Log_Ratio_Expression=0.05, trained=False):
    '''
    train IDREM model and save the results in iterative training with midpath and iteration
    
    parameters
    ----------------------
    paths: the path of IPF progression
    idremInput: average gene expression of each path
    trained: if the model is trained, use saved model
    
    '''
    dir1 = os.path.join(midpath, str(iteration)+'/idremInput')
    dir2 = os.path.join(midpath, str(iteration)+'/idremsetting')
    dir3 = os.path.join(midpath, str(iteration)+'/idremModel')

    initalcommand = 'mkdir '+dir1+' && mkdir '+dir2+' && mkdir '+dir3
    p = subprocess.Popen(initalcommand, stdout=subprocess.PIPE, shell=True)
    print(p.stdout.read()) 
    print(paths)
    for i, each in enumerate(paths):
        each_processed = []
        for e in each:
            e = str(e).strip('[]')
            e = e.replace(', ','n')
            each_processed.append(e)
        # each_processed = [str(e).strip('[]').replace(', ', 'n') for e in each]
        print(each_processed)
        file_name = '-'.join(each_processed)
        file_path = os.path.join(midpath, str(iteration), 'idremInput', f'{file_name}.txt')
        header = ['gene'] + [f'stage{j}' for j in range(len(each))]
        with open(file_path, 'w') as f:
            f.write('\t'.join(header) + '\n')
            for j, row in enumerate(idremInput[i]):
                row_data = '\t'.join(str(r) for r in row)
                f.write("%s\t%s\n" % (genenames[j], row_data))
        examplefile_path = os.path.join(idrem_dir, 'example_settings.txt')#open(os.path.join(idrem_dir, 'example_settings.txt'), 'r')
        settings_file_path = os.path.join(midpath, str(iteration), 'idremsetting', f'{file_name}.txt')
        with open(examplefile_path, 'r') as examplefile:
        # Open settings_file_path for writing
            with open(settings_file_path, 'w') as f:
                for k, line in enumerate(examplefile.readlines()):

                    if trained and k == 4:
                        print(midpath) 
                        f.write('%s\t%s\n' % ('Saved_Model_File', os.path.join(os.path.abspath(os.path.join(midpath, str(iteration), 'idremInput')), f'{file_name}.txt')))
                    elif k == 1:
                        if species == 'Human':
                            f.write('%s\t%s\n' % ('TF-gene_Interaction_Source', 'human_encode.txt.gz'))
                            
                            continue
                        elif species =='Mouse':
                            f.write('%s\t%s\n' % ('TF-gene_Interaction_Source', 'mouse_predicted.txt.gz'))
                            continue
                    elif k == 2:
                        if species == 'Human':
                            f.write('%s\t%s\n' % ('TF-gene_Interactions_File', 'TFInput/human_encode.txt.gz'))
                            continue
                        elif species == 'Mouse':
                            f.write('%s\t%s\n' % ('TF-gene_Interactions_File', 'TFInput/mouse_predicted.txt.gz'))
                            continue
                    elif k == 5:
                        if species == 'Human':
                            f.write('%s\t%s\n' % ('Gene_Annotation_Source', 'Human (EBI)'))
                            continue
                        elif species == 'Mouse':
                            f.write('%s\t%s\n' % ('Gene_Annotation_Source', 'Mouse (EBI)'))
                            continue
                    elif k == 6:
                        if species == 'Human':
                            f.write('%s\t%s\n' % ('Gene_Annotation_File', 'goa_human.gaf.gz'))
                            continue 
                        elif species == 'Mouse':
                            f.write('%s\t%s\n' % ('Gene_Annotation_File', 'goa_mouse.gaf.gz'))
                            continue
                    elif k== 17:
                        f.write('%s\n' % ('miRNA-gene_Interaction_Source'))
                        continue
                    elif k== 18:
                        f.write('%s\n' % ('miRNA_Expression_Data_File'))
                        continue
                    elif k== 26:
                        f.write('%s\n' % ('Proteomics_File'))
                        continue
                    elif k == 34:
                        f.write('%s\n' % ('Epigenomic_File'))
                        continue
                    elif k == 35:
                        f.write('%s\n' % ('GTF File'))
                        continue
                    elif k == 42:
                        f.write('%s\t%s\n' % ('Minimum_Absolute_Log_Ratio_Expression', str(Minimum_Absolute_Log_Ratio_Expression)))
                        continue
                    elif k ==51 :
                        f.write('%s\t%s\n' % ('Convergence_Likelihood_%', str(Convergence_Likelihood)))
                        continue
                    elif k == 52:
                        f.write('%s\t%s\n' % ('Minimum_Standard_Deviation', str(Minimum_Standard_Deviation)))
                        continue
                    elif k != 3:
                        f.write(line)
                    else:
                        f.write('%s\t%s\n' % ('Expression_Data_File', os.path.join(os.path.abspath(os.path.join(midpath, str(iteration), 'idremInput')), f'{file_name}.txt')))

        settinglist = os.listdir(os.path.join(midpath,str(iteration)+'/idremsetting/'))
    
    print(settinglist)
    threads = []
    for each in settinglist:
        if each[0] != '.':
            
            indir = os.path.abspath(os.path.join(midpath,str(iteration)+'/idremsetting/',each))
            outdir =os.path.join(os.path.abspath(os.path.join(midpath,str(iteration))+'/idremModel/'),each)
            
            threads.append(IDREMthread(indir, outdir, each,idrem_dir))
    count = 0
    while True:
        if count<len(threads) and count +2 > len(threads):
            threads[count].start()
            threads[count].join()
            break
        elif count == len(threads):
            break
        else:
            threads[count].start()
            threads[count+1].start()
            threads[count].join()
            threads[count+1].join()
            count+=2
    if not trained:
        print(os.getcwd())
        dir1 = os.path.join(midpath, str(iteration)+'/idremResults')
        dir2 = os.path.join(midpath, str(iteration)+'/idremInput/*.txt_viz')
        command = [['rm -r '+dir1],[ ' mkdir '+dir1], [' mv '+dir2+ ' '+dir1]]
        for each in command:
            p = subprocess.Popen(each, stdout=subprocess.PIPE, shell=True)
            print(p.stdout.read())
    print('idrem Done')


def averageNode(nodes,state):
    '''
    calculate the average gene expression of sibling nodes
    
    parameters
    ----------------------
    nodes: int
        number of sibling nodes
    state: list
        the gene expression of each cluster in a certain stage
    
    return
    -----------
    out: the average gene expression of sibling nodes
    '''
    out = 0
    for each in nodes:
        out+=state[each]
    return out/len(nodes)

if __name__ == '__main__':
    import numpy as np
    edges = [[[2, 0], [0, 1], [8, 3], [3, 4], [3, 5], [4, 6], [5, 7], [6, 8], [1, 9]],  [[1, 0], [4, 1], [3, 2], [0, 3], [7, 5], [0, 6], [8, 7], [10, 10], [3, 11], [8, 12]],[[1, 0], [0, 1], [1, 2], [3, 3], [5, 4], [0, 5], [2, 6], [7, 8], [5, 9], [12, 10], [2, 12]]]
    paths = getClusterPaths(edges,4)
    averageValues = np.load('/mnt/md0/yumin/to_upload/UNAGI/UNAGI/data/example/2/averageValues.npy',allow_pickle=True)
    # print(averageValues.shape)
    idrem= getClusterIdrem(paths,averageValues,4)
    print(idrem)