Source code for UNAGI.dynamic_regulatory_networks.processIDREM

import numpy as np
import os
import threading
import subprocess
import gc
def getClusterPaths(edges, total_stages):
    '''
    Obtain the paths of each cluster for multiple stages.
    
    parameters
    -----------
    edges: list
        A list of lists, where each sublist contains edges between consecutive stages.
    total_stages: int
        Total number of stages.

    return
    -----------
    paths: list
        A collection of paths of clusters.
    '''
    if len(edges) != total_stages - 1:
        raise ValueError("Number of edges must be one less than total stages")

    paths = {}
    for key in list(edges.keys()):
        edges[int(key)] = edges[key]
    # Initialize paths with the first set of edges
    for each in edges[0]:
        if str(each[0]) not in paths:
            paths[str(each[0])] = [[each[0]], [each[1]]]
        else:
            paths[str(each[0])][1].append(each[1])

    # Iterate through remaining stages
    for stage in range(1, total_stages - 1):
        for each in edges[stage]:
            for item in paths.keys():
                if len(paths[item]) == stage:
                    continue
                if each[0] in paths[item][stage]:
                    if len(paths[item]) == stage + 1:
                        paths[item].append([each[1]])
                    else:
                        paths[item][stage + 1].append(each[1])

    return paths

def getClusterIdrem(paths, state, total_stages):
    '''
    Concatenate the average gene expression in a cluster tree. Shape: [number of stages, number of genes]
    
    parameters
    -----------
    paths: The collection of paths.
    state: A list of average gene expression of each state.
    total_stages: Total number of stages.
    
    return
    -----------
    out: A list of gene expression of each cluster tree.
    '''
    out = []

    for path_key in paths.keys():
        path = paths[path_key]

        # Ensure the path contains the expected number of stages
        if len(path) == total_stages:
            stages = [averageNode(node, state[i]) for i, node in enumerate(path)]

            # Reshape each stage and concatenate
            reshaped_stages = [stage.reshape(-1, 1) for stage in stages]
            joint_matrix = np.concatenate(reshaped_stages, axis=1)

            out.append(joint_matrix)

    return out

def getIdrem(paths,state):
    '''
    concatenate the average gene expression of clusters in each path. shape: [number of stages, number of gene]
    parameters
    ----------------------
    paths: list
        the list of paths
    state: list
        a list of average gene expression of each state
    
    return
    ---------------------- 
    out: list
        a list of gene expression of each path
    '''
    out = []
    for i,each in enumerate(paths):
        joint_matrix = np.concatenate((state[0][each[0]].reshape(-1,1), state[1][each[1]].reshape(-1,1), state[2][each[2]].reshape(-1,1), state[3][each[3]].reshape(-1,1)),axis =1)
        out.append(joint_matrix)
    return out

class IDREMthread(threading.Thread):
    '''
    the thread for running IDREM. Support multiple threads.
    '''
    def __init__(self, indir, outdir, each,idrem_dir):
        threading.Thread.__init__(self)
        self.indir = indir
        self.outdir = outdir
        self.each = each
        self.idrem_dir = idrem_dir
    def run(self):
        
        command = 'cd '+ str(self.idrem_dir)+' && java -Xmx8G -jar idrem.jar -b %s %s'%(self.indir, self.outdir)
        p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        print(p.stdout.read())

[docs] def runIdrem(paths, midpath, idremInput,genenames,iteration, idrem_dir, species='Human', Minimum_Standard_Deviation = 0.01,Convergence_Likelihood=0.1,Minimum_Absolute_Log_Ratio_Expression=0.05, trained=False): ''' train IDREM model and save the results in iterative training with midpath and iteration parameters ---------------------- paths: the path of IPF progression idremInput: average gene expression of each path trained: if the model is trained, use saved model ''' dir1 = os.path.join(midpath, str(iteration)+'/idremInput') dir2 = os.path.join(midpath, str(iteration)+'/idremsetting') dir3 = os.path.join(midpath, str(iteration)+'/idremModel') initalcommand = 'mkdir '+dir1+' && mkdir '+dir2+' && mkdir '+dir3 p = subprocess.Popen(initalcommand, stdout=subprocess.PIPE, shell=True) print(p.stdout.read()) print(paths) for i, each in enumerate(paths): each_processed = [] for e in each: e = str(e).strip('[]') e = e.replace(', ','n') each_processed.append(e) # each_processed = [str(e).strip('[]').replace(', ', 'n') for e in each] print(each_processed) file_name = '-'.join(each_processed) file_path = os.path.join(midpath, str(iteration), 'idremInput', f'{file_name}.txt') header = ['gene'] + [f'stage{j}' for j in range(len(each))] with open(file_path, 'w') as f: f.write('\t'.join(header) + '\n') for j, row in enumerate(idremInput[i]): row_data = '\t'.join(str(r) for r in row) f.write("%s\t%s\n" % (genenames[j], row_data)) examplefile_path = os.path.join(idrem_dir, 'example_settings.txt')#open(os.path.join(idrem_dir, 'example_settings.txt'), 'r') settings_file_path = os.path.join(midpath, str(iteration), 'idremsetting', f'{file_name}.txt') with open(examplefile_path, 'r') as examplefile: # Open settings_file_path for writing with open(settings_file_path, 'w') as f: for k, line in enumerate(examplefile.readlines()): if trained and k == 4: print(midpath) f.write('%s\t%s\n' % ('Saved_Model_File', os.path.join(os.path.abspath(os.path.join(midpath, str(iteration), 'idremInput')), f'{file_name}.txt'))) elif k == 1: if species == 'Human': f.write('%s\t%s\n' % ('TF-gene_Interaction_Source', 'human_encode.txt.gz')) continue elif species =='Mouse': f.write('%s\t%s\n' % ('TF-gene_Interaction_Source', 'mouse_predicted.txt.gz')) continue elif k == 2: if species == 'Human': f.write('%s\t%s\n' % ('TF-gene_Interactions_File', 'TFInput/human_encode.txt.gz')) continue elif species == 'Mouse': f.write('%s\t%s\n' % ('TF-gene_Interactions_File', 'TFInput/mouse_predicted.txt.gz')) continue elif k == 5: if species == 'Human': f.write('%s\t%s\n' % ('Gene_Annotation_Source', 'Human (EBI)')) continue elif species == 'Mouse': f.write('%s\t%s\n' % ('Gene_Annotation_Source', 'Mouse (EBI)')) continue elif k == 6: if species == 'Human': f.write('%s\t%s\n' % ('Gene_Annotation_File', 'goa_human.gaf.gz')) continue elif species == 'Mouse': f.write('%s\t%s\n' % ('Gene_Annotation_File', 'goa_mouse.gaf.gz')) continue elif k== 17: f.write('%s\n' % ('miRNA-gene_Interaction_Source')) continue elif k== 18: f.write('%s\n' % ('miRNA_Expression_Data_File')) continue elif k== 26: f.write('%s\n' % ('Proteomics_File')) continue elif k == 34: f.write('%s\n' % ('Epigenomic_File')) continue elif k == 35: f.write('%s\n' % ('GTF File')) continue elif k == 42: f.write('%s\t%s\n' % ('Minimum_Absolute_Log_Ratio_Expression', str(Minimum_Absolute_Log_Ratio_Expression))) continue elif k ==51 : f.write('%s\t%s\n' % ('Convergence_Likelihood_%', str(Convergence_Likelihood))) continue elif k == 52: f.write('%s\t%s\n' % ('Minimum_Standard_Deviation', str(Minimum_Standard_Deviation))) continue elif k != 3: f.write(line) else: f.write('%s\t%s\n' % ('Expression_Data_File', os.path.join(os.path.abspath(os.path.join(midpath, str(iteration), 'idremInput')), f'{file_name}.txt'))) settinglist = os.listdir(os.path.join(midpath,str(iteration)+'/idremsetting/')) print(settinglist) threads = [] for each in settinglist: if each[0] != '.': indir = os.path.abspath(os.path.join(midpath,str(iteration)+'/idremsetting/',each)) outdir =os.path.join(os.path.abspath(os.path.join(midpath,str(iteration))+'/idremModel/'),each) threads.append(IDREMthread(indir, outdir, each,idrem_dir)) count = 0 while True: if count<len(threads) and count +2 > len(threads): threads[count].start() threads[count].join() break elif count == len(threads): break else: threads[count].start() threads[count+1].start() threads[count].join() threads[count+1].join() count+=2 if not trained: print(os.getcwd()) dir1 = os.path.join(midpath, str(iteration)+'/idremResults') dir2 = os.path.join(midpath, str(iteration)+'/idremInput/*.txt_viz') command = [['rm -r '+dir1],[ ' mkdir '+dir1], [' mv '+dir2+ ' '+dir1]] for each in command: p = subprocess.Popen(each, stdout=subprocess.PIPE, shell=True) print(p.stdout.read()) print('idrem Done')
def averageNode(nodes,state): ''' calculate the average gene expression of sibling nodes parameters ---------------------- nodes: int number of sibling nodes state: list the gene expression of each cluster in a certain stage return ----------- out: the average gene expression of sibling nodes ''' out = 0 for each in nodes: out+=state[each] return out/len(nodes) if __name__ == '__main__': import numpy as np edges = [[[2, 0], [0, 1], [8, 3], [3, 4], [3, 5], [4, 6], [5, 7], [6, 8], [1, 9]], [[1, 0], [4, 1], [3, 2], [0, 3], [7, 5], [0, 6], [8, 7], [10, 10], [3, 11], [8, 12]],[[1, 0], [0, 1], [1, 2], [3, 3], [5, 4], [0, 5], [2, 6], [7, 8], [5, 9], [12, 10], [2, 12]]] paths = getClusterPaths(edges,4) averageValues = np.load('/mnt/md0/yumin/to_upload/UNAGI/UNAGI/data/example/2/averageValues.npy',allow_pickle=True) # print(averageValues.shape) idrem= getClusterIdrem(paths,averageValues,4) print(idrem)