Source code for UNAGI.marker_discovery.dynamic_markers

# get top markers for each track from IDREM results, the requirements for top markers are:
# 1. the tendency of the marker is monotonic
# 2. the log2FC of the marker is larger than a cutoff
import json
import os
import numpy as np
import pandas as pd
from scipy.stats import norm
from ..dynamic_regulatory_networks.processTFs import readIdremJson

[docs]
def scoreAgainstBackground(background,input,all=False,mean=None,std=None):
    '''
    Calculate the p-value of the input gene expression change based on the background gene expression change.
    
    parameters
    --------------------
    background: the background gene expression change
    input: the input gene expression change
    all: if all is True, the input gene expression change will be compared with all the background gene expression changes. Otherwise, the input gene expression change will be compared with each background gene expression change.
    mean: the mean of the background gene expression change
    std: the standard deviation of the background gene expression change

    return
    --------------------
    cdf: np.float
        p-value of the input gene expression change
    '''

    if all:
        background = background.reshape(-1,1)

    # print(background.mean(axis=0), background.std(axis=0))
    if mean is None:
        cdf = norm.cdf(input, loc=background.mean(axis=0), scale=background.std(axis=0))
    else:
        cdf = norm.cdf(input, loc=mean, scale=std)
    
    return cdf


def getTopMarkers(idrem, background,filename, cutoff=None,topN=None,one_dist=False):
    '''
    **not updated**
    '''
    topMarkers = {}
    tt = readIdremJson(idrem,filename)

    background = np.array(background)
    stages = len(filename.split('.')[0].split('-'))
    
    temp = np.array(tt[8])
    idrem_genes = np.array(temp[1:,0].tolist())
    tendency = temp[1:,1].astype(float)
    for i in range(2,stages):
        tendency*=temp[1:,i].astype(float)
    tendency[tendency <0] = 0
    index = [i for i, x in enumerate(tendency) if x <= 0]
    change = temp[1:,stages].astype(float) - temp[1:,1].astype(float)
    stage_values = []
    for i in range(0,stages):
        stage_values.append(temp[1:,i+1].astype(float))

    stages_list = []
    for i in range(0,stages):
        stages_list.append(temp[1:,i+1].astype(float))
    
    change[index] = 0
    if cutoff is not None:
        topMarkers['increasing'] = {}
        topMarkers['decreasing'] = {}
        increasing_stop = np.where(change > 0)[0]
        decreasing_stop = np.where(change < 0)[0]
        temp_change = change[increasing_stop]
        temp_names = idrem_genes[increasing_stop]
        temp_stages = []
        for i in range(0,stages):
            temp_stages.append(stages_list[i][increasing_stop])
        temp_background = background[:,increasing_stop]

        
        increasing_stop = temp_change.argsort()[::-1]
        # increasing_pval = scoreAgainstBackground(background[:,increasing_stop],temp_change)
        
        topMarkers['increasing']['gene'] = {}
        topMarkers['increasing']['log2fc'] = {}
        topMarkers['increasing']['rank'] = {}
        for i in range(0,stages):
            topMarkers['increasing']['stage'+str(i)] = {} 
        topMarkers['increasing']['qval'] = {}
        count = 0
        pvals = []
        in_pvals = []
        if one_dist:
            temp_background = temp_background.reshape(-1,1)
            mean = temp_background.mean()
            std = temp_background.std()
        for i, each in enumerate(increasing_stop):
            if one_dist:
                increasing_pval = scoreAgainstBackground(temp_background,temp_change[each],mean = mean, std = std)
            else:
                increasing_pval = scoreAgainstBackground(temp_background[:, each],temp_change[each])
                if increasing_pval < (1-cutoff):
                    continue
                increasing_pval = scoreAgainstBackground(temp_background,temp_change[each],all=True)
            if increasing_pval < (1-cutoff):
                continue
            in_pvals.append(each)
            pvals.append(1-increasing_pval)
            count+=1
        count_1 = 0
        #sort pvals by sorted
        pvals = np.array(pvals).reshape(-1)

        in_pvals = np.array(in_pvals).reshape(-1)
        pvals = pvals[pvals.argsort()]

        in_pvals = in_pvals[pvals.argsort()]
        for i in range(count):
            
            q_val = pvals[i] * count / (i+1)
            if q_val > cutoff:
                continue
            each = in_pvals[i]
            topMarkers['increasing']['gene'][str(count_1)] = temp_names[each]
            for i in range(0,stages):
                topMarkers['increasing']['stage'+str(i)][str(count_1)] = temp_stages[i][each]
            topMarkers['increasing']['log2fc'][str(count_1)] = temp_change[each]
            topMarkers['increasing']['rank'][str(count_1)] = count_1+1
            topMarkers['increasing']['qval'][str(count_1)] = q_val
            count_1+=1
        temp_change = change[decreasing_stop]
        temp_names = idrem_genes[decreasing_stop]
        temp_stages = []
        for i in range(0,stages):
            temp_stages.append(stages_list[i][decreasing_stop]) 
        temp_background = background[:,decreasing_stop]
        decreasing_stop = temp_change.argsort()
        # decreasing_pval = scoreAgainstBackground(background[:,decreasing_stop],temp_change)
        
        topMarkers['decreasing']['gene'] = {}
        topMarkers['decreasing']['log2fc'] = {}
        topMarkers['decreasing']['rank'] = {}
        for i in range(0,stages):
            topMarkers['decreasing']['stage'+str(i)] = {}
        topMarkers['decreasing']['qval'] = {}
        count = 0
        pvals = []
        in_pvals = []
        if one_dist:
            temp_background = temp_background.reshape(-1,1)
            mean = temp_background.mean()
            std = temp_background.std()
        for i, each in enumerate(decreasing_stop):
            if one_dist:
                decreasing_pval = scoreAgainstBackground(temp_background,temp_change[each],mean = mean, std = std)
            else:
                decreasing_pval = scoreAgainstBackground(temp_background[:, each],temp_change[each])
                if decreasing_pval > cutoff:
                    continue
                decreasing_pval = scoreAgainstBackground(temp_background,temp_change[each],all=True)
            if decreasing_pval > cutoff:
                continue
            in_pvals.append(each)
            pvals.append(decreasing_pval)
            count+=1
        count_1 = 0
        
        pvals = np.array(pvals).reshape(-1)
        in_pvals = np.array(in_pvals).reshape(-1)
        pvals = pvals[pvals.argsort()]
        in_pvals = in_pvals[pvals.argsort()]

        for i in range(count):
            q_val = pvals[i] * count/(i+1)  #1 - (1 - topMarkers['decreasing']['pval'][str(i)]) * (count/(i+1))
            if q_val > cutoff:
                continue
            each = in_pvals[i]
            topMarkers['decreasing']['gene'][str(count_1)] = temp_names[each]
            topMarkers['decreasing']['log2fc'][str(count_1)] = temp_change[each]
            topMarkers['decreasing']['rank'][str(count_1)] = count_1+1 
            for i in range(0,stages):
                topMarkers['decreasing']['stage'+str(i)][str(count_1)] = temp_stages[i][each]
            topMarkers['decreasing']['qval'][str(count_1)] = q_val
            count_1+=1
    else: # if cutoff is not given, return ranked list
        topMarkers['increasing'] = {}
        topMarkers['decreasing'] = {}
        increasing_stop = np.where(change > 0)[0]
        decreasing_stop = np.where(change < 0)[0]
        temp_change = change[increasing_stop]
        temp_names = idrem_genes[increasing_stop]
        temp_stages = []
        for i in range(0,stages):
            temp_stages.append(stages_list[i][increasing_stop])
        increasing_stop = temp_change.argsort()[::-1]
        topMarkers['increasing']['gene'] = {}
        topMarkers['increasing']['log2fc'] = {}
        topMarkers['increasing']['rank'] = {}
        for i in range(0,stages):
            topMarkers['increasing']['stage'+str(i)] = {}
        if topN is not None:
            for i, each in enumerate(increasing_stop[:topN]):
                topMarkers['increasing']['gene'][str(i)] = temp_names[each]
                for i in range(0,stages):
                    topMarkers['increasing']['stage'+str(i)][str(i)] = temp_stages[i][each]
                topMarkers['increasing']['log2fc'][str(i)] = temp_change[each]
                topMarkers['increasing']['rank'][str(i)] = i+1
        else:
            for i, each in enumerate(increasing_stop):
                topMarkers['increasing']['gene'][str(i)] = temp_names[each]
                for i in range(0,stages):
                    topMarkers['increasing']['stage'+str(i)][str(i)] = temp_stages[i][each]
                topMarkers['increasing']['log2fc'][str(i)] = temp_change[each]
                topMarkers['increasing']['rank'][str(i)] = i+1
        temp_change = change[decreasing_stop]
        temp_names = idrem_genes[decreasing_stop]
        temp_stages = []
        for i in range(0,stages):
            temp_stages.append(stages_list[i][decreasing_stop])
        decreasing_stop = temp_change.argsort()
        topMarkers['decreasing']['gene'] = {}
        topMarkers['decreasing']['log2fc'] = {}
        topMarkers['decreasing']['rank'] = {}
        for i in range(0,stages):
            topMarkers['decreasing']['stage'+str(i)] = {}
        if topN is not None:
            for i, each in enumerate(decreasing_stop[:topN]):
                topMarkers['decreasing']['gene'][str(i)] = temp_names[each]
                topMarkers['decreasing']['log2fc'][str(i)] = temp_change[each]
                for i in range(0,stages):
                    topMarkers['decreasing']['stage'+str(i)][str(i)] = temp_stages[i][each]
                topMarkers['decreasing']['rank'][str(i)] = i+1

        else:
            for i, each in enumerate(decreasing_stop):
                topMarkers['decreasing']['gene'][str(i)] = temp_names[each]
                topMarkers['decreasing']['log2fc'][str(i)] = temp_change[each]
                topMarkers['decreasing']['rank'][str(i)] = i+1 
                for i in range(0,stages):
                    topMarkers['decreasing']['stage'+str(i)][str(i)] = temp_stages[i][each]


    return topMarkers

[docs]
def getTopMarkersFromIDREM(path, background,cutoff=None,topN=None,one_dist=False):
    '''
    Get the top markers for each track from IDREM results.
    parameters
    --------------------
    path: str
        the directory to the IDREM results.
    background: np.array
        the background gene expression change
    cutoff: float
        the cutoff for p-value. Default is None.
    topN: int
        the number of top markers to return. Default is None.
    one_dist: bool
        whether to consider all the background gene expression changes as one distribution. Default is False.

    return
    ----------------
    out:    dict
        a dictionary of top markers for each track.
    '''
    out = {}
    filenames = os.listdir(path)
    for each in filenames:
        name = each
        if each[0] != '.':
            each = each.split('.')[0]#.split('-')[-1].split('n')
            if one_dist:
                out[each] = getTopMarkers(path,background,name,cutoff,topN,one_dist=one_dist)
            else:
                out[each] = getTopMarkers(path,background[each],name,cutoff,topN)
    return out



[docs]
def runGetProgressionMarkercsv(directory,background, save_dir, topN=None,cutoff=None):
    '''
    Get the top markers for each track from IDREM results and save as a csv file.
    
    parameters
    --------------------
    directory: str
        the directory to the IDREM results.
    background: str
        the directory to the background gene expression change.
    cutoff: float
        the cutoff for p-value. Default is None.
    topN: int
        the number of top markers to return. Default is None.
    save_dir: str
        the directory to save the csv file.

    return
    ---------------
    None
    '''
    background = dict(np.load(background,allow_pickle=True).tolist())
    out = getTopMarkersFromIDREM(directory,background,topN=topN,cutoff=cutoff)
    results = pd.json_normalize(out)
    results.columns = results.columns.str.split(".").map(tuple)
    results = results.stack([0,1,2]).reset_index(0, drop=True)
    results = results.transpose()
    results = results.reset_index()
    results['index'] = results['index'].astype('int32')
    results = results.set_index(['index']).sort_index()
    # results = results.transpose()

    results = results.sort_index()
    # print(results)
    # return results
    results.to_csv(os.path.join(save_dir,'mesProgressionMarker_pval_twofilters.csv')) 




[docs]
def runGetProgressionMarker(directory,background, cutoff=0.05, topN=None):
    '''
    Get the top markers for each track from IDREM results.
    
    parameters
    --------------------
    directory: str
        the directory to the IDREM results.
    background: str
        the directory to the background gene expression change.
    cutoff: float
        the cutoff for p-value. Default is 0.05.
    topN: int
        the number of top markers to return. Default is None.

    return
    ---------------
    out:    dict
        a dictionary of top markers for each track.
    '''
    out = getTopMarkersFromIDREM(directory,background,cutoff = cutoff,topN=topN)
    return out




[docs]
def runGetProgressionMarker_one_dist(directory,background,size, cutoff=0.05, topN=None):
    '''
    Get the top markers for each track from IDREM results and consider the whole background as one distribution.
    
    parameters
    --------------------
    directory: str
        the directory to the IDREM results.
    background: str
        the directory to the background gene expression change.
    cutoff: float
        the cutoff for p-value. Default is 0.05.
    topN: int
        the number of top markers to return. Default is None.

    return
    ---------------
    out:    
        a dictionary of top markers for each track.
    '''

    one_dist  = []
    for each in background.keys(): 
        one_dist.append(np.array(background[each]))
    background = np.array(one_dist).reshape(-1, size)
    out = getTopMarkersFromIDREM(directory,background,cutoff = cutoff,one_dist=True)
    return out