# get top markers for each track from IDREM results, the requirements for top markers are:
# 1. the tendency of the marker is monotonic
# 2. the log2FC of the marker is larger than a cutoff
import json
import os
import numpy as np
import pandas as pd
from scipy.stats import norm
from ..dynamic_regulatory_networks.processTFs import readIdremJson
[docs]
def scoreAgainstBackground(background,input,all=False,mean=None,std=None):
'''
Calculate the p-value of the input gene expression change based on the background gene expression change.
parameters
--------------------
background: the background gene expression change
input: the input gene expression change
all: if all is True, the input gene expression change will be compared with all the background gene expression changes. Otherwise, the input gene expression change will be compared with each background gene expression change.
mean: the mean of the background gene expression change
std: the standard deviation of the background gene expression change
return
--------------------
cdf: np.float
p-value of the input gene expression change
'''
if all:
background = background.reshape(-1,1)
# print(background.mean(axis=0), background.std(axis=0))
if mean is None:
cdf = norm.cdf(input, loc=background.mean(axis=0), scale=background.std(axis=0))
else:
cdf = norm.cdf(input, loc=mean, scale=std)
return cdf
def getTopMarkers(idrem, background,filename, cutoff=None,topN=None,one_dist=False):
'''
**not updated**
'''
topMarkers = {}
tt = readIdremJson(idrem,filename)
background = np.array(background)
stages = len(filename.split('.')[0].split('-'))
temp = np.array(tt[8])
idrem_genes = np.array(temp[1:,0].tolist())
tendency = temp[1:,1].astype(float)
for i in range(2,stages):
tendency*=temp[1:,i].astype(float)
tendency[tendency <0] = 0
index = [i for i, x in enumerate(tendency) if x <= 0]
change = temp[1:,stages].astype(float) - temp[1:,1].astype(float)
stage_values = []
for i in range(0,stages):
stage_values.append(temp[1:,i+1].astype(float))
stages_list = []
for i in range(0,stages):
stages_list.append(temp[1:,i+1].astype(float))
change[index] = 0
if cutoff is not None:
topMarkers['increasing'] = {}
topMarkers['decreasing'] = {}
increasing_stop = np.where(change > 0)[0]
decreasing_stop = np.where(change < 0)[0]
temp_change = change[increasing_stop]
temp_names = idrem_genes[increasing_stop]
temp_stages = []
for i in range(0,stages):
temp_stages.append(stages_list[i][increasing_stop])
temp_background = background[:,increasing_stop]
increasing_stop = temp_change.argsort()[::-1]
# increasing_pval = scoreAgainstBackground(background[:,increasing_stop],temp_change)
topMarkers['increasing']['gene'] = {}
topMarkers['increasing']['log2fc'] = {}
topMarkers['increasing']['rank'] = {}
for i in range(0,stages):
topMarkers['increasing']['stage'+str(i)] = {}
topMarkers['increasing']['qval'] = {}
count = 0
pvals = []
in_pvals = []
if one_dist:
temp_background = temp_background.reshape(-1,1)
mean = temp_background.mean()
std = temp_background.std()
for i, each in enumerate(increasing_stop):
if one_dist:
increasing_pval = scoreAgainstBackground(temp_background,temp_change[each],mean = mean, std = std)
else:
increasing_pval = scoreAgainstBackground(temp_background[:, each],temp_change[each])
if increasing_pval < (1-cutoff):
continue
increasing_pval = scoreAgainstBackground(temp_background,temp_change[each],all=True)
if increasing_pval < (1-cutoff):
continue
in_pvals.append(each)
pvals.append(1-increasing_pval)
count+=1
count_1 = 0
#sort pvals by sorted
pvals = np.array(pvals).reshape(-1)
in_pvals = np.array(in_pvals).reshape(-1)
pvals = pvals[pvals.argsort()]
in_pvals = in_pvals[pvals.argsort()]
for i in range(count):
q_val = pvals[i] * count / (i+1)
if q_val > cutoff:
continue
each = in_pvals[i]
topMarkers['increasing']['gene'][str(count_1)] = temp_names[each]
for i in range(0,stages):
topMarkers['increasing']['stage'+str(i)][str(count_1)] = temp_stages[i][each]
topMarkers['increasing']['log2fc'][str(count_1)] = temp_change[each]
topMarkers['increasing']['rank'][str(count_1)] = count_1+1
topMarkers['increasing']['qval'][str(count_1)] = q_val
count_1+=1
temp_change = change[decreasing_stop]
temp_names = idrem_genes[decreasing_stop]
temp_stages = []
for i in range(0,stages):
temp_stages.append(stages_list[i][decreasing_stop])
temp_background = background[:,decreasing_stop]
decreasing_stop = temp_change.argsort()
# decreasing_pval = scoreAgainstBackground(background[:,decreasing_stop],temp_change)
topMarkers['decreasing']['gene'] = {}
topMarkers['decreasing']['log2fc'] = {}
topMarkers['decreasing']['rank'] = {}
for i in range(0,stages):
topMarkers['decreasing']['stage'+str(i)] = {}
topMarkers['decreasing']['qval'] = {}
count = 0
pvals = []
in_pvals = []
if one_dist:
temp_background = temp_background.reshape(-1,1)
mean = temp_background.mean()
std = temp_background.std()
for i, each in enumerate(decreasing_stop):
if one_dist:
decreasing_pval = scoreAgainstBackground(temp_background,temp_change[each],mean = mean, std = std)
else:
decreasing_pval = scoreAgainstBackground(temp_background[:, each],temp_change[each])
if decreasing_pval > cutoff:
continue
decreasing_pval = scoreAgainstBackground(temp_background,temp_change[each],all=True)
if decreasing_pval > cutoff:
continue
in_pvals.append(each)
pvals.append(decreasing_pval)
count+=1
count_1 = 0
pvals = np.array(pvals).reshape(-1)
in_pvals = np.array(in_pvals).reshape(-1)
pvals = pvals[pvals.argsort()]
in_pvals = in_pvals[pvals.argsort()]
for i in range(count):
q_val = pvals[i] * count/(i+1) #1 - (1 - topMarkers['decreasing']['pval'][str(i)]) * (count/(i+1))
if q_val > cutoff:
continue
each = in_pvals[i]
topMarkers['decreasing']['gene'][str(count_1)] = temp_names[each]
topMarkers['decreasing']['log2fc'][str(count_1)] = temp_change[each]
topMarkers['decreasing']['rank'][str(count_1)] = count_1+1
for i in range(0,stages):
topMarkers['decreasing']['stage'+str(i)][str(count_1)] = temp_stages[i][each]
topMarkers['decreasing']['qval'][str(count_1)] = q_val
count_1+=1
else: # if cutoff is not given, return ranked list
topMarkers['increasing'] = {}
topMarkers['decreasing'] = {}
increasing_stop = np.where(change > 0)[0]
decreasing_stop = np.where(change < 0)[0]
temp_change = change[increasing_stop]
temp_names = idrem_genes[increasing_stop]
temp_stages = []
for i in range(0,stages):
temp_stages.append(stages_list[i][increasing_stop])
increasing_stop = temp_change.argsort()[::-1]
topMarkers['increasing']['gene'] = {}
topMarkers['increasing']['log2fc'] = {}
topMarkers['increasing']['rank'] = {}
for i in range(0,stages):
topMarkers['increasing']['stage'+str(i)] = {}
if topN is not None:
for i, each in enumerate(increasing_stop[:topN]):
topMarkers['increasing']['gene'][str(i)] = temp_names[each]
for i in range(0,stages):
topMarkers['increasing']['stage'+str(i)][str(i)] = temp_stages[i][each]
topMarkers['increasing']['log2fc'][str(i)] = temp_change[each]
topMarkers['increasing']['rank'][str(i)] = i+1
else:
for i, each in enumerate(increasing_stop):
topMarkers['increasing']['gene'][str(i)] = temp_names[each]
for i in range(0,stages):
topMarkers['increasing']['stage'+str(i)][str(i)] = temp_stages[i][each]
topMarkers['increasing']['log2fc'][str(i)] = temp_change[each]
topMarkers['increasing']['rank'][str(i)] = i+1
temp_change = change[decreasing_stop]
temp_names = idrem_genes[decreasing_stop]
temp_stages = []
for i in range(0,stages):
temp_stages.append(stages_list[i][decreasing_stop])
decreasing_stop = temp_change.argsort()
topMarkers['decreasing']['gene'] = {}
topMarkers['decreasing']['log2fc'] = {}
topMarkers['decreasing']['rank'] = {}
for i in range(0,stages):
topMarkers['decreasing']['stage'+str(i)] = {}
if topN is not None:
for i, each in enumerate(decreasing_stop[:topN]):
topMarkers['decreasing']['gene'][str(i)] = temp_names[each]
topMarkers['decreasing']['log2fc'][str(i)] = temp_change[each]
for i in range(0,stages):
topMarkers['decreasing']['stage'+str(i)][str(i)] = temp_stages[i][each]
topMarkers['decreasing']['rank'][str(i)] = i+1
else:
for i, each in enumerate(decreasing_stop):
topMarkers['decreasing']['gene'][str(i)] = temp_names[each]
topMarkers['decreasing']['log2fc'][str(i)] = temp_change[each]
topMarkers['decreasing']['rank'][str(i)] = i+1
for i in range(0,stages):
topMarkers['decreasing']['stage'+str(i)][str(i)] = temp_stages[i][each]
return topMarkers
[docs]
def getTopMarkersFromIDREM(path, background,cutoff=None,topN=None,one_dist=False):
'''
Get the top markers for each track from IDREM results.
parameters
--------------------
path: str
the directory to the IDREM results.
background: np.array
the background gene expression change
cutoff: float
the cutoff for p-value. Default is None.
topN: int
the number of top markers to return. Default is None.
one_dist: bool
whether to consider all the background gene expression changes as one distribution. Default is False.
return
----------------
out: dict
a dictionary of top markers for each track.
'''
out = {}
filenames = os.listdir(path)
for each in filenames:
name = each
if each[0] != '.':
each = each.split('.')[0]#.split('-')[-1].split('n')
if one_dist:
out[each] = getTopMarkers(path,background,name,cutoff,topN,one_dist=one_dist)
else:
out[each] = getTopMarkers(path,background[each],name,cutoff,topN)
return out
[docs]
def runGetProgressionMarkercsv(directory,background, save_dir, topN=None,cutoff=None):
'''
Get the top markers for each track from IDREM results and save as a csv file.
parameters
--------------------
directory: str
the directory to the IDREM results.
background: str
the directory to the background gene expression change.
cutoff: float
the cutoff for p-value. Default is None.
topN: int
the number of top markers to return. Default is None.
save_dir: str
the directory to save the csv file.
return
---------------
None
'''
background = dict(np.load(background,allow_pickle=True).tolist())
out = getTopMarkersFromIDREM(directory,background,topN=topN,cutoff=cutoff)
results = pd.json_normalize(out)
results.columns = results.columns.str.split(".").map(tuple)
results = results.stack([0,1,2]).reset_index(0, drop=True)
results = results.transpose()
results = results.reset_index()
results['index'] = results['index'].astype('int32')
results = results.set_index(['index']).sort_index()
# results = results.transpose()
results = results.sort_index()
# print(results)
# return results
results.to_csv(os.path.join(save_dir,'mesProgressionMarker_pval_twofilters.csv'))
[docs]
def runGetProgressionMarker(directory,background, cutoff=0.05, topN=None):
'''
Get the top markers for each track from IDREM results.
parameters
--------------------
directory: str
the directory to the IDREM results.
background: str
the directory to the background gene expression change.
cutoff: float
the cutoff for p-value. Default is 0.05.
topN: int
the number of top markers to return. Default is None.
return
---------------
out: dict
a dictionary of top markers for each track.
'''
out = getTopMarkersFromIDREM(directory,background,cutoff = cutoff,topN=topN)
return out
[docs]
def runGetProgressionMarker_one_dist(directory,background,size, cutoff=0.05, topN=None):
'''
Get the top markers for each track from IDREM results and consider the whole background as one distribution.
parameters
--------------------
directory: str
the directory to the IDREM results.
background: str
the directory to the background gene expression change.
cutoff: float
the cutoff for p-value. Default is 0.05.
topN: int
the number of top markers to return. Default is None.
return
---------------
out:
a dictionary of top markers for each track.
'''
one_dist = []
for each in background.keys():
one_dist.append(np.array(background[each]))
background = np.array(one_dist).reshape(-1, size)
out = getTopMarkersFromIDREM(directory,background,cutoff = cutoff,one_dist=True)
return out