Preprocessing functions
getGR = function(chr,start,end,offset){
tp = GRanges(seqnames = Rle(chr),
ranges = IRanges(start-offset,
end = end+offset),
strand = Rle(rep("*", length(start))),
loop = seq(1, length(start)) )
seqlevelsStyle(tp)='ucsc'
return(tp)}
binAnno = function( chrSizes, chroms, binSize ){
do.call('rbind', lapply(chroms, function(chr){
chr.length = seqlengths(chrSizes[chr])
dongling = chr.length %% binSize
if( dongling > 0 ) chr.length = chr.length + (binSize-dongling) + 1
data.frame( chr=chr,
start=seq(1, chr.length-binSize, by=binSize),
end = seq(binSize, chr.length, by=binSize),
stringsAsFactors=FALSE )
}))
}
## helper to vectorize the read dumped interaction file
transformToSparseMatrix=function( df,row.annotation,col.annotation ){
# row.annotation=rowAnnotation; col.annotation=colAnnotation
sm = Matrix( 0, nrow=nrow(row.annotation),
ncol=nrow(col.annotation) )
tp = cbind( row=match(as.numeric(df[,1]+1),row.annotation[,2]),
col=match(as.numeric(df[,2]+1),col.annotation[,2]) )
sm[tp] = df[,3]
return( sm )
}
## function to read matrices dumped from .hic files
read.hic_files = function( directory, prefix, suffix, binAnnotation, chroms ){
chrlist = as.list(chroms)
names(chrlist)= chroms
# prefix="ES1_10kb/ES1_30_"; suffix=".txt"; directory=dumped_dir; binAnnotation=ga
# prefix=""; suffix=".matrix.txt"; directory=dumped_directory_ele; binAnnotation=ga;chrlist= paste0("chr",c(1:22,"X"))
lapply( as.list(chrlist), function(chromosome){
# left = unlist( strsplit( chromosome, ' ' ) )[1]
# right = unlist( strsplit( chromosome, ' ' ) )[2]
# newChromosome = paste(left,right,sep='_')
# chromosome = "chr1"
df = read.delim( paste0(directory, prefix, chromosome, suffix ), as.is=T, header=F )
df = df[!is.na(df$V3),]
df[,1] = as.numeric(df[,1])
rowAnnotation = binAnnotation[binAnnotation$chr==chromosome,]
colAnnotation = binAnnotation[binAnnotation$chr==chromosome,]
print(chromosome)
# print(chromosome)
return( transformToSparseMatrix(df,
row.annotation = rowAnnotation,
col.annotation = colAnnotation ) )
} )
}
## wrap all and create a Genome-wide matrix of interactions
getGWmatrix=function( list.of.sm, ga ){
nummSM = Matrix(0, nrow=nrow(ga), ncol=nrow(ga) )
ids = do.call('rbind', lapply( as.list(names(list.of.sm)), function(chrom_comb){
print( paste0('including ', chrom_comb) )
chrom = unlist(strsplit(chrom_comb, " "))
if( chrom[1] == 'chr1' & chrom[2] == 'chr1') {start.row = 0
start.col = 0}
if( chrom[1] == 'chr1' & chrom[2] != 'chr1') {start.row = 0
start.col = min(which( ga$chr==chrom[2] ))-1}
if( chrom[1] != 'chr1') {start.row = min(which( ga$chr==chrom[1] ))-1
start.col = min(which( ga$chr==chrom[2] ))-1 }
tp = as.data.frame(summary(list.of.sm[[chrom_comb]]),stringsAsFactors=F )
tp$i = tp$i + start.row
tp$j = tp$j + start.col
return(tp)
}))
print( 'assembling the matrix, depending on the resolution this might take a while' )
nummSM[ cbind(ids[,1],ids[,2])]=ids[,3]
return( nummSM )
}
## wrap all and create a Genome-wide matrix of interactions
## this function is foreseen to use the list of objects returned by IPF
getGWmatrix_dedicated=function( list.of.sm, ga, which.object ){
nummSM = Matrix(0, nrow=nrow(ga), ncol=nrow(ga) )
if( which.object == 'LFM' ) {i=1}
if( which.object == 'balanced' ) {i=2}
ids = do.call('rbind', lapply( as.list(names(list.of.sm)), function(chrom){
print( paste0('including ', chrom) )
if( chrom == 'chr1' ) {start.row = 0; start.col = 0}
if( chrom != 'chr1') {start.row = min(which( ga$chr==chrom ))-1
start.col = min(which( ga$chr==chrom ))-1 }
tp = as.data.frame(summary(list.of.sm[[chrom]][[i]]),stringsAsFactors=F )
tp$i = tp$i + start.row
tp$j = tp$j + start.col
return(tp)
}))
print( 'assembling the matrix, depending on the resolution this might take a while' )
nummSM[ cbind(ids[,1],ids[,2])]=ids[,3]
return( nummSM )
}
getGWmatrix_intra=function( list.of.sm, ga, which.object ){
if( which.object == 'LFM' ) {i=1}
if( which.object == 'balanced' ) {i=2}
nummSM = Matrix(0, nrow=nrow(ga), ncol=nrow(ga) )
ids = do.call('rbind', lapply( as.list(names(list.of.sm)), function(chrom){
print( paste0('including ', chrom) )
if( chrom == 'chr1' ) {start.row = 0; start.col = 0}
if( chrom != 'chr1') {start.row = min(which( ga$chr==chrom ))-1
start.col = min(which( ga$chr==chrom ))-1 }
tp = as.data.frame(summary(list.of.sm[[chrom]][[i]]),stringsAsFactors=F )
tp$i = tp$i + start.row
tp$j = tp$j + start.col
return(tp)
}))
print( 'assembling the matrix, depending on the resolution this might take a while' )
nummSM[ cbind(ids[,1],ids[,2])]=ids[,3]
return( nummSM )
}
Function - specific tasks in HiC
getPixelsAround = function(ROW,COL,SIZE,ID){
data.frame( rows = rep( seq(ROW-SIZE,ROW+SIZE),length(seq(ROW-SIZE,ROW+SIZE))),
col = rep( seq(COL-SIZE,COL+SIZE),each=length(seq(ROW-SIZE,ROW+SIZE))),
id = ID ) }
GetCentroidSignal = function( loops, hic, size,CHROMS,which_file ){
# loops = loops_a3; hic=ko_10kb; size=1;chr="chr1"
lapply( as.list( CHROMS ),function(chr){
print(chr)
theseloops = loops[loops$X.chr1==chr,]
if(which_file=="balanced") {this_hic = hic[[chr]]$balanced}
if(which_file=="raw") {this_hic = hic[[chr]]$LFM}
theseloops = theseloops[theseloops$left_binID-size>1,]
theseloops = theseloops[theseloops$right_binID+size<nrow(this_hic),]
coords = do.call("rbind",lapply( split(theseloops,rownames(theseloops)), function(l){
getPixelsAround(ROW=l$left_binID,
COL=l$right_binID,
SIZE=size,
ID=rownames(l))} ))
coords$hic_signal = this_hic[ cbind(coords$rows,coords$col)]
return( coords ) } ) }
ProcessLoops = function( lo ){
# lo=human_loops_ele30; pixelSize=3
tp = do.call("rbind",lo)
tp = split(tp,tp$id)
tp = lapply( tp, function(x){matrix(x$hic_signal,nrow=sqrt(nrow(x)),ncol=sqrt(nrow(x)))})
return( tp )
}
GetCentroidSignalGW = function( loops, hic, size,CHROMS,ga ){
# loops = loops_a3; hic=ko_10kb; size=1;chr="chr1"
lapply( as.list( CHROMS ),function(chr){
print(chr)
this_hic = hic[ min(which(ga$chr==chr)):max(which(ga$chr==chr)), min(which(ga$chr==chr)):max(which(ga$chr==chr))]
theseloops = loops[loops$X.chr1==chr,]
theseloops = theseloops[theseloops$left_binID-size>1,]
theseloops = theseloops[theseloops$right_binID+size<nrow(this_hic),]
coords = do.call("rbind",lapply( split(theseloops,rownames(theseloops)), function(l){
getPixelsAround(ROW=l$left_binID,
COL=l$right_binID,
SIZE=size,
ID=rownames(l))} ))
coords$hic_signal = this_hic[ cbind(coords$rows,coords$col)]
return( coords ) } ) }
readBed_filterChromsStraded = function(filePath, chroms, scoreCol){
# filePath='/Volumes/T7/CTCF/ATAC_ES_2i_nov/peaks/ATACseq40_ES_2i_IAA_1_MM.narrowPeak'
x= read.delim(filePath,as.is=T,header=F)
x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]),
end=as.numeric(x[,3]),
names=rownames(x)),
strand = x[,6],
score=x[,scoreCol]) ) }
readNarrowPeak2getSummit = function(filePath, chroms, scoreCol){
# filePath='/Volumes/T7/CTCF/ATAC_ES_2i_nov/peaks/ATACseq40_ES_2i_IAA_1_MM.narrowPeak'
x= read.delim(filePath,as.is=T,header=F)
x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]) + as.numeric(x[,10]) ,
end=as.numeric(x[,2]) + as.numeric(x[,10]),
names=rownames(x)),
score=x[,scoreCol]) ) }
appendMotifInformation = function( peakFile, motifFile,summitFile ){
## we will only consider peaks which have motif close to the summit
## peakFile=ctcf_wt_NS_peaks; motifFile=ctcf_motif; summitFile=ctcf_wt_NS_peaks
start(summitFile) = start(summitFile)-25
end(summitFile) = end(summitFile)+25
peakFile$number_of_motifs=countOverlaps(summitFile,motifFile)
peakFile$motif_score = 0
peakFile$motif_strand = factor(rep('*',length(peakFile)),levels = c("*",'+','-'))
motifFile$realStart = start(motifFile)
motifFile$realStart[which(strand(motifFile)=='-')] = end(motifFile[which(strand(motifFile)=='-')])
motifFile$motifID = 0
peak_motif = findOverlaps(summitFile,motifFile)
tp = data.frame(score=motifFile$score,
strand=strand(motifFile),
motif_start=motifFile$realStart,
motif_id = 1:length(motifFile),
stringsAsFactors=FALSE)
tp = split(tp[subjectHits(peak_motif),],queryHits(peak_motif))
th = lapply(tp,function(x){x[which.max(x$score),]})
## checks:
# table(unlist(lapply(tp,nrow)))
# table(unlist(lapply(th)))
peakFile$motif_score[as.numeric(names(th))] = unlist(lapply(th,function(f){f$score}))
peakFile$motif_strand[as.numeric(names(th))] = unlist(lapply(th,function(f){f$strand}))
peakFile$motif_start[as.numeric(names(th))] = unlist(lapply(th,function(f){f$motif_start}))
peakFile$motifID[as.numeric(names(th))] = unlist(lapply(th,function(f){f$motif_id}))
return(peakFile) }
compute_putative_loop_signal = function(ctcf_ctcf_loop_signal,central_sq) {
# central_sq - half window size
signal_samples = c()
# Get signal from the central 5X5 square of the matrix
core_range = (ceiling(ncol(ctcf_ctcf_loop_signal)/2) - central_sq):(ceiling(ncol(ctcf_ctcf_loop_signal)/2) + central_sq)
# Calculate mean
core_signal = mean(ctcf_ctcf_loop_signal[core_range,core_range])
# Set boundaries for random selection to avoid the central 5X5 square
no_go_range = (min(core_range) - 4):max(core_range)
check_indexs = 1:(ncol(ctcf_ctcf_loop_signal) - 4)
check_indexs = check_indexs[!(check_indexs %in% no_go_range)]
# Get 15 sample points to grab a new 5X5 square from around the center
sample_rows = sample(check_indexs, 15,replace=TRUE)
sample_cols = sample(check_indexs, 15,replace=TRUE)
# For each random sample point calculate the 5X5 region and get mean signal
for(i in 1:15) {
signal_samples = c(signal_samples,mean(ctcf_ctcf_loop_signal[sample_rows[i]:(sample_rows[i]+4),sample_cols[i]:(sample_cols[i]+4)]))
}
# Compute and return
# the central signal value,
# the log2 FC for the core signal vs the average of the sample sites,
# p-value from t-test assuming the core value to be the distribution mean
return(c('interact_signal' = core_signal, 'l2fc_v.bg' = log2(core_signal/mean(signal_samples)), t.test(signal_samples, mu = core_signal)[3]))
}
getMatrixForARegion = function(lowertri,uppertri,CHROM,START,END,ga,upperLimit){
# lowertri=wt_A3;uppertri=ko_10kb;CHROM="chr1";START=89900000-w;END=91000000+w;upperLimit=0.015
bins = ga[ga$chr==CHROM,]
bins = which(bins$start>START & bins$end<END)
a = lowertri[[CHROM]]$balanced
b = uppertri[[CHROM]]$balanced
a = a[min(bins):max(bins),min(bins):max(bins)]
b = b[min(bins):max(bins),min(bins):max(bins)]
ab = a
ab[upper.tri(ab)] = b[upper.tri(b)]
diag(ab) = 0
ab = t(matrix(apply(ab,2,rev),nrow(ab),ncol(ab)))
ab[ab>upperLimit] = upperLimit
return(ab) }
getSignalInBins = function( bins, modification, featureColumn ){
PsDFvsX = findOverlaps(bins,modification)
PsDFvsX_score = split( as.numeric(elementMetadata(modification)[,featureColumn][subjectHits(PsDFvsX)]), queryHits(PsDFvsX) )
PsDFvsX_score = unlist(lapply(PsDFvsX_score,function(x){max(x,na.rm=T)}))
bm = rep( 0, length(bins) )
bm[as.numeric(names(PsDFvsX_score))]=PsDFvsX_score
bm = matrix(bm,nrow=length(bins)/sum(bins$peak==1), ncol=sum(bins$peak==1), byrow = T )
rownames(bm) = names(bins)[seq(1,length(bins),by=sum(bins$peak==1))]
return(bm)
}
getLoopSignal = function(ll,D){
# ll = APA_A1_IPF_sig; D=2; thr_lfc=0.59; pval_thr=0.01
tp = lapply(ll,function(x){compute_putative_loop_signal(x,D)})
return(unlist(lapply(tp,function(x){x[[2]]})))
}
importBEDPE_Filter_Loops = function(loopdir, max_size, extend_anchors_by){
# loopdir = '/Volumes/T7/CTCF/CTCF_paper/CTCF_paper/data/hiccups_output_esc/'
return( do.call( 'rbind', lapply(as.list(list.files(loopdir)), function(x){
tp=read.delim( paste0(loopdir,x,'/merged_loops.bedpe'), header=TRUE )
tp=tp[ -1, ]
tp$loop_span=tp$y2-tp$x1
tp = tp[ tp$loop_span < max_size,]
tp$x1 = tp$x1-extend_anchors_by
tp$x2 = tp$x2+extend_anchors_by
tp$y1 = tp$y1-extend_anchors_by
tp$y2 = tp$y2+extend_anchors_by
return(tp)
})) )
}
compareLoops = function(loops1,loops2,offset,maxLoopSize){
# offset=0;loops1=es_loops;loops2=ns_loops
loops1 = loops1[loops1$loop_span<maxLoopSize,]
loops2 = loops2[loops2$loop_span<maxLoopSize,]
leftGR1 = getGR(loops1[,1],loops1[,2],loops1[,3],offset=offset)
leftGR2 = getGR(loops2[,1],loops2[,2],loops2[,3],offset=offset)
rightGR1 = getGR(loops1[,4],loops1[,5],loops1[,6],offset=offset)
rightGR2 = getGR(loops2[,4],loops2[,5],loops2[,6],offset=offset)
leftOV = findOverlaps(leftGR1,leftGR2)
rightOV = findOverlaps(rightGR1,rightGR2)
overlapping1 = table(c(paste(queryHits(leftOV),subjectHits(leftOV),sep='_'),
paste(queryHits(rightOV),subjectHits(rightOV),sep='_')))
overlapping_loops1 = unique(as.numeric( unlist(strsplit(names(overlapping1[overlapping1==2]),"_"))[seq(1,2*length(overlapping1[overlapping1==2]),by=2)]))
overlapping_loops1 = overlapping_loops1[order(overlapping_loops1)]
loops1_common = loops1[overlapping_loops1,]
loops1_specific = loops1[-overlapping_loops1,]
leftOV = findOverlaps(leftGR2,leftGR1)
rightOV = findOverlaps(rightGR2,rightGR1)
overlapping2 = table(c(paste(queryHits(leftOV),subjectHits(leftOV),sep='_'),
paste(queryHits(rightOV),subjectHits(rightOV),sep='_')))
overlapping_loops2 = unique(as.numeric( unlist(strsplit(names(overlapping2[overlapping2==2]),"_"))[seq(1,2*length(overlapping2[overlapping2==2]),by=2)]))
loops2_common = loops2[overlapping_loops2,]
loops2_specific = loops2[-overlapping_loops2,]
res=vector('list',length=4)
res[[1]] = loops1_common
res[[2]] = loops1_specific
res[[3]] = loops2_common
res[[4]] = loops2_specific
names(res) = c('First_set_common','First_set_specific','Second_set_common',
'Second_set_specific')
return(res)
}
## Insulation analysis
getSUMMEDsignal4bins = function( B, D, A, M ){
# B=bins-HOWFAR; D=distance; A=Area; M=thism
rows = unlist(lapply(as.list(B), function(b){
rep( ( (b-D) + seq(-A,A)) , (1+2*A))}))
cols = unlist(lapply(as.list(B), function(b){
rep( ( (b+D) + seq(-A,A) ), each=(1+2*A))}))
ids = cbind(rows, cols)
m = M[ids]
IDS = rep( names(B), each = (1+2*A)^2 )
# print(c(min(ids),max(ids)))
matrices = split( m, IDS )
return( unlist(lapply(matrices,function(m){sum(m, na.rm=T)})) )}
InsulationScore = function( bin, mat, GAGR, distance, Area, HOWFAR ){
## mat=ko_10kb; distance=5; Area=3;chr="chr1";GAGR=gagr;HOWFAR=10
## bin = gagr[which(chrom(gagr)!="chrY")]
## rm(list=c("mat","distance","Area","chr","GAGR","HOWFAR","B","D","A","M"))
do.call("rbind",lapply( as.list(unique(as.character(chrom(bin)))),function(chr){
print(paste0("processing ",chr))
thism = mat[[chr]]$balanced
theseb = GenomicRanges::resize( bin[which(chrom(bin)==chr)], 1, fix="center")
bins = GAGR$binid[queryHits(findOverlaps(GAGR,theseb))]
names(bins) = names(theseb[subjectHits(findOverlaps(GAGR,theseb))])
bins = bins[bins-distance-HOWFAR-Area>0 & bins+distance+HOWFAR+Area<(length(GAGR[which(chrom(GAGR)==chr)]))]
return(data.frame( left=getSUMMEDsignal4bins(B=bins-HOWFAR, D=distance, A=Area, M=thism ),
middle=getSUMMEDsignal4bins(B=bins, D=distance, A=Area, M=thism ),
right=getSUMMEDsignal4bins(B=bins+HOWFAR, D=distance, A=Area, M=thism ) )) } )) }
InsulationScorePublishedKR = function( bin, mat, GAGR, distance, Area, HOWFAR ){
## mat=ko_10kb; distance=5; Area=3;chr="chr1";GAGR=gagr;HOWFAR=10
## bin = gagr[which(chrom(gagr)!="chrY")]
## rm(list=c("mat","distance","Area","chr","GAGR","HOWFAR","B","D","A","M"))
do.call("rbind",lapply( as.list(unique(as.character(chrom(bin)))),function(chr){
print(paste0("processing ",chr))
thism = mat[[chr]]
theseb = GenomicRanges::resize( bin[which(chrom(bin)==chr)], 1, fix="center")
bins = GAGR$binid[queryHits(findOverlaps(GAGR,theseb))]
names(bins) = names(theseb[subjectHits(findOverlaps(GAGR,theseb))])
bins = bins[bins-distance-HOWFAR-Area>0 & bins+distance+HOWFAR+Area<(length(GAGR[which(chrom(GAGR)==chr)]))]
return(data.frame( left=getSUMMEDsignal4bins(B=bins-HOWFAR, D=distance, A=Area, M=thism ),
middle=getSUMMEDsignal4bins(B=bins, D=distance, A=Area, M=thism ),
right=getSUMMEDsignal4bins(B=bins+HOWFAR, D=distance, A=Area, M=thism ) )) } )) }
processIS = function( IS, GAGR ){
res = GAGR
res$binid=NULL
res$score = 0
res$score[match(rownames(IS),names(res))] = log2( rowMeans((0.001+IS[,c(1,3)]))/(0.001+IS[,2] ) )
return(res) }
cleanIS = function(x){
x[is.na(x)]=0
x[!is.finite(x)]=0
return(x) }
RNA-seq and ChIP-seq functions
getSignalInBins400 = function( bins, modification, featureColumn ){
PsDFvsX = findOverlaps(bins,modification)
PsDFvsX_score = split( as.numeric(elementMetadata(modification)[,featureColumn][subjectHits(PsDFvsX)]), queryHits(PsDFvsX) )
PsDFvsX_score = unlist(lapply(PsDFvsX_score,function(x){mean(x,na.rm=T)}))
bm = rep( 0, length(bins) )
bm[as.numeric(names(PsDFvsX_score))]=PsDFvsX_score
bm = matrix(bm,nrow=length(bins)/401, ncol=401, byrow = T )
rownames(bm) = (bins$peak)[seq(1,length(bins),by=401)]
return(bm)
}
reorderMatrixBasedOnStrand = function( m, peaks ){
m_fwd = m[which(as.character(peaks$motif_strand)=='+'),]
m_rev = t(apply( m[which(as.character(peaks$motif_strand)=='-'),],1,rev))
return(rbind(m_fwd,m_rev))
}
GetTPM = function( m,cols2take,gn ){
# m=PL_Dat; cols2take=c(7,8,14,15,9,10,11,12,13)
mb = m[,cols2take]
cs = colSums( mb )
f1 = do.call( "cbind", lapply(1:ncol(mb), function(x){
rpk = 1000*(mb[,x]/m$Length)
SF = sum(rpk)/1000000
return( rpk/SF ) } ) )
colnames(f1) = colnames( mb )
rownames(f1) = gn
return(f1)
}
readBed_filterChroms = function(filePath, chroms, scoreCol){
# filePath='/Volumes/T7/CTCF/CTCF_HUMAN.H11MO.0.A.bed'
x= read.delim(filePath,as.is=T,header=F)
x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]),
end=as.numeric(x[,3]),
names=rownames(x)),
score=x[,scoreCol]) ) }
readBed_filterChromsStraded = function(filePath, chroms, scoreCol){
# filePath='/Volumes/T7/CTCF/ATAC_ES_2i_nov/peaks/ATACseq40_ES_2i_IAA_1_MM.narrowPeak'
x= read.delim(filePath,as.is=T,header=F)
x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]),
end=as.numeric(x[,3]),
names=rownames(x)),
strand = x[,6],
score=x[,scoreCol]) ) }
readNarrowPeak2getSummit = function(filePath, chroms, scoreCol){
# filePath='/Volumes/T7/CTCF/ATAC_ES_2i_nov/peaks/ATACseq40_ES_2i_IAA_1_MM.narrowPeak'
x= read.delim(filePath,as.is=T,header=F)
x[,1] = paste0('chr',x[,1])
x = x[x[,1] %in% chroms,]
return( GRanges( seqnames = Rle(x[,1]),
ranges = IRanges(as.numeric(x[,2]) + as.numeric(x[,10]) ,
end=as.numeric(x[,2]) + as.numeric(x[,10]),
names=rownames(x)),
score=x[,scoreCol]) ) }