Out of curiosity I have been playing with several ways to calculate fold changes and I am trying to find the fastest and the most elegant way to do that (hoping that would also be the same solution).
The kind of matrix I am interested in would look like this:
# Some data
nvars <- 10000
nsamples <- 500
sample_groups <- 5
MAT <- replicate(nvars, runif(n=nsamples))
And a grouping vector that look like this:
f <- rep_len(1:sample_groups, nsamples)
f <- LETTERS[f]
The output I would like in the end from the above input is a 10 x 10,000 matrix with one row for each combination of the levels in f
.
To do this the first task is to calculate the mean for each group for all column. I have come up with 4 possible methods to do that:
# Settings
aggr_FUN <- mean
combi_FUN <- function(x,y) "/"(x,y)
# helper function
pasteC <- function(x,y) paste(x,y,sep=" - ")
#A1. Loop
system.time({
f_un <- unique(f)
temp1 <- matrix(NA,nrow = length(f_un),ncol=ncol(MAT))
rownames(temp1) <- f_un
for(i in 1:length(f_un)){
temp1[i,] <- apply(MAT[f_un[i] == f,,drop=FALSE],2,aggr_FUN)
}
})
user system elapsed
0.41 0.00 0.41
#A2. aggregate
system.time({
temp2 <- aggregate(. ~ class, data = cbind.data.frame(class=f,MAT), aggr_FUN)
})
user system elapsed
7.76 0.05 7.81
#A3. reshape2
library(reshape2)
system.time({
temp3 <- recast(data.frame(class=f,MAT),class ~ variable,id.var="class",aggr_FUN)
})
user system elapsed
1.82 0.30 2.12
#A4. purrr
library(purrr)
system.time({
temp4 <- data.frame(class = f, MAT) %>%
slice_rows("class") %>%
by_slice(map, aggr_FUN)
})
user system elapsed
0.47 0.00 0.47
As you can see the loop is actually the fastest solution with the purrr
package being only slightly slower. recast
is 5 times slower and aggregate
is a clear looser. I also tried the dplyr package but that turned out to be very slow for some reason (https://github.com/hadley/dplyr/issues/1395).
purrr
is both fast and elegant so for this part it is a pretty academic exercise looking for a better way.
At this point the output we have is:
> temp1[,1:6]
[,1] [,2] [,3] [,4] [,5] [,6]
A 0.4804964 0.4779168 0.5292458 0.4401357 0.4728515 0.5009800
B 0.4819612 0.5260592 0.5291887 0.5095620 0.4840777 0.4792213
C 0.4661714 0.4886010 0.5006018 0.5061170 0.5058892 0.5432819
D 0.4566942 0.4519988 0.5334207 0.4912822 0.4542889 0.4898384
E 0.4967948 0.5630683 0.4941777 0.5239327 0.5045152 0.5227140
So if you are still reading here comes the more challenging part. We need to calculate the fold change between all combinations of the groups/rows.
I found two ways to do this:
#B1. by loop
combs <- t(combn(as.character(f_un),2))
combi_FUN_vec <- Vectorize(combi_FUN)
out <- matrix(NA,nrow = nrow(combs),ncol=ncol(temp1))
rownames(out) <- pasteC(combs[,1],combs[,2])
colnames(out) <- 1:ncol(temp1)
system.time({
for( i in 1:nrow(combs)){
out[i,] <- combi_FUN_vec( temp1[combs[i,1],] , temp1[combs[i,2],] )
}
})
user system elapsed
0.13 0.00 0.13
#B2. by apply
class_computed <- as.character(temp2[,1])
temp2 <- as.matrix(temp2[,-1])
combs <- t(combn(class_computed,2))
rownames(temp2) <- class_computed
combi_FUN_vec <- Vectorize(combi_FUN)
system.time({
out <- apply(temp2,2,function(x){
v <- combi_FUN_vec( x[combs[,1]] , x[combs[,2]] )
names(v) <- pasteC(combs[,1],combs[,2])
return(v)
})
})
user system elapsed
0.91 0.00 0.91
Not surprisingly the loop is a clear winner and the output is this:
> out[,1:5]
1 2 3 4 5
A - B 1.2128952 1.0161608 0.9313115 0.9767619 1.0132362
A - C 1.0946079 1.0524154 0.9882857 0.9546686 0.9604382
A - D 1.1872958 0.9113349 0.9941437 0.8751611 0.9863873
A - E 1.1457396 0.9669100 0.9146375 0.8760513 1.0604971
B - C 0.9024753 1.0356780 1.0611763 0.9773810 0.9478918
B - D 0.9788940 0.8968413 1.0674664 0.8959820 0.9735018
B - E 0.9446320 0.9515325 0.9820962 0.8968933 1.0466435
C - D 1.0846768 0.8659461 1.0059275 0.9167172 1.0270179
C - E 1.0467123 0.9187532 0.9254788 0.9176496 1.1041804
D - E 0.9649993 1.0609821 0.9200254 1.0010171 1.0751326
Now here is what bugs me... These two last methods are extremely ugly. Is there a better/cleaner/faster way? Preferably in dplyr
/purrr
style syntax? Perhaps without having to go through combn
even?
Any hints are appreciated.
EDIT:
I manage to make a much more compact version in dplyr style:
f_un <- unique(f)
combs <- t(combn(as.character(f_un),2))
out3 <- data.frame(class = f, MAT) %>% slice_rows("class") %>% by_slice(map, aggr_FUN) %>%
do(combi_FUN( slice(.,match(combs[,1], class))[,-1] ,slice(.,match(combs[,2], class))[,-1] )) %>%
as.data.frame(row.names = pasteC(combs[,1],combs[,2]))
Is there a way to simplify that and speed it up? It is 10x slower than the fastest above.
EDIT2: Based on the suggestions so far the fastest and cleanest is the below function.
fold.change <- function(MAT,f,aggr_FUN=mean,combi_FUN=function(x,y) "/"(x,y) ){
# mean using purrr
x <- data.frame(class = f, MAT) %>% slice_rows("class") %>% by_slice(map, aggr_FUN)
rownames <- as.character(as.data.frame(x[,1])[,1])
x <- as.matrix(x[,-1])
rownames(x) <- rownames
# calculate changes between all rows
i <- combn(unique(f), 2)
ret <- combi_FUN(x[i[1,],] , x[i[2,],])
rownames(ret) <- pasteC(i[1,], i[2,])
# Put original colnames
colnames(ret) <- colnames(MAT)
return(ret)
}
Matrix operations and subsetting are fast:
fold <- function(x, f, aggr_FUN = colMeans, combi_FUN = '/'){
f <- as.factor(f)
i <- split(1:nrow(x), f)
x <- sapply(i, function(i){ aggr_FUN(x[i,])})
x <- t(x)
j <- combn(levels(f), 2)
ret <- combi_FUN(x[j[1,],], x[j[2,],])
rownames(ret) <- paste(j[1,], j[2,], sep = '-')
ret
}
> system.time(ret <- fold(MAT, f))
user system elapsed
0.13 0.00 0.12
> all.equal(ret, out, check.attributes = F)
[1] TRUE
> if(require(matrixStats))
+ system.time(fold(MAT, f, aggr_FUN = colMedians))
user system elapsed
0.27 0.00 0.27
> if(require(matrixStats))
+ system.time(fold(MAT, f, aggr_FUN = colSds))
user system elapsed
0.17 0.02 0.18
Unless, I'm really misunderstanding what you're trying to do.
Collected from the Internet
Please contact [email protected] to delete if infringement.
Comments