在R中两列中搜索两个字符串的高效版本

debugcn 发表于 Dev

Sabor117

我有一个（大）数据框，其结构与此类似：

id1 id2 symbol1 symbol2 scoreA scoreB scoreC
4790 1120 ABC LLL 1 0 1
2300 4790 NNN ABC 0 0 1
1120 4790 LLL ABC 0 1 1
1120 3120 LLL CCC 0 0 0

我正在尝试过滤数据帧，以便我可以在其中匹配两个不同的字符串的每一行，symbol1而且symbol2这是反复且动态地完成的，因此我将字符串作为变量进行搜索。

因此，在上面的示例中，如果我要查找两个符号分别为ABC和的每个实例LLL，我将输出类似以下结果：

id1 id2 symbol1 symbol2 scoreA scoreB scoreC
4790 1120 ABC LLL 1 0 1
1120 4790 LLL ABC 0 1 1

所以我的问题是，我想尝试搜索每一行，其中一列等于两个值中的一个，另一列等于两个值中的另一个。

我的解决方案是执行以下操作：

c1_step1 = scores_file[scores_file$symbol1 == in_gene,]
c2_step1 = scores_file[scores_file$symbol1 == end_gene,]

c1_step2 = c1_step1[c1_step1$symbol2 == end_gene,]
c2_step2 = c2_step1[c2_step1$symbol2 == in_gene,]

out_file = rbind(c1_step2, c2_step2)

但是，这只是感觉相当笨拙和笨拙，我想知道是否有可能存在一种更好的方法（也更易于阅读）来执行此类操作？也许有些我不知道的使用dplyr的东西？

A5C1D2H2I1M1N2O1R2T1

最好的方法是使用%in%代替==，例如：

SYM <- c("ABC", "LLL")

library(data.table)
setDT(mydf)[symbol1 %in% SYM & symbol2 %in% SYM]
##     id1  id2 symbol1 symbol2 scoreA scoreB scoreC
## 1: 4790 1120     ABC     LLL      1      0      1
## 2: 1120 4790     LLL     ABC      0      1      1

或者，使用“ dplyr”，您可以尝试以下任一方法：

library(dplyr)
SYM <- c("ABC", "LLL")

# Option 1
mydf %>%
  filter(symbol1 %in% SYM, symbol2 %in% SYM)

# Option 2
mydf %>%
  filter_at(vars(symbol1, symbol2), all_vars(. %in% SYM))

这是开始数据：

mydf <- structure(list(id1 = c(4790L, 2300L, 1120L, 1120L), id2 = c(1120L, 
    4790L, 4790L, 3120L), symbol1 = c("ABC", "NNN", "LLL", "LLL"), 
        symbol2 = c("LLL", "ABC", "ABC", "CCC"), scoreA = c(1L, 0L, 
        0L, 0L), scoreB = c(0L, 0L, 1L, 0L), scoreC = c(1L, 1L, 1L, 
        0L)), index = structure(integer(0), "`__symbol1`" = c(1L, 
    3L, 4L, 2L), "`__symbol2`" = c(2L, 3L, 4L, 1L), "`__symbol2__symbol1`" = c(3L, 
    2L, 4L, 1L)), row.names = c(NA, 4L), class = "data.frame")

正如我在评论中所暗示的那样，“无知”可能是非常主观的。我决定尝试使用您的方法以提高效率，以及此处的建议和使用的原始建议paste。我还添加了一个选项，interaction有时它可能比更快paste。

样本数据

library(data.table)
library(dplyr)
x <- rbindlist(replicate(250, mydf, FALSE)) ## 1000 rows
y <- rbindlist(replicate(1000, x, FALSE))   ## 1 million rows

正在测试的功能

# OP's approach
op_fun <- function(data = x) {
  c1_step1 = data[symbol1 == "ABC",]
  c2_step1 = data[symbol1 == "LLL",]
  c1_step2 = c1_step1[symbol2 == "LLL",]
  c2_step2 = c2_step1[symbol2 == "ABC",]
  rbind(c1_step2, c2_step2)
}

# data.table
am_fun <- function(data = x, symbs = c("ABC", "LLL")) {
  data[symbol1 %in% symbs & symbol2 %in% symbs]
}

# dplyr
am_dplyr_1 <- function(data = x, symbs = c("ABC", "LLL")) {
  data %>% filter(symbol1 %in% symbs, symbol2 %in% symbs)
}
am_dplyr_2 <- function(data = x, symbs = c("ABC", "LLL")) {
  data %>% filter_at(vars(symbol1, symbol2), all_vars(. %in% symbs))
}

# base R
paste_fun <- function(data = x) {
  as.data.table(data)[paste(symbol1, symbol2) %in% c("ABC LLL", "LLL ABC")]
}
interaction_fun <- function(data = x) {
  as.data.table(data)[interaction(symbol1, symbol2) %in% c("ABC.LLL", "LLL.ABC")]
}

基准测试

bench::mark(op_fun(x), am_fun(x), am_dplyr_1(x), am_dplyr_2(x), paste_fun(x), interaction_fun(x), 
            check = FALSE, time_unit = "ms")
# # A tibble: 6 x 13
#   expression           min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result           memory          time    gc              
#   <bch:expr>         <dbl>  <dbl>     <dbl> <bch:byt>    <dbl> <int> <dbl>      <dbl> <list>           <list>          <list>  <list>          
# 1 op_fun(x)          4.16   4.28       230.   523.1KB     6.57   105     3       456. <df[,7] [500 × … <df[,3] [112 ×… <bch:t… <tibble [108 × …
# 2 am_fun(x)          1.29   1.36       701.   150.9KB     4.21   333     2       475. <df[,7] [500 × … <df[,3] [31 × … <bch:t… <tibble [335 × …
# 3 am_dplyr_1(x)      0.602  0.627     1565.    62.7KB     8.73   717     4       458. <df[,7] [500 × … <df[,3] [22 × … <bch:t… <tibble [721 × …
# 4 am_dplyr_2(x)      1.45   1.52       645.    66.6KB     6.56   295     3       457. <df[,7] [500 × … <df[,3] [28 × … <bch:t… <tibble [298 × …
# 5 paste_fun(x)       0.403  0.414     2374.   155.6KB     4.20  1130     2       476. <df[,7] [500 × … <df[,3] [33 × … <bch:t… <tibble [1,132 …
# 6 interaction_fun(x) 0.483  0.496     1960.   219.1KB     6.38   922     3       470. <df[,7] [500 × … <df[,3] [46 × … <bch:t… <tibble [925 × …

bench::mark(op_fun(y), am_fun(y), am_dplyr_1(y), am_dplyr_2(y), paste_fun(y), interaction_fun(y), 
            check = FALSE, time_unit = "ms")
# # A tibble: 6 x 13
#   expression           min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result             memory          time     gc           
#   <bch:expr>         <dbl>  <dbl>     <dbl> <bch:byt>    <dbl> <int> <dbl>      <dbl> <list>             <list>          <list>   <list>       
# 1 op_fun(y)           67.2   71.9      8.65    98.6MB    10.4      5     6       578. <df[,7] [500,000 … <df[,3] [112 ×… <bch:tm> <tibble [5 ×…
# 2 am_fun(y)           19.8   25.0     39.7     36.4MB    19.8     20    10       504. <df[,7] [500,000 … <df[,3] [31 × … <bch:tm> <tibble [20 …
# 3 am_dplyr_1(y)       33.4   41.4     19.3     59.1MB    13.5     10     7       518. <df[,7] [500,000 … <df[,3] [22 × … <bch:tm> <tibble [10 …
# 4 am_dplyr_2(y)       34.5   43.8     23.3     59.1MB    19.5     12    10       514. <df[,7] [500,000 … <df[,3] [28 × … <bch:tm> <tibble [12 …
# 5 paste_fun(y)       181.   196.       4.38     103MB     5.84     3     4       685. <df[,7] [500,000 … <df[,3] [34 × … <bch:tm> <tibble [3 ×…
# 6 interaction_fun(y) 108.   168.       5.88   164.8MB    10.3      4     7       681. <df[,7] [500,000 … <df[,3] [53 × … <bch:tm> <tibble [4 ×…

如您所见，数据较小（1000行）paste且interaction速度很快。interaction似乎比的伸缩性更好paste，但它仍然无法与您的方法或%in%我在这里建议的使用方法进行比较。在data.table与dplyr方法扩展的最佳和保持可读性很强。

注意：我尚未测试其他答案，因为我认为它们不是很正确。

本文收集自互联网，转载请注明来源。

如有侵权，请联系[email protected] 删除。

编辑于2021-04-1

我来说两句

0条评论

登录后参与评论

来自分类Dev

Related 相关文章

文章