我想对30个SNP(基因型的0、1、2格式的单核苷酸多态性)列表的病例对照结果/疾病(对照和病例的格式为0,1)进行回归分析。我知道如何在R中使用以下命令对一个SNP进行操作;
test = glm(casecontrol ~ rs12345, data=mydata, family=binomial)
问题:我如何运行模型以在R中一次性获得30个SNP与疾病的关联的摘要统计信息?就像我们从GWAS,β估计值,p值,SD,等位基因频率中得到的一样?我可以使用R中的任何软件包吗?
编辑:
structure(list(ID = 1:6, sex = c(2L, 1L, 2L, 2L, 1L, 1L), age = c(49.65,
48.56, 49.55, 55.23, 60.62, 60.19), bmi = c(18.09, 22.82, 31.31,
21.87, 30.07, 26.75), casecontrol = c(1L, 0L, 0L, 1L, 0L, 0L),
rs1 = c(1L, 0L, 1L, 0L, 0L, 2L), rs2 = c(0L, 0L, 1L, 0L,
0L, 0L), rs3 = c(0L, 0L, 0L, 0L, 0L, 0L), rs4 = c(1L, 0L,
1L, 0L, 1L, 0L), rs5 = c(0L, 1L, 1L, 2L, 2L, 1L), rs6 = c(0L,
0L, 1L, 0L, 0L, 0L), rs7 = c(1L, 0L, 0L, 0L, 0L, 0L), rs8 = c(0L,
1L, 0L, 0L, 0L, 0L), rs9 = c(1L, 0L, 0L, 1L, 0L, 1L), rs10 = c(1L,
1L, 0L, 1L, 0L, 0L), rs11 = c(0L, 1L, 1L, 0L, 0L, 0L), rs12 = c(0L,
0L, 0L, 1L, 1L, 0L), rs13 = c(1L, 0L, 1L, 1L, 1L, 0L), rs14 = c(0L,
1L, 0L, 1L, 0L, 0L), rs15 = c(0L, 0L, 0L, 0L, 0L, 0L), rs16 = c(0L,
0L, 0L, 0L, 2L, 0L), rs17 = c(2L, 1L, 1L, 1L, 0L, 0L), rs18 = c(0L,
0L, 0L, 1L, 0L, 1L), rs19 = c(0L, 0L, 0L, 0L, 1L, 0L), rs20 = c(0L,
1L, 1L, 0L, 0L, 0L), rs21 = c(1L, 1L, 1L, 2L, 0L, 0L), rs22 = c(1L,
1L, 1L, 0L, 0L, 0L), rs23 = c(0L, 0L, 0L, 0L, 1L, 0L), rs24 = c(0L,
0L, 1L, 1L, 0L, 1L), rs25 = c(1L, 0L, 1L, 1L, 0L, 0L), rs26 = c(0L,
1L, 0L, 0L, 0L, 0L), rs27 = c(0L, 0L, 0L, 1L, 0L, 0L), rs28 = c(1L,
0L, 0L, 2L, 1L, 0L), rs29 = c(0L, 0L, 0L, 1L, 0L, 0L), rs30 = c(1L,
1L, 0L, 0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")```
您肯定在正确的轨道上。使用多个预测变量就像将它们一个接一个地添加一样简单...因此,只需练习前三个变量,就可以如图所示更改命令(在一分钟之内可以轻松使用30个预测变量。
编辑以确保我们将SNP转换为因子,而不是假设它们是因子的整数。也是一个更好的玩具数据集,可以收敛
library(dplyr)
library(broom)
glm(casecontrol ~ as.factor(rs1) + as.factor(rs2) + as.factor(rs3), data = mydata, family=binomial)
#>
#> Call: glm(formula = casecontrol ~ as.factor(rs1) + as.factor(rs2) +
#> as.factor(rs3), family = binomial, data = mydata)
#>
#> Coefficients:
#> (Intercept) as.factor(rs1)1 as.factor(rs1)2 as.factor(rs2)1
#> 0.03811 0.13198 -0.20161 0.22642
#> as.factor(rs2)2 as.factor(rs3)1 as.factor(rs3)2
#> 0.10170 -0.22889 -0.03697
#>
#> Degrees of Freedom: 499 Total (i.e. Null); 493 Residual
#> Null Deviance: 693
#> Residual Deviance: 688.4 AIC: 702.4
使用summary
命令获取p值等。
summary(glm(casecontrol ~ as.factor(rs1) + as.factor(rs2) + as.factor(rs3), data = mydata, family=binomial))
#>
#> Call:
#> glm(formula = casecontrol ~ as.factor(rs1) + as.factor(rs2) +
#> as.factor(rs3), family = binomial, data = mydata)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.350 -1.193 1.014 1.161 1.348
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 0.03811 0.22619 0.168 0.866
#> as.factor(rs1)1 0.13198 0.22276 0.592 0.554
#> as.factor(rs1)2 -0.20161 0.22264 -0.906 0.365
#> as.factor(rs2)1 0.22642 0.22111 1.024 0.306
#> as.factor(rs2)2 0.10170 0.21969 0.463 0.643
#> as.factor(rs3)1 -0.22889 0.21864 -1.047 0.295
#> as.factor(rs3)2 -0.03697 0.22117 -0.167 0.867
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 693.02 on 499 degrees of freedom
#> Residual deviance: 688.39 on 493 degrees of freedom
#> AIC: 702.39
#>
#> Number of Fisher Scoring iterations: 3
更好地broom::tidy
用于获得良好的输出
tidy(glm(casecontrol ~ as.factor(rs1) + as.factor(rs2) + as.factor(rs3), data = mydata, family=binomial))
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.0381 0.226 0.168 0.866
#> 2 as.factor(rs1)1 0.132 0.223 0.592 0.554
#> 3 as.factor(rs1)2 -0.202 0.223 -0.906 0.365
#> 4 as.factor(rs2)1 0.226 0.221 1.02 0.306
#> 5 as.factor(rs2)2 0.102 0.220 0.463 0.643
#> 6 as.factor(rs3)1 -0.229 0.219 -1.05 0.295
#> 7 as.factor(rs3)2 -0.0370 0.221 -0.167 0.867
显然,使用示例数据,我们将无法获得真正的答案。
为了最有效地利用时间,请创建一个临时数据集进行分析。我们称其为justanalyze
仅包含您实际要使用的结果和变量。然后,我们可以casecontrol ~ .
说案例控制与其他所有变量一起用作预测变量。
justanalyze <-
mydata %>%
select(casecontrol, rs1:rs30) %>%
mutate_at(vars(rs1:rs30), as.factor)
# glm(casecontrol ~ ., data = justanalyze, family=binomial)
# summary(glm(casecontrol ~ ., data = justanalyze, family=binomial))
tidy(glm(casecontrol ~ ., data = justanalyze, family=binomial))
#> # A tibble: 61 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) -0.493 0.782 -0.630 0.529
#> 2 rs11 0.143 0.249 0.574 0.566
#> 3 rs12 -0.157 0.244 -0.642 0.521
#> 4 rs21 0.106 0.248 0.428 0.669
#> 5 rs22 0.0427 0.243 0.176 0.860
#> 6 rs31 -0.231 0.238 -0.970 0.332
#> 7 rs32 0.00169 0.245 0.00690 0.994
#> 8 rs41 -0.259 0.244 -1.06 0.288
#> 9 rs42 -0.474 0.253 -1.87 0.0610
#> 10 rs51 0.0148 0.256 0.0577 0.954
#> # … with 51 more rows
整理更好的数据(示例)
set.seed(2020)
mydata <- data.frame(ID = 1:100,
sex = sample(1:2, size = 500, replace = TRUE),
age = runif(100, min= 35, max = 70),
bmi = runif(100, min= 15, max = 35),
casecontrol = sample(0:1, size = 500, replace = TRUE),
rs1 = sample(0:2, size = 500, replace = TRUE),
rs2 = sample(0:2, size = 500, replace = TRUE),
rs3 = sample(0:2, size = 500, replace = TRUE),
rs4 = sample(0:2, size = 500, replace = TRUE),
rs5 = sample(0:2, size = 500, replace = TRUE),
rs6 = sample(0:2, size = 500, replace = TRUE),
rs7 = sample(0:2, size = 500, replace = TRUE),
rs8 = sample(0:2, size = 500, replace = TRUE),
rs9 = sample(0:2, size = 500, replace = TRUE),
rs10 = sample(0:2, size = 500, replace = TRUE),
rs11 = sample(0:2, size = 500, replace = TRUE),
rs12 = sample(0:2, size = 500, replace = TRUE),
rs13 = sample(0:2, size = 500, replace = TRUE),
rs14 = sample(0:2, size = 500, replace = TRUE),
rs15 = sample(0:2, size = 500, replace = TRUE),
rs16 = sample(0:2, size = 500, replace = TRUE),
rs17 = sample(0:2, size = 500, replace = TRUE),
rs18 = sample(0:2, size = 500, replace = TRUE),
rs19 = sample(0:2, size = 500, replace = TRUE),
rs20 = sample(0:2, size = 500, replace = TRUE),
rs21 = sample(0:2, size = 500, replace = TRUE),
rs22 = sample(0:2, size = 500, replace = TRUE),
rs23 = sample(0:2, size = 500, replace = TRUE),
rs24 = sample(0:2, size = 500, replace = TRUE),
rs25 = sample(0:2, size = 500, replace = TRUE),
rs26 = sample(0:2, size = 500, replace = TRUE),
rs27 = sample(0:2, size = 500, replace = TRUE),
rs28 = sample(0:2, size = 500, replace = TRUE),
rs29 = sample(0:2, size = 500, replace = TRUE),
rs30 = sample(0:2, size = 500, replace = TRUE)
)
# mydata
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句