我想使用Poisson glmnet对保险索赔计数进行建模。我手头的数据包含每个保单的索赔数量(这是响应变量),有关保单的某些功能(性别,地区等)以及保单的期限(以年为单位)。我想将对数持续时间作为偏移项包括在内,就像我们在精算科学中通常所做的那样。借助软件包的cv.glmnet
功能glmnet
,它很简单:
library(tidyverse)
library(glmnet)
n <- 100
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
fit <- cv.glmnet(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
但是,由于它具有许多优点,我的目标是使用软件包的train
功能来训练该模型caret
。确实,此程序包的验证,预处理以及功能选择要好得多。用以下方法训练基本的glmnet(没有偏移项)很简单caret
:
library(caret)
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson"
)
fit
天真的,我们可以尝试offset
在train
函数中添加参数:
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
不幸的是,这段代码引发了错误Error : No newoffset provided for prediction, yet offset used in fit of glmnet
。发生此错误的原因是,caret::train
函数不会在函数中为newoffset
参数提供值predict.glmnet
。
在本书中,他们展示了如何通过修改caret::train
函数的源代码向GLM模型添加偏移项。完美运作。但是,该predict.glm
函数与该函数有很大不同predict.glmnet
,因为它没有newoffset
参数。我试图修改该caret::train
函数的源代码,但是遇到了一些麻烦,因为我不太了解此函数的工作原理。
一个简单的方法来执行,这是通过offset
列的部分x
和在每个fit
与predict
呼叫传递作为x
列x
其不是offset
。在as offset
/ newoffset
pass时x
对应于的列offset
。
在下面的示例中,x的最不列也需要命名为“ offset”。这可以相对容易地更改
要创建函数,我们将仅使用以下部分的内容:https : //github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet很特别,因为它需要a loop
,其余的只需从https://topepo.github.io/caret/using-your-own-model-in-train.html#illustrative-example-1-svms-拉普拉斯内核
family = "poisson"
将在整个过程中进行指定,以更改此采用代码,网址为https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet_offset <- list(type = "Regression",
library = c("glmnet", "Matrix"),
loop = function(grid) {
alph <- unique(grid$alpha)
loop <- data.frame(alpha = alph)
loop$lambda <- NA
submodels <- vector(mode = "list", length = length(alph))
for(i in seq(along = alph)) {
np <- grid[grid$alpha == alph[i],"lambda"]
loop$lambda[loop$alpha == alph[i]] <- np[which.max(np)]
submodels[[i]] <- data.frame(lambda = np[-which.max(np)])
}
list(loop = loop, submodels = submodels)
})
glmnet_offset$parameters <- data.frame(parameter = c('alpha', 'lambda'),
class = c("numeric", "numeric"),
label = c('Mixing Percentage', 'Regularization Parameter'))
glmnet_offset$grid <- function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
init <- glmnet::glmnet(Matrix::as.matrix(x[,colnames(x) != "offset"]), y,
family = "poisson",
nlambda = len+2,
alpha = .5,
offset = x[,colnames(x) == "offset"])
lambda <- unique(init$lambda)
lambda <- lambda[-c(1, length(lambda))]
lambda <- lambda[1:min(length(lambda), len)]
out <- expand.grid(alpha = seq(0.1, 1, length = len),
lambda = lambda)
} else {
out <- data.frame(alpha = runif(len, min = 0, 1),
lambda = 2^runif(len, min = -10, 3))
}
out
}
所以x[,colnames(x) != "offset"]
是x
同时offset
是x[,colnames(x) == "offset"]
glmnet_offset$fit <- function(x, y, wts, param, last, ...) {
theDots <- list(...)
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
if(!(class(x)[1] %in% c("matrix", "sparseMatrix")))
x <- Matrix::as.matrix(x)
modelArgs <- c(list(x = x[,colnames(x) != "offset"],
y = y,
alpha = param$alpha,
family = "poisson",
offset = x[,colnames(x) == "offset"]),
theDots)
out <- do.call(glmnet::glmnet, modelArgs)
if(!is.na(param$lambda[1])) out$lambdaOpt <- param$lambda[1]
out
}
glmnet_offset$predict <- function(modelFit, newdata, submodels = NULL) {
if(!is.matrix(newdata)) newdata <- Matrix::as.matrix(newdata)
out <- predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = modelFit$lambdaOpt,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response") #important for measures to be appropriate
if(is.matrix(out)) out <- out[,1]
out
if(!is.null(submodels)) {
tmp <- as.list(as.data.frame(predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = submodels$lambda,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response"),
stringsAsFactors = TRUE))
out <- c(list(out), tmp)
}
out
}
由于某些原因,我不明白,没有prob
插槽就无法使用
glmnet_offset$prob <- glmnet_offset$predict
glmnet_offset$tags = c("Generalized Linear Model", "Implicit Feature Selection",
"L1 Regularization", "L2 Regularization", "Linear Classifier",
"Linear Regression")
glmnet_offset$sort = function(x) x[order(-x$lambda, x$alpha),]
glmnet_offset$trim = function(x) {
x$call <- NULL
x$df <- NULL
x$dev.ratio <- NULL
x
}
library(tidyverse)
library(caret)
library(glmnet)
n <- 100
set.seed(123)
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
x = dat %>%
dplyr::select(-nb_claims) %>%
mutate(offset = log(duration)) %>%
dplyr::select(-duration) %>%
as.matrix
fit <- caret::train(
x = x,
y = dat %>% pull(nb_claims),
method = glmnet_offset,
)
fit
100 samples
4 predictor
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 100, 100, 100, 100, 100, 100, ...
Resampling results across tuning parameters:
alpha lambda RMSE Rsquared MAE
0.10 0.0001640335 0.7152018 0.01805762 0.5814200
0.10 0.0016403346 0.7152013 0.01805684 0.5814193
0.10 0.0164033456 0.7130390 0.01798125 0.5803747
0.55 0.0001640335 0.7151988 0.01804917 0.5814020
0.55 0.0016403346 0.7150312 0.01802689 0.5812936
0.55 0.0164033456 0.7095996 0.01764947 0.5783706
1.00 0.0001640335 0.7152033 0.01804795 0.5813997
1.00 0.0016403346 0.7146528 0.01798979 0.5810811
1.00 0.0164033456 0.7063482 0.01732168 0.5763653
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were alpha = 1 and lambda = 0.01640335.
predict(fit$finalModel, x[,1:3], newoffset = x[,4]) #works
这不适用于插入符号中的预处理,因为我们将offset作为功能之一传递。但是,它将与配方一起使用,因为您可以定义将通过选择在其上执行预处理功能的列。有关详细信息的硒文章:https : //tidymodels.github.io/recipes/articles/Selecting_Variables.html
我还没有时间错误检查我的代码。如果出现任何问题或某个地方有错误,请发表评论。谢谢。
您还可以在插入符号github中发布问题,要求将此功能(偏移量/ newoffset)添加到模型中
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句