带约束的分段回归

机器算法验证 r 分段线性
2022-04-21 16:37:01

我想在 R 中进行分段线性回归。我有一个包含 3 段的大型数据集,我希望第一段和第三段没有斜率,即平行于 x 轴,并且我还希望回归是连续的。我有一个小的示例数据集和下面的示例代码。我对 R 相当陌生,并试图编写模型公式但没有成功。任何人都可以帮助计算公式以及如何提取不同段的截距和斜率吗?在代码中插入三个具体问题作为注释。我已经查看了分段的包,但无法理解如何将分段 1 和 3 约束为平行于 x 轴。

#Example data
y <- c(4.5,4.3,2.57,4.40,4.52,1.39,4.15,3.55,2.49,4.27,4.42,4.10,2.21,2.90,1.42,1.50,1.45,1.7,4.6,3.8,1.9)  
x <- c(320,419,650,340,400,800,300,570,720,480,425,460,675,600,850,920,975,1022,450,520,780)  
plot(x, y, col="black",pch=16)

#model 1 not continuous, Q1: how to get that?
fit1<-lm(y ~ I(x<500)+I((x>=500&x<800)*x) + I(x>=800)) 
summary(fit1) #intercepts and slopes extracted from summary(fit1)
lines(c(min(x),500),c(round(summary(fit1)[[4]][1]+summary(fit1)[[4]][2],2),round(summary(fit1)[[4]][1]+summary(fit1)[[4]][2],2)),col="red")
lines(c(800,max(x)),c(round(summary(fit1)[[4]][1]+summary(fit1)[[4]][4],2),round(summary(fit1)[[4]][1]+summary(fit1)[[4]][4],2)),col="red")
lines(c(500,800),c((summary(fit1)[[4]][1])+(500*(summary(fit1)[[4]][3])),
               (summary(fit1)[[4]][1])+(800*(summary(fit1)[[4]][3]))),col="red")

#model 2 continuous but first and third segment not parallell to x-axis, Q2: how to get that?
fit2<-lm(y ~ x + I(pmax(x-500,0)) + I(pmax(x-800,0)))                                                  
summary(fit2) # Q3: how to get intercept and slope from summary(fit2) for model2?
mg <- seq(min(x),max(x),length=400)
mpv <- predict(fit2, newdata=data.frame(x = mg,t = (mg - 800)*as.numeric(mg > 800)))
lines(mg, mpv,col="blue")
3个回答

如果目标只是拟合函数,则可以将其视为优化问题:

y <- c(4.5,4.3,2.57,4.40,4.52,1.39,4.15,3.55,2.49,4.27,4.42,4.10,2.21,2.90,1.42,1.50,1.45,1.7,4.6,3.8,1.9)  
x <- c(320,419,650,340,400,800,300,570,720,480,425,460,675,600,850,920,975,1022,450,520,780)  
plot(x, y, col="black",pch=16)


#we need four parameters: the two breakpoints and the starting and ending intercepts
fun <- function(par, x) {
  #set all y values to starting intercept
  y1 <- x^0 * par["i1"]
  #set values after second breakpoint to ending intercept
  y1[x >= par["x2"]] <- par["i2"]
  #which values are between breakpoints?
  r <- x > par["x1"] & x < par["x2"]
  #interpolate between breakpoints
  y1[r] <- par["i1"] + (par["i2"] - par["i1"]) / (par["x2"] - par["x1"]) * (x[r] - par["x1"])
  y1
}

#sum of squared residuals
SSR <- function(par) {
     sum((y - fun(par, x))^2)
   }


library(optimx)
optimx(par = c(x1 = 500, x2 = 820, i1 = 5, i2 = 1), 
       fn = SSR, 
       method = "Nelder-Mead")

#                  x1       x2       i1       i2     value fevals gevals niter convcode kkt1 kkt2 xtimes
#Nelder-Mead 449.8546 800.0002 4.381454 1.512305 0.6404728    373     NA    NA        0 TRUE TRUE   0.06

lines(300:1100, 
      fun(c(x1 = 449.8546, x2 = 800.0002, i1 = 4.381454, i2 = 1.512305), 300:1100))

结果图

如果您还需要置信区间和预测区间,您可以首先通过平滑函数逼近您的三相分段线性函数,进行nls拟合,然后使用investr包(这有助于拟合,因为函数随后是连续可微分的)。

在你的情况下:

x <- c(478, 525, 580,  650,  700,  720,  780,  825,  850,  900,  930,  980, 1020, 1040, 1050, 1075, 1081, 1100, 1160, 1180, 1200)
y <- c(1.70, 1.45, 1.50, 1.42, 1.39, 1.90, 2.49, 2.21, 2.57, 2.90, 3.55, 3.80, 4.27, 4.10, 4.60, 4.42, 4.30, 4.52, 4.40, 4.50, 4.15)
# calculate rolling slopes at each point to provide good initial estimates for slope parameter b
f <- function (d) {
  m <- lm(y~x, as.data.frame(d))
  return(coef(m)[2])
}
require(zoo)
slopes <- rollapply(data.frame(x=x,y=y), 3, f, by.column=F)

# smooth approximation is
# y ~ a + (1/2)*b*(B2-B1) + 
#         (1/2)*sqrt(abs(b*(4*s+b*(B1-x)^2))) -  
#         (1/2)*sqrt(abs(b*(4*s+b*(B2-x)^2)))
# this smooth approximation approaches the piecewise linear model more as s -> 0

require(minpack.lm)
nlslmfit = nlsLM(y ~ a + (1/2)*exp(logb)*(B2-B1) + # we fit exp(logb) to force b > 0, if you don't want this just fit b instead
                   (1/2)*sqrt(abs(exp(logb)*(4*1E-10+exp(logb)*(B1-x)^2))) - # now set s to 1E-10, we could also fit exp(logs) 
                   (1/2)*sqrt(abs(exp(logb)*(4*1E-10+exp(logb)*(B2-x)^2))),
                 data = data.frame(x=x, y=y),
                 start = c(B1=min(x)+1E-10, B2=max(x)-1E-10, a=min(y)+1E-10, logb=log(max(slopes))),
                 # lower = c(B1=min(x), B2=mean(x), a=min(y), logb=log(min(slopes[slopes>0]))),
                 # upper = c(B1=mean(x), B2=max(x), a=mean(y), logb=log(max(slopes))),
                 control = nls.control(maxiter=1000, warnOnly=TRUE) )
# as s->0 this smooth model approximates more closely the piecewise linear one
summary(nlslmfit)
# Parameters:
#   Estimate Std. Error t value Pr(>|t|)    
#   B1    699.99988   19.23569   36.39  < 2e-16 ***
#   B2   1050.00069   15.49283   67.77  < 2e-16 ***
#   a       1.50817    0.09636   15.65 1.57e-11 ***
#   logb   -4.80172    0.06347  -75.65  < 2e-16 ***
require(investr)
xvals=seq(min(x),max(x),length.out=100)
predintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="prediction"))
confintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="confidence"))
require(ggplot2)
qplot(data=predintervals, x=x, y=fit, ymin=lwr, ymax=upr, geom="ribbon", fill=I("red"), alpha=I(0.2)) +
  geom_ribbon(data=confintervals, aes(x=x, ymin=lwr, ymax=upr), fill=I("blue"), alpha=I(0.2)) +
  geom_line(data=confintervals, aes(x=x, y=fit), colour=I("blue"), lwd=2) +
  geom_point(data=data.frame(x=x,y=y), aes(x=x, y=y, ymin=NULL, ymax=NULL), size=5, col="blue") +
  ylab("y")

在此处输入图像描述

您还可以使用包中的函数进行稳健nls拟合(对异常值稍微稳健),其余与上述相同:nlrobrobustbase

require(robustbase)
nlsrobfit  <- nlrob(y ~ a + (1/2)*exp(logb)*(B2-B1) + # we fit exp(logb) to force b > 0
                       (1/2)*sqrt(abs(exp(logb)*(4*1E-10+exp(logb)*(B1-x)^2))) - # now set s to 1E-10, we could also fit exp(logs) 
                       (1/2)*sqrt(abs(exp(logb)*(4*1E-10+exp(logb)*(B2-x)^2))),
                    data = data.frame(x=x, y=y),
                    maxit = 1000,
                    method="M",
                    algorithm="port",
                    doCov=TRUE,
                    start = c(B1=min(x)+1E-10, B2=max(x)-1E-10, a=min(y)+1E-10, logb=log(mean(slopes)) ),
                    # lower = c(B1=min(x), B2=mean(x), a=min(y), logb=log(min(slopes[slopes>0]))),
                    # upper = c(B1=mean(x), B2=max(x), a=mean(y), logb=log(max(slopes))),
                    control = nls.control(maxiter=1000, warnOnly=TRUE) )
summary(nlsrobfit)
class(nlsrobfit)="nls" # for compatibility with investr

s与也拟合了参数的模型的比较:

require(minpack.lm)
nlslmfit = nlsLM(y ~ a + (1/2)*exp(logb)*(B2-B1) + # we fit exp(logb) to force b > 0
                     (1/2)*sqrt(abs(exp(logb)*(4*exp(logs)+exp(logb)*(B1-x)^2))) - # we now fit exp(logs) 
                     (1/2)*sqrt(abs(exp(logb)*(4*exp(logs)+exp(logb)*(B2-x)^2))),
                 data = data.frame(x=x, y=y),
                 start = c(B1=min(x)+1E-10, B2=max(x)-1E-10, a=min(y)+1E-10, logb=log(mean(slopes)), logs=-10),
                 control = nls.control(maxiter=1000, warnOnly=TRUE) )
summary(nlslmfit)
# Parameters:
#   Estimate Std. Error t value Pr(>|t|)    
#   B1    7.000e+02  2.079e+01   33.67 2.78e-16 ***
#   B2    1.051e+03  1.614e+01   65.08  < 2e-16 ***
#   a     1.514e+00  1.000e-01   15.13 6.70e-11 ***
#   logb -4.806e+00  7.131e-02  -67.39  < 2e-16 ***
#   logs -1.805e+01  4.561e+04    0.00        1    
require(investr)
xvals=seq(min(x),max(x),length.out=100)
predintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="prediction"))
confintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="confidence"))
require(ggplot2)
qplot(data=predintervals, x=x, y=fit, ymin=lwr, ymax=upr, geom="ribbon", fill=I("red"), alpha=I(0.2)) +
  geom_ribbon(data=confintervals, aes(x=x, ymin=lwr, ymax=upr), fill=I("blue"), alpha=I(0.2)) +
  geom_line(data=confintervals, aes(x=x, y=fit), colour=I("blue"), lwd=2) +
  geom_point(data=data.frame(x=x,y=y), aes(x=x, y=y, ymin=NULL, ymax=NULL), size=5, col="blue") +
  ylab("y")

与平滑 4 参数逻辑模型的比较:

M.4pl <- function(x, lower.asymp, upper.asymp, inflec, hill){
  f <- lower.asymp + ((upper.asymp - lower.asymp)/
                        (1 + (x / inflec)^-hill))
  return(f)
}

require(minpack.lm)
nlslmfit = nlsLM(y ~ M.4pl(x, lower.asymp, upper.asymp, inflec, hill),
                 data = data.frame(x=x, y=y),
                 start = c(lower.asymp=min(y)+1E-10, upper.asymp=max(y)-1E-10, inflec=mean(x), hill=1),
                 control = nls.control(maxiter=1000, warnOnly=TRUE) )
summary(nlslmfit)
# Parameters:
#   Estimate Std. Error t value Pr(>|t|)    
#   lower.asymp   1.5371     0.1080   14.24 7.06e-11 ***
#   upper.asymp   4.5508     0.1497   30.40 2.93e-16 ***
#   inflec      889.1543    14.0924   63.09  < 2e-16 ***
#   hill         13.1717     2.5475    5.17 7.68e-05 ***
require(investr)
xvals=seq(min(x),max(x),length.out=100)
predintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="prediction"))
confintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="confidence"))
require(ggplot2)
qplot(data=predintervals, x=x, y=fit, ymin=lwr, ymax=upr, geom="ribbon", fill=I("red"), alpha=I(0.2)) +
  geom_ribbon(data=confintervals, aes(x=x, ymin=lwr, ymax=upr), fill=I("blue"), alpha=I(0.2)) +
  geom_line(data=confintervals, aes(x=x, y=fit), colour=I("blue"), lwd=2) +
  geom_point(data=data.frame(x=x,y=y), aes(x=x, y=y, ymin=NULL, ymax=NULL), size=5, col="blue") +
  ylab("y")

在此处输入图像描述

在一般情况下,三个段的线性分段回归导致这种函数:

在此处输入图像描述

在此处输入图像描述

根据论文https://fr.scribd.com/document/380941024/Regression-par-morceaux-Piecewise-Regression-pdf第 30 页给出的直接方法(非迭代)计算的参数

但是第一段和第三段不平行于 x 轴,这可能不是预期的。

第 18 页给出了第 1 段和第 3 段平行于 x 轴的段的分段回归的特殊情况。结果是:

在此处输入图像描述