机器算法验证 - 带约束的分段回归 - 吾爱随笔录

带约束的分段回归

机器算法验证 r 分段线性

2022-04-21 16:37:01

我想在 R 中进行分段线性回归。我有一个包含 3 段的大型数据集，我希望第一段和第三段没有斜率，即平行于 x 轴，并且我还希望回归是连续的。我有一个小的示例数据集和下面的示例代码。我对 R 相当陌生，并试图编写模型公式但没有成功。任何人都可以帮助计算公式以及如何提取不同段的截距和斜率吗？在代码中插入三个具体问题作为注释。我已经查看了分段的包，但无法理解如何将分段 1 和 3 约束为平行于 x 轴。

#Example data
y <- c(4.5,4.3,2.57,4.40,4.52,1.39,4.15,3.55,2.49,4.27,4.42,4.10,2.21,2.90,1.42,1.50,1.45,1.7,4.6,3.8,1.9)  
x <- c(320,419,650,340,400,800,300,570,720,480,425,460,675,600,850,920,975,1022,450,520,780)  
plot(x, y, col="black",pch=16)

#model 1 not continuous, Q1: how to get that?
fit1<-lm(y ~ I(x<500)+I((x>=500&x<800)*x) + I(x>=800)) 
summary(fit1) #intercepts and slopes extracted from summary(fit1)
lines(c(min(x),500),c(round(summary(fit1)[[4]][1]+summary(fit1)[[4]][2],2),round(summary(fit1)[[4]][1]+summary(fit1)[[4]][2],2)),col="red")
lines(c(800,max(x)),c(round(summary(fit1)[[4]][1]+summary(fit1)[[4]][4],2),round(summary(fit1)[[4]][1]+summary(fit1)[[4]][4],2)),col="red")
lines(c(500,800),c((summary(fit1)[[4]][1])+(500*(summary(fit1)[[4]][3])),
               (summary(fit1)[[4]][1])+(800*(summary(fit1)[[4]][3]))),col="red")

#model 2 continuous but first and third segment not parallell to x-axis, Q2: how to get that?
fit2<-lm(y ~ x + I(pmax(x-500,0)) + I(pmax(x-800,0)))                                                  
summary(fit2) # Q3: how to get intercept and slope from summary(fit2) for model2?
mg <- seq(min(x),max(x),length=400)
mpv <- predict(fit2, newdata=data.frame(x = mg,t = (mg - 800)*as.numeric(mg > 800)))
lines(mg, mpv,col="blue")

3个回答

如果目标只是拟合函数，则可以将其视为优化问题：

y <- c(4.5,4.3,2.57,4.40,4.52,1.39,4.15,3.55,2.49,4.27,4.42,4.10,2.21,2.90,1.42,1.50,1.45,1.7,4.6,3.8,1.9)  
x <- c(320,419,650,340,400,800,300,570,720,480,425,460,675,600,850,920,975,1022,450,520,780)  
plot(x, y, col="black",pch=16)


#we need four parameters: the two breakpoints and the starting and ending intercepts
fun <- function(par, x) {
  #set all y values to starting intercept
  y1 <- x^0 * par["i1"]
  #set values after second breakpoint to ending intercept
  y1[x >= par["x2"]] <- par["i2"]
  #which values are between breakpoints?
  r <- x > par["x1"] & x < par["x2"]
  #interpolate between breakpoints
  y1[r] <- par["i1"] + (par["i2"] - par["i1"]) / (par["x2"] - par["x1"]) * (x[r] - par["x1"])
  y1
}

#sum of squared residuals
SSR <- function(par) {
     sum((y - fun(par, x))^2)
   }


library(optimx)
optimx(par = c(x1 = 500, x2 = 820, i1 = 5, i2 = 1), 
       fn = SSR, 
       method = "Nelder-Mead")

#                  x1       x2       i1       i2     value fevals gevals niter convcode kkt1 kkt2 xtimes
#Nelder-Mead 449.8546 800.0002 4.381454 1.512305 0.6404728    373     NA    NA        0 TRUE TRUE   0.06

lines(300:1100, 
      fun(c(x1 = 449.8546, x2 = 800.0002, i1 = 4.381454, i2 = 1.512305), 300:1100))

结果图

如果您还需要置信区间和预测区间，您可以首先通过平滑函数逼近您的三相分段线性函数，进行nls拟合，然后使用investr包（这有助于拟合，因为函数随后是连续可微分的）。

在你的情况下：

x <- c(478, 525, 580,  650,  700,  720,  780,  825,  850,  900,  930,  980, 1020, 1040, 1050, 1075, 1081, 1100, 1160, 1180, 1200)
y <- c(1.70, 1.45, 1.50, 1.42, 1.39, 1.90, 2.49, 2.21, 2.57, 2.90, 3.55, 3.80, 4.27, 4.10, 4.60, 4.42, 4.30, 4.52, 4.40, 4.50, 4.15)
# calculate rolling slopes at each point to provide good initial estimates for slope parameter b
f <- function (d) {
  m <- lm(y~x, as.data.frame(d))
  return(coef(m)[2])
}
require(zoo)
slopes <- rollapply(data.frame(x=x,y=y), 3, f, by.column=F)

# smooth approximation is
# y ~ a + (1/2)*b*(B2-B1) + 
#         (1/2)*sqrt(abs(b*(4*s+b*(B1-x)^2))) -  
#         (1/2)*sqrt(abs(b*(4*s+b*(B2-x)^2)))
# this smooth approximation approaches the piecewise linear model more as s -> 0

require(minpack.lm)
nlslmfit = nlsLM(y ~ a + (1/2)*exp(logb)*(B2-B1) + # we fit exp(logb) to force b > 0, if you don't want this just fit b instead
                   (1/2)*sqrt(abs(exp(logb)*(4*1E-10+exp(logb)*(B1-x)^2))) - # now set s to 1E-10, we could also fit exp(logs) 
                   (1/2)*sqrt(abs(exp(logb)*(4*1E-10+exp(logb)*(B2-x)^2))),
                 data = data.frame(x=x, y=y),
                 start = c(B1=min(x)+1E-10, B2=max(x)-1E-10, a=min(y)+1E-10, logb=log(max(slopes))),
                 # lower = c(B1=min(x), B2=mean(x), a=min(y), logb=log(min(slopes[slopes>0]))),
                 # upper = c(B1=mean(x), B2=max(x), a=mean(y), logb=log(max(slopes))),
                 control = nls.control(maxiter=1000, warnOnly=TRUE) )
# as s->0 this smooth model approximates more closely the piecewise linear one
summary(nlslmfit)
# Parameters:
#   Estimate Std. Error t value Pr(>|t|)    
#   B1    699.99988   19.23569   36.39  < 2e-16 ***
#   B2   1050.00069   15.49283   67.77  < 2e-16 ***
#   a       1.50817    0.09636   15.65 1.57e-11 ***
#   logb   -4.80172    0.06347  -75.65  < 2e-16 ***
require(investr)
xvals=seq(min(x),max(x),length.out=100)
predintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="prediction"))
confintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="confidence"))
require(ggplot2)
qplot(data=predintervals, x=x, y=fit, ymin=lwr, ymax=upr, geom="ribbon", fill=I("red"), alpha=I(0.2)) +
  geom_ribbon(data=confintervals, aes(x=x, ymin=lwr, ymax=upr), fill=I("blue"), alpha=I(0.2)) +
  geom_line(data=confintervals, aes(x=x, y=fit), colour=I("blue"), lwd=2) +
  geom_point(data=data.frame(x=x,y=y), aes(x=x, y=y, ymin=NULL, ymax=NULL), size=5, col="blue") +
  ylab("y")

您还可以使用包中的函数进行稳健nls拟合（对异常值稍微稳健），其余与上述相同：nlrobrobustbase

require(robustbase)
nlsrobfit  <- nlrob(y ~ a + (1/2)*exp(logb)*(B2-B1) + # we fit exp(logb) to force b > 0
                       (1/2)*sqrt(abs(exp(logb)*(4*1E-10+exp(logb)*(B1-x)^2))) - # now set s to 1E-10, we could also fit exp(logs) 
                       (1/2)*sqrt(abs(exp(logb)*(4*1E-10+exp(logb)*(B2-x)^2))),
                    data = data.frame(x=x, y=y),
                    maxit = 1000,
                    method="M",
                    algorithm="port",
                    doCov=TRUE,
                    start = c(B1=min(x)+1E-10, B2=max(x)-1E-10, a=min(y)+1E-10, logb=log(mean(slopes)) ),
                    # lower = c(B1=min(x), B2=mean(x), a=min(y), logb=log(min(slopes[slopes>0]))),
                    # upper = c(B1=mean(x), B2=max(x), a=mean(y), logb=log(max(slopes))),
                    control = nls.control(maxiter=1000, warnOnly=TRUE) )
summary(nlsrobfit)
class(nlsrobfit)="nls" # for compatibility with investr

s与也拟合了参数的模型的比较：

require(minpack.lm)
nlslmfit = nlsLM(y ~ a + (1/2)*exp(logb)*(B2-B1) + # we fit exp(logb) to force b > 0
                     (1/2)*sqrt(abs(exp(logb)*(4*exp(logs)+exp(logb)*(B1-x)^2))) - # we now fit exp(logs) 
                     (1/2)*sqrt(abs(exp(logb)*(4*exp(logs)+exp(logb)*(B2-x)^2))),
                 data = data.frame(x=x, y=y),
                 start = c(B1=min(x)+1E-10, B2=max(x)-1E-10, a=min(y)+1E-10, logb=log(mean(slopes)), logs=-10),
                 control = nls.control(maxiter=1000, warnOnly=TRUE) )
summary(nlslmfit)
# Parameters:
#   Estimate Std. Error t value Pr(>|t|)    
#   B1    7.000e+02  2.079e+01   33.67 2.78e-16 ***
#   B2    1.051e+03  1.614e+01   65.08  < 2e-16 ***
#   a     1.514e+00  1.000e-01   15.13 6.70e-11 ***
#   logb -4.806e+00  7.131e-02  -67.39  < 2e-16 ***
#   logs -1.805e+01  4.561e+04    0.00        1    
require(investr)
xvals=seq(min(x),max(x),length.out=100)
predintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="prediction"))
confintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="confidence"))
require(ggplot2)
qplot(data=predintervals, x=x, y=fit, ymin=lwr, ymax=upr, geom="ribbon", fill=I("red"), alpha=I(0.2)) +
  geom_ribbon(data=confintervals, aes(x=x, ymin=lwr, ymax=upr), fill=I("blue"), alpha=I(0.2)) +
  geom_line(data=confintervals, aes(x=x, y=fit), colour=I("blue"), lwd=2) +
  geom_point(data=data.frame(x=x,y=y), aes(x=x, y=y, ymin=NULL, ymax=NULL), size=5, col="blue") +
  ylab("y")

与平滑 4 参数逻辑模型的比较：

M.4pl <- function(x, lower.asymp, upper.asymp, inflec, hill){
  f <- lower.asymp + ((upper.asymp - lower.asymp)/
                        (1 + (x / inflec)^-hill))
  return(f)
}

require(minpack.lm)
nlslmfit = nlsLM(y ~ M.4pl(x, lower.asymp, upper.asymp, inflec, hill),
                 data = data.frame(x=x, y=y),
                 start = c(lower.asymp=min(y)+1E-10, upper.asymp=max(y)-1E-10, inflec=mean(x), hill=1),
                 control = nls.control(maxiter=1000, warnOnly=TRUE) )
summary(nlslmfit)
# Parameters:
#   Estimate Std. Error t value Pr(>|t|)    
#   lower.asymp   1.5371     0.1080   14.24 7.06e-11 ***
#   upper.asymp   4.5508     0.1497   30.40 2.93e-16 ***
#   inflec      889.1543    14.0924   63.09  < 2e-16 ***
#   hill         13.1717     2.5475    5.17 7.68e-05 ***
require(investr)
xvals=seq(min(x),max(x),length.out=100)
predintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="prediction"))
confintervals = data.frame(x=xvals,predFit(nlslmfit, newdata=data.frame(x=xvals), interval="confidence"))
require(ggplot2)
qplot(data=predintervals, x=x, y=fit, ymin=lwr, ymax=upr, geom="ribbon", fill=I("red"), alpha=I(0.2)) +
  geom_ribbon(data=confintervals, aes(x=x, ymin=lwr, ymax=upr), fill=I("blue"), alpha=I(0.2)) +
  geom_line(data=confintervals, aes(x=x, y=fit), colour=I("blue"), lwd=2) +
  geom_point(data=data.frame(x=x,y=y), aes(x=x, y=y, ymin=NULL, ymax=NULL), size=5, col="blue") +
  ylab("y")

在一般情况下，三个段的线性分段回归导致这种函数：

根据论文https://fr.scribd.com/document/380941024/Regression-par-morceaux-Piecewise-Regression-pdf第 30 页给出的直接方法（非迭代）计算的参数

但是第一段和第三段不平行于 x 轴，这可能不是预期的。

第 18 页给出了第 1 段和第 3 段平行于 x 轴的段的分段回归的特殊情况。结果是：

其它你可能感兴趣的问题

上一篇在进行多元回归时对每个变量进行不同的转换是否可行下一篇消极的R2R2在随机回归森林