在散点图上查找异常值

机器算法验证 异常值 散点图
2022-03-08 15:07:47

我有一组数据点应该位于一个轨迹上并遵循一个模式,但是主轨迹上的一些散点会导致我最终分析中的不确定性。我想获得一个整洁的轨迹,以便以后应用它进行分析。蓝点或多或少是我想通过复杂的方式查找并排除它们的散点,而无需手动执行。在此处输入图像描述

我正在考虑使用最近邻回归之类的方法,但我不确定它是否是最好的方法,或者我不太熟悉应该如何实施它才能给我一个合适的结果。顺便说一句,我想在没有任何拟合程序的情况下进行。

数据的转置版本如下:

X=array([[ 0.87 , -0.01 ,  0.575,  1.212,  0.382,  0.418, -0.01 ,  0.474,
         0.432,  0.702,  0.574,  0.45 ,  0.334,  0.565,  0.414,  0.873,
         0.381,  1.103,  0.848,  0.503,  0.27 ,  0.416,  0.939,  1.211,
         1.106,  0.321,  0.709,  0.744,  0.309,  0.247,  0.47 , -0.107,
         0.925,  1.127,  0.833,  0.963,  0.385,  0.572,  0.437,  0.577,
         0.461,  0.474,  1.046,  0.892,  0.313,  1.009,  1.048,  0.349,
         1.189,  0.302,  0.278,  0.629,  0.36 ,  1.188,  0.273,  0.191,
        -0.068,  0.95 ,  1.044,  0.776,  0.726,  1.035,  0.817,  0.55 ,
         0.387,  0.476,  0.473,  0.863,  0.252,  0.664,  0.365,  0.244,
         0.238,  1.203,  0.339,  0.528,  0.326,  0.347,  0.385,  1.139,
         0.748,  0.879,  0.324,  0.265,  0.328,  0.815,  0.38 ,  0.884,
         0.571,  0.416,  0.485,  0.683,  0.496,  0.488,  1.204,  1.18 ,
         0.465,  0.34 ,  0.335,  0.447,  0.28 ,  1.02 ,  0.519,  0.335,
         1.037,  1.126,  0.323,  0.452,  0.201,  0.321,  0.285,  0.587,
         0.292,  0.228,  0.303,  0.844,  0.229,  1.077,  0.864,  0.515,
         0.071,  0.346,  0.255,  0.88 ,  0.24 ,  0.533,  0.725,  0.339,
         0.546,  0.841,  0.43 ,  0.568,  0.311,  0.401,  0.212,  0.691,
         0.565,  0.292,  0.295,  0.587,  0.545,  0.817,  0.324,  0.456,
         0.267,  0.226,  0.262,  0.338,  1.124,  0.373,  0.814,  1.241,
         0.661,  0.229,  0.416,  1.103,  0.226,  1.168,  0.616,  0.593,
         0.803,  1.124,  0.06 ,  0.573,  0.664,  0.882,  0.286,  0.139,
         1.095,  1.112,  1.167,  0.589,  0.3  ,  0.578,  0.727,  0.252,
         0.174,  0.317,  0.427,  1.184,  0.397,  0.43 ,  0.229,  0.261,
         0.632,  0.938,  0.576,  0.37 ,  0.497,  0.54 ,  0.306,  0.315,
         0.335,  0.24 ,  0.344,  0.93 ,  0.134,  0.4  ,  0.223,  1.224,
         1.187,  1.031,  0.25 ,  0.53 , -0.147,  0.087,  0.374,  0.496,
         0.441,  0.884,  0.971,  0.749,  0.432,  0.582,  0.198,  0.615,
         1.146,  0.475,  0.595,  0.304,  0.416,  0.645,  0.281,  0.576,
         1.139,  0.316,  0.892,  0.648,  0.826,  0.299,  0.381,  0.926,
         0.606],
       [-0.154, -0.392, -0.262,  0.214, -0.403, -0.363, -0.461, -0.326,
        -0.349, -0.21 , -0.286, -0.358, -0.436, -0.297, -0.394, -0.166,
        -0.389,  0.029, -0.124, -0.335, -0.419, -0.373, -0.121,  0.358,
         0.042, -0.408, -0.189, -0.213, -0.418, -0.479, -0.303, -0.645,
        -0.153,  0.098, -0.171, -0.066, -0.368, -0.273, -0.329, -0.295,
        -0.362, -0.305, -0.052, -0.171, -0.406, -0.102,  0.011, -0.375,
         0.126, -0.411, -0.42 , -0.27 , -0.407,  0.144, -0.419, -0.465,
        -0.036, -0.099,  0.007, -0.167, -0.205, -0.011, -0.151, -0.267,
        -0.368, -0.342, -0.299, -0.143, -0.42 , -0.232, -0.368, -0.417,
        -0.432,  0.171, -0.388, -0.319, -0.407, -0.379, -0.353,  0.043,
        -0.211, -0.14 , -0.373, -0.431, -0.383, -0.142, -0.345, -0.144,
        -0.302, -0.38 , -0.337, -0.2  , -0.321, -0.269,  0.406,  0.223,
        -0.322, -0.395, -0.379, -0.324, -0.424,  0.01 , -0.298, -0.386,
         0.018,  0.157, -0.384, -0.327, -0.442, -0.388, -0.387, -0.272,
        -0.397, -0.415, -0.388, -0.106, -0.504,  0.034, -0.153, -0.32 ,
        -0.271, -0.417, -0.417, -0.136, -0.447, -0.279, -0.225, -0.372,
        -0.316, -0.161, -0.331, -0.261, -0.409, -0.338, -0.437, -0.242,
        -0.328, -0.403, -0.433, -0.274, -0.331, -0.163, -0.361, -0.298,
        -0.392, -0.447, -0.429, -0.388,  0.11 , -0.348, -0.174,  0.244,
        -0.182, -0.424, -0.319,  0.088, -0.547,  0.189, -0.216, -0.228,
        -0.17 ,  0.125, -0.073, -0.266, -0.234, -0.108, -0.395, -0.395,
         0.131,  0.074,  0.514, -0.235, -0.389, -0.288, -0.22 , -0.416,
        -0.777, -0.358, -0.31 ,  0.817, -0.363, -0.328, -0.424, -0.416,
        -0.248, -0.093, -0.28 , -0.357, -0.348, -0.298, -0.384, -0.394,
        -0.362, -0.415, -0.349, -0.08 , -0.572, -0.07 , -0.423,  0.359,
         0.4  ,  0.099, -0.426, -0.252, -0.697, -0.508, -0.348, -0.254,
        -0.307, -0.116, -0.029, -0.201, -0.302, -0.25 , -0.44 , -0.233,
         0.274, -0.295, -0.223, -0.398, -0.298, -0.209, -0.389, -0.247,
         0.225, -0.395, -0.124, -0.237, -0.104, -0.361, -0.335, -0.083,
        -0.254]])
1个回答

作为识别“分散”点的开始,请考虑关注内核密度估计相对较低的位置。

这个建议假设最初对点的“轨迹”知之甚少,甚至一无所知——它们中的大多数将沿着一条或多条曲线落下——它是本着对数据进行半自动探索的精神提出的(而不是检验假设)。

您可能需要使用内核宽度和“相对较低”的阈值。存在很好的自动方法来估计前者,而后者可以通过分析数据点的密度来识别(以识别低值集群)。


例子

该图是由两种数据组合生成的:一种用红点表示,是高精度数据,另一种用蓝点表示,是在的极低值附近获得的相对低精度的数据。在其背景中是(a)内核密度估计的轮廓(灰度)和(b)围绕其生成点的曲线(黑色)。X

数字

密度相对较低的点已被自动圈出。(这些点的密度小于所有点平均密度的八分之一。)它们包括大部分——但不是全部!——低精度点和一些高精度点(在顶部对)。位于曲线附近的低精度点(由高精度点推断)没有被圈出。高精度点的圈出凸显了点稀疏的地方,底层曲线的轨迹将是不确定的。这是建议方法的一个特点,而不是限制!


代码

R生成此示例的代码如下。它使用该ks库评估点模式中的各向异性来开发定向内核形状。这种方法在样本数据中效果很好,其点云往往又长又瘦。

#
# Simulate some data.
#
f <- function(x) -0.55 + 0.45*x + 0.01/(1.2-x)^2 # The underlying curve

set.seed(17)
n1 <- 280; n2 <- 15
x <- c(1.2 - rbeta(n1,.9, .6), rep(0.1, n2))
y <- f(x)
d <- data.frame(x=x + c(rnorm(n1, 0, 0.025), rnorm(n2, 0, 0.1)),
                   y=y + c(rnorm(n1, 0, 0.025), rnorm(n2, 0, 0.33)),
                   group=c(rep(1, n1), rep(2, n2)))
d <- subset(d, subset=(y <= 1.0)) # Omit any high-y points
#
# Plot the density estimate.
#
require(ks)
p <- cbind(d$x, d$y)
dens <- kde(p)
n.levels <- 13
colors <- gray(seq(1, 0, length.out=n.levels))
plot(dens, display="filled.contour2", cont=seq(0, 100, length.out=n.levels),
     col=colors, xlab="X", ylab="Y")
#
# Evaluate densities at the data points.
#
dens <- kde(p, eval.points=p)
d$Density <- dens$estimate
#
# Plot the (correct) curve and the points.
#
curve(f(x), add=TRUE, to=1.2, col="Black")
points(d$x, d$y, ylim=c(-1,1), pch=19, cex=sqrt(d$Density/8),
     col=ifelse(d$group==1, "Red", "Blue"))
#
# Highlight some low-density points.
#
m <- mean(d$Density)
e <- subset(d, subset=(Density < m/10))
points(e$x, e$y, col="#00000080")