凸面,二维数据的最佳二进制分类

芽杜宫

问题

我有一个非常复杂的数据集,我想尽可能地分开:

我想找到一个简单的凸形,例如可以捕捉90%的红色点,同时减少绿色的点

img

  mydata = cbind(
    "Temperature" = c( -3.62, -3.45, -3.47, -8.84, -6.47, -11, -6, -5.3, -4.24, -3.45, -2.91, -5.27, -9.9, -4.06, -8.71, -7.26, -7.67, -5.1, -3.19, -3.98, -6.9, -8.27, -8.37, -8.8, -2.68, -2.43, -6.99, -3.44, -3.49, -6.45, -4.24, -7.3, -2.22, -3.88, -9.57, -6.38, -3.86, -11.6, -5.45, -7.52, -4.05, -5.07, -6.55, -12, -9.41, -4.11, -7.3, -6.08, -3.71, -3.12, -8.17, -7.4, -3.26, -4.12, -2.74, -2.42, -7.02, -9.44, -6.62, -3.76, -4.12, -9.84, -3.78, -2.86, -4.3, -6.3, -5.29, -2.89, -3.94, -6.56, -6.58, -10.9, -3.16, -2.5, -5.33, -4.1, -3.22, -4.24, -6.19, -3.1, -5.48, -7.48, -9.56, -7.08, -3.59, -8.24, -8.44, -6.2, -3.41, -7.9, -7.66, -2.86, -8.39, -7.16, -9.42, -10.9, -3.47, -6.2, -3.57, -3.09, -7.29, -3.3, -5.8, -9.75, -4.82, -3.91, -3.34, -3.07, -9.39, -6.51, -6.6, -7.57, -11, -5.9, -4.42, -10.2, -2.02, -8.06, -3.82, -2.84, -3.27, -3.37, -2.93, -5.18, -9.29, -8.72, -3.17, -7.47, -8.85, -5.27, -3.15, -5.07, -5.4, -3.38, -4.07, -8.92, -2.54, -7.87, -8.7, -4.88, -9.35, -8.57, -10.3, -4.08, -3.06, -6.02, -4.43, -8.24, -3.46, -8.15, -9.9, -6.37, -2.65, -3.2, -4.23, -4.29, -8.06, -8.3, -9.36, -5.82, -11.3, -5.87, -3.1, -8.48, -4.47, -3.44, -6.34, -3.39, -5.29, -9.76, -7.29, -4.2, -3.84, -4.12, -10.6, -3.34, -9.75, -7.41, -11.4, -3.7, -7.57, -4.29, -4.18, -4.87, -3.85, -10, -3.94, -4.55, -3.57, -5.29, -9.29, -6.17, -4.58, -3.92, -3.08, -4.51, -8.21, -2.7, -3.56, -7.92, -4.38, -3.3, -6.94, -6.92, -7.5, -9.1, -6.72, -7.66, -7.76, -6.84, -4.23, -6.11, -12.6, -5.95, -9.34, -3.81, -9.35, -7.37, -6.66, -3.89, -7.64, -7.8, -10.3, -5.06, -3.62, -2.15, -7.66, -2.74, -6.9, -4.36, -2.21, -6.5, -2.95, -4.19, -9.96, -6.81, -10, -6.59, -9.19, -4.27, -7.64, -6.13, -4.01, -4.98, -7.11, -4.72, -11.2, -3.88, -3.03, -3.88, -2.39, -6.83, -5.94, -6.92, -3.54, -11, -7.74, -10.3, -9.04, -4.93, -8.96, -2.74, -4.15, -5.06, -10.8, -5.94, -7.96, -4.32, -4.23, -4.68, -4.8, -2.86, -2.31, -3.37, -6.06, -2.4, -2.57, -4.54, -3.11, -3.1, -5.2, -4.23, -4.22, -3.6, -3.16, -3.45, -3.65, -3.28, -3.6, -3.13, -3.08, -3.74, -2.61, -4.42, -2.82, -2.52, -3.05, -3.56, -5.58, -4.53, -2.82, -4.73, -3.17, -4.37, -3.39, -4.74, -4.06, -2.49, -4.35, -2.57, -3.88, -3.53, -3.11, -2.9, -2.76, -4.2, -3.28, -4.07, -3.1, -2.96, -3.5, -2.5, -6.26, -3.5, -3.16, -3.05, -1.95, -2.19, -4.1, -5.71, -3.53, -3.77, -1.95, -4.18, -3.96, -3.45, -3.86, -3.1, -3.54, -3.96, -3.23, -2.32, -2.7, -3.73, -2.77, -3.04, -3.17, -2.35, -3.46, -4.01, -3.05, -2.64, -5.51, -2.44, -2.6, -5.34, -2.83, -2.84, -6.01, -4.64, -2.69, -4.28, -4.28, -2.82, -3.18, -2.89, -3.12, -2.93, -3.36, -4.86, -4.92, -4.5, -3.69, -3.72, -4.67, -3.19, -3.74, -3.94, -2.81, -3.66, -2.98, -4.46, -2.46, -3.85, -3.66, -2.88, -4.19, -3.03, -3.46, -3.96, -2.4, -3.09, -4.08, -4.18, -2.56, -2.06, -3.14, -3.44, -3.51, -4.99, -2.9, -3.41, -3.36, -4.53, -2.76, -3.74, -3.33, -2.75, -2.39, -3.1, -6.21, -4.45, -2.81, -2.5, -4.14, -3.56, -3.06, -3.36, -2.86, -3.22, -3.33, -3.88, -5.38, -2.88, -2.25, -2.97, -5.22, -4.49, -4.76, -2.73, -2.98, -4.85, -4.03, -3.48, -2.54, -2.02, -2.86, -2.7, -3.63, -3.46, -2.71, -2.9, -2.96, -8.07, -2.83, -2.87, -2.87, -3.98, -4.34, -4.84, -4.06, -3.03, -3.1, -3.2, -3.86, -3.72, -2.82, -5.83, -3.1, -4.24, -3.33, -3.2, -2.92, -2.1, -3.61, -2.78, -3.37, -4.26, -2.38, -3.65, -5.05, -5.54, -3.77, -5.37, -3.51, -3.2, -3.67, -4.36, -2.21, -2.78, -2.85, -3.53, -2.04, -4.97, -2.94, -5.7, -4.14, -3.8, -2.69, -4, -3.18, -3.58, -2.14),
    "Loss" = c( -74.6, -77.6, -77.3, -74.1, -73, -72.9, -83.3, -73.3, -73.5, -73.9, -77.7, -76.9, -74.7, -75.4, -80.9, -74.9, -77.6, -75.8, -78.9, -74.7, -73.2, -72.7, -83.8, -73.4, -75.1, -75.8, -77.2, -76.5, -72.3, -73.4, -72.4, -74.6, -74.3, -73.9, -73.7, -78.8, -78.9, -83.1, -71.7, -82.1, -72.8, -73.7, -82.6, -74.9, -79.7, -74, -75.2, -73.4, -75.2, -72.8, -79, -76.9, -74.1, -74, -76.7, -73.9, -85.5, -79, -78.5, -72.5, -73.1, -76, -73.1, -77.2, -73.5, -78.8, -76.9, -76.8, -76.5, -77, -77.9, -73.2, -77.1, -75.8, -73.2, -76.5, -76.2, -72.8, -71.5, -74, -74.4, -85.8, -79.6, -82.3, -75.7, -72, -75.3, -81.5, -72.8, -74.3, -78.9, -73.7, -75.6, -73.9, -74.1, -78.3, -74.8, -80.8, -79.7, -74.8, -80.7, -76, -75.9, -78.3, -79.8, -78.9, -76.2, -74.1, -75.4, -75.6, -80.4, -77.8, -72.4, -72.3, -73.4, -78.4, -75.7, -80.7, -74.9, -75.8, -73.1, -74.4, -73, -72.9, -79.4, -74.2, -82.4, -75.5, -73.1, -75.8, -82.5, -76, -73.7, -78.4, -72.1, -82.2, -73, -72.9, -76.1, -74.1, -73.2, -74.7, -74.3, -71.6, -75.1, -75.4, -81, -74.6, -72.8, -76.9, -78.3, -73.8, -74.2, -73.9, -73.5, -75.2, -76.4, -79.6, -76.1, -76.3, -75.4, -78, -73.1, -80.7, -74.3, -72.9, -78.2, -81.5, -77.3, -73.5, -74, -73.7, -74.4, -74, -76, -73.9, -75.8, -74.5, -77.5, -73.2, -82.7, -73.1, -75, -79.8, -72.6, -85.1, -72.3, -72.5, -75.3, -72.6, -75.8, -74.2, -74.1, -73.2, -75.7, -72.3, -74.3, -75.2, -72, -77.8, -76.5, -75.9, -82.3, -73.7, -74.5, -75.1, -77.2, -76.6, -76.2, -75.6, -75.7, -74.8, -72.8, -72, -72.7, -72, -74.7, -72.8, -77.8, -74.5, -74.4, -75.2, -73, -76.4, -76.2, -73.3, -84.3, -73.2, -72.9, -76.2, -79.7, -82.6, -75.3, -73.6, -72.6, -77.8, -75.9, -76.7, -77.8, -76.8, -76.1, -73.9, -83.1, -75.1, -72.6, -74.7, -80.9, -76.7, -76.1, -74.6, -72.3, -74.4, -82.2, -74, -74.1, -75.3, -78.8, -75.4, -73.9, -72.6, -84.7, -71.8, -73.2, -73.6, -73.2, -75, -79, -71.7, -75.1, -75.5, -77.5, -78.7, -73.3, -72, -76.4, -75, -73.7, -81.7, -76.2, -78.4, -80, -79.8, -75.6, -70.5, -80.2, -69.9, -76.5, -74.4, -77.1, -71.6, -72.9, -74, -82.5, -74.5, -76.4, -73.5, -75.9, -77.6, -74.5, -80.9, -78.8, -77.7, -71.1, -80.2, -75.1, -83.7, -76.8, -81.8, -77.3, -77.9, -80.4, -77.3, -74.2, -77.2, -70, -74.2, -83, -75.8, -73, -75.1, -73.1, -71, -72.9, -76.8, -82.6, -76.5, -73.9, -75.9, -74.7, -76.3, -76.6, -77.7, -72.9, -73, -73.9, -75.2, -78.4, -73.6, -75.3, -73.3, -73.5, -79.9, -76.9, -74.5, -75.9, -76.4, -76.4, -73.4, -73, -73.2, -74.2, -75.1, -78.5, -72.8, -77.5, -79.5, -72.3, -76.6, -73, -83.8, -75.9, -70, -77.8, -73.9, -72.2, -76.6, -74, -70.9, -73, -79.3, -78.1, -81, -84.1, -71, -80, -73.1, -74, -71.7, -73.5, -73.2, -80.2, -77.7, -76, -78.5, -76.7, -72.6, -74.8, -73.1, -69.9, -74.7, -74.9, -82.9, -75.4, -78.4, -76.8, -75.9, -77.8, -80.5, -76.9, -78.7, -74.4, -80.3, -72.3, -73.9, -72.3, -73.8, -75.2, -74.4, -76.6, -79.1, -74.3, -76.2, -76.6, -71.7, -79, -74.8, -73.8, -73, -73.7, -74, -74.2, -79, -76.3, -78.4, -74.8, -81, -76.7, -77, -75.4, -73.8, -74.2, -78.4, -74.6, -72.7, -81.5, -78.4, -74.3, -74.1, -71.2, -76.7, -77.5, -76.2, -75.1, -72.4, -75.4, -74.4, -73.3, -86, -71.6, -80.4, -73.5, -72.5, -77.8, -74.5, -79.9, -76.3, -73.9, -76.5, -83.8, -77.2, -74.5, -80.4, -75.4, -72.8, -77.3, -78.7, -74, -73.6, -73, -72.8, -82.8, -71.6, -78.9, -74.9, -73.1, -82.4, -77.1, -74.5, -71.8, -72.9, -85.3, -73.9, -84.4, -79, -78.1, -74.5, -75.7, -75.6, -75, -71.8, -74.6, -73.4, -73.3),
    "Class" = c( 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0))

plot(mydata[,1:2], pch=2+mydata[,3], col=2+mydata[,3])

  • 现在让我们忽略过度拟合

目标

  • 绘制最佳矩形(内部最小绿色)
  • 画出N <= 6面的最佳(内部最小绿色)多边形(或画一个圆)

我想得到:

  • 方程/多边形点最佳分离,并且
  • 内/外百分比/计数。

我想

  • 固定内部所需的百分比,并优化污染的不希望有的数量,或者
  • fix the % purity and maximize the number of desired points falling inside the polygon

Tried

I tried SVM packages e1071 and kernlab, but I did not get useful results.

  • Either SVM not designed for so overlapping data, or just I did not get how it works.

Googling it was otherwise pretty unsuccesful, I am probaly just missing the right phrases to search for this problem, this simple problem must have been solved decades ago.

Any help appreciated. Thanks in advance.

G5W

Here is one approach. You want (approximately) 90% of the red points in one group. Use hierarchical clustering to find points that are clustered. Then, take the convex hull of those points.

HC = hclust(dist(mydata[mydata[,3]==0,1:2]), method="average")
sum(mydata[,3]==0)  # 81
table(cutree(HC,2))
  1  2 
 77  4 
table(cutree(HC,3))
  1  2  3 
 72  5  4 
Grouped3 = which(cutree(HC,3) == 1)

将树分成2组可得到一组77分(太多)。分成三组砍树,得到一组72分(略低于90%)。我选择与分组的72分一起去。您可以做出其他选择。

现在我们有了要封闭的点,将它们绘制出来并将它们封闭在它们的凸包中。

plot(mydata[,1:2], pch=2+mydata[,3], col=2+mydata[,3])
points(mydata[which(mydata[,3]==0)[Grouped3],1:2], pch=20, col="red")
CH3 = chull(mydata[which(mydata[,3]==0)[Grouped3],1:2])
polygon(mydata[which(mydata[,3]==0)[Grouped3][CH3],1:2], col="#FF000033")

凸包

average在群集中使用了测量距离方法。您可以尝试其他方法(完整,单一)。您可以尝试使用不同数量的组。但这为生成多边形提供了系统的框架。

您还询问了多边形内有多少个红色和绿色点。该程序包mgcv具有in.out确定点是否在多边形内的功能。在这种情况下,我们得到

library(mgcv)
boundary = mydata[which(mydata[,3]==0)[Grouped3][CH3],1:2]
sum(in.out(boundary,mydata[which(mydata[,3]==1),1:2]))       # 269 points

因此,多边形内有269个绿点。

本文收集自互联网,转载请注明来源。

如有侵权,请联系[email protected] 删除。

编辑于
0

我来说两句

0条评论
登录后参与评论

相关文章

来自分类Dev

提取二维二进制数组的索引

来自分类Dev

如何通过使用二进制搜索C ++从二维数组输出数据?

来自分类Dev

传感器数据的二进制分类

来自分类Dev

朴素贝叶斯分类器与二进制数据

来自分类Dev

无效的操作数到二进制/二维数组

来自分类Dev

如何在二维数组中应用二进制搜索?

来自分类Dev

如何改组二维二进制矩阵,保留边际分布

来自分类Dev

在Python中生成二进制数的二维数组

来自分类Dev

TensorFlow用于二进制分类

来自分类Dev

熊猫进行二进制分类

来自分类Dev

Git和二进制数据,最佳存储方法

来自分类Dev

表示二进制数据的最佳方法

来自分类Dev

表示二进制数据的最佳方法

来自分类Dev

在二维图像(二进制)中找到x轴上像素值为1的最大坐标点(二进制)(Matlab)

来自分类Dev

在二进制分类中使用Lasso回归找到最佳特征

来自分类Dev

如何从伪随机二进制序列生成如下图所示的二维周期模式?

来自分类Dev

分割二进制数据块

来自分类Dev

如何发送二进制数据?

来自分类Dev

AngularJS,管理二进制数据

来自分类Dev

二进制数据的Python结构

来自分类Dev

读取二进制数据

来自分类Dev

响应二进制数据下载

来自分类Dev

R写二进制数据

来自分类Dev

聚类二进制数据

来自分类Dev

读取二进制数据问题

来自分类Dev

二进制数据转换缓慢

来自分类Dev

MatLab中的二进制数据

来自分类Dev

Java解析二进制数据

来自分类Dev

二进制数据读取