如果要关注类的异常值,可以执行以下操作:
使用隔离森林
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import plotly.express as px
X, y = load_iris(return_X_y= True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
out_model = Pipeline([("model", IsolationForest(random_state= 42))]).fit(X_train)
visualizer = Pipeline([("scaler",StandardScaler()),
("decomposer",PCA(n_components= 3)),
("framer", FunctionTransformer(lambda x: pd.DataFrame(x, columns = ["p1","p2","p3"])))]).fit(X_train)
outliers = out_model.predict(X_train)
X3D = visualizer.transform(X_train)
px.scatter_3d(data_frame=X3D, x = "p1", y = "p2", z = "p3", color = y_train, symbol=outliers )
pd.crosstab(index = y_train, columns= outliers, margins= True, normalize= 0 )
输出:
我们可以看到,大约 35% 属于第 2 类的观察值被标记为异常值
这是分析其背后原因的一个很好的起点。
希望能帮助到你