RANSAC和R2,为什么r2分数是负数?

数据挖掘 机器学习 回归 对数
2022-02-20 23:36:54

我正在尝试使用curve_fit、RANSAC 和尝试学习基础知识的东西,但有一件事我不明白。

为什么这里的 R2 得分为负?

import numpy as np
import warnings 
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator
from sklearn.linear_model import RANSACRegressor
from scipy.optimize import OptimizeWarning 
from scipy.optimize import curve_fit


class LogarithmicRegression(BaseEstimator):
    def __init__(self, log_base=np.log):
        self.__log_base = log_base

    def __log_expr(self, x, a, b, c):
        with warnings.catch_warnings(): 
            warnings.simplefilter("ignore", RuntimeWarning) 
            return a * self.__log_base(x+c) + b
    
    def get_params(self, deep=False):
        # https://scikit-learn.org/stable/developers/develop.html#get-params-and-set-params
        return {"log_base": self.__log_base}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def fit(self, X, y): 
        self.coef, _ = curve_fit(self.__log_expr, X.flatten(), y,  maxfev=10000, 
                                 bounds=( (-np.inf, -np.inf, -np.inf),(np.inf, np.inf, np.inf) ))
        return self
    
    def predict(self, X):
        hypothesis = self.__log_expr(X, *self.coef)
        return hypothesis.flatten()
    
    def score(self, X_test, y_test):
        from sklearn.metrics import r2_score
        self.accuracy = r2_score(X_test, y_test)
        return self.accuracy


np.random.seed(543)

n_sample = 100

dataX = np.array(range(1, n_sample+1))
dataY = 2.5 * np.log(dataX) + 7

noise = np.random.normal(np.mean(dataY), 2, n_sample)
add_noise = np.random.choice(a=[False, True], size=n_sample)

for i in range(n_sample):
    if add_noise[i]:
        dataY[i] = noise[i]

plt.style.use("dark_background")
plt.rcParams["figure.figsize"] = (8,6)
plt.grid(False)
#plt.scatter(dataX, dataY, color='white')

X = dataX.reshape(-1, 1)
y = dataY


ransac = RANSACRegressor(base_estimator=LogarithmicRegression(),
                         min_samples=int(n_sample/4),
                         residual_threshold=0.7)
ransac.fit(X, y)

inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

plt.scatter(X[inlier_mask], y[inlier_mask], color='yellowgreen', marker='.', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask], color='r', marker='.', label='Outliers')

lineX = np.arange(X.min(), X.max())[:, np.newaxis]
lineY = ransac.predict(lineX)

print("Estimated coefficients", ransac.estimator_.coef)
print("Accuracy", ransac.estimator_.accuracy)

plt.plot(lineX, lineY, color='yellow', linewidth=2, label='RANSAC regressor')

plt.show()

在此处输入图像描述

1个回答

在阅读了 RANSAC 源代码后,我想我误解了 score 函数的作用。这是我必须做的:

def score(self, X_subset, y_subset):
        from sklearn.metrics import r2_score
        y_pred = self.predict(X_subset)
        self.accuracy = r2_score(y_pred, y_subset)
        return self.accuracy