Python sklearn - 平均分类报告

数据挖掘 Python scikit-学习
2022-02-09 23:39:37

使用 Python 的 sklearn 模块,

 from sklearn.metrics import classification_report
 y1_predict = [0, 1, 1, 0]
 y1_dev = [0, 1, 1, 0]
 report_1 = classification_report(y1_dev, y1_predict)
 y2_predict = [1, 0, 1, 0]
 y2_dev = [1, 1, 0, 0]
 report_2 = classification_report(y2_dev, y2_predict)

有没有一种方法可以结合(也许只是一个平均值)report_1report_2我正在寻找一个像这样的实现:

 report_average = average(report_1,report_2)

还是必须手动完成?我希望打印report_average在两个报告之间会有平均值。

这是已接受答案的 MWE:

    from sklearn.metrics import classification_report
    import pandas as pd
    import numpy as np
    from functools import reduce
    def report_average(*args):
        report_list = list()
        for report in args:
            splited = [' '.join(x.split()) for x in report.split('\n\n')]
            header = [x for x in splited[0].split(' ')]
            data = np.array(splited[1].split(' ')).reshape(-1, len(header) + 1)
            data = np.delete(data, 0, 1).astype(float)
            avg_total = np.array([x for x in splited[2].split(' ')][3:]).astype(float).reshape(-1, len(header))
            df = pd.DataFrame(np.concatenate((data, avg_total)), columns=header)
            report_list.append(df)
        res = reduce(lambda x, y: x.add(y, fill_value=0), report_list) / len(report_list)
        return res.rename(index={res.index[-1]: 'avg / total'})


    y1_predict = [0, 1, 1, 0]
    y1_dev = [0, 1, 1, 0]
    report_1 = classification_report(y1_dev, y1_predict)
    y2_predict = [1, 0, 1, 0]
    y2_dev = [1, 1, 0, 0]
    report_2 = classification_report(y2_dev, y2_predict)

    report_ave = report_average(report_1,report_2)

    print(report_ave)

哪个产量

             precision  recall  f1-score  support
0                 0.75    0.75      0.75      2.0
1                 0.75    0.75      0.75      2.0
avg / total       0.75    0.75      0.75      4.0
2个回答

它可能有点复杂,因为我将报告转换为 pandas.DataFrame 进行计算。但我认为这是值得的,因为它也适用于两个或更多报告。试试下面:

import pandas as pd
import numpy as np
from functools import reduce
def report_average(*args):
    report_list = list()
    for report in args:
        splited = [' '.join(x.split()) for x in report.split('\n\n')]
        header = [x for x in splited[0].split(' ')]
        data = np.array(splited[1].split(' ')).reshape(-1, len(header) + 1)
        data = np.delete(data, 0, 1).astype(float)
        avg_total = np.array([x for x in splited[2].split(' ')][3:]).astype(float).reshape(-1, len(header))
        df = pd.DataFrame(np.concatenate((data, avg_total)), columns=header)
        report_list.append(df)
    res = reduce(lambda x, y: x.add(y, fill_value=0), report_list) / len(report_list)
    return res.rename(index={res.index[-1]: 'avg / total'})

输出:

report_average  = report_average(report_1, report_2)
print(report_average)
             precision  recall  f1-score  support
0                 0.75    0.75      0.75      2.0
1                 0.75    0.75      0.75      2.0
avg / total       0.75    0.75      0.75      4.0

report_3 = report_2
report_average  = report_average(report_1, report_2,report_3)
print(report_average)
             precision    recall  f1-score  support
0             0.666667  0.666667  0.666667      2.0
1             0.666667  0.666667  0.666667      2.0
avg / total   0.666667  0.666667  0.666667      4.0

当报告(as_dict)作为列表传递时,这是另一种方法。这会将结果作为字典返回。

def report_average(reports):
    mean_dict = dict()
    for label in reports[0].keys():
        dictionary = dict()

        if label in 'accuracy':
            mean_dict[label] = sum(d[label] for d in reports) / len(reports)
            continue

        for key in reports[0][label].keys():
            dictionary[key] = sum(d[label][key] for d in reports) / len(reports)
        mean_dict[label] = dictionary

    return mean_dict