pyod.models.iforest.IForest 的转换器¶

此示例回答了 685 号问题。它为模型 pyod.models.iforest.IForest 实现了一个自定义转换器。此示例使用实现一个新的转换器作为起点。

训练模型¶

所有导入。它还注册了 xgboost 和 lightgbm 的 onnx 转换器。

import numpy as np
import pandas as pd
from onnxruntime import InferenceSession
from sklearn.preprocessing import MinMaxScaler
from skl2onnx.proto import onnx_proto
from skl2onnx.common.data_types import (
    FloatTensorType,
    Int64TensorType,
    guess_numpy_type,
)
from skl2onnx import to_onnx, update_registered_converter, get_model_alias
from skl2onnx.algebra.onnx_ops import (
    OnnxIdentity,
    OnnxMul,
    OnnxLess,
    OnnxConcat,
    OnnxCast,
    OnnxAdd,
    OnnxClip,
)
from skl2onnx.algebra.onnx_operator import OnnxSubEstimator

try:
    from pyod.models.iforest import IForest
except (ValueError, ImportError) as e:
    print("Unable to import pyod:", e)
    IForest = None

if IForest is not None:
    data1 = {
        "First": [500, 500, 400, 100, 200, 300, 100],
        "Second": ["a", "b", "a", "b", "a", "b", "c"],
    }

    df1 = pd.DataFrame(data1, columns=["First", "Second"])
    dumdf1 = pd.get_dummies(df1)
    scaler = MinMaxScaler()
    scaler.partial_fit(dumdf1)
    sc_data = scaler.transform(dumdf1)
    model1 = IForest(
        n_estimators=10,
        bootstrap=True,
        behaviour="new",
        contamination=0.1,
        random_state=np.random.RandomState(42),
        verbose=1,
        n_jobs=-1,
    ).fit(sc_data)
    feature_names2 = dumdf1.columns

    initial_type = [("float_input", FloatTensorType([None, len(feature_names2)]))]

Unable to import pyod: No module named 'pyod'

我们检查转换是否按预期失败。

if IForest is not None:
    try:
        to_onnx(model1, initial_types=initial_type)
    except Exception as e:
        print(e)

自定义转换器¶

首先是解析器和形状计算器。解析器定义了输出的数量和它们的类型。形状计算器定义了它们的维度。

def pyod_iforest_parser(scope, model, inputs, custom_parsers=None):
    alias = get_model_alias(type(model))
    this_operator = scope.declare_local_operator(alias, model)

    # inputs
    this_operator.inputs.append(inputs[0])

    # outputs
    cls_type = inputs[0].type.__class__
    val_y1 = scope.declare_local_variable("label", Int64TensorType())
    val_y2 = scope.declare_local_variable("probability", cls_type())
    this_operator.outputs.append(val_y1)
    this_operator.outputs.append(val_y2)

    # end
    return this_operator.outputs


def pyod_iforest_shape_calculator(operator):
    N = operator.inputs[0].get_first_dimension()
    operator.outputs[0].type.shape = [N, 1]
    operator.outputs[1].type.shape = [N, 2]

然后是转换器。

def pyod_iforest_converter(scope, operator, container):
    op = operator.raw_operator
    opv = container.target_opset
    out = operator.outputs

    # We retrieve the unique input.
    X = operator.inputs[0]

    # In most case, computation happen in floats.
    # But it might be with double. ONNX is very strict
    # about types, every constant should have the same
    # type as the input.
    dtype = guess_numpy_type(X.type)

    detector = op.detector_  # Should be IForest from scikit-learn.
    lab_pred = OnnxSubEstimator(detector, X, op_version=opv)
    scores = OnnxIdentity(lab_pred[1], op_version=opv)

    # labels
    threshold = op.threshold_
    above = OnnxLess(scores, np.array([threshold], dtype=dtype), op_version=opv)
    labels = OnnxCast(
        above, op_version=opv, to=onnx_proto.TensorProto.INT64, output_names=out[:1]
    )

    # probabilities
    train_scores = op.decision_scores_
    scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1))
    scores_ = OnnxMul(scores, np.array([-1], dtype=dtype), op_version=opv)
    print(scaler.min_)
    print(scaler.scale_)

    scaled = OnnxMul(scores_, scaler.scale_.astype(dtype), op_version=opv)
    scaled_centered = OnnxAdd(scaled, scaler.min_.astype(dtype), op_version=opv)
    clipped = OnnxClip(
        scaled_centered,
        np.array([0], dtype=dtype),
        np.array([1], dtype=dtype),
        op_version=opv,
    )
    clipped_ = OnnxAdd(
        OnnxMul(clipped, np.array([-1], dtype=dtype), op_version=opv),
        np.array([1], dtype=dtype),
        op_version=opv,
    )

    scores_2d = OnnxConcat(
        clipped_, clipped, axis=1, op_version=opv, output_names=out[1:]
    )

    labels.add_to(scope, container)
    scores_2d.add_to(scope, container)

最后是注册。

if IForest is not None:
    update_registered_converter(
        IForest,
        "PyodIForest",
        pyod_iforest_shape_calculator,
        pyod_iforest_converter,
        parser=pyod_iforest_parser,
    )

以及转换。

if IForest is not None:
    onx = to_onnx(
        model1, initial_types=initial_type, target_opset={"": 14, "ai.onnx.ml": 2}
    )

检查差异¶

if IForest is not None:
    data = sc_data.astype(np.float32)

    expected_labels = model1.predict(data)
    expected_proba = model1.predict_proba(data)

    sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
    res = sess.run(None, {"float_input": data})

    onx_labels = res[0]
    onx_proba = res[1]

    diff_labels = np.abs(onx_labels.ravel() - expected_labels.ravel()).max()
    diff_proba = np.abs(onx_proba.ravel() - expected_proba.ravel()).max()

    print("dicrepencies:", diff_labels, diff_proba)

    print("ONNX labels", onx_labels)
    print("ONNX probabilities", onx_proba)

脚本总运行时间： (0 分钟 0.011 秒)

Sphinx-Gallery 生成的图库