pyod.models.iforest.IForest 的转换器

此示例回答了问题 685。它为模型 pyod.models.iforest.IForest 实现了一个自定义转换器。此示例使用 实现新的转换器 作为起点。

训练模型

所有导入。它还为 xgboostlightgbm 注册了 onnx 转换器。

import numpy as np
import pandas as pd
from onnxruntime import InferenceSession
from sklearn.preprocessing import MinMaxScaler
from skl2onnx.proto import onnx_proto
from skl2onnx.common.data_types import (
    FloatTensorType,
    Int64TensorType,
    guess_numpy_type,
)
from skl2onnx import to_onnx, update_registered_converter, get_model_alias
from skl2onnx.algebra.onnx_ops import (
    OnnxIdentity,
    OnnxMul,
    OnnxLess,
    OnnxConcat,
    OnnxCast,
    OnnxAdd,
    OnnxClip,
)
from skl2onnx.algebra.onnx_operator import OnnxSubEstimator

try:
    from pyod.models.iforest import IForest
except (ValueError, ImportError) as e:
    print("Unable to import pyod:", e)
    IForest = None

if IForest is not None:
    data1 = {
        "First": [500, 500, 400, 100, 200, 300, 100],
        "Second": ["a", "b", "a", "b", "a", "b", "c"],
    }

    df1 = pd.DataFrame(data1, columns=["First", "Second"])
    dumdf1 = pd.get_dummies(df1)
    scaler = MinMaxScaler()
    scaler.partial_fit(dumdf1)
    sc_data = scaler.transform(dumdf1)
    model1 = IForest(
        n_estimators=10,
        bootstrap=True,
        behaviour="new",
        contamination=0.1,
        random_state=np.random.RandomState(42),
        verbose=1,
        n_jobs=-1,
    ).fit(sc_data)
    feature_names2 = dumdf1.columns

    initial_type = [("float_input", FloatTensorType([None, len(feature_names2)]))]
Unable to import pyod: No module named 'pyod'

我们检查转换是否如预期的那样失败。

if IForest is not None:
    try:
        to_onnx(model1, initial_types=initial_type)
    except Exception as e:
        print(e)

自定义转换器

首先是解析器和形状计算器。解析器定义输出的数量及其类型。形状计算器定义其维度。

def pyod_iforest_parser(scope, model, inputs, custom_parsers=None):
    alias = get_model_alias(type(model))
    this_operator = scope.declare_local_operator(alias, model)

    # inputs
    this_operator.inputs.append(inputs[0])

    # outputs
    cls_type = inputs[0].type.__class__
    val_y1 = scope.declare_local_variable("label", Int64TensorType())
    val_y2 = scope.declare_local_variable("probability", cls_type())
    this_operator.outputs.append(val_y1)
    this_operator.outputs.append(val_y2)

    # end
    return this_operator.outputs


def pyod_iforest_shape_calculator(operator):
    N = operator.inputs[0].get_first_dimension()
    operator.outputs[0].type.shape = [N, 1]
    operator.outputs[1].type.shape = [N, 2]

然后是转换器。

def pyod_iforest_converter(scope, operator, container):
    op = operator.raw_operator
    opv = container.target_opset
    out = operator.outputs

    # We retrieve the unique input.
    X = operator.inputs[0]

    # In most case, computation happen in floats.
    # But it might be with double. ONNX is very strict
    # about types, every constant should have the same
    # type as the input.
    dtype = guess_numpy_type(X.type)

    detector = op.detector_  # Should be IForest from scikit-learn.
    lab_pred = OnnxSubEstimator(detector, X, op_version=opv)
    scores = OnnxIdentity(lab_pred[1], op_version=opv)

    # labels
    threshold = op.threshold_
    above = OnnxLess(scores, np.array([threshold], dtype=dtype), op_version=opv)
    labels = OnnxCast(
        above, op_version=opv, to=onnx_proto.TensorProto.INT64, output_names=out[:1]
    )

    # probabilities
    train_scores = op.decision_scores_
    scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1))
    scores_ = OnnxMul(scores, np.array([-1], dtype=dtype), op_version=opv)
    print(scaler.min_)
    print(scaler.scale_)

    scaled = OnnxMul(scores_, scaler.scale_.astype(dtype), op_version=opv)
    scaled_centered = OnnxAdd(scaled, scaler.min_.astype(dtype), op_version=opv)
    clipped = OnnxClip(
        scaled_centered,
        np.array([0], dtype=dtype),
        np.array([1], dtype=dtype),
        op_version=opv,
    )
    clipped_ = OnnxAdd(
        OnnxMul(clipped, np.array([-1], dtype=dtype), op_version=opv),
        np.array([1], dtype=dtype),
        op_version=opv,
    )

    scores_2d = OnnxConcat(
        clipped_, clipped, axis=1, op_version=opv, output_names=out[1:]
    )

    labels.add_to(scope, container)
    scores_2d.add_to(scope, container)

最后是注册。

if IForest is not None:
    update_registered_converter(
        IForest,
        "PyodIForest",
        pyod_iforest_shape_calculator,
        pyod_iforest_converter,
        parser=pyod_iforest_parser,
    )

以及转换。

if IForest is not None:
    onx = to_onnx(
        model1, initial_types=initial_type, target_opset={"": 14, "ai.onnx.ml": 2}
    )

检查差异

if IForest is not None:
    data = sc_data.astype(np.float32)

    expected_labels = model1.predict(data)
    expected_proba = model1.predict_proba(data)

    sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
    res = sess.run(None, {"float_input": data})

    onx_labels = res[0]
    onx_proba = res[1]

    diff_labels = np.abs(onx_labels.ravel() - expected_labels.ravel()).max()
    diff_proba = np.abs(onx_proba.ravel() - expected_proba.ravel()).max()

    print("dicrepencies:", diff_labels, diff_proba)

    print("ONNX labels", onx_labels)
    print("ONNX probabilities", onx_proba)

脚本总运行时间:(0 分钟 0.008 秒)

由 Sphinx-Gallery 生成的库