使用 XGBoost 模型转换流水线¶

sklearn-onnx 只将 scikit-learn 模型转换为 ONNX，但许多库实现了 scikit-learn API，以便它们的模型可以包含在 scikit-learn 流水线中。本示例考虑了一个包含 XGBoost 模型的流水线。只要 sklearn-onnx 知道与 XGBClassifier 相关联的转换器，它就可以转换整个流水线。让我们看看如何实现。

训练一个 XGBoost 分类器¶

import os
import numpy
import matplotlib.pyplot as plt
import onnx
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnxruntime as rt
import sklearn
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import xgboost
from xgboost import XGBClassifier
import skl2onnx
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
)
import onnxmltools
from onnxmltools.convert.xgboost.operator_converters.XGBoost import (
    convert_xgboost,
)
import onnxmltools.convert.common.data_types

data = load_iris()
X = data.data[:, :2]
y = data.target

ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()

pipe = Pipeline([("scaler", StandardScaler()), ("lgbm", XGBClassifier(n_estimators=3))])
pipe.fit(X, y)

# The conversion fails but it is expected.

try:
    convert_sklearn(
        pipe,
        "pipeline_xgboost",
        [("input", FloatTensorType([None, 2]))],
        target_opset={"": 12, "ai.onnx.ml": 2},
    )
except Exception as e:
    print(e)

# The error message tells no converter was found
# for XGBoost models. By default, *sklearn-onnx*
# only handles models from *scikit-learn* but it can
# be extended to every model following *scikit-learn*
# API as long as the module knows there exists a converter
# for every model used in a pipeline. That's why
# we need to register a converter.

'super' object has no attribute '__sklearn_tags__'

注册 XGBClassifier 的转换器¶

该转换器实现在 onnxmltools 中：onnxmltools…XGBoost.py。以及形状计算器：onnxmltools…Classifier.py。

然后我们导入转换器和形状计算器。

让我们注册新的转换器。

update_registered_converter(
    XGBClassifier,
    "XGBoostXGBClassifier",
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)

再次转换¶

model_onnx = convert_sklearn(
    pipe,
    "pipeline_xgboost",
    [("input", FloatTensorType([None, 2]))],
    target_opset={"": 12, "ai.onnx.ml": 2},
)

# And save.
with open("pipeline_xgboost.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

Traceback (most recent call last):
  File "/home/xadupre/github/sklearn-onnx/docs/examples/plot_pipeline_xgboost.py", line 107, in <module>
    model_onnx = convert_sklearn(
                 ^^^^^^^^^^^^^^^^
  File "/home/xadupre/github/sklearn-onnx/skl2onnx/convert.py", line 192, in convert_sklearn
    topology = parse_sklearn_model(
               ^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/github/sklearn-onnx/skl2onnx/_parse.py", line 847, in parse_sklearn_model
    outputs = parse_sklearn(
              ^^^^^^^^^^^^^^
  File "/home/xadupre/github/sklearn-onnx/skl2onnx/_parse.py", line 757, in parse_sklearn
    res = _parse_sklearn(scope, model, inputs, custom_parsers=custom_parsers)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/github/sklearn-onnx/skl2onnx/_parse.py", line 688, in _parse_sklearn
    outputs = sklearn_parsers_map[tmodel](
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/github/sklearn-onnx/skl2onnx/_parse.py", line 295, in _parse_sklearn_pipeline
    ) and is_classifier(step[1]):
          ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/base.py", line 1237, in is_classifier
    return get_tags(estimator).estimator_type == "classifier"
           ^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/_tags.py", line 405, in get_tags
    sklearn_tags_provider[klass] = klass.__sklearn_tags__(estimator)  # type: ignore[attr-defined]
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/base.py", line 540, in __sklearn_tags__
    tags = super().__sklearn_tags__()
           ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'super' object has no attribute '__sklearn_tags__'

比较预测结果¶

使用 XGBoost 进行预测。

print("predict", pipe.predict(X[:5]))
print("predict_proba", pipe.predict_proba(X[:1]))

使用 onnxruntime 进行预测。

sess = rt.InferenceSession("pipeline_xgboost.onnx", providers=["CPUExecutionProvider"])
pred_onx = sess.run(None, {"input": X[:5].astype(numpy.float32)})
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1][:1])

显示 ONNX 图¶

pydot_graph = GetPydotGraph(
    model_onnx.graph,
    name=model_onnx.graph.name,
    rankdir="TB",
    node_producer=GetOpNodeProducer(
        "docstring", color="yellow", fillcolor="yellow", style="filled"
    ),
)
pydot_graph.write_dot("pipeline.dot")

os.system("dot -O -Gdpi=300 -Tpng pipeline.dot")

image = plt.imread("pipeline.dot.png")
fig, ax = plt.subplots(figsize=(40, 20))
ax.imshow(image)
ax.axis("off")

本示例使用的版本

print("numpy:", numpy.__version__)
print("scikit-learn:", sklearn.__version__)
print("onnx: ", onnx.__version__)
print("onnxruntime: ", rt.__version__)
print("skl2onnx: ", skl2onnx.__version__)
print("onnxmltools: ", onnxmltools.__version__)
print("xgboost: ", xgboost.__version__)

脚本总运行时间： (0 分钟 0.161 秒)

由 Sphinx-Gallery 生成的画廊