将数据框作为输入

管道通常将数据作为矩阵摄取。如果所有数据共享相同的类型,则可以将其转换为矩阵。但是,保存在数据框中的数据通常具有多种类型,例如浮点数、整数或字符串(用于类别)。ONNX 也支持这种情况。

包含类别的数据集

import numpy
import pprint
from onnxruntime import InferenceSession
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from skl2onnx import to_onnx
from skl2onnx.algebra.type_helper import guess_initial_types


data = DataFrame(
    [
        dict(CAT1="a", CAT2="c", num1=0.5, num2=0.6, y=0),
        dict(CAT1="b", CAT2="d", num1=0.4, num2=0.8, y=1),
        dict(CAT1="a", CAT2="d", num1=0.5, num2=0.56, y=0),
        dict(CAT1="a", CAT2="d", num1=0.55, num2=0.56, y=1),
        dict(CAT1="a", CAT2="c", num1=0.35, num2=0.86, y=0),
        dict(CAT1="a", CAT2="c", num1=0.5, num2=0.68, y=1),
    ]
)

cat_cols = ["CAT1", "CAT2"]
train_data = data.drop("y", axis=1)


categorical_transformer = Pipeline(
    [("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))]
)
preprocessor = ColumnTransformer(
    transformers=[("cat", categorical_transformer, cat_cols)], remainder="passthrough"
)
pipe = Pipeline([("preprocess", preprocessor), ("rf", RandomForestClassifier())])
pipe.fit(train_data, data["y"])
Pipeline(steps=[('preprocess',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['CAT1', 'CAT2'])])),
                ('rf', RandomForestClassifier())])
在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示形式或信任笔记本。
在 GitHub 上,HTML 表示形式无法呈现,请尝试使用 nbviewer.org 加载此页面。


转换为 ONNX

函数 to_onnx 不处理数据框。

onx = to_onnx(pipe, train_data[:1], options={RandomForestClassifier: {"zipmap": False}})

使用 ONNX 进行预测

onnxruntime 不支持数据框。

sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
try:
    sess.run(None, train_data)
except Exception as e:
    print(e)

# Unhide conversion logic with a dataframe
# ++++++++++++++++++++++++++++++++++++++++
#
# A dataframe can be seen as a set of columns with
# different types. That's what ONNX should see:
# a list of inputs, the input name is the column name,
# the input type is the column type.


def guess_schema_from_data(X):
    init = guess_initial_types(X)
    unique = set()
    for _, col in init:
        if len(col.shape) != 2:
            return init
        if col.shape[0] is not None:
            return init
        if len(unique) > 0 and col.__class__ not in unique:
            return init
        unique.add(col.__class__)
    unique = list(unique)
    return [("X", unique[0]([None, sum(_[1].shape[1] for _ in init)]))]


init = guess_schema_from_data(train_data)

pprint.pprint(init)
run(): incompatible function arguments. The following argument types are supported:
    1. (self: onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession, arg0: List[str], arg1: Dict[str, object], arg2: onnxruntime.capi.onnxruntime_pybind11_state.RunOptions) -> List[object]

Invoked with: <onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession object at 0x7f029b60e570>, ['label', 'probabilities'],   CAT1 CAT2  num1  num2
0    a    c  0.50  0.60
1    b    d  0.40  0.80
2    a    d  0.50  0.56
3    a    d  0.55  0.56
4    a    c  0.35  0.86
5    a    c  0.50  0.68, None
[('CAT1', StringTensorType(shape=[None, 1])),
 ('CAT2', StringTensorType(shape=[None, 1])),
 ('num1', DoubleTensorType(shape=[None, 1])),
 ('num2', DoubleTensorType(shape=[None, 1]))]

让我们改用浮点数。

for c in train_data.columns:
    if c not in cat_cols:
        train_data[c] = train_data[c].astype(numpy.float32)


init = guess_schema_from_data(train_data)
pprint.pprint(init)
[('CAT1', StringTensorType(shape=[None, 1])),
 ('CAT2', StringTensorType(shape=[None, 1])),
 ('num1', FloatTensorType(shape=[None, 1])),
 ('num2', FloatTensorType(shape=[None, 1]))]

让我们仅使用 skl2onnx 进行转换。

onx2 = to_onnx(
    pipe, initial_types=init, options={RandomForestClassifier: {"zipmap": False}}
)

让我们使用 onnxruntime 运行它。我们需要将数据框转换为字典,其中列名成为键,列值成为值。

inputs = {c: train_data[c].values.reshape((-1, 1)) for c in train_data.columns}
pprint.pprint(inputs)
{'CAT1': array([['a'],
       ['b'],
       ['a'],
       ['a'],
       ['a'],
       ['a']], dtype=object),
 'CAT2': array([['c'],
       ['d'],
       ['d'],
       ['d'],
       ['c'],
       ['c']], dtype=object),
 'num1': array([[0.5 ],
       [0.4 ],
       [0.5 ],
       [0.55],
       [0.35],
       [0.5 ]], dtype=float32),
 'num2': array([[0.6 ],
       [0.8 ],
       [0.56],
       [0.56],
       [0.86],
       [0.68]], dtype=float32)}

推理。

sess2 = InferenceSession(onx2.SerializeToString(), providers=["CPUExecutionProvider"])

got2 = sess2.run(None, inputs)

print(pipe.predict(train_data))
print(got2[0])
[0 1 0 1 0 1]
[0 1 0 1 0 1]

以及概率。

print(pipe.predict_proba(train_data))
print(got2[1])
[[0.82 0.18]
 [0.26 0.74]
 [0.76 0.24]
 [0.37 0.63]
 [0.75 0.25]
 [0.29 0.71]]
[[0.82       0.18      ]
 [0.2600004  0.7399996 ]
 [0.76       0.24000004]
 [0.3700003  0.6299997 ]
 [0.75       0.25000003]
 [0.29000038 0.7099996 ]]

脚本总运行时间:(0 分钟 0.412 秒)

由 Sphinx-Gallery 生成的库