注意
前往末尾 下载完整的示例代码。
Dataframe 作为输入¶
通常,一个管道会以矩阵形式接收数据。如果所有数据都具有相同的类型,则可以将其转换为矩阵。但是,数据框(dataframe)中的数据通常具有多种类型,例如浮点型、整型或类别的字符串型。ONNX 也支持这种情况。
包含类别的数据集¶
import numpy
import pprint
from onnxruntime import InferenceSession
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from skl2onnx import to_onnx
from skl2onnx.algebra.type_helper import guess_initial_types
data = DataFrame(
[
dict(CAT1="a", CAT2="c", num1=0.5, num2=0.6, y=0),
dict(CAT1="b", CAT2="d", num1=0.4, num2=0.8, y=1),
dict(CAT1="a", CAT2="d", num1=0.5, num2=0.56, y=0),
dict(CAT1="a", CAT2="d", num1=0.55, num2=0.56, y=1),
dict(CAT1="a", CAT2="c", num1=0.35, num2=0.86, y=0),
dict(CAT1="a", CAT2="c", num1=0.5, num2=0.68, y=1),
]
)
cat_cols = ["CAT1", "CAT2"]
train_data = data.drop("y", axis=1)
categorical_transformer = Pipeline(
[("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))]
)
preprocessor = ColumnTransformer(
transformers=[("cat", categorical_transformer, cat_cols)], remainder="passthrough"
)
pipe = Pipeline([("preprocess", preprocessor), ("rf", RandomForestClassifier())])
pipe.fit(train_data, data["y"])
转换为 ONNX¶
函数 to_onnx 不支持数据框。
onx = to_onnx(pipe, train_data[:1], options={RandomForestClassifier: {"zipmap": False}})
使用 ONNX 进行预测¶
onnxruntime 不支持数据框。
sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
try:
sess.run(None, train_data)
except Exception as e:
print(e)
# Unhide conversion logic with a dataframe
# ++++++++++++++++++++++++++++++++++++++++
#
# A dataframe can be seen as a set of columns with
# different types. That's what ONNX should see:
# a list of inputs, the input name is the column name,
# the input type is the column type.
def guess_schema_from_data(X):
init = guess_initial_types(X)
unique = set()
for _, col in init:
if len(col.shape) != 2:
return init
if col.shape[0] is not None:
return init
if len(unique) > 0 and col.__class__ not in unique:
return init
unique.add(col.__class__)
unique = list(unique)
return [("X", unique[0]([None, sum(_[1].shape[1] for _ in init)]))]
init = guess_schema_from_data(train_data)
pprint.pprint(init)
run(): incompatible function arguments. The following argument types are supported:
1. (self: onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession, arg0: list[str], arg1: dict[str, object], arg2: onnxruntime.capi.onnxruntime_pybind11_state.RunOptions) -> list
Invoked with: <onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession object at 0x74146894e1f0>, ['label', 'probabilities'], CAT1 CAT2 num1 num2
0 a c 0.50 0.60
1 b d 0.40 0.80
2 a d 0.50 0.56
3 a d 0.55 0.56
4 a c 0.35 0.86
5 a c 0.50 0.68, None
[('CAT1', StringTensorType(shape=[None, 1])),
('CAT2', StringTensorType(shape=[None, 1])),
('num1', DoubleTensorType(shape=[None, 1])),
('num2', DoubleTensorType(shape=[None, 1]))]
让我们使用浮点数代替。
for c in train_data.columns:
if c not in cat_cols:
train_data[c] = train_data[c].astype(numpy.float32)
init = guess_schema_from_data(train_data)
pprint.pprint(init)
[('CAT1', StringTensorType(shape=[None, 1])),
('CAT2', StringTensorType(shape=[None, 1])),
('num1', FloatTensorType(shape=[None, 1])),
('num2', FloatTensorType(shape=[None, 1]))]
让我们只使用 skl2onnx 进行转换。
onx2 = to_onnx(
pipe, initial_types=init, options={RandomForestClassifier: {"zipmap": False}}
)
让我们使用 onnxruntime 运行它。我们需要将数据框转换为字典,其中列名成为键,列值成为值。
inputs = {c: train_data[c].values.reshape((-1, 1)) for c in train_data.columns}
pprint.pprint(inputs)
{'CAT1': array([['a'],
['b'],
['a'],
['a'],
['a'],
['a']], dtype=object),
'CAT2': array([['c'],
['d'],
['d'],
['d'],
['c'],
['c']], dtype=object),
'num1': array([[0.5 ],
[0.4 ],
[0.5 ],
[0.55],
[0.35],
[0.5 ]], dtype=float32),
'num2': array([[0.6 ],
[0.8 ],
[0.56],
[0.56],
[0.86],
[0.68]], dtype=float32)}
推理。
sess2 = InferenceSession(onx2.SerializeToString(), providers=["CPUExecutionProvider"])
got2 = sess2.run(None, inputs)
print(pipe.predict(train_data))
print(got2[0])
[0 1 0 1 0 1]
[0 1 0 1 0 1]
以及概率。
print(pipe.predict_proba(train_data))
print(got2[1])
[[0.78 0.22]
[0.27 0.73]
[0.71 0.29]
[0.2 0.8 ]
[0.76 0.24]
[0.36 0.64]]
[[0.78 0.22000003]
[0.2700004 0.7299996 ]
[0.71000004 0.29 ]
[0.20000046 0.79999954]
[0.76 0.24000004]
[0.3600003 0.6399997 ]]
脚本总运行时间: (0 分钟 0.403 秒)