注意
转到末尾 下载完整示例代码。
Dataframe 作为输入¶
流水线通常以矩阵形式摄取数据。如果所有数据类型相同,则可以将其转换为矩阵。但保存在 dataframe 中的数据通常具有多种类型,例如 float、integer 或用于类别的 string。ONNX 也支持这种情况。
包含类别的数据集¶
import numpy
import pprint
from onnxruntime import InferenceSession
from pandas import DataFrame
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from skl2onnx import to_onnx
from skl2onnx.algebra.type_helper import guess_initial_types
data = DataFrame(
[
dict(CAT1="a", CAT2="c", num1=0.5, num2=0.6, y=0),
dict(CAT1="b", CAT2="d", num1=0.4, num2=0.8, y=1),
dict(CAT1="a", CAT2="d", num1=0.5, num2=0.56, y=0),
dict(CAT1="a", CAT2="d", num1=0.55, num2=0.56, y=1),
dict(CAT1="a", CAT2="c", num1=0.35, num2=0.86, y=0),
dict(CAT1="a", CAT2="c", num1=0.5, num2=0.68, y=1),
]
)
cat_cols = ["CAT1", "CAT2"]
train_data = data.drop("y", axis=1)
categorical_transformer = Pipeline(
[("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))]
)
preprocessor = ColumnTransformer(
transformers=[("cat", categorical_transformer, cat_cols)], remainder="passthrough"
)
pipe = Pipeline([("preprocess", preprocessor), ("rf", RandomForestClassifier())])
pipe.fit(train_data, data["y"])
转换为 ONNX¶
函数 *to_onnx* 不处理 dataframe。
onx = to_onnx(pipe, train_data[:1], options={RandomForestClassifier: {"zipmap": False}})
使用 ONNX 进行预测¶
onnxruntime 不支持 dataframe。
sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
try:
sess.run(None, train_data)
except Exception as e:
print(e)
# Unhide conversion logic with a dataframe
# ++++++++++++++++++++++++++++++++++++++++
#
# A dataframe can be seen as a set of columns with
# different types. That's what ONNX should see:
# a list of inputs, the input name is the column name,
# the input type is the column type.
def guess_schema_from_data(X):
init = guess_initial_types(X)
unique = set()
for _, col in init:
if len(col.shape) != 2:
return init
if col.shape[0] is not None:
return init
if len(unique) > 0 and col.__class__ not in unique:
return init
unique.add(col.__class__)
unique = list(unique)
return [("X", unique[0]([None, sum(_[1].shape[1] for _ in init)]))]
init = guess_schema_from_data(train_data)
pprint.pprint(init)
run(): incompatible function arguments. The following argument types are supported:
1. (self: onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession, arg0: list[str], arg1: dict[str, object], arg2: onnxruntime.capi.onnxruntime_pybind11_state.RunOptions) -> list
Invoked with: <onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession object at 0x7f58cc896630>, ['label', 'probabilities'], CAT1 CAT2 num1 num2
0 a c 0.50 0.60
1 b d 0.40 0.80
2 a d 0.50 0.56
3 a d 0.55 0.56
4 a c 0.35 0.86
5 a c 0.50 0.68, None
[('CAT1', StringTensorType(shape=[None, 1])),
('CAT2', StringTensorType(shape=[None, 1])),
('num1', DoubleTensorType(shape=[None, 1])),
('num2', DoubleTensorType(shape=[None, 1]))]
让我们改用 float 类型。
for c in train_data.columns:
if c not in cat_cols:
train_data[c] = train_data[c].astype(numpy.float32)
init = guess_schema_from_data(train_data)
pprint.pprint(init)
[('CAT1', StringTensorType(shape=[None, 1])),
('CAT2', StringTensorType(shape=[None, 1])),
('num1', FloatTensorType(shape=[None, 1])),
('num2', FloatTensorType(shape=[None, 1]))]
让我们只用 *skl2onnx* 进行转换。
onx2 = to_onnx(
pipe, initial_types=init, options={RandomForestClassifier: {"zipmap": False}}
)
让我们使用 onnxruntime 运行它。我们需要将 dataframe 转换为一个字典,其中列名成为键,列值成为值。
inputs = {c: train_data[c].values.reshape((-1, 1)) for c in train_data.columns}
pprint.pprint(inputs)
{'CAT1': array([['a'],
['b'],
['a'],
['a'],
['a'],
['a']], dtype=object),
'CAT2': array([['c'],
['d'],
['d'],
['d'],
['c'],
['c']], dtype=object),
'num1': array([[0.5 ],
[0.4 ],
[0.5 ],
[0.55],
[0.35],
[0.5 ]], dtype=float32),
'num2': array([[0.6 ],
[0.8 ],
[0.56],
[0.56],
[0.86],
[0.68]], dtype=float32)}
推理。
sess2 = InferenceSession(onx2.SerializeToString(), providers=["CPUExecutionProvider"])
got2 = sess2.run(None, inputs)
print(pipe.predict(train_data))
print(got2[0])
[0 1 0 1 0 1]
[0 1 0 1 0 1]
和概率。
print(pipe.predict_proba(train_data))
print(got2[1])
[[0.84 0.16]
[0.32 0.68]
[0.68 0.32]
[0.17 0.83]
[0.77 0.23]
[0.36 0.64]]
[[0.84000003 0.16 ]
[0.32000035 0.67999965]
[0.68000007 0.31999996]
[0.1700005 0.8299995 ]
[0.77 0.23000003]
[0.3600003 0.6399997 ]]
脚本总运行时间:(0 分 0.300 秒)