注意
转到末尾 下载完整的示例代码。
TfIdf 和稀疏矩阵¶
TfidfVectorizer 通常会创建稀疏数据。如果数据足够稀疏,矩阵通常会在整个流水线中保持稀疏状态,直到训练预测器。稀疏矩阵不考虑 null 和缺失值,因为它们不存在于数据集中。由于某些预测器会区分这些值,这种歧义可能会在转换为 ONNX 时引入差异。本示例将探讨几种配置。
导入,设置¶
所有导入。它还注册了 xgboost 和 lightgbm 的 onnx 转换器。
import warnings
import numpy
import pandas
import onnxruntime as rt
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
try:
from sklearn.ensemble import HistGradientBoostingClassifier
except ImportError:
HistGradientBoostingClassifier = None
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.sklapi import CastTransformer, ReplaceTransformer
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm
update_registered_converter(
XGBClassifier,
"XGBoostXGBClassifier",
calculate_linear_classifier_output_shapes,
convert_xgboost,
options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)
update_registered_converter(
LGBMClassifier,
"LightGbmLGBMClassifier",
calculate_linear_classifier_output_shapes,
convert_lightgbm,
options={"nocl": [True, False], "zipmap": [True, False]},
)
人工数据集¶
Iris + 文本列。
cst = ["class zero", "class one", "class two"]
data = load_iris()
X = data.data[:, :2]
y = data.target
df = pandas.DataFrame(X)
df.columns = [f"c{c}" for c in df.columns]
df["text"] = [cst[i] for i in y]
ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()
稀疏数据后的集成训练¶
本示例使用 Iris 数据集和经过 tf-idf 预处理的人工文本数据集。sparse_threshold=1. 避免稀疏矩阵转换为密集矩阵。
def make_pipelines(
df_train,
y_train,
models=None,
sparse_threshold=1.0,
replace_nan=False,
insert_replace=False,
):
if models is None:
models = [
RandomForestClassifier,
HistGradientBoostingClassifier,
XGBClassifier,
LGBMClassifier,
]
models = [_ for _ in models if _ is not None]
pipes = []
for model in tqdm(models):
if model == HistGradientBoostingClassifier:
kwargs = dict(max_iter=5)
elif model == XGBClassifier:
kwargs = dict(n_estimators=5, use_label_encoder=False)
else:
kwargs = dict(n_estimators=5)
if insert_replace:
pipe = Pipeline(
[
(
"union",
ColumnTransformer(
[
("scale1", StandardScaler(), [0, 1]),
(
"subject",
Pipeline(
[
("count", CountVectorizer()),
("tfidf", TfidfTransformer()),
("repl", ReplaceTransformer()),
]
),
"text",
),
],
sparse_threshold=sparse_threshold,
),
),
("cast", CastTransformer()),
("cls", model(max_depth=3, **kwargs)),
]
)
else:
pipe = Pipeline(
[
(
"union",
ColumnTransformer(
[
("scale1", StandardScaler(), [0, 1]),
(
"subject",
Pipeline(
[
("count", CountVectorizer()),
("tfidf", TfidfTransformer()),
]
),
"text",
),
],
sparse_threshold=sparse_threshold,
),
),
("cast", CastTransformer()),
("cls", model(max_depth=3, **kwargs)),
]
)
try:
pipe.fit(df_train, y_train)
except TypeError as e:
obs = dict(model=model.__name__, pipe=pipe, error=e, model_onnx=None)
pipes.append(obs)
continue
options = {model: {"zipmap": False}}
if replace_nan:
options[TfidfTransformer] = {"nan": True}
# convert
with warnings.catch_warnings(record=False):
warnings.simplefilter("ignore", (FutureWarning, UserWarning))
model_onnx = to_onnx(
pipe,
initial_types=[
("input", FloatTensorType([None, 2])),
("text", StringTensorType([None, 1])),
],
target_opset={"": 12, "ai.onnx.ml": 2},
options=options,
)
with open("model.onnx", "wb") as f:
f.write(model_onnx.SerializeToString())
sess = rt.InferenceSession(
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
)
inputs = {
"input": df[["c0", "c1"]].values.astype(numpy.float32),
"text": df[["text"]].values,
}
pred_onx = sess.run(None, inputs)
diff = numpy.abs(pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum()
obs = dict(
model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe
)
pipes.append(obs)
return pipes
data_sparse = make_pipelines(df, y)
stat = pandas.DataFrame(data_sparse).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
Traceback (most recent call last):
File "/home/xadupre/github/sklearn-onnx/docs/tutorial/plot_usparse_xgboost.py", line 227, in <module>
data_sparse = make_pipelines(df, y)
^^^^^^^^^^^^^^^^^^^^^
File "/home/xadupre/github/sklearn-onnx/docs/tutorial/plot_usparse_xgboost.py", line 217, in make_pipelines
diff = numpy.abs(pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum()
^^^^^^^^^^^^^^^^^^^^^^
File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/pipeline.py", line 896, in predict_proba
with _raise_or_warn_if_not_fitted(self):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
next(self.gen)
File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/pipeline.py", line 60, in _raise_or_warn_if_not_fitted
check_is_fitted(estimator)
File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1756, in check_is_fitted
if not _is_fitted(estimator, attributes, all_or_any):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1665, in _is_fitted
return estimator.__sklearn_is_fitted__()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/pipeline.py", line 1310, in __sklearn_is_fitted__
check_is_fitted(last_step)
File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1751, in check_is_fitted
tags = get_tags(estimator)
^^^^^^^^^^^^^^^^^^^
File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/_tags.py", line 405, in get_tags
sklearn_tags_provider[klass] = klass.__sklearn_tags__(estimator) # type: ignore[attr-defined]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/base.py", line 540, in __sklearn_tags__
tags = super().__sklearn_tags__()
^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'super' object has no attribute '__sklearn_tags__'
稀疏数据会导致问题。
密集数据¶
让我们使用 sparse_threshold=0. 将稀疏数据替换为密集数据。
data_dense = make_pipelines(df, y, sparse_threshold=0.0)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
这好多了。我们来比较一下预处理是如何应用于数据的。
print("sparse")
print(data_sparse[-1]["pipe"].steps[0][-1].transform(df)[:2])
print()
print("dense")
print(data_dense[-1]["pipe"].steps[0][-1].transform(df)[:2])
这表明 RandomForestClassifier、XGBClassifier 处理稀疏矩阵和密集矩阵的方式与 LGBMClassifier 不同。而 HistGradientBoostingClassifier 会失败。
带有 nan 的密集数据¶
让我们在 scikit-learn 流水线中保留稀疏数据,但在 onnx 图中将 null 值替换为 nan。
data_dense = make_pipelines(df, y, sparse_threshold=1.0, replace_nan=True)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
密集数据,0 替换为 nan¶
除了使用特定选项将 null 值替换为 nan 值外,本示例明确地将一个名为 ReplaceTransformer 的自定义转换器插入到流水线中。还将一个新的转换器添加到支持的模型列表中。这与之前的选项等效,只是更显式。
data_dense = make_pipelines(
df, y, sparse_threshold=1.0, replace_nan=False, insert_replace=True
)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
结论¶
除非使用密集数组,否则由于 onnxruntime 尚不支持稀疏 ONNX,转换需要根据 TfIdf 预处理后面的模型进行调整。
脚本总运行时间: (0 分钟 0.258 秒)