注意
转到末尾 下载完整的示例代码。
TfIdf 和稀疏矩阵¶
TfidfVectorizer 通常会创建稀疏数据。如果数据足够稀疏,矩阵通常会在整个管道中保持稀疏状态,直到训练预测器。稀疏矩阵不考虑空值和缺失值,因为它们不存在于数据集中。由于一些预测器会区分这些值,这种歧义可能会在转换为 ONNX 时引入差异。本示例将探讨几种配置。
导入和设置¶
所有导入。它还注册了 xgboost 和 lightgbm 的 onnx 转换器。
import warnings
import numpy
import pandas
import onnxruntime as rt
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
try:
from sklearn.ensemble import HistGradientBoostingClassifier
except ImportError:
HistGradientBoostingClassifier = None
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.sklapi import CastTransformer, ReplaceTransformer
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm
update_registered_converter(
XGBClassifier,
"XGBoostXGBClassifier",
calculate_linear_classifier_output_shapes,
convert_xgboost,
options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)
update_registered_converter(
LGBMClassifier,
"LightGbmLGBMClassifier",
calculate_linear_classifier_output_shapes,
convert_lightgbm,
options={"nocl": [True, False], "zipmap": [True, False]},
)
人工数据集¶
Iris + 一个文本列。
cst = ["class zero", "class one", "class two"]
data = load_iris()
X = data.data[:, :2]
y = data.target
df = pandas.DataFrame(X)
df.columns = [f"c{c}" for c in df.columns]
df["text"] = [cst[i] for i in y]
ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()
稀疏后训练集成模型¶
该示例使用 Iris 数据集,并预处理了人工文本数据集,使用 tf-idf。 sparse_threshold=1. 避免了将稀疏矩阵转换为密集矩阵。
def make_pipelines(
df_train,
y_train,
models=None,
sparse_threshold=1.0,
replace_nan=False,
insert_replace=False,
):
if models is None:
models = [
RandomForestClassifier,
HistGradientBoostingClassifier,
XGBClassifier,
LGBMClassifier,
]
models = [_ for _ in models if _ is not None]
pipes = []
for model in tqdm(models):
if model == HistGradientBoostingClassifier:
kwargs = dict(max_iter=5)
elif model == XGBClassifier:
kwargs = dict(n_estimators=5, use_label_encoder=False)
else:
kwargs = dict(n_estimators=5)
if insert_replace:
pipe = Pipeline(
[
(
"union",
ColumnTransformer(
[
("scale1", StandardScaler(), [0, 1]),
(
"subject",
Pipeline(
[
("count", CountVectorizer()),
("tfidf", TfidfTransformer()),
("repl", ReplaceTransformer()),
]
),
"text",
),
],
sparse_threshold=sparse_threshold,
),
),
("cast", CastTransformer()),
("cls", model(max_depth=3, **kwargs)),
]
)
else:
pipe = Pipeline(
[
(
"union",
ColumnTransformer(
[
("scale1", StandardScaler(), [0, 1]),
(
"subject",
Pipeline(
[
("count", CountVectorizer()),
("tfidf", TfidfTransformer()),
]
),
"text",
),
],
sparse_threshold=sparse_threshold,
),
),
("cast", CastTransformer()),
("cls", model(max_depth=3, **kwargs)),
]
)
try:
pipe.fit(df_train, y_train)
except TypeError as e:
obs = dict(model=model.__name__, pipe=pipe, error=e, model_onnx=None)
pipes.append(obs)
continue
options = {model: {"zipmap": False}}
if replace_nan:
options[TfidfTransformer] = {"nan": True}
# convert
with warnings.catch_warnings(record=False):
warnings.simplefilter("ignore", (FutureWarning, UserWarning))
model_onnx = to_onnx(
pipe,
initial_types=[
("input", FloatTensorType([None, 2])),
("text", StringTensorType([None, 1])),
],
target_opset={"": 12, "ai.onnx.ml": 2},
options=options,
)
with open("model.onnx", "wb") as f:
f.write(model_onnx.SerializeToString())
sess = rt.InferenceSession(
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
)
inputs = {
"input": df[["c0", "c1"]].values.astype(numpy.float32),
"text": df[["text"]].values,
}
pred_onx = sess.run(None, inputs)
diff = numpy.abs(pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum()
obs = dict(
model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe
)
pipes.append(obs)
return pipes
data_sparse = make_pipelines(df, y)
stat = pandas.DataFrame(data_sparse).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
0%| | 0/4 [00:00<?, ?it/s]/home/xadupre/vv/this312/lib/python3.12/site-packages/xgboost/training.py:183: UserWarning: [09:45:35] WARNING: /workspace/src/learner.cc:738:
Parameters: { "use_label_encoder" } are not used.
bst.update(dtrain, iteration=i, fobj=obj)
75%|███████▌ | 3/4 [00:00<00:00, 3.18it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000728 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py:2735: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
warnings.warn(
100%|██████████| 4/4 [00:01<00:00, 3.84it/s]
model discrepencies
0 RandomForestClassifier 0.947052
1 HistGradientBoostingClassifier NaN
2 XGBClassifier 15.196619
3 LGBMClassifier 0.000009
稀疏数据不利。
密集数据¶
让我们通过使用 sparse_threshold=0. 将稀疏数据替换为密集数据。
data_dense = make_pipelines(df, y, sparse_threshold=0.0)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
0%| | 0/4 [00:00<?, ?it/s]
50%|█████ | 2/4 [00:00<00:00, 2.78it/s]/home/xadupre/vv/this312/lib/python3.12/site-packages/xgboost/training.py:183: UserWarning: [09:45:37] WARNING: /workspace/src/learner.cc:738:
Parameters: { "use_label_encoder" } are not used.
bst.update(dtrain, iteration=i, fobj=obj)
75%|███████▌ | 3/4 [00:01<00:00, 1.81it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py:2735: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
warnings.warn(
100%|██████████| 4/4 [00:01<00:00, 2.58it/s]
100%|██████████| 4/4 [00:01<00:00, 2.42it/s]
这好多了。让我们比较一下预处理如何应用于数据。
print("sparse")
print(data_sparse[-1]["pipe"].steps[0][-1].transform(df)[:2])
print()
print("dense")
print(data_dense[-1]["pipe"].steps[0][-1].transform(df)[:2])
sparse
<Compressed Sparse Row sparse matrix of dtype 'float64'
with 8 stored elements and shape (2, 6)>
Coords Values
(0, 0) -0.9006811702978088
(0, 1) 1.019004351971607
(0, 2) 0.4323732931220851
(0, 5) 0.9016947018779491
(1, 0) -1.1430169111851105
(1, 1) -0.13197947932162468
(1, 2) 0.4323732931220851
(1, 5) 0.9016947018779491
dense
[[-0.90068117 1.01900435 0.43237329 0. 0. 0.9016947 ]
[-1.14301691 -0.13197948 0.43237329 0. 0. 0.9016947 ]]
这表明 RandomForestClassifier、XGBClassifier 与 LGBMClassifier 不同,它们处理稀疏和密集矩阵的方式不同。而 HistGradientBoostingClassifier 则失败了。
带 nan 的密集数据¶
让我们在 scikit-learn 管道中保留稀疏数据,但在 ONNX 图中将空值替换为 nan。
data_dense = make_pipelines(df, y, sparse_threshold=1.0, replace_nan=True)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
0%| | 0/4 [00:00<?, ?it/s]/home/xadupre/vv/this312/lib/python3.12/site-packages/xgboost/training.py:183: UserWarning: [09:45:38] WARNING: /workspace/src/learner.cc:738:
Parameters: { "use_label_encoder" } are not used.
bst.update(dtrain, iteration=i, fobj=obj)
75%|███████▌ | 3/4 [00:00<00:00, 3.21it/s][LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012050 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py:2735: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
warnings.warn(
100%|██████████| 4/4 [00:01<00:00, 4.06it/s]
100%|██████████| 4/4 [00:01<00:00, 3.83it/s]
model discrepencies
0 RandomForestClassifier 24.634892
1 HistGradientBoostingClassifier NaN
2 XGBClassifier 2.899390
3 LGBMClassifier 0.000009
密集,0 被替换为 nan¶
不是使用特定选项将空值替换为 nan 值,而是将一个名为 ReplaceTransformer 的自定义转换器显式插入到管道中。一个新转换器被添加到支持的模型列表中。它等同于之前的选项,但更明确。
data_dense = make_pipelines(
df, y, sparse_threshold=1.0, replace_nan=False, insert_replace=True
)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
print(stat.drop("error", axis=1))
stat
0%| | 0/4 [00:00<?, ?it/s]/home/xadupre/vv/this312/lib/python3.12/site-packages/xgboost/training.py:183: UserWarning: [09:45:39] WARNING: /workspace/src/learner.cc:738:
Parameters: { "use_label_encoder" } are not used.
bst.update(dtrain, iteration=i, fobj=obj)
75%|███████▌ | 3/4 [00:01<00:00, 2.67it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py:2735: UserWarning: X does not have valid feature names, but LGBMClassifier was fitted with feature names
warnings.warn(
100%|██████████| 4/4 [00:01<00:00, 3.30it/s]
model discrepencies
0 RandomForestClassifier 41.288296
1 HistGradientBoostingClassifier NaN
2 XGBClassifier 2.899390
3 LGBMClassifier 0.000009
结论¶
除非使用密集数组,因为 onnxruntime ONNX 尚不支持稀疏,因此转换需要根据 TfIdf 预处理之后的模型进行调整。
脚本总运行时间: (0 分钟 4.999 秒)