TfIdf 和稀疏矩阵

TfidfVectorizer 通常会创建稀疏数据。如果数据足够稀疏,矩阵通常会在整个流水线中保持稀疏状态,直到训练预测器。稀疏矩阵不考虑 null 和缺失值,因为它们不存在于数据集中。由于某些预测器会区分这些值,这种歧义可能会在转换为 ONNX 时引入差异。本示例将探讨几种配置。

导入,设置

所有导入。它还注册了 xgboostlightgbm 的 onnx 转换器。

import warnings
import numpy
import pandas
import onnxruntime as rt
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

try:
    from sklearn.ensemble import HistGradientBoostingClassifier
except ImportError:
    HistGradientBoostingClassifier = None
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.sklapi import CastTransformer, ReplaceTransformer
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm


update_registered_converter(
    XGBClassifier,
    "XGBoostXGBClassifier",
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)
update_registered_converter(
    LGBMClassifier,
    "LightGbmLGBMClassifier",
    calculate_linear_classifier_output_shapes,
    convert_lightgbm,
    options={"nocl": [True, False], "zipmap": [True, False]},
)

人工数据集

Iris + 文本列。

cst = ["class zero", "class one", "class two"]

data = load_iris()
X = data.data[:, :2]
y = data.target

df = pandas.DataFrame(X)
df.columns = [f"c{c}" for c in df.columns]
df["text"] = [cst[i] for i in y]


ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()

稀疏数据后的集成训练

本示例使用 Iris 数据集和经过 tf-idf 预处理的人工文本数据集。sparse_threshold=1. 避免稀疏矩阵转换为密集矩阵。

def make_pipelines(
    df_train,
    y_train,
    models=None,
    sparse_threshold=1.0,
    replace_nan=False,
    insert_replace=False,
):
    if models is None:
        models = [
            RandomForestClassifier,
            HistGradientBoostingClassifier,
            XGBClassifier,
            LGBMClassifier,
        ]
    models = [_ for _ in models if _ is not None]

    pipes = []
    for model in tqdm(models):
        if model == HistGradientBoostingClassifier:
            kwargs = dict(max_iter=5)
        elif model == XGBClassifier:
            kwargs = dict(n_estimators=5, use_label_encoder=False)
        else:
            kwargs = dict(n_estimators=5)

        if insert_replace:
            pipe = Pipeline(
                [
                    (
                        "union",
                        ColumnTransformer(
                            [
                                ("scale1", StandardScaler(), [0, 1]),
                                (
                                    "subject",
                                    Pipeline(
                                        [
                                            ("count", CountVectorizer()),
                                            ("tfidf", TfidfTransformer()),
                                            ("repl", ReplaceTransformer()),
                                        ]
                                    ),
                                    "text",
                                ),
                            ],
                            sparse_threshold=sparse_threshold,
                        ),
                    ),
                    ("cast", CastTransformer()),
                    ("cls", model(max_depth=3, **kwargs)),
                ]
            )
        else:
            pipe = Pipeline(
                [
                    (
                        "union",
                        ColumnTransformer(
                            [
                                ("scale1", StandardScaler(), [0, 1]),
                                (
                                    "subject",
                                    Pipeline(
                                        [
                                            ("count", CountVectorizer()),
                                            ("tfidf", TfidfTransformer()),
                                        ]
                                    ),
                                    "text",
                                ),
                            ],
                            sparse_threshold=sparse_threshold,
                        ),
                    ),
                    ("cast", CastTransformer()),
                    ("cls", model(max_depth=3, **kwargs)),
                ]
            )

        try:
            pipe.fit(df_train, y_train)
        except TypeError as e:
            obs = dict(model=model.__name__, pipe=pipe, error=e, model_onnx=None)
            pipes.append(obs)
            continue

        options = {model: {"zipmap": False}}
        if replace_nan:
            options[TfidfTransformer] = {"nan": True}

        # convert
        with warnings.catch_warnings(record=False):
            warnings.simplefilter("ignore", (FutureWarning, UserWarning))
            model_onnx = to_onnx(
                pipe,
                initial_types=[
                    ("input", FloatTensorType([None, 2])),
                    ("text", StringTensorType([None, 1])),
                ],
                target_opset={"": 12, "ai.onnx.ml": 2},
                options=options,
            )

        with open("model.onnx", "wb") as f:
            f.write(model_onnx.SerializeToString())

        sess = rt.InferenceSession(
            model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
        )
        inputs = {
            "input": df[["c0", "c1"]].values.astype(numpy.float32),
            "text": df[["text"]].values,
        }
        pred_onx = sess.run(None, inputs)

        diff = numpy.abs(pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum()

        obs = dict(
            model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe
        )
        pipes.append(obs)

    return pipes


data_sparse = make_pipelines(df, y)
stat = pandas.DataFrame(data_sparse).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
    print(stat.drop("error", axis=1))
stat
Traceback (most recent call last):
  File "/home/xadupre/github/sklearn-onnx/docs/tutorial/plot_usparse_xgboost.py", line 227, in <module>
    data_sparse = make_pipelines(df, y)
                  ^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/github/sklearn-onnx/docs/tutorial/plot_usparse_xgboost.py", line 217, in make_pipelines
    diff = numpy.abs(pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum()
                                           ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/pipeline.py", line 896, in predict_proba
    with _raise_or_warn_if_not_fitted(self):
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
    next(self.gen)
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/pipeline.py", line 60, in _raise_or_warn_if_not_fitted
    check_is_fitted(estimator)
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1756, in check_is_fitted
    if not _is_fitted(estimator, attributes, all_or_any):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1665, in _is_fitted
    return estimator.__sklearn_is_fitted__()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/pipeline.py", line 1310, in __sklearn_is_fitted__
    check_is_fitted(last_step)
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/validation.py", line 1751, in check_is_fitted
    tags = get_tags(estimator)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/utils/_tags.py", line 405, in get_tags
    sklearn_tags_provider[klass] = klass.__sklearn_tags__(estimator)  # type: ignore[attr-defined]
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xadupre/vv/this312/lib/python3.12/site-packages/sklearn/base.py", line 540, in __sklearn_tags__
    tags = super().__sklearn_tags__()
           ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'super' object has no attribute '__sklearn_tags__'

稀疏数据会导致问题。

密集数据

让我们使用 sparse_threshold=0. 将稀疏数据替换为密集数据。

data_dense = make_pipelines(df, y, sparse_threshold=0.0)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
    print(stat.drop("error", axis=1))
stat

这好多了。我们来比较一下预处理是如何应用于数据的。

print("sparse")
print(data_sparse[-1]["pipe"].steps[0][-1].transform(df)[:2])
print()
print("dense")
print(data_dense[-1]["pipe"].steps[0][-1].transform(df)[:2])

这表明 RandomForestClassifierXGBClassifier 处理稀疏矩阵和密集矩阵的方式与 LGBMClassifier 不同。而 HistGradientBoostingClassifier 会失败。

带有 nan 的密集数据

让我们在 scikit-learn 流水线中保留稀疏数据,但在 onnx 图中将 null 值替换为 nan。

data_dense = make_pipelines(df, y, sparse_threshold=1.0, replace_nan=True)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
    print(stat.drop("error", axis=1))
stat

密集数据,0 替换为 nan

除了使用特定选项将 null 值替换为 nan 值外,本示例明确地将一个名为 ReplaceTransformer 的自定义转换器插入到流水线中。还将一个新的转换器添加到支持的模型列表中。这与之前的选项等效,只是更显式。

data_dense = make_pipelines(
    df, y, sparse_threshold=1.0, replace_nan=False, insert_replace=True
)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
    print(stat.drop("error", axis=1))
stat

结论

除非使用密集数组,否则由于 onnxruntime 尚不支持稀疏 ONNX,转换需要根据 TfIdf 预处理后面的模型进行调整。

脚本总运行时间: (0 分钟 0.258 秒)

图库由 Sphinx-Gallery 生成