TfIdf 和稀疏矩阵

TfidfVectorizer 通常创建稀疏数据。如果数据足够稀疏,矩阵通常在整个管道中保持稀疏状态,直到预测器被训练。稀疏矩阵不将空值和缺失值视为数据集中的存在值。因为一些预测器会进行差异处理,所以在转换为 ONNX 时,这种歧义可能会导致差异。此示例探讨了几种配置。

导入,设置

所有导入。它还为 xgboostlightgbm 注册了 onnx 转换器。

import warnings
import numpy
import pandas
import onnxruntime as rt
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

try:
    from sklearn.ensemble import HistGradientBoostingClassifier
except ImportError:
    HistGradientBoostingClassifier = None
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from skl2onnx.common.data_types import FloatTensorType, StringTensorType
from skl2onnx import to_onnx, update_registered_converter
from skl2onnx.sklapi import CastTransformer, ReplaceTransformer
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm


update_registered_converter(
    XGBClassifier,
    "XGBoostXGBClassifier",
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)
update_registered_converter(
    LGBMClassifier,
    "LightGbmLGBMClassifier",
    calculate_linear_classifier_output_shapes,
    convert_lightgbm,
    options={"nocl": [True, False], "zipmap": [True, False]},
)

人工数据集

鸢尾花 + 文本列。

cst = ["class zero", "class one", "class two"]

data = load_iris()
X = data.data[:, :2]
y = data.target

df = pandas.DataFrame(X)
df.columns = [f"c{c}" for c in df.columns]
df["text"] = [cst[i] for i in y]


ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()

稀疏后训练集成

此示例使用鸢尾花数据集和使用 tf-idf 预处理的人工文本数据集。sparse_threshold=1. 避免将稀疏矩阵转换为密集矩阵。

def make_pipelines(
    df_train,
    y_train,
    models=None,
    sparse_threshold=1.0,
    replace_nan=False,
    insert_replace=False,
):
    if models is None:
        models = [
            RandomForestClassifier,
            HistGradientBoostingClassifier,
            XGBClassifier,
            LGBMClassifier,
        ]
    models = [_ for _ in models if _ is not None]

    pipes = []
    for model in tqdm(models):
        if model == HistGradientBoostingClassifier:
            kwargs = dict(max_iter=5)
        elif model == XGBClassifier:
            kwargs = dict(n_estimators=5, use_label_encoder=False)
        else:
            kwargs = dict(n_estimators=5)

        if insert_replace:
            pipe = Pipeline(
                [
                    (
                        "union",
                        ColumnTransformer(
                            [
                                ("scale1", StandardScaler(), [0, 1]),
                                (
                                    "subject",
                                    Pipeline(
                                        [
                                            ("count", CountVectorizer()),
                                            ("tfidf", TfidfTransformer()),
                                            ("repl", ReplaceTransformer()),
                                        ]
                                    ),
                                    "text",
                                ),
                            ],
                            sparse_threshold=sparse_threshold,
                        ),
                    ),
                    ("cast", CastTransformer()),
                    ("cls", model(max_depth=3, **kwargs)),
                ]
            )
        else:
            pipe = Pipeline(
                [
                    (
                        "union",
                        ColumnTransformer(
                            [
                                ("scale1", StandardScaler(), [0, 1]),
                                (
                                    "subject",
                                    Pipeline(
                                        [
                                            ("count", CountVectorizer()),
                                            ("tfidf", TfidfTransformer()),
                                        ]
                                    ),
                                    "text",
                                ),
                            ],
                            sparse_threshold=sparse_threshold,
                        ),
                    ),
                    ("cast", CastTransformer()),
                    ("cls", model(max_depth=3, **kwargs)),
                ]
            )

        try:
            pipe.fit(df_train, y_train)
        except TypeError as e:
            obs = dict(model=model.__name__, pipe=pipe, error=e, model_onnx=None)
            pipes.append(obs)
            continue

        options = {model: {"zipmap": False}}
        if replace_nan:
            options[TfidfTransformer] = {"nan": True}

        # convert
        with warnings.catch_warnings(record=False):
            warnings.simplefilter("ignore", (FutureWarning, UserWarning))
            model_onnx = to_onnx(
                pipe,
                initial_types=[
                    ("input", FloatTensorType([None, 2])),
                    ("text", StringTensorType([None, 1])),
                ],
                target_opset={"": 12, "ai.onnx.ml": 2},
                options=options,
            )

        with open("model.onnx", "wb") as f:
            f.write(model_onnx.SerializeToString())

        sess = rt.InferenceSession(
            model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
        )
        inputs = {
            "input": df[["c0", "c1"]].values.astype(numpy.float32),
            "text": df[["text"]].values,
        }
        pred_onx = sess.run(None, inputs)

        diff = numpy.abs(pred_onx[1].ravel() - pipe.predict_proba(df).ravel()).sum()

        obs = dict(
            model=model.__name__, discrepencies=diff, model_onnx=model_onnx, pipe=pipe
        )
        pipes.append(obs)

    return pipes


data_sparse = make_pipelines(df, y)
stat = pandas.DataFrame(data_sparse).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
    print(stat.drop("error", axis=1))
stat
  0%|          | 0/4 [00:00<?, ?it/s]
 75%|███████▌  | 3/4 [00:00<00:00, 13.14it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002602 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

100%|██████████| 4/4 [00:00<00:00, 11.75it/s]
                            model  discrepencies
0          RandomForestClassifier       0.700004
1  HistGradientBoostingClassifier            NaN
2                   XGBClassifier      28.331459
3                  LGBMClassifier       0.000009
模型 差异 错误
0 RandomForestClassifier 0.700004 NaN
1 HistGradientBoostingClassifier NaN 为 X 传递了稀疏数据,但密集数据 i...
2 XGBClassifier 28.331459 NaN
3 LGBMClassifier 0.000009 NaN


稀疏数据有害。

密集数据

让我们通过使用 sparse_threshold=0. 将稀疏数据替换为密集数据。

data_dense = make_pipelines(df, y, sparse_threshold=0.0)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
    print(stat.drop("error", axis=1))
stat
  0%|          | 0/4 [00:00<?, ?it/s]
 50%|█████     | 2/4 [00:00<00:00, 17.40it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

100%|██████████| 4/4 [00:00<00:00, 20.51it/s]
模型 差异
0 RandomForestClassifier 0.733338
1 HistGradientBoostingClassifier 0.000005
2 XGBClassifier 2.967510
3 LGBMClassifier 0.000009


这好多了。让我们比较预处理如何应用于数据。

print("sparse")
print(data_sparse[-1]["pipe"].steps[0][-1].transform(df)[:2])
print()
print("dense")
print(data_dense[-1]["pipe"].steps[0][-1].transform(df)[:2])
sparse
  (0, 0)        -0.9006811702978088
  (0, 1)        1.019004351971607
  (0, 2)        0.4323732931220851
  (0, 5)        0.9016947018779491
  (1, 0)        -1.1430169111851105
  (1, 1)        -0.13197947932162468
  (1, 2)        0.4323732931220851
  (1, 5)        0.9016947018779491

dense
[[-0.90068117  1.01900435  0.43237329  0.          0.          0.9016947 ]
 [-1.14301691 -0.13197948  0.43237329  0.          0.          0.9016947 ]]

这表明 RandomForestClassifierXGBClassifier 处理稀疏矩阵和密集矩阵的方式不同,而 LGBMClassifier 则没有这种区别。并且 HistGradientBoostingClassifier 会失败。

带有 nan 的密集数据

让我们在 scikit-learn 管道中保留稀疏数据,但在 onnx 图中用 nan 替换空值。

data_dense = make_pipelines(df, y, sparse_threshold=1.0, replace_nan=True)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
    print(stat.drop("error", axis=1))
stat
  0%|          | 0/4 [00:00<?, ?it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

100%|██████████| 4/4 [00:00<00:00, 29.40it/s]
100%|██████████| 4/4 [00:00<00:00, 29.34it/s]
                            model  discrepencies
0          RandomForestClassifier      40.908293
1  HistGradientBoostingClassifier            NaN
2                   XGBClassifier       2.967510
3                  LGBMClassifier       0.000009
模型 差异 错误
0 RandomForestClassifier 40.908293 NaN
1 HistGradientBoostingClassifier NaN 为 X 传递了稀疏数据,但密集数据 i...
2 XGBClassifier 2.967510 NaN
3 LGBMClassifier 0.000009 NaN


密集,0 替换为 nan

而不是使用特定的选项将空值替换为 nan 值,一个名为 ReplaceTransformer 的自定义转换器被显式地插入到管道中。一个新的转换器被添加到支持模型列表中。它等同于之前的选项,只是更明确。

data_dense = make_pipelines(
    df, y, sparse_threshold=1.0, replace_nan=False, insert_replace=True
)
stat = pandas.DataFrame(data_dense).drop(["model_onnx", "pipe"], axis=1)
if "error" in stat.columns:
    print(stat.drop("error", axis=1))
stat
  0%|          | 0/4 [00:00<?, ?it/s][LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000024 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 53
[LightGBM] [Info] Number of data points in the train set: 150, number of used features: 5
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf

100%|██████████| 4/4 [00:00<00:00, 36.10it/s]
100%|██████████| 4/4 [00:00<00:00, 36.01it/s]
                            model  discrepencies
0          RandomForestClassifier      41.729285
1  HistGradientBoostingClassifier            NaN
2                   XGBClassifier       2.967510
3                  LGBMClassifier       0.000009
模型 差异 错误
0 RandomForestClassifier 41.729285 NaN
1 HistGradientBoostingClassifier NaN 为 X 传递了稀疏数据,但密集数据 i...
2 XGBClassifier 2.967510 NaN
3 LGBMClassifier 0.000009 NaN


结论

除非使用密集数组,因为 onnxruntime ONNX 尚未支持稀疏数组,否则转换需要根据遵循 TfIdf 预处理的模型进行调整。

脚本总运行时间:(0 分钟 0.992 秒)

由 Sphinx-Gallery 生成的库