English | 中文版

6. 内存安全案例研究：AscendC C++ vs ascend-rs

在 16 个内核部署到 NPU 硬件、413 个测试在 Ascend 910B3 上通过 NPU 正确性验证、1565 个总计内核通过 MLIR 后端编译之后，ascend-rs 的价值主张超越了性能对等——核心优势在于内存安全。以下我们展示 6 组配对的案例研究，每组中 AscendC C++ 内核包含一个真实的、可被利用的内存安全漏洞，而等价的 Rust ascend-rs 内核从结构上阻止了同类漏洞。

这些不是刻意构造的示例。每种漏洞类别都是 AscendC C++ 内核开发实践中真实存在的模式：

案例	漏洞类型	C++ 根本原因
1. 类型混淆	`GM_ADDR` 擦除所有类型信息	函数签名编码元素类型
2. 缓冲区溢出	`GetValue(i)`/`SetValue(i,v)` 无边界检查	基于 Buffer-ID 的 API + 显式计数参数
3. 释放后使用	`FreeTensor()` 后通过失效句柄访问	API 中无手动释放操作
4. 缺失同步	忘记在 DMA 和计算之间添加 `pipe_barrier()`	`kernel_ops` 组合算子内置屏障
5. 双重释放	`FreeTensor()` 被调用两次	API 中不存在释放操作
6. 整数溢出	偏移量计算中 `u32` 静默回绕	`wrapping_mul` 使溢出语义显式化

6.1 类型混淆：GM_ADDR 类型擦除

AscendC 内核入口点将所有张量指针作为 GM_ADDR（= uint8_t*）接收。内核必须手动转换为正确的元素类型。如果宿主机传入 f16 数据但内核转换为 float*，每个元素读取 4 字节而非 2 字节——产生垃圾值且无任何警告。当一个内核在不同数据类型之间复用而未更新类型转换时，或者当宿主机封装传入了错误的张量格式时，就会触发此漏洞。

C++ — 存在漏洞：

#include "kernel_operator.h"

class KernelSoftmaxConfused {
public:
    __aicore__ inline void Init(GM_ADDR input, GM_ADDR output, GM_ADDR len_buf) {
        uint32_t n = *((__gm__ uint32_t *)len_buf);

        // BUG: 宿主机传入了半精度 (f16) 数据，但我们转换为 float。
        // 每个 "float" 元素读取 4 字节而非 2 字节，因此：
        //   - 有意义的值只有预期数量的一半
        //   - 每个值都是垃圾（两个 f16 位模式被重新解释为一个 float）
        // 编译器无法捕获此问题，因为 GM_ADDR 只是 uint8_t*。
        inputGm.SetGlobalBuffer((__gm__ float *)input, n);
        outputGm.SetGlobalBuffer((__gm__ float *)output, n);
        // ...
    }

    __aicore__ inline void Compute(int32_t len) {
        AscendC::LocalTensor<float> xLocal = inQueue.DeQue<float>();
        AscendC::LocalTensor<float> yLocal = outQueue.AllocTensor<float>();
        // 所有计算都在垃圾值上操作——静默产生错误输出，无崩溃、无报错。
        AscendC::Exp(yLocal, xLocal, len);
        outQueue.EnQue<float>(yLocal);
        inQueue.FreeTensor(xLocal);
    }
    // ...
};

// 入口点使用 GM_ADDR (= uint8_t*) 接收所有张量参数。
// 调用方可以传入任何数据类型——此边界没有类型检查。
extern "C" __global__ __aicore__ void softmax_confused(
        GM_ADDR input, GM_ADDR output, GM_ADDR len_buf) {
    KernelSoftmaxConfused op;
    op.Init(input, output, len_buf);
    op.Process();
}

Rust — 安全：

#![allow(unused)]
#![feature(no_core)]
#![no_std]
#![no_core]

fn main() {
/// 签名 `input: *const f32` 意味着宿主机必须传入 f32 张量。
/// 如果宿主机有 f16 数据 (*const u16)，调用此函数是类型错误：
///     softmax(f16_ptr, ...)  // 错误：期望 *const f32，实际 *const u16
#[ascend_std::aiv_kernel]
pub unsafe fn softmax(input: *const f32, output: *mut f32, len: *const u32) {
    unsafe {
        let n = *len;
        let buf_in = ascend_std::ascend_buf_alloc(n);
        let buf_out = ascend_std::ascend_buf_alloc(n);
        let buf_work = ascend_std::ascend_buf_alloc(n);

        // 加载 f32 数据——_f32 后缀与指针类型匹配。
        // 不可能通过 f32 API 意外加载 f16 数据。
        ascend_std::ascend_buf_load_f32(buf_in, input, n);
        ascend_std::ascend_pipe_barrier();

        // softmax_f32 期望 f32 缓冲区——整个流水线中类型一致性
        // 无需手动转换即可保持。
        ascend_std::kernel_ops::softmax_f32(buf_out, buf_in, buf_work, n);

        ascend_std::ascend_pipe_barrier();
        ascend_std::ascend_buf_store_f32(output, buf_out, n);
    }
}
}

核心洞见： 在 C++ 中，GM_ADDR 是类型擦除的 uint8_t*，接受任何数据格式。在 Rust 中，函数签名 *const f32 是类型系统的一部分——编译器在编译期拒绝类型不匹配。

6.2 缓冲区溢出：未检查的张量索引

AscendC 的 GetValue(i) 和 SetValue(i, v) 不执行边界检查。如果循环边界错误——off-by-one 错误、使用了错误的长度变量、或混淆了输入/输出大小——内核会在本地 SRAM 上越界读写。由于本地 SRAM 在同一 tile 内的所有张量分配之间共享，越界写入会静默覆盖相邻张量的数据。

C++ — 存在漏洞：

#include "kernel_operator.h"

class KernelScalarSoftmax {
    // ...
    __aicore__ inline void Compute(int32_t len, int32_t alignedLen) {
        AscendC::LocalTensor<float> xLocal = inQueue.DeQue<float>();
        AscendC::LocalTensor<float> yLocal = outQueue.AllocTensor<float>();

        // 第一步：找最大值（标量循环）
        float maxVal = xLocal.GetValue(0);
        for (int32_t i = 1; i < len; i++) {
            float v = xLocal.GetValue(i);
            if (v > maxVal) maxVal = v;
        }

        // 第二步：计算 exp(x - max) 并求和
        float sum = 0.0f;
        for (int32_t i = 0; i < len; i++) {
            float v = xLocal.GetValue(i) - maxVal;
            yLocal.SetValue(i, v);
            sum += v;
        }

        // 第三步：归一化
        float invSum = 1.0f / sum;

        // BUG: Off-by-one 错误——循环条件使用 <= 而非 <。
        // 当 i == len 时，SetValue 写入超出已分配缓冲区一个元素。
        // 这会覆盖 SRAM 中的相邻数据（另一个张量的数据、
        // 队列元数据等），且无错误或警告。
        for (int32_t i = 0; i <= len; i++) {  // 应为 i < len
            yLocal.SetValue(i, yLocal.GetValue(i) * invSum);  // i==len 时越界
        }

        outQueue.EnQue<float>(yLocal);
        inQueue.FreeTensor(xLocal);
    }
    // ...
};

Rust — 安全：

#![allow(unused)]
#![feature(no_core)]
#![no_std]
#![no_core]

fn main() {
/// 传给每个向量操作的计数 `n` 与分配缓冲区时使用的值相同。
/// 没有可能偏移的独立循环变量。没有逐元素索引意味着没有 off-by-one。
#[ascend_std::aiv_kernel]
pub unsafe fn softmax(input: *const f32, output: *mut f32, len: *const u32) {
    unsafe {
        let n = *len;
        let buf_in = ascend_std::ascend_buf_alloc(n);
        let buf_out = ascend_std::ascend_buf_alloc(n);
        let buf_work = ascend_std::ascend_buf_alloc(n);

        ascend_std::ascend_buf_load_f32(buf_in, input, n);
        ascend_std::ascend_pipe_barrier();

        // softmax_f32 对整个 `n` 元素缓冲区操作。
        // 没有循环索引、没有 GetValue(i)、没有 SetValue(i, v)。
        // 计数 `n` 与 ascend_buf_alloc 中使用的值相同——
        // 分配和操作天然一致。
        ascend_std::kernel_ops::softmax_f32(buf_out, buf_in, buf_work, n);

        ascend_std::ascend_pipe_barrier();
        ascend_std::ascend_buf_store_f32(output, buf_out, n);
    }
}
}

核心洞见： C++ API 暴露了无边界检查的 GetValue(i)/SetValue(i, v)——off-by-one 错误的经典来源。Rust 的 Buffer-ID API 使用显式计数参数对整个缓冲区操作，完全消除了逐元素索引。

6.3 释放后使用 LocalTensor

AscendC 要求手动调用 FreeTensor() 将 SRAM 缓冲区归还到队列的空闲池。调用 FreeTensor() 后，LocalTensor 句柄在 C++ 类型层面仍然有效——它仍持有原始缓冲区地址。任何后续的 GetValue() 或 SetValue() 都能编译并运行，但读写的内存可能已被重新分配给其他张量。

C++ — 存在漏洞：

#include "kernel_operator.h"

class KernelVecAddUAF {
    // ...
    __aicore__ inline void Compute(int32_t len) {
        AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
        AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
        AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();

        AscendC::Add(zLocal, xLocal, yLocal, len);

        // 将缓冲区归还到空闲池
        inQueueX.FreeTensor(xLocal);
        inQueueY.FreeTensor(yLocal);

        // BUG: xLocal 已在上面被释放，但 C++ 句柄仍能编译。
        // SRAM 区域已归还到 inQueueX 的空闲列表。
        // 在多 tile 内核中，此缓冲区可能已被下一次迭代的
        // AllocTensor() 重新分配。读取返回过期或损坏的数据。
        half check = xLocal.GetValue(0);  // 释放后使用！

        // 过期值可能导致错误的控制流决策
        if ((float)check > 100.0f) {
            AscendC::Muls(zLocal, zLocal, (half)0.5f, len);  // 基于垃圾数据
        }

        outQueueZ.EnQue<half>(zLocal);
    }
    // ...
};

Rust — 安全：

#![allow(unused)]
#![feature(no_core)]
#![no_std]
#![no_core]

fn main() {
/// buf_x 是一个类型化的 UbBuf ID——它永远不会失效。
/// 对比 C++ 中 FreeTensor(xLocal) 使缓冲区失效，
/// 但 xLocal.GetValue(0) 仍能编译并访问已释放的 SRAM。
#[ascend_std::aiv_kernel]
pub unsafe fn vec_add(x: *const u16, y: *const u16, z: *mut u16, len: *const u32) {
    unsafe {
        let n = *len;
        let block_idx = ascend_std::get_block_idx() as u32;
        let base = block_idx * n;

        let tile_size = 256u32;
        let buf_x = ascend_std::ascend_buf_alloc(tile_size);
        let buf_y = ascend_std::ascend_buf_alloc(tile_size);
        let buf_z = ascend_std::ascend_buf_alloc(tile_size);

        let mut offset = 0u32;
        loop {
            if offset >= n { break; }
            let mut len = tile_size;
            if offset + len > n { len = n - offset; }
            let gm_off = (base + offset) as usize;

            ascend_std::ascend_buf_load_f16(buf_x, x.wrapping_add(gm_off), len);
            ascend_std::ascend_buf_load_f16(buf_y, y.wrapping_add(gm_off), len);
            ascend_std::ascend_pipe_barrier();

            ascend_std::ascend_add_f16(buf_z, buf_x, buf_y, len);
            ascend_std::ascend_pipe_barrier();

            // 无需 FreeTensor。buf_x、buf_y、buf_z 仍然有效。
            // 相同的 Buffer ID 在下一 tile 迭代中复用。
            ascend_std::ascend_buf_store_f16(z.wrapping_add(gm_off), buf_z, len);
            offset = offset + tile_size;
        }
        // 内核返回。所有缓冲区隐式释放。
    }
}
}

核心洞见： C++ 的 LocalTensor 句柄在 FreeTensor() 之后在语法上仍然有效——编译器无法区分已释放和存活的句柄。在 Rust 中，Buffer ID 是 #[repr(transparent)] 新类型封装（UbBuf、L1Buf、L0aBuf、L0bBuf、L0cBuf），没有释放操作；“在释放后使用缓冲区“不是一个有意义的概念。新类型还防止将缓冲区传递到错误的存储层级——例如，将 L0aBuf 传递给期望 UbBuf 的向量操作会导致编译错误。

6.4 缺失流水线同步

昇腾 NPU 并发执行 DMA（MTE2/MTE3）、向量（V）和标量（S）流水线。在 DMA 加载和后续向量操作之间需要 pipe_barrier() 来确保数据确实已到达本地 SRAM。忘记此屏障是最常见的 NPU 漏洞——内核正常编译和运行，但产生静默的错误结果。

C++ — 存在漏洞：

#include "kernel_operator.h"

class KernelSigmoidNoSync {
    // ...
    __aicore__ inline void CopyIn(int32_t offset, int32_t len) {
        AscendC::LocalTensor<float> xLocal = inQueue.AllocTensor<float>();
        AscendC::DataCopy(xLocal, inputGm[offset], len);
        // BUG: DMA 加载和 EnQue 之间缺少 pipe_barrier()。
        // EnQue 只是将张量标记为队列中"可用"，
        // 但不保证 DMA 传输已完成。
        // 如果 DMA 流水线 (MTE2) 比标量流水线 (S) 慢，
        // 后续的 DeQue + 向量操作将读取过期的 SRAM 数据。
        inQueue.EnQue(xLocal);
    }

    __aicore__ inline void Compute(int32_t len) {
        AscendC::LocalTensor<float> xLocal = inQueue.DeQue<float>();
        AscendC::LocalTensor<float> yLocal = outQueue.AllocTensor<float>();

        // Sigmoid = 1 / (1 + exp(-x))
        // 每个向量操作都可能在 DMA 加载完成之前执行，
        // 读取未初始化或过期的 SRAM 数据。
        AscendC::Muls(yLocal, xLocal, -1.0f, len);       // -x（过期数据？）
        AscendC::Exp(yLocal, yLocal, len);                // exp(-x)
        AscendC::Adds(yLocal, yLocal, 1.0f, len);         // 1 + exp(-x)
        AscendC::Reciprocal(yLocal, yLocal, len);          // 1 / (1 + exp(-x))

        outQueue.EnQue<float>(yLocal);
        inQueue.FreeTensor(xLocal);
    }
    // ...
};

Rust — 安全：

#![allow(unused)]
#![feature(no_core)]
#![no_std]
#![no_core]

fn main() {
/// DMA 加载和计算之间的 pipe_barrier() 是显式且可见的。
/// sigmoid_f32 组合算子在其四个步骤（muls → exp → adds → reciprocal）
/// 之间包含所有内部屏障。
#[ascend_std::aiv_kernel]
pub unsafe fn sigmoid(input: *const f32, output: *mut f32, len: *const u32) {
    unsafe {
        let n = *len;
        let buf_in = ascend_std::ascend_buf_alloc(n);
        let buf_out = ascend_std::ascend_buf_alloc(n);

        // 从 GM 加载数据到 UB
        ascend_std::ascend_buf_load_f32(buf_in, input, n);

        // 显式屏障：保证 DMA 加载完成后才有向量操作读取 buf_in。
        ascend_std::ascend_pipe_barrier();

        // sigmoid_f32 是一个组合算子，内部执行：
        //   muls(-1) → pipe_barrier → exp → pipe_barrier →
        //   adds(1) → pipe_barrier → reciprocal
        // 所有内部屏障已包含——不可能遗忘。
        ascend_std::kernel_ops::sigmoid_f32(buf_out, buf_in, n);

        // 显式屏障：保证向量计算完成后才有 DMA 存储读取 buf_out。
        ascend_std::ascend_pipe_barrier();

        // 从 UB 存储数据到 GM
        ascend_std::ascend_buf_store_f32(output, buf_out, n);
    }
}
}

核心洞见： C++ 的队列模型（EnQue/DeQue）给人同步的假象，但实际并不确保 DMA 完成。在 Rust 中，每个屏障都是显式的（ascend_pipe_barrier()），且 kernel_ops 组合算子包含所有内部屏障——程序员不可能在组合操作内部意外遗漏屏障。

6.5 双重释放张量缓冲区

对同一 LocalTensor 调用两次 FreeTensor() 会将同一缓冲区地址两次插入队列的空闲列表。接下来的两次 AllocTensor() 调用都会返回相同的缓冲区，导致两个“不同“的张量别名同一 SRAM 区域。这表现为间歇性的数据损坏，且依赖于 tile 数量。

C++ — 存在漏洞：

#include "kernel_operator.h"

class KernelVecAddDoubleFree {
    // ...
    __aicore__ inline void Compute(int32_t len) {
        AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
        AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
        AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();

        AscendC::Add(zLocal, xLocal, yLocal, len);

        inQueueX.FreeTensor(xLocal);
        inQueueY.FreeTensor(yLocal);
        outQueueZ.EnQue<half>(zLocal);

        // BUG: 重构时的复制粘贴错误——FreeTensor 被再次调用。
        // xLocal 的缓冲区现在在 inQueueX 的空闲列表中出现两次。
        // 在接下来的两次 tile 迭代中，AllocTensor 将为两个"不同"的
        // 张量返回相同的缓冲区地址，导致它们相互别名。
        // 一个 tile 的 DMA 加载将静默覆盖另一个 tile 的数据。
        inQueueX.FreeTensor(xLocal);  // 双重释放！损坏空闲列表
    }
    // ...
};

Rust — 安全：

#![allow(unused)]
#![feature(no_core)]
#![no_std]
#![no_core]

fn main() {
/// Buffer ID（buf_x、buf_y、buf_z）分配一次后跨所有 tile 迭代复用。
/// 无需手动生命周期管理意味着没有双重释放。
#[ascend_std::aiv_kernel]
pub unsafe fn vec_add(x: *const u16, y: *const u16, z: *mut u16, len: *const u32) {
    unsafe {
        let n = *len;
        let block_idx = ascend_std::get_block_idx() as u32;
        let base = block_idx * n;
        let tile_size = 256u32;

        // 分配一次缓冲区。这些 ID 在整个内核中有效。
        let buf_x = ascend_std::ascend_buf_alloc(tile_size);
        let buf_y = ascend_std::ascend_buf_alloc(tile_size);
        let buf_z = ascend_std::ascend_buf_alloc(tile_size);

        let mut offset = 0u32;
        loop {
            if offset >= n { break; }
            let mut len = tile_size;
            if offset + len > n { len = n - offset; }
            let gm_off = (base + offset) as usize;

            ascend_std::ascend_buf_load_f16(buf_x, x.wrapping_add(gm_off), len);
            ascend_std::ascend_buf_load_f16(buf_y, y.wrapping_add(gm_off), len);
            ascend_std::ascend_pipe_barrier();

            ascend_std::ascend_add_f16(buf_z, buf_x, buf_y, len);
            ascend_std::ascend_pipe_barrier();

            ascend_std::ascend_buf_store_f16(z.wrapping_add(gm_off), buf_z, len);

            // 这里没有 FreeTensor。即使这一行被复制粘贴重复，
            // 也根本没有可以调用的 free 函数。
            offset = offset + tile_size;
        }
        // 内核返回——所有缓冲区隐式释放。
    }
}
}

核心洞见： 在 C++ 中，FreeTensor() 是一个手动操作，可能被意外重复。在 Rust 中，不存在释放操作——Buffer ID 是类型化的新类型封装（UbBuf、L1Buf 等），在编译期编码存储层级。“双重释放“一个缓冲区 ID 是没有意义的。

6.6 多核偏移量的静默整数溢出

多核内核通过计算 offset = blockIdx * perBlockLen 在 NPU 核心之间分配工作。使用 uint32_t 算术时，此乘法在溢出时静默回绕——例如 8192 * 524288 = 0x100000000 回绕为 0。内核从错误的内存区域读写，可能与另一个 block 的数据产生别名。在 C++ 中，无符号溢出是定义行为（模运算），因此不会产生警告。

C++ — 存在漏洞：

#include "kernel_operator.h"

class KernelVecAddOverflow {
    // ...
    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR len_buf) {
        uint32_t perBlockLen = *((__gm__ uint32_t *)len_buf);

        // BUG: 当 blockIdx * perBlockLen > 2^32 时 uint32_t 静默溢出。
        //
        // 示例：8192 个 block，perBlockLen = 524288 (512K 元素)，
        // 总张量大小为 4GB 半精度数据。Block 8192 计算：
        //   offset = 8192 * 524288 = 4294967296 = 0x100000000
        // 但 uint32_t 回绕：offset = 0。此 block 现在与 block 0 的数据别名。
        //
        // C++ 不产生警告——无符号溢出被定义为模运算。
        // 内核静默地读取错误数据。
        uint32_t offset = AscendC::GetBlockIdx() * perBlockLen;

        xGm.SetGlobalBuffer((__gm__ half *)x + offset, perBlockLen);
        yGm.SetGlobalBuffer((__gm__ half *)y + offset, perBlockLen);
        zGm.SetGlobalBuffer((__gm__ half *)z + offset, perBlockLen);
        // ...
    }
    // ...
};

Rust — 安全：

#![allow(unused)]
#![feature(no_core)]
#![no_std]
#![no_core]

fn main() {
/// wrapping_mul 表明此乘法对于大张量可能溢出。
/// 审阅者看到 wrapping_mul 就知道需要检查溢出是否安全。
/// 在 debug 构建中，普通的 `*` 会在溢出时 panic。
#[ascend_std::aiv_kernel]
pub unsafe fn vec_add(x: *const u16, y: *const u16, z: *mut u16, len: *const u32) {
    unsafe {
        let n = *len;
        let block_idx = ascend_std::get_block_idx() as u32;

        // wrapping_mul 使溢出语义显式化。
        // 阅读此行的开发者知道：
        //   1. 此乘法对大输入可能溢出
        //   2. 溢出行为是有意的回绕
        //   3. 这是一个值得审查的潜在正确性问题
        //
        // 在 debug 构建中（CPU 端测试），普通 `*` 会在溢出时 panic：
        //   let offset = block_idx * n;  // debug 模式下溢出会 panic！
        let offset = block_idx.wrapping_mul(n);

        let tile_size = 256u32;
        let buf_x = ascend_std::ascend_buf_alloc(tile_size);
        let buf_y = ascend_std::ascend_buf_alloc(tile_size);
        let buf_z = ascend_std::ascend_buf_alloc(tile_size);

        let mut tile_off = 0u32;
        loop {
            if tile_off >= n { break; }
            let mut len = tile_size;
            if tile_off + len > n { len = n - tile_off; }
            let gm_off = (offset.wrapping_add(tile_off)) as usize;

            ascend_std::ascend_buf_load_f16(buf_x, x.wrapping_add(gm_off), len);
            ascend_std::ascend_buf_load_f16(buf_y, y.wrapping_add(gm_off), len);
            ascend_std::ascend_pipe_barrier();

            ascend_std::ascend_add_f16(buf_z, buf_x, buf_y, len);
            ascend_std::ascend_pipe_barrier();

            ascend_std::ascend_buf_store_f16(z.wrapping_add(gm_off), buf_z, len);
            tile_off = tile_off + tile_size;
        }
    }
}
}

核心洞见： 在 C++ 中，blockIdx * perBlockLen 静默回绕，没有任何迹象表明开发者考虑过溢出。在 Rust 中，wrapping_mul 显式记录了意图，且在 debug 构建中普通的 * 会在溢出时 panic——在代码到达硬件之前即可在开发阶段捕获漏洞。

Keyboard shortcuts

ascend-rs：Rust 内存安全的 NPU 内核编程

6. 内存安全案例研究：AscendC C++ vs ascend-rs

6.1 类型混淆：GM_ADDR 类型擦除

6.2 缓冲区溢出：未检查的张量索引

6.3 释放后使用 LocalTensor

6.4 缺失流水线同步

6.5 双重释放张量缓冲区

6.6 多核偏移量的静默整数溢出