# 使用 ISO C++语言并行在 GPU 上进行利润和损失建模

## 利润和损失建模说明

$dS_t = rS_t dt + \sigma_R S_t dW_t^Q$

$\text{P\&L}_j = \Gamma_j dS_j^2 + \theta_j dt \equiv \Gamma_jS_j^2(\sigma_R^2 - \sigma_H^2)dt$

$\text{P\&L} = \Sigma_{j=1}^{N} \text{P\&L}_j$

## 并行 P&L 模拟

void calculate_pnl_paths_sequential(stdex::mdspan<const double, stdex::dextents<size_t,2>> paths,
std::span<const double>Strikes,
std::span<const double>Maturities,
std::span<const double>Volatilities,
const double RiskFreeRate,
std::span<double>pnl,
const double dt)
{
int num_paths = paths.extent(0);
int horizon   = paths.extent(1);

auto steps = std::views::iota(1,horizon);
// Iterate from 0 to num_paths - 1
auto path_itr = std::views::iota(0,num_paths);

// Note - In this version path remains in CPU memory
// Note - Also that when built for the GPU this will result in
// num_paths * (horizon - 1) kernel launches
std::for_each(path_itr.begin(), path_itr.end(),
[=](int path) // Called for each path from 0 to num_paths - 1
{
// Iterate from 1 to horizon - 1
std::for_each(steps.begin(), steps.end(),
[=](int step) // Called for each step along the chosen path
{
// Query the number of options from the pnl array
int optN      = pnl.size();
// Enumerate from 0 to (optN - 1)
auto opts = std::views::iota(0,optN);

double s      = paths(path,step);
double s_prev = paths(path,step-1);
double ds2 = s - s_prev;
ds2 *= ds2;
// Calculate pnl for each option
std::transform(std::execution::par_unseq, opts.begin(), opts.end(),
pnl.begin(), [=](int opt)
{
double gamma = 0.0, theta = 0.0;
BlackScholesBody(gamma,
s_prev,
Strikes[opt],
Maturities[opt] - std::max(dt*(step-1),0.0),
RiskFreeRate,
Volatilities[opt],
CALL,
GAMMA);
BlackScholesBody(theta,
s_prev,
Strikes[opt],
Maturities[opt] - std::max(dt*(step-1),0.0),
RiskFreeRate,
Volatilities[opt],
CALL,
THETA);
// P&L = 0.5 * Gamma * (dS)^2 + Theta * dt
return pnl[opt] + 0.5 * gamma * ds2 + (theta*dt);
});
});
});
}


## 提高并行性以提高性能

• 启动延迟：启动 GPU 内核的成本。
• 同步：并行算法相对于 CPU 是同步的，这意味着程序必须等待内核完成，然后再继续并启动下一个内核。

void calculate_pnl_paths_parallel(stdex::mdspan<const double,
stdex::dextents<size_t,2>> paths,
std::span<const double>Strikes,
std::span<const double>Maturities,
std::span<const double>Volatilities,
const double RiskFreeRate,
std::span<double>pnl,
const double dt)
{
int num_paths = paths.extent(0);
int horizon   = paths.extent(1);
int optN      = pnl.size();

// Create an iota to enumerate the flatted index space of
// options and paths
auto opts = std::views::iota(0,optN*num_paths);

std::for_each(std::execution::par_unseq, opts.begin(), opts.end(),
[=](int idx)
{
// Extract path and option number from flat index
// C++23 cartesian_product would remove the need for below
int path = idx/optN;
int opt  = idx%optN;

// atomic_ref prevents race condition on elements of pnl array.
std::atomic_ref<double> elem(pnl[opt]);

// Walk the path from 1 to (horizon - 1) in steps of 1
auto path_itr = std::views::iota(1,horizon);

// Transform_Reduce will apply the lambda to every option and perform
// a plus reduction to sum the PNL value for each option.
double pnl_temp = std::transform_reduce(path_itr.begin(), path_itr.end(),
0.0, std::plus{},
[=](int step) {
double gamma = 0.0, theta = 0.0;
double s      = paths(path,step);
double s_prev = paths(path,step-1);
double ds2 = s - s_prev;
ds2 *= ds2;
// Options in the grid age as the simulation progresses
// along the path
double time_to_maturity = Maturities[opt] –
std::max(dt*(step-1),0.0);
BlackScholesBody(gamma,
s_prev,
Strikes[opt],
time_to_maturity,
RiskFreeRate,
Volatilities[opt],
CALL,
GAMMA);
BlackScholesBody(theta,
s_prev,
Strikes[opt],
time_to_maturity,
RiskFreeRate,
Volatilities[opt],
CALL,
THETA);
// P&L = 0.5 * Gamma * (dS)^2 + Theta * dt
return 0.5 * gamma * ds2 + (theta*dt);
});
// accumulate on atomic_ref to pnl array
});
}


std::for_each算法用于在路径和选项之间进行迭代。在每次迭代中，std::transform_reduce算法用于遍历每个选项的每个路径，将利润和损失相加并返回该结果。然后，每个中间结果都会自动添加到 P&L 数组中。