Тестовые ядра
This commit is contained in:
@@ -199,6 +199,96 @@ __global__ void aggregate_periods_simple_kernel(
|
||||
out_stats[period_idx] = stats;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static __device__ __forceinline__ double warp_reduce_sum(double v) {
|
||||
for (int offset = 16; offset > 0; offset >>= 1)
|
||||
v += __shfl_down_sync(0xffffffff, v, offset);
|
||||
return v;
|
||||
}
|
||||
static __device__ __forceinline__ double warp_reduce_min(double v) {
|
||||
for (int offset = 16; offset > 0; offset >>= 1)
|
||||
v = fmin(v, __shfl_down_sync(0xffffffff, v, offset));
|
||||
return v;
|
||||
}
|
||||
static __device__ __forceinline__ double warp_reduce_max(double v) {
|
||||
for (int offset = 16; offset > 0; offset >>= 1)
|
||||
v = fmax(v, __shfl_down_sync(0xffffffff, v, offset));
|
||||
return v;
|
||||
}
|
||||
static __device__ __forceinline__ int warp_reduce_sum_int(int v) {
|
||||
for (int offset = 16; offset > 0; offset >>= 1)
|
||||
v += __shfl_down_sync(0xffffffff, v, offset);
|
||||
return v;
|
||||
}
|
||||
|
||||
__global__ void aggregate_periods_warp_kernel(
|
||||
const double* __restrict__ open,
|
||||
const double* __restrict__ high,
|
||||
const double* __restrict__ low,
|
||||
const double* __restrict__ close,
|
||||
const int64_t* __restrict__ unique_periods,
|
||||
const int* __restrict__ offsets,
|
||||
const int* __restrict__ counts,
|
||||
int num_periods,
|
||||
GpuPeriodStats* __restrict__ out_stats)
|
||||
{
|
||||
int global_thread = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int warp_id = global_thread >> 5; // /32
|
||||
int lane = threadIdx.x & 31; // %32
|
||||
int period_idx = warp_id;
|
||||
|
||||
if (period_idx >= num_periods) return;
|
||||
|
||||
int offset = offsets[period_idx];
|
||||
int count = counts[period_idx];
|
||||
|
||||
// Локальные аккумуляторы каждого lane
|
||||
double sum_avg = 0.0;
|
||||
double omin = DBL_MAX, omax = -DBL_MAX;
|
||||
double cmin = DBL_MAX, cmax = -DBL_MAX;
|
||||
int local_n = 0;
|
||||
|
||||
// Каждый lane берёт i = lane, lane+32, lane+64...
|
||||
for (int i = lane; i < count; i += 32) {
|
||||
int tick = offset + i;
|
||||
double avg = 0.5 * (low[tick] + high[tick]);
|
||||
sum_avg += avg;
|
||||
|
||||
double o = open[tick];
|
||||
double c = close[tick];
|
||||
|
||||
omin = fmin(omin, o);
|
||||
omax = fmax(omax, o);
|
||||
cmin = fmin(cmin, c);
|
||||
cmax = fmax(cmax, c);
|
||||
|
||||
local_n += 1;
|
||||
}
|
||||
|
||||
// Warp-редукция
|
||||
sum_avg = warp_reduce_sum(sum_avg);
|
||||
omin = warp_reduce_min(omin);
|
||||
omax = warp_reduce_max(omax);
|
||||
cmin = warp_reduce_min(cmin);
|
||||
cmax = warp_reduce_max(cmax);
|
||||
int n = warp_reduce_sum_int(local_n); // должно дать count, но так надёжнее
|
||||
|
||||
// lane 0 пишет результат
|
||||
if (lane == 0) {
|
||||
GpuPeriodStats s;
|
||||
s.period = unique_periods[period_idx];
|
||||
s.avg = (n > 0) ? (sum_avg / (double)n) : 0.0;
|
||||
s.open_min = omin;
|
||||
s.open_max = omax;
|
||||
s.close_min = cmin;
|
||||
s.close_max = cmax;
|
||||
s.count = n;
|
||||
out_stats[period_idx] = s;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ============================================================================
|
||||
// Проверка доступности GPU
|
||||
// ============================================================================
|
||||
@@ -208,7 +298,7 @@ extern "C" int gpu_is_available() {
|
||||
cudaError_t err = cudaGetDeviceCount(&n);
|
||||
if (err != cudaSuccess) return 0;
|
||||
if (n > 0) {
|
||||
cudaFree(0); // Форсируем инициализацию контекста
|
||||
// cudaFree(0); // Форсируем инициализацию контекста
|
||||
}
|
||||
return (n > 0) ? 1 : 0;
|
||||
}
|
||||
@@ -352,14 +442,28 @@ extern "C" int gpu_aggregate_periods(
|
||||
GpuPeriodStats* d_out_stats = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(&d_out_stats, num_periods * sizeof(GpuPeriodStats)));
|
||||
|
||||
// Используем простой kernel (один поток на период)
|
||||
// т.к. обычно тиков в периоде немного
|
||||
int agg_blocks = (num_periods + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
|
||||
aggregate_periods_simple_kernel<<<agg_blocks, BLOCK_SIZE>>>(
|
||||
// int agg_blocks = (num_periods + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
// aggregate_periods_simple_kernel<<<agg_blocks, BLOCK_SIZE>>>(
|
||||
// d_open, d_high, d_low, d_close,
|
||||
// d_unique_periods, d_offsets, d_counts,
|
||||
// num_periods, d_out_stats);
|
||||
|
||||
|
||||
// aggregate_periods_kernel<<<num_periods, BLOCK_SIZE>>>(
|
||||
// d_open, d_high, d_low, d_close,
|
||||
// d_unique_periods, d_offsets, d_counts,
|
||||
// num_periods, d_out_stats);
|
||||
|
||||
int warps_per_block = BLOCK_SIZE / 32; // 8
|
||||
int num_blocks1 = (num_periods + warps_per_block - 1) / warps_per_block;
|
||||
|
||||
aggregate_periods_warp_kernel<<<num_blocks1, BLOCK_SIZE>>>(
|
||||
d_open, d_high, d_low, d_close,
|
||||
d_unique_periods, d_offsets, d_counts,
|
||||
num_periods, d_out_stats);
|
||||
|
||||
|
||||
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
|
||||
|
||||
Reference in New Issue
Block a user