Больше таймеров

This commit is contained in:
2025-12-11 10:08:22 +00:00
parent 44f297e55a
commit 7f16a5c17a
3 changed files with 156 additions and 18 deletions

View File

@@ -1,6 +1,15 @@
#include <cuda_runtime.h>
#include <cstdint>
#include <cfloat>
#include <cstdio>
#include <ctime>
// CPU таймер в миллисекундах
static double get_time_ms() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
}
// Структуры данных (должны совпадать с C++ кодом)
struct GpuRecord {
@@ -27,6 +36,10 @@ extern "C" int gpu_is_available() {
int n = 0;
cudaError_t err = cudaGetDeviceCount(&n);
if (err != cudaSuccess) return 0;
if (n > 0) {
// Инициализируем CUDA контекст заранее (cudaFree(0) форсирует инициализацию)
cudaFree(0);
}
return (n > 0) ? 1 : 0;
}
@@ -89,7 +102,33 @@ extern "C" int gpu_aggregate_days(
int num_days,
GpuDayStats* h_out_stats)
{
// Выделяем память на GPU
double cpu_total_start = get_time_ms();
// === Создаём CUDA события для измерения времени ===
double cpu_event_create_start = get_time_ms();
cudaEvent_t start_malloc, stop_malloc;
cudaEvent_t start_transfer, stop_transfer;
cudaEvent_t start_kernel, stop_kernel;
cudaEvent_t start_copy_back, stop_copy_back;
cudaEvent_t start_free, stop_free;
cudaEventCreate(&start_malloc);
cudaEventCreate(&stop_malloc);
cudaEventCreate(&start_transfer);
cudaEventCreate(&stop_transfer);
cudaEventCreate(&start_kernel);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&start_copy_back);
cudaEventCreate(&stop_copy_back);
cudaEventCreate(&start_free);
cudaEventCreate(&stop_free);
double cpu_event_create_ms = get_time_ms() - cpu_event_create_start;
// === ИЗМЕРЕНИЕ cudaMalloc ===
cudaEventRecord(start_malloc);
GpuRecord* d_records = nullptr;
int* d_day_offsets = nullptr;
int* d_day_counts = nullptr;
@@ -113,7 +152,15 @@ extern "C" int gpu_aggregate_days(
err = cudaMalloc(&d_out_stats, num_days * sizeof(GpuDayStats));
if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); cudaFree(d_day_indices); return -5; }
// Копируем данные на GPU
cudaEventRecord(stop_malloc);
cudaEventSynchronize(stop_malloc);
float time_malloc_ms = 0;
cudaEventElapsedTime(&time_malloc_ms, start_malloc, stop_malloc);
// === ИЗМЕРЕНИЕ memcpy H->D ===
cudaEventRecord(start_transfer);
err = cudaMemcpy(d_records, h_records, num_records * sizeof(GpuRecord), cudaMemcpyHostToDevice);
if (err != cudaSuccess) return -10;
@@ -126,17 +173,24 @@ extern "C" int gpu_aggregate_days(
err = cudaMemcpy(d_day_indices, h_day_indices, num_days * sizeof(long long), cudaMemcpyHostToDevice);
if (err != cudaSuccess) return -13;
// Запускаем kernel: каждый поток обрабатывает один день
cudaEventRecord(stop_transfer);
cudaEventSynchronize(stop_transfer);
float time_transfer_ms = 0;
cudaEventElapsedTime(&time_transfer_ms, start_transfer, stop_transfer);
// === ИЗМЕРЕНИЕ kernel ===
const int THREADS_PER_BLOCK = 256;
int num_blocks = (num_days + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
cudaEventRecord(start_kernel);
aggregate_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(
d_records, num_records,
d_day_offsets, d_day_counts, d_day_indices,
num_days, d_out_stats
);
// Проверяем ошибку запуска kernel
err = cudaGetLastError();
if (err != cudaSuccess) {
cudaFree(d_records);
@@ -147,26 +201,65 @@ extern "C" int gpu_aggregate_days(
return -7;
}
// Ждём завершения
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
cudaFree(d_records);
cudaFree(d_day_offsets);
cudaFree(d_day_counts);
cudaFree(d_day_indices);
cudaFree(d_out_stats);
return -6;
}
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
// Копируем результат обратно
float time_kernel_ms = 0;
cudaEventElapsedTime(&time_kernel_ms, start_kernel, stop_kernel);
// === ИЗМЕРЕНИЕ memcpy D->H ===
cudaEventRecord(start_copy_back);
cudaMemcpy(h_out_stats, d_out_stats, num_days * sizeof(GpuDayStats), cudaMemcpyDeviceToHost);
cudaEventRecord(stop_copy_back);
cudaEventSynchronize(stop_copy_back);
float time_copy_back_ms = 0;
cudaEventElapsedTime(&time_copy_back_ms, start_copy_back, stop_copy_back);
// === ИЗМЕРЕНИЕ cudaFree ===
cudaEventRecord(start_free);
// Освобождаем память
cudaFree(d_records);
cudaFree(d_day_offsets);
cudaFree(d_day_counts);
cudaFree(d_day_indices);
cudaFree(d_out_stats);
cudaEventRecord(stop_free);
cudaEventSynchronize(stop_free);
float time_free_ms = 0;
cudaEventElapsedTime(&time_free_ms, start_free, stop_free);
// Общее время GPU
float time_total_ms = time_malloc_ms + time_transfer_ms + time_kernel_ms + time_copy_back_ms + time_free_ms;
// === Освобождаем события ===
double cpu_event_destroy_start = get_time_ms();
cudaEventDestroy(start_malloc);
cudaEventDestroy(stop_malloc);
cudaEventDestroy(start_transfer);
cudaEventDestroy(stop_transfer);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_copy_back);
cudaEventDestroy(stop_copy_back);
cudaEventDestroy(start_free);
cudaEventDestroy(stop_free);
double cpu_event_destroy_ms = get_time_ms() - cpu_event_destroy_start;
double cpu_total_ms = get_time_ms() - cpu_total_start;
// Выводим детальную статистику
printf(" GPU Timings (%d records, %d days):\n", num_records, num_days);
printf(" cudaMalloc: %7.3f ms\n", time_malloc_ms);
printf(" memcpy H->D: %7.3f ms\n", time_transfer_ms);
printf(" kernel execution: %7.3f ms\n", time_kernel_ms);
printf(" memcpy D->H: %7.3f ms\n", time_copy_back_ms);
printf(" cudaFree: %7.3f ms\n", time_free_ms);
printf(" GPU TOTAL: %7.3f ms\n", cpu_total_ms);
fflush(stdout);
return 0;
}