Больше таймеров
This commit is contained in:
@@ -1,6 +1,15 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdint>
|
||||
#include <cfloat>
|
||||
#include <cstdio>
|
||||
#include <ctime>
|
||||
|
||||
// CPU таймер в миллисекундах
|
||||
static double get_time_ms() {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
|
||||
}
|
||||
|
||||
// Структуры данных (должны совпадать с C++ кодом)
|
||||
struct GpuRecord {
|
||||
@@ -27,6 +36,10 @@ extern "C" int gpu_is_available() {
|
||||
int n = 0;
|
||||
cudaError_t err = cudaGetDeviceCount(&n);
|
||||
if (err != cudaSuccess) return 0;
|
||||
if (n > 0) {
|
||||
// Инициализируем CUDA контекст заранее (cudaFree(0) форсирует инициализацию)
|
||||
cudaFree(0);
|
||||
}
|
||||
return (n > 0) ? 1 : 0;
|
||||
}
|
||||
|
||||
@@ -89,7 +102,33 @@ extern "C" int gpu_aggregate_days(
|
||||
int num_days,
|
||||
GpuDayStats* h_out_stats)
|
||||
{
|
||||
// Выделяем память на GPU
|
||||
double cpu_total_start = get_time_ms();
|
||||
|
||||
// === Создаём CUDA события для измерения времени ===
|
||||
double cpu_event_create_start = get_time_ms();
|
||||
|
||||
cudaEvent_t start_malloc, stop_malloc;
|
||||
cudaEvent_t start_transfer, stop_transfer;
|
||||
cudaEvent_t start_kernel, stop_kernel;
|
||||
cudaEvent_t start_copy_back, stop_copy_back;
|
||||
cudaEvent_t start_free, stop_free;
|
||||
|
||||
cudaEventCreate(&start_malloc);
|
||||
cudaEventCreate(&stop_malloc);
|
||||
cudaEventCreate(&start_transfer);
|
||||
cudaEventCreate(&stop_transfer);
|
||||
cudaEventCreate(&start_kernel);
|
||||
cudaEventCreate(&stop_kernel);
|
||||
cudaEventCreate(&start_copy_back);
|
||||
cudaEventCreate(&stop_copy_back);
|
||||
cudaEventCreate(&start_free);
|
||||
cudaEventCreate(&stop_free);
|
||||
|
||||
double cpu_event_create_ms = get_time_ms() - cpu_event_create_start;
|
||||
|
||||
// === ИЗМЕРЕНИЕ cudaMalloc ===
|
||||
cudaEventRecord(start_malloc);
|
||||
|
||||
GpuRecord* d_records = nullptr;
|
||||
int* d_day_offsets = nullptr;
|
||||
int* d_day_counts = nullptr;
|
||||
@@ -113,7 +152,15 @@ extern "C" int gpu_aggregate_days(
|
||||
err = cudaMalloc(&d_out_stats, num_days * sizeof(GpuDayStats));
|
||||
if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); cudaFree(d_day_indices); return -5; }
|
||||
|
||||
// Копируем данные на GPU
|
||||
cudaEventRecord(stop_malloc);
|
||||
cudaEventSynchronize(stop_malloc);
|
||||
|
||||
float time_malloc_ms = 0;
|
||||
cudaEventElapsedTime(&time_malloc_ms, start_malloc, stop_malloc);
|
||||
|
||||
// === ИЗМЕРЕНИЕ memcpy H->D ===
|
||||
cudaEventRecord(start_transfer);
|
||||
|
||||
err = cudaMemcpy(d_records, h_records, num_records * sizeof(GpuRecord), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) return -10;
|
||||
|
||||
@@ -126,17 +173,24 @@ extern "C" int gpu_aggregate_days(
|
||||
err = cudaMemcpy(d_day_indices, h_day_indices, num_days * sizeof(long long), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) return -13;
|
||||
|
||||
// Запускаем kernel: каждый поток обрабатывает один день
|
||||
cudaEventRecord(stop_transfer);
|
||||
cudaEventSynchronize(stop_transfer);
|
||||
|
||||
float time_transfer_ms = 0;
|
||||
cudaEventElapsedTime(&time_transfer_ms, start_transfer, stop_transfer);
|
||||
|
||||
// === ИЗМЕРЕНИЕ kernel ===
|
||||
const int THREADS_PER_BLOCK = 256;
|
||||
int num_blocks = (num_days + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
|
||||
cudaEventRecord(start_kernel);
|
||||
|
||||
aggregate_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(
|
||||
d_records, num_records,
|
||||
d_day_offsets, d_day_counts, d_day_indices,
|
||||
num_days, d_out_stats
|
||||
);
|
||||
|
||||
// Проверяем ошибку запуска kernel
|
||||
err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
cudaFree(d_records);
|
||||
@@ -147,26 +201,65 @@ extern "C" int gpu_aggregate_days(
|
||||
return -7;
|
||||
}
|
||||
|
||||
// Ждём завершения
|
||||
err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
cudaFree(d_records);
|
||||
cudaFree(d_day_offsets);
|
||||
cudaFree(d_day_counts);
|
||||
cudaFree(d_day_indices);
|
||||
cudaFree(d_out_stats);
|
||||
return -6;
|
||||
}
|
||||
cudaEventRecord(stop_kernel);
|
||||
cudaEventSynchronize(stop_kernel);
|
||||
|
||||
// Копируем результат обратно
|
||||
float time_kernel_ms = 0;
|
||||
cudaEventElapsedTime(&time_kernel_ms, start_kernel, stop_kernel);
|
||||
|
||||
// === ИЗМЕРЕНИЕ memcpy D->H ===
|
||||
cudaEventRecord(start_copy_back);
|
||||
cudaMemcpy(h_out_stats, d_out_stats, num_days * sizeof(GpuDayStats), cudaMemcpyDeviceToHost);
|
||||
cudaEventRecord(stop_copy_back);
|
||||
cudaEventSynchronize(stop_copy_back);
|
||||
|
||||
float time_copy_back_ms = 0;
|
||||
cudaEventElapsedTime(&time_copy_back_ms, start_copy_back, stop_copy_back);
|
||||
|
||||
// === ИЗМЕРЕНИЕ cudaFree ===
|
||||
cudaEventRecord(start_free);
|
||||
|
||||
// Освобождаем память
|
||||
cudaFree(d_records);
|
||||
cudaFree(d_day_offsets);
|
||||
cudaFree(d_day_counts);
|
||||
cudaFree(d_day_indices);
|
||||
cudaFree(d_out_stats);
|
||||
|
||||
cudaEventRecord(stop_free);
|
||||
cudaEventSynchronize(stop_free);
|
||||
|
||||
float time_free_ms = 0;
|
||||
cudaEventElapsedTime(&time_free_ms, start_free, stop_free);
|
||||
|
||||
// Общее время GPU
|
||||
float time_total_ms = time_malloc_ms + time_transfer_ms + time_kernel_ms + time_copy_back_ms + time_free_ms;
|
||||
|
||||
// === Освобождаем события ===
|
||||
double cpu_event_destroy_start = get_time_ms();
|
||||
|
||||
cudaEventDestroy(start_malloc);
|
||||
cudaEventDestroy(stop_malloc);
|
||||
cudaEventDestroy(start_transfer);
|
||||
cudaEventDestroy(stop_transfer);
|
||||
cudaEventDestroy(start_kernel);
|
||||
cudaEventDestroy(stop_kernel);
|
||||
cudaEventDestroy(start_copy_back);
|
||||
cudaEventDestroy(stop_copy_back);
|
||||
cudaEventDestroy(start_free);
|
||||
cudaEventDestroy(stop_free);
|
||||
|
||||
double cpu_event_destroy_ms = get_time_ms() - cpu_event_destroy_start;
|
||||
double cpu_total_ms = get_time_ms() - cpu_total_start;
|
||||
|
||||
// Выводим детальную статистику
|
||||
printf(" GPU Timings (%d records, %d days):\n", num_records, num_days);
|
||||
printf(" cudaMalloc: %7.3f ms\n", time_malloc_ms);
|
||||
printf(" memcpy H->D: %7.3f ms\n", time_transfer_ms);
|
||||
printf(" kernel execution: %7.3f ms\n", time_kernel_ms);
|
||||
printf(" memcpy D->H: %7.3f ms\n", time_copy_back_ms);
|
||||
printf(" cudaFree: %7.3f ms\n", time_free_ms);
|
||||
printf(" GPU TOTAL: %7.3f ms\n", cpu_total_ms);
|
||||
fflush(stdout);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user