NMVL查询显卡信息

首页 > 代码库 > NMVL查询显卡信息

2024-09-22 11:42:27 218人阅读

前段时间做英伟达硬解得时候，显卡总是莫名挂掉，后来发现是因为显卡温度过高掉了。这几天找到CUDA中有NVML工具可以查看显卡信息，nvidia-smi也是基于这个工具包。

使用的CUDA版本为CUDA 8.0 。

1.给程序添加NVML

安装CUDA之后可以找到如下：

技术分享

图1.NVML的例子

这里面包含的是NVML的一个例子。我的系统是64位的，可以找到NVML的lib和头文件如下：

技术分享

图2.NVML的lib文件

技术分享

图3.NVML头文件

在工程中包含NVML。我是新建的CUDA 8.0 Runtime工程，因为NVML包含在CUDA中，建CUDA 8.0 Runtime工程可以省去CUDA的配置工作，工程建立方法参见VS2013 VC++的.cpp文件调用CUDA的.cu文件中的函数

,CUDA 8.0为默认安装，系统为win10 64位。

在程序中直接包含NVML的头文件和lib文件即可：

#include "nvml.h"#pragma  comment(lib,"nvml.lib")

注意64位系统应该建立x64工程，因为在安装的CUDA中没有win32的nvml.lib。

2.NVML查询显卡信息

常用函数：

·nvmlInit()函数初始化NVML；

·nvmlDeviceGetCount(unsigned int *deviceCount)函数可以获得显卡数；

·nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device)获取设备；

·nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length)查询设备的名称；

·nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci)获取PCI信息，对这个函数的重要性，例子中是这么说的

// pci.busId is very useful to know which device physically you‘re talking to
// Using PCI identifier you can also match nvmlDevice handle to CUDA device.

·nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode)得到显卡当前所处的模式，模式由以下：

typedef enum nvmlComputeMode_enum
{
    NVML_COMPUTEMODE_DEFAULT           = 0, //!< Default compute mode -- multiple contexts per device
    NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed
    NVML_COMPUTEMODE_PROHIBITED        = 2, //!< Compute-prohibited mode -- no contexts per device
    NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time

    // Keep this last
    NVML_COMPUTEMODE_COUNT
} nvmlComputeMode_t;

·nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode)可以修改显卡的模式；

·nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp)查询温度阈值，具体有两种:

typedef enum nvmlTemperatureThresholds_enum
{
    NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0,    // Temperature at which the GPU will shut down for HW protection
    NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1,    // Temperature at which the GPU will begin slowdown
    // Keep this last
    NVML_TEMPERATURE_THRESHOLD_COUNT
} nvmlTemperatureThresholds_t;

当温度达到NVML_TEMPERATURE_THRESHOLD_SHUTDOWN 参数获取的温度时，显卡将自动关闭以保护硬件；当温度达到NVML_TEMPERATURE_THRESHOLD_SLOWDOWN参数获取的温度时，显卡的性能将下降。

·nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp)获取显卡当前温度；

·nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization)获取设备的使用率（原注释：Retrieves the current utilization rates for the device‘s major subsystems。不知道理解错了没有），使用率包括以下：

typedef struct nvmlUtilization_st
{
unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written
} nvmlUtilization_t;

·nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory) Retrieves the amount of used, free and total memory available on the device, in bytes。

·nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory) Gets Total, Available and Used size of BAR1 memory.（不知道这种与上一种有什么区别，有待后续学习）

·nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos) Get information about processes with a compute context on a device。应该是获取当前在使用显卡的程序信息。

·nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock) Retrieves the maximum clock speeds for the device。包括以下：

typedef enum nvmlClockType_enum
{
    NVML_CLOCK_GRAPHICS = 0,        //!< Graphics clock domain
    NVML_CLOCK_SM        = 1,        //!< SM clock domain
    NVML_CLOCK_MEM       = 2,        //!< Memory clock domain
    NVML_CLOCK_VIDEO     = 3,        //!< Video encoder/decoder clock domain

    // Keep this last
    NVML_CLOCK_COUNT //<! Count of clock types
} nvmlClockType_t;

·nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock) Retrieves the current clock speeds for the device.上面是获取最大的，这个是获取当前的。

代码示例：

#include "cuda_kernels.h"#include "nvml.h"#include <stdio.h>  #include <windows.h>  #include <winbase.h>  #include <tlhelp32.h>  #include <psapi.h>   #pragma comment(lib,"kernel32.lib")  #pragma comment(lib,"advapi32.lib")  #pragma  comment(lib,"nvml.lib")const char * convertToComputeModeString(nvmlComputeMode_t mode){    switch (mode)    {    case NVML_COMPUTEMODE_DEFAULT:        return "Default";    case NVML_COMPUTEMODE_EXCLUSIVE_THREAD:        return "Exclusive_Thread";    case NVML_COMPUTEMODE_PROHIBITED:        return "Prohibited";    case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS:        return "Exclusive Process";    default:        return "Unknown";    }}int main(){    cuAdd();    nvmlReturn_t result;    unsigned int device_count, i;    // First initialize NVML library    result = nvmlInit();    if (NVML_SUCCESS != result)    {        printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));        printf("Press ENTER to continue...\n");        getchar();        return 1;    }    result = nvmlDeviceGetCount(&device_count);    if (NVML_SUCCESS != result)    {        printf("Failed to query device count: %s\n", nvmlErrorString(result));        goto Error;    }    printf("Found %d device%s\n\n", device_count, device_count != 1 ? "s" : "");    printf("Listing devices:\n");    while (true)    {        for (i = 0; i < device_count; i++)        {            nvmlDevice_t device;            char name[NVML_DEVICE_NAME_BUFFER_SIZE];            nvmlPciInfo_t pci;            nvmlComputeMode_t compute_mode;            // Query for device handle to perform operations on a device            // You can also query device handle by other features like:            // nvmlDeviceGetHandleBySerial            // nvmlDeviceGetHandleByPciBusId            result = nvmlDeviceGetHandleByIndex(i, &device);            if (NVML_SUCCESS != result)            {                printf("Failed to get handle for device %i: %s\n", i, nvmlErrorString(result));                goto Error;            }            result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);            if (NVML_SUCCESS != result)            {                printf("Failed to get name of device %i: %s\n", i, nvmlErrorString(result));                goto Error;            }            // pci.busId is very useful to know which device physically you‘re talking to            // Using PCI identifier you can also match nvmlDevice handle to CUDA device.            result = nvmlDeviceGetPciInfo(device, &pci);            if (NVML_SUCCESS != result)            {                printf("Failed to get pci info for device %i: %s\n", i, nvmlErrorString(result));                goto Error;            }            printf("%d. %s [%s]\n", i, name, pci.busId);            // This is a simple example on how you can modify GPU‘s state            result = nvmlDeviceGetComputeMode(device, &compute_mode);            if (NVML_ERROR_NOT_SUPPORTED == result)                printf("\t This is not CUDA capable device\n");            else if (NVML_SUCCESS != result)            {                printf("Failed to get compute mode for device %i: %s\n", i, nvmlErrorString(result));                goto Error;            }            else            {                // try to change compute mode                printf("\t Changing device‘s compute mode from ‘%s‘ to ‘%s‘\n",                    convertToComputeModeString(compute_mode),                    convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));                result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);                if (NVML_ERROR_NO_PERMISSION == result)                    printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));                else if (NVML_ERROR_NOT_SUPPORTED == result)                    printf("\t\t Compute mode prohibited not supported. You might be running on\n"                    "\t\t windows in WDDM driver model or on non-CUDA capable GPU.\n");                else if (NVML_SUCCESS != result)                {                    printf("\t\t Failed to set compute mode for device %i: %s\n", i, nvmlErrorString(result));                    goto Error;                }                else                {                    printf("\t Restoring device‘s compute mode back to ‘%s‘\n",                        convertToComputeModeString(compute_mode));                    result = nvmlDeviceSetComputeMode(device, compute_mode);                    if (NVML_SUCCESS != result)                    {                        printf("\t\t Failed to restore compute mode for device %i: %s\n", i, nvmlErrorString(result));                        goto Error;                    }                }            }            printf("\n");            printf("----- 温度 ----- \n");            unsigned int temperature_threshold = 100;            result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, &temperature_threshold);            if (NVML_SUCCESS != result)            {                printf("device %i Failed to get NVML_TEMPERATURE_THRESHOLD_SHUTDOWN: %s\n", i, nvmlErrorString(result));            }            else                printf("截止温度： %d 摄氏度  （Temperature at which the GPU will shut down for HW protection）\n", temperature_threshold);            result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &temperature_threshold);            if (NVML_SUCCESS != result)            {                printf("device %i Failed NVML_TEMPERATURE_THRESHOLD_SLOWDOWN: %s\n", i, nvmlErrorString(result));            }            else                printf("上限温度： %d 摄氏度  （Temperature at which the GPU will begin slowdown）\n", temperature_threshold);            unsigned int temperature = 0;            result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);            if (NVML_SUCCESS != result)            {                printf("device %i NVML_TEMPERATURE_GPU Failed: %s\n", i, nvmlErrorString(result));            }            else                printf("当前温度： %d 摄氏度 \n", temperature);            //使用率            printf("\n");            nvmlUtilization_t utilization;            result = nvmlDeviceGetUtilizationRates(device, &utilization);            if (NVML_SUCCESS != result)            {                printf(" device %i nvmlDeviceGetUtilizationRates Failed : %s\n", i, nvmlErrorString(result));            }            else            {                printf("----- 使用率 ----- \n");                printf("GPU 使用率： %lld %% \n", utilization.gpu);                printf("显存使用率： %lld %% \n", utilization.memory);            }            //FB memory            printf("\n");            nvmlMemory_t memory;            result = nvmlDeviceGetMemoryInfo(device, &memory);            if (NVML_SUCCESS != result)            {                printf("device %i nvmlDeviceGetMemoryInfo Failed : %s\n", i, nvmlErrorString(result));            }            else            {                printf("------ FB memory ------- \n");                printf("Total installed FB memory： %lld bytes \n", memory.total);                printf("Unallocated FB memory： %lld bytes \n", memory.free);                printf("Allocated FB memory： %lld bytes \n", memory.used);            }            //BAR1 memory            printf("\n");            nvmlBAR1Memory_t bar1Memory;            result = nvmlDeviceGetBAR1MemoryInfo(device, &bar1Memory);            if (NVML_SUCCESS != result)            {                printf("device %i  nvmlDeviceGetBAR1MemoryInfo Failed : %s\n", i, nvmlErrorString(result));            }            else            {                printf("------ BAR1 memory ------- \n");                printf("Total BAR1 memory： %lld bytes \n", bar1Memory.bar1Total);                printf("Unallocated BAR1 memory： %lld bytes \n", bar1Memory.bar1Free);                printf("Allocated BAR1 memory： %lld bytes \n", bar1Memory.bar1Used);            }            //Information about running compute processes on the GPU            printf("\n");            unsigned int infoCount;            nvmlProcessInfo_t infos[999];            result = nvmlDeviceGetComputeRunningProcesses(device, &infoCount, infos);            if (NVML_SUCCESS != result)            {                printf("Failed to get ComputeRunningProcesses for device %i: %s\n", i, nvmlErrorString(result));            }            else            {                HANDLE handle; //定义CreateToolhelp32Snapshot系统快照句柄                       handle = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0);//获得系统快照句柄                     PROCESSENTRY32 *info; //定义PROCESSENTRY32结构字指                     //PROCESSENTRY32 结构的 dwSize 成员设置成 sizeof(PROCESSENTRY32)                      info = new PROCESSENTRY32;                info->dwSize = sizeof(PROCESSENTRY32);                //调用一次     Process32First 函数，从快照中获取进程列表                     Process32First(handle, info);                //重复调用 Process32Next，直到函数返回 FALSE 为止                    printf("------ Information about running compute processes on the GPU ------- \n");                for (int i = 0; i < infoCount; i++)                {                    printf("PID： %d  显存占用：%lld bytes   ", infos[i].pid, infos[i].usedGpuMemory);                    while (Process32Next(handle, info) != FALSE)                    {                        if (info->th32ProcessID == infos[i].pid)                        {                            //printf("  %s\n", info->szExeFile);                            HANDLE hProcess = NULL;                            //打开目标进程                              hProcess = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, info->th32ProcessID);                            if (hProcess == NULL) {                                printf("\nOpen Process fAiled:%d\n", GetLastError());                                break;                            }                            char strFilePath[MAX_PATH];                            GetModuleFileNameEx(hProcess, NULL, strFilePath, MAX_PATH);                            printf(" %s\n", strFilePath);                            CloseHandle(hProcess);                            break;                        }                    }                }                                delete info;                CloseHandle(handle);            }            //BAR1 memory            printf("\n");            printf("------ Clocks ------- \n");             unsigned int max_clock;            result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_GRAPHICS, &max_clock);            if (NVML_SUCCESS != result)            {                printf("device %i   nvmlDeviceGetMaxClockInfo Failed : %s\n", i, nvmlErrorString(result));            }            unsigned int clock;            result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_GRAPHICS, &clock);            if (NVML_SUCCESS != result)            {                printf("Failed to get NVML_CLOCK_GRAPHICS info for device %i: %s\n", i, nvmlErrorString(result));            }            else            {                printf("GRAPHICS： %6d Mhz   max clock ：%d  \n", clock, max_clock);            }            result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &max_clock);            if (NVML_SUCCESS != result)            {                printf("Failed to get max NVML_CLOCK_SM for device %i: %s\n", i, nvmlErrorString(result));            }            result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &clock);            if (NVML_SUCCESS != result)            {                printf("Failed to get current NVML_CLOCK_SM for device %i: %s\n", i, nvmlErrorString(result));            }            else            {                printf("      SM： %6d Mhz   max clock ：%d   \n", clock, max_clock);            }            result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_MEM, &max_clock);            if (NVML_SUCCESS != result)            {                printf("Failed to get max NVML_CLOCK_MEM for device %i: %s\n", i, nvmlErrorString(result));            }            result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &clock);            if (NVML_SUCCESS != result)            {                printf("Failed to get current NVML_CLOCK_MEM for device %i: %s\n", i, nvmlErrorString(result));            }            else            {                printf("     MEM： %6d Mhz   max clock ：%d   \n", clock, max_clock);            }            result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_VIDEO, &max_clock);            if (NVML_SUCCESS != result)            {                printf("Failed to get max NVML_CLOCK_VIDEO for device %i: %s\n", i, nvmlErrorString(result));            }            result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_VIDEO, &clock);            if (NVML_SUCCESS != result)            {                printf("Failed to get current NVML_CLOCK_VIDEO for device %i: %s\n", i, nvmlErrorString(result));            }            else            {                printf("   VIDEO： %6d Mhz   max clock ：%d   \n", clock, max_clock);            }        }        printf("-------------------------------------------------------------------- \n");        Sleep(1000);    }Error:    result = nvmlShutdown();    if (NVML_SUCCESS != result)        printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));    system("pause");    return 0;}

虽然我已经把nvml.dll拷贝到运行目录，程序应该是可以正常运行了。也做一下nvidia-smi的环境配置，参考NVIDIA 显卡信息（CUDA信息的查看），我把他的复制到下面来：

1. nvidia-smi 查看显卡信息
nvidia-smi 指的是 NVIDIA System Management Interface；
在安装完成 NVIDIA 显卡驱动之后，对于 windows 用户而言，cmd 命令行界面还无法识别 nvidia-smi 命令，需要将相关环境变量添加进去。如将 NVIDIA 显卡驱动安装在默认位置，nvidia-smi 命令所在的完整路径应当为：
C:\Program Files\NVIDIA Corporation\NVSMI
也即将上述路径添加进 Path 系统环境变量中。
2. 查看 CUDA 信息
CUDA 的版本：
进入命令行：nvcc -V

3.运行结果

技术分享

图4.GeForce 940M查询结果

技术分享

图5.Tesla P4查询结果

NVML对GeForce 940M的支持不怎么好，对Tesla P4支持得比较好。

工程源码：http://download.csdn.net/download/qq_33892166/9841800

NMVL查询显卡信息

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > NMVL查询显卡信息