123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428 |
- #include "PerformanceMonitor.h"
- #include "DBMgr.h"
- #define DEFAULT_PFM_CHECK_TIME 5000 // 5s
- #define NVML_LIB_NAME "libnvidia-ml.so"
- #define LOCAL_NET_NAME "lo"
- NAMESPACE_MAS_BEGIN
- PerformanceMonitor * PerformanceMonitor::_ins = nullptr;
- tzc::Mutex PerformanceMonitor::_insLock = FALSE;
- PerformanceMonitor * PerformanceMonitor::GetInstance()
- {
- if (!_ins)
- {
- _insLock.Lock();
- if (!_ins)
- {
- _ins = new PerformanceMonitor();
- }
- _insLock.Unlock();
- }
- return _ins;
- }
- void PerformanceMonitor::DestoryInstance()
- {
- _insLock.Lock();
- TZ_delete(_ins);
- _insLock.Unlock();
- }
- PerformanceMonitor::PerformanceMonitor():
- m_inited(FALSE),
- m_timeCnt(0),
- m_intervalTime(DEFAULT_PFM_CHECK_TIME),
- m_cpuUse(0),
- m_fNvmlInit(nullptr),
- m_fNvmlShutdown(nullptr),
- m_fNvmlDeviceGetCount(nullptr),
- m_fNvmlDeviceGetHandleByIndex(nullptr),
- m_fNvmlDeviceGetMemoryInfo(nullptr),
- m_fNvmlDeviceGetUtilizationRate(nullptr),
- m_fNvmlDeviceGetTemperature(nullptr),
- m_hdLib(nullptr)
- {
- }
- PerformanceMonitor::~PerformanceMonitor()
- {
- this->Dispose();
- }
- /* 参数:time,采集频率,单位秒 */
- TZ_INT PerformanceMonitor::Initialize()
- {
- if (m_inited)
- {
- TZLogWarn("PerformanceMonitor is initialized~~~");
- return MEC_OK;
- }
- this->initData();
- m_hdLib = tzc::SysUtils::LoadLibrary(NVML_LIB_NAME);
- if (m_hdLib == nullptr)
- {
- TZLogError("load dll %s failed!!", NVML_LIB_NAME);
- return MEC_FAILED;
- }
- if (this->loadNvmlFunc(m_fNvmlInit, "nvmlInit") == 0
- && this->loadNvmlFunc(m_fNvmlShutdown, "nvmlShutdown") == 0
- && this->loadNvmlFunc(m_fNvmlDeviceGetCount, "nvmlDeviceGetCount") == 0
- && this->loadNvmlFunc(m_fNvmlDeviceGetHandleByIndex,
- "nvmlDeviceGetHandleByIndex") == 0
- && this->loadNvmlFunc(m_fNvmlDeviceGetMemoryInfo,
- "nvmlDeviceGetMemoryInfo") == 0
- && this->loadNvmlFunc(m_fNvmlDeviceGetUtilizationRate,
- "nvmlDeviceGetUtilizationRates") == 0
- && this->loadNvmlFunc(m_fNvmlDeviceGetTemperature,
- "nvmlDeviceGetTemperature") == 0)
- {
- if (m_fNvmlInit() != nvmlReturn_t::NVML_SUCCESS)
- {
- TZLogInfo("call m_fNvmlInit failed!!!");
- return MEC_FAILED;
- }
- TZLogInfo("Init NVML Lib success!!!!");
- this->Start();
- m_inited = TRUE;
- TZLogInfo("PerformanceMonitor Initialize success~~~");
- return MEC_OK;
- }
- TZLogInfo("Init NVML Lib Failed!!!!!");
- return MEC_FAILED;
- }
- TZ_INT PerformanceMonitor::Dispose()
- {
- if (!m_inited) return MEC_OK;
- this->StopAndWait();
- if (m_hdLib != nullptr)
- {
- if (m_fNvmlShutdown != nullptr)
- {
- m_fNvmlShutdown();
- }
- tzc::SysUtils::FreeLibrary(m_hdLib);
- m_hdLib = nullptr;
- }
- m_inited = FALSE;
- TZLogInfo("PerformanceMonitor Dispose success~~~");
- return MEC_OK;
- }
- void PerformanceMonitor::SetIntervalTime(TZ_INT time)
- {
- m_intervalTime = time;
- TZLogInfo("PerformanceMonitor IntervalTime set to %ds~~~", time);
- }
- void PerformanceMonitor::initData()
- {
- /* 获取当前cpu各项数据信息 */
- FileHelper::GetSysStatData(m_cpuDataInfo);
- /* 获取内存数据信息 */
- FileHelper::GetSysMemData(m_memDataInfo);
- /* 获取磁盘读取写入数据信息 */
- FileHelper::GetSysDiskIOData(m_diskDataList);
- /* 获取当前网口数据信息 */
- FileHelper::GetSysNetBandData(m_netDataList);
- /* 并将所拥有的当前网口 加载到系统信息参数的m_interFaces */
- for (auto & iter : m_netDataList)
- {
- InterfaceInfo info;
- info.Name = iter.NicName;
- m_interFaces.push_back(info);
- }
- /* 获取CPU名称以及核心总数 */
- FileHelper::GetCPUInfo(m_cpuInfo);
- /* 获取网络连接信息 */
- FileHelper::GetNetPortInfo(m_netInfos);
- /* 获取磁盘使用率 */
- FileHelper::GetDiskInfo(m_diskInfos);
- /* 获取GPU信息 */
- FileHelper::GetGPUInfo(m_GPUName);
- TZLogInfo("after initData");
- return;
- }
- void PerformanceMonitor::Entry()
- {
- while (!this->IsStop())
- {
- if (m_timeCnt < m_intervalTime)
- {
- tzc::SysUtils::DelayMseconds(100);
- m_timeCnt += 100;
- continue;
- }
- m_timeCnt = 0;
- this->collectCPUData();
- this->collectMemData();
- // this->collectDiskIOData();
- this->collectBandWidthRate();
- this->collectGPUData();
- this->collectCPUInfo();
- this->collectNetworkInfo();
- this->collectDiskInfo();
- this->collectGPUInfo();
- m_tbl.Tb_RecordTime = TIME_STAMP_NOW;
- m_tbl.Tb_Id = INVALID_PRIMARY_KEY;
- DBMGR->AddOrUpdateTblPerformanceRecord(m_tbl);
- }
- }
- void PerformanceMonitor::collectCPUData()
- {
- SysStatInfo data;
- FileHelper::GetSysStatData(data);
- /* 计算 */
- m_cpuUse = TZ_DOUBLE(data.user - m_cpuDataInfo.user)
- / TZ_DOUBLE(data.GetCPUTime() - m_cpuDataInfo.GetCPUTime()) * 100;
- m_cpuDataInfo = data;
- TZLogDebug(2, "DEBUG:System CPU use (%lf)~~~", m_cpuUse);
- m_tbl.Tb_Cpu = m_cpuUse;
- }
- void PerformanceMonitor::collectMemData()
- {
- FileHelper::GetSysMemData(m_memDataInfo);
- TZLogDebug(2, "DEBUG:System MEM rate (%lf)~~~", m_memDataInfo.GetMemRate());
- m_tbl.Tb_Memory = m_memDataInfo.GetMemRate();
- }
- void PerformanceMonitor::collectDiskIOData()
- {
- /* !!此处计算的是总体的磁盘使用情况 */
- std::list<SysDiskStatInfo> datas;
- FileHelper::GetSysDiskIOData(datas);
- TZ_LONG readDiskNum = 0;
- TZ_LONG writeDiskNum = 0;
- TZ_LONG collectTime = 0;
- for (auto & data : datas)
- {
- readDiskNum += data.ReadSectionCount;
- writeDiskNum += data.WriteSectionCount;
- collectTime = data.collectTime;
- }
- TZ_LONG readDiskNum_d = 0;
- TZ_LONG writeDiskNum_d = 0;
- TZ_LONG collectTime_d = 0;
- for (auto & data : m_diskDataList)
- {
- readDiskNum_d += data.ReadSectionCount;
- writeDiskNum_d += data.WriteSectionCount;
- collectTime_d = data.collectTime;
- }
- if ((collectTime - collectTime_d) == 0) return;
- m_readSpeed = (TZ_DOUBLE(readDiskNum - readDiskNum_d) * 512 / 1024)
- / (collectTime - collectTime_d);
- m_writeSpeed = (TZ_FLOAT(writeDiskNum - writeDiskNum_d) * 512 / 1024)
- / (collectTime - collectTime_d);
- TZLogDebug(2, "DEBUG:System DISK read speed (%lf)~~~", m_readSpeed);
- TZLogDebug(2, "DEBUG:System DISK write speed (%lf)~~~", m_writeSpeed);
- m_diskDataList.clear();
- m_diskDataList.swap(datas);
- // TODO: 暂不需要记录硬盘 I/O
- }
- void PerformanceMonitor::collectBandWidthRate()
- {
- std::list<SysNetBandInfo> datas;
- FileHelper::GetSysNetBandData(datas);
- m_bandWidthInfoMap.clear();
- TZ_DOUBLE totalUse = 0.0;
- TZ_DOUBLE totalBandWidth = 0.0;
- for (auto & iter : datas)
- {
- for (auto & info : m_netDataList)
- {
- if (info.NicName.size() == sizeof(LOCAL_NET_NAME)
- && info.NicName.find(LOCAL_NET_NAME) == 0)
- {
- continue;
- }
- if (iter.NicName == info.NicName)
- {
- TZ_DOUBLE sendRate = /* 单位: bps */
- TZ_DOUBLE((iter.Receive.bytes - info.Receive.bytes +
- iter.Transmit.bytes - info.Transmit.bytes) * 8) /
- TZ_DOUBLE(1000000 * (iter.collectTime - info.collectTime));
- TZ_DOUBLE BandWidthRate = sendRate / TZ_DOUBLE(iter.BandWidth) * 100;
- totalUse += sendRate;
- totalBandWidth += iter.BandWidth;
- TZLogDebug(2, "DEBUG:System BUF [%s](%lf)~~~",
- iter.NicName.c_str(), BandWidthRate);
- m_bandWidthInfoMap.emplace(iter.NicName, BandWidthRate);
- }
- }
- }
- m_netDataList.clear();
- m_netDataList.assign(datas.begin(), datas.end());
- m_tbl.Tb_Nic = totalUse / TZ_DOUBLE(totalBandWidth) * 100;
- TZLogDebug(2, "DEBUG: Total Net Useagr (%d)~~~", m_tbl.Tb_Nic);
- }
- void PerformanceMonitor::collectCPUInfo()
- {
- FileHelper::GetCPUInfo(m_cpuInfo);
- }
- void PerformanceMonitor::collectNetworkInfo()
- {
- FileHelper::GetNetPortInfo(m_netInfos);
- }
- void PerformanceMonitor::collectDiskInfo()
- {
- FileHelper::GetDiskInfo(m_diskInfos);
- // TODO: 改为挂载 /var/mas2.0 的硬盘
- for (auto & i : m_diskInfos)
- {
- if (i.MountedOn == "/")
- {
- m_tbl.Tb_Disk = atoi(i.UseRate.c_str());
- }
- }
- }
- void PerformanceMonitor::collectGPUInfo()
- {
- FileHelper::GetGPUInfo(m_GPUName);
- }
- void PerformanceMonitor::collectGPUData()
- {
- TZ_INT iRet(nvmlReturn_t::NVML_SUCCESS);
- TZ_Uint32 uDevCnt(0);
- iRet = m_fNvmlDeviceGetCount(&uDevCnt);
- if (iRet != nvmlReturn_t::NVML_SUCCESS)
- {
- TZLogWarn("Call m_fNvmlDeviceGetCount failed!!!:%d", iRet);
- this->clearGPUperf();
- return;
- }
- TZ_Uint64 totalUse = 0;
- TZ_Uint64 totalMemUse = 0;
- TZ_Uint64 totalMem = 0;
- std::vector<SysGPUPerf> vecTmpGPUPerf;
- for (TZ_Uint32 uIndex = 0; uIndex < uDevCnt; ++uIndex)
- {
- nvmlDevice_t nvmlDevice;
- iRet = m_fNvmlDeviceGetHandleByIndex(uIndex, &nvmlDevice);
- if (iRet != nvmlReturn_t::NVML_SUCCESS)
- {
- TZLogWarn("Call m_fNvmlDeviceGetHandleByIndex failed!!!:%d", iRet);
- continue;
- }
- nvmlMemory_t nvmlMemory;
- iRet = m_fNvmlDeviceGetMemoryInfo(nvmlDevice, &nvmlMemory);
- if (iRet != nvmlReturn_t::NVML_SUCCESS)
- {
- TZLogWarn("Call m_fNvmlDeviceGetMemoryInfo failed!!!:%d", iRet);
- continue;
- }
- nvmlUtilization_t nvmlUtilization;
- iRet = m_fNvmlDeviceGetUtilizationRate(nvmlDevice, &nvmlUtilization);
- if (iRet != nvmlReturn_t::NVML_SUCCESS)
- {
- TZLogWarn("Call m_fNvmlDeviceGetUtilizationRate failed!!!:%d", iRet);
- continue;
- }
- SysGPUPerf gpuPerf;
- gpuPerf.gpuIndex = uIndex;
- gpuPerf.gpuUseRate = nvmlUtilization.gpu;
- // nvmlUtilization.memory; // TODO: 可以使用这个查询 Mem 使用率
- gpuPerf.memTotal = nvmlMemory.total;
- gpuPerf.memFree = nvmlMemory.free;
- gpuPerf.memUsed = nvmlMemory.used;
- totalUse += nvmlUtilization.gpu;
- totalMem += nvmlMemory.total;
- totalMemUse += nvmlMemory.used;
- vecTmpGPUPerf.push_back(gpuPerf);
- }
- m_lockGPUPerf.Lock();
- m_vGPUPerf = vecTmpGPUPerf;
- m_lockGPUPerf.Unlock();
- m_tbl.Tb_GpuMemory = 100.0 * totalMemUse / totalMem;
- m_tbl.Tb_GpuUtil = totalUse / m_vGPUPerf.size(); // TODO: 评价使用率暂无更好的计算方法
- }
- void PerformanceMonitor::clearGPUperf()
- {
- m_lockGPUPerf.Lock();
- m_vGPUPerf.clear();
- m_lockGPUPerf.Unlock();
- }
- NAMESPACE_MAS_END
|