PerformanceMonitor.cpp 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. #include "PerformanceMonitor.h"
  2. #include "DBMgr.h"
  3. #define DEFAULT_PFM_CHECK_TIME 5000 // 5s
  4. #define NVML_LIB_NAME "libnvidia-ml.so"
  5. #define LOCAL_NET_NAME "lo"
  6. NAMESPACE_MAS_BEGIN
  7. PerformanceMonitor * PerformanceMonitor::_ins = nullptr;
  8. tzc::Mutex PerformanceMonitor::_insLock = FALSE;
  9. PerformanceMonitor * PerformanceMonitor::GetInstance()
  10. {
  11. if (!_ins)
  12. {
  13. _insLock.Lock();
  14. if (!_ins)
  15. {
  16. _ins = new PerformanceMonitor();
  17. }
  18. _insLock.Unlock();
  19. }
  20. return _ins;
  21. }
  22. void PerformanceMonitor::DestoryInstance()
  23. {
  24. _insLock.Lock();
  25. TZ_delete(_ins);
  26. _insLock.Unlock();
  27. }
  28. PerformanceMonitor::PerformanceMonitor():
  29. m_inited(FALSE),
  30. m_timeCnt(0),
  31. m_intervalTime(DEFAULT_PFM_CHECK_TIME),
  32. m_cpuUse(0),
  33. m_fNvmlInit(nullptr),
  34. m_fNvmlShutdown(nullptr),
  35. m_fNvmlDeviceGetCount(nullptr),
  36. m_fNvmlDeviceGetHandleByIndex(nullptr),
  37. m_fNvmlDeviceGetMemoryInfo(nullptr),
  38. m_fNvmlDeviceGetUtilizationRate(nullptr),
  39. m_fNvmlDeviceGetTemperature(nullptr),
  40. m_hdLib(nullptr)
  41. {
  42. }
  43. PerformanceMonitor::~PerformanceMonitor()
  44. {
  45. this->Dispose();
  46. }
  47. /* 参数:time,采集频率,单位秒 */
  48. TZ_INT PerformanceMonitor::Initialize()
  49. {
  50. if (m_inited)
  51. {
  52. TZLogWarn("PerformanceMonitor is initialized~~~");
  53. return MEC_OK;
  54. }
  55. this->initData();
  56. m_hdLib = tzc::SysUtils::LoadLibrary(NVML_LIB_NAME);
  57. if (m_hdLib == nullptr)
  58. {
  59. TZLogError("load dll %s failed!!", NVML_LIB_NAME);
  60. return MEC_FAILED;
  61. }
  62. if (this->loadNvmlFunc(m_fNvmlInit, "nvmlInit") == 0
  63. && this->loadNvmlFunc(m_fNvmlShutdown, "nvmlShutdown") == 0
  64. && this->loadNvmlFunc(m_fNvmlDeviceGetCount, "nvmlDeviceGetCount") == 0
  65. && this->loadNvmlFunc(m_fNvmlDeviceGetHandleByIndex,
  66. "nvmlDeviceGetHandleByIndex") == 0
  67. && this->loadNvmlFunc(m_fNvmlDeviceGetMemoryInfo,
  68. "nvmlDeviceGetMemoryInfo") == 0
  69. && this->loadNvmlFunc(m_fNvmlDeviceGetUtilizationRate,
  70. "nvmlDeviceGetUtilizationRates") == 0
  71. && this->loadNvmlFunc(m_fNvmlDeviceGetTemperature,
  72. "nvmlDeviceGetTemperature") == 0)
  73. {
  74. if (m_fNvmlInit() != nvmlReturn_t::NVML_SUCCESS)
  75. {
  76. TZLogInfo("call m_fNvmlInit failed!!!");
  77. return MEC_FAILED;
  78. }
  79. TZLogInfo("Init NVML Lib success!!!!");
  80. this->Start();
  81. m_inited = TRUE;
  82. TZLogInfo("PerformanceMonitor Initialize success~~~");
  83. return MEC_OK;
  84. }
  85. TZLogInfo("Init NVML Lib Failed!!!!!");
  86. return MEC_FAILED;
  87. }
  88. TZ_INT PerformanceMonitor::Dispose()
  89. {
  90. if (!m_inited) return MEC_OK;
  91. this->StopAndWait();
  92. if (m_hdLib != nullptr)
  93. {
  94. if (m_fNvmlShutdown != nullptr)
  95. {
  96. m_fNvmlShutdown();
  97. }
  98. tzc::SysUtils::FreeLibrary(m_hdLib);
  99. m_hdLib = nullptr;
  100. }
  101. m_inited = FALSE;
  102. TZLogInfo("PerformanceMonitor Dispose success~~~");
  103. return MEC_OK;
  104. }
  105. void PerformanceMonitor::SetIntervalTime(TZ_INT time)
  106. {
  107. m_intervalTime = time;
  108. TZLogInfo("PerformanceMonitor IntervalTime set to %ds~~~", time);
  109. }
  110. void PerformanceMonitor::initData()
  111. {
  112. /* 获取当前cpu各项数据信息 */
  113. FileHelper::GetSysStatData(m_cpuDataInfo);
  114. /* 获取内存数据信息 */
  115. FileHelper::GetSysMemData(m_memDataInfo);
  116. /* 获取磁盘读取写入数据信息 */
  117. FileHelper::GetSysDiskIOData(m_diskDataList);
  118. /* 获取当前网口数据信息 */
  119. FileHelper::GetSysNetBandData(m_netDataList);
  120. /* 并将所拥有的当前网口 加载到系统信息参数的m_interFaces */
  121. for (auto & iter : m_netDataList)
  122. {
  123. InterfaceInfo info;
  124. info.Name = iter.NicName;
  125. m_interFaces.push_back(info);
  126. }
  127. /* 获取CPU名称以及核心总数 */
  128. FileHelper::GetCPUInfo(m_cpuInfo);
  129. /* 获取网络连接信息 */
  130. FileHelper::GetNetPortInfo(m_netInfos);
  131. /* 获取磁盘使用率 */
  132. FileHelper::GetDiskInfo(m_diskInfos);
  133. /* 获取GPU信息 */
  134. FileHelper::GetGPUInfo(m_GPUName);
  135. TZLogInfo("after initData");
  136. return;
  137. }
  138. void PerformanceMonitor::Entry()
  139. {
  140. while (!this->IsStop())
  141. {
  142. if (m_timeCnt < m_intervalTime)
  143. {
  144. tzc::SysUtils::DelayMseconds(100);
  145. m_timeCnt += 100;
  146. continue;
  147. }
  148. m_timeCnt = 0;
  149. this->collectCPUData();
  150. this->collectMemData();
  151. // this->collectDiskIOData();
  152. this->collectBandWidthRate();
  153. this->collectGPUData();
  154. this->collectCPUInfo();
  155. this->collectNetworkInfo();
  156. this->collectDiskInfo();
  157. this->collectGPUInfo();
  158. m_tbl.Tb_RecordTime = TIME_STAMP_NOW;
  159. m_tbl.Tb_Id = INVALID_PRIMARY_KEY;
  160. DBMGR->AddOrUpdateTblPerformanceRecord(m_tbl);
  161. }
  162. }
  163. void PerformanceMonitor::collectCPUData()
  164. {
  165. SysStatInfo data;
  166. FileHelper::GetSysStatData(data);
  167. /* 计算 */
  168. m_cpuUse = TZ_DOUBLE(data.user - m_cpuDataInfo.user)
  169. / TZ_DOUBLE(data.GetCPUTime() - m_cpuDataInfo.GetCPUTime()) * 100;
  170. m_cpuDataInfo = data;
  171. TZLogDebug(2, "DEBUG:System CPU use (%lf)~~~", m_cpuUse);
  172. m_tbl.Tb_Cpu = m_cpuUse;
  173. }
  174. void PerformanceMonitor::collectMemData()
  175. {
  176. FileHelper::GetSysMemData(m_memDataInfo);
  177. TZLogDebug(2, "DEBUG:System MEM rate (%lf)~~~", m_memDataInfo.GetMemRate());
  178. m_tbl.Tb_Memory = m_memDataInfo.GetMemRate();
  179. }
  180. void PerformanceMonitor::collectDiskIOData()
  181. {
  182. /* !!此处计算的是总体的磁盘使用情况 */
  183. std::list<SysDiskStatInfo> datas;
  184. FileHelper::GetSysDiskIOData(datas);
  185. TZ_LONG readDiskNum = 0;
  186. TZ_LONG writeDiskNum = 0;
  187. TZ_LONG collectTime = 0;
  188. for (auto & data : datas)
  189. {
  190. readDiskNum += data.ReadSectionCount;
  191. writeDiskNum += data.WriteSectionCount;
  192. collectTime = data.collectTime;
  193. }
  194. TZ_LONG readDiskNum_d = 0;
  195. TZ_LONG writeDiskNum_d = 0;
  196. TZ_LONG collectTime_d = 0;
  197. for (auto & data : m_diskDataList)
  198. {
  199. readDiskNum_d += data.ReadSectionCount;
  200. writeDiskNum_d += data.WriteSectionCount;
  201. collectTime_d = data.collectTime;
  202. }
  203. if ((collectTime - collectTime_d) == 0) return;
  204. m_readSpeed = (TZ_DOUBLE(readDiskNum - readDiskNum_d) * 512 / 1024)
  205. / (collectTime - collectTime_d);
  206. m_writeSpeed = (TZ_FLOAT(writeDiskNum - writeDiskNum_d) * 512 / 1024)
  207. / (collectTime - collectTime_d);
  208. TZLogDebug(2, "DEBUG:System DISK read speed (%lf)~~~", m_readSpeed);
  209. TZLogDebug(2, "DEBUG:System DISK write speed (%lf)~~~", m_writeSpeed);
  210. m_diskDataList.clear();
  211. m_diskDataList.swap(datas);
  212. // TODO: 暂不需要记录硬盘 I/O
  213. }
  214. void PerformanceMonitor::collectBandWidthRate()
  215. {
  216. std::list<SysNetBandInfo> datas;
  217. FileHelper::GetSysNetBandData(datas);
  218. m_bandWidthInfoMap.clear();
  219. TZ_DOUBLE totalUse = 0.0;
  220. TZ_DOUBLE totalBandWidth = 0.0;
  221. for (auto & iter : datas)
  222. {
  223. for (auto & info : m_netDataList)
  224. {
  225. if (info.NicName.size() == sizeof(LOCAL_NET_NAME)
  226. && info.NicName.find(LOCAL_NET_NAME) == 0)
  227. {
  228. continue;
  229. }
  230. if (iter.NicName == info.NicName)
  231. {
  232. TZ_DOUBLE sendRate = /* 单位: bps */
  233. TZ_DOUBLE((iter.Receive.bytes - info.Receive.bytes +
  234. iter.Transmit.bytes - info.Transmit.bytes) * 8) /
  235. TZ_DOUBLE(1000000 * (iter.collectTime - info.collectTime));
  236. TZ_DOUBLE BandWidthRate = sendRate / TZ_DOUBLE(iter.BandWidth) * 100;
  237. totalUse += sendRate;
  238. totalBandWidth += iter.BandWidth;
  239. TZLogDebug(2, "DEBUG:System BUF [%s](%lf)~~~",
  240. iter.NicName.c_str(), BandWidthRate);
  241. m_bandWidthInfoMap.emplace(iter.NicName, BandWidthRate);
  242. }
  243. }
  244. }
  245. m_netDataList.clear();
  246. m_netDataList.assign(datas.begin(), datas.end());
  247. m_tbl.Tb_Nic = totalUse / TZ_DOUBLE(totalBandWidth) * 100;
  248. TZLogDebug(2, "DEBUG: Total Net Useagr (%d)~~~", m_tbl.Tb_Nic);
  249. }
  250. void PerformanceMonitor::collectCPUInfo()
  251. {
  252. FileHelper::GetCPUInfo(m_cpuInfo);
  253. }
  254. void PerformanceMonitor::collectNetworkInfo()
  255. {
  256. FileHelper::GetNetPortInfo(m_netInfos);
  257. }
  258. void PerformanceMonitor::collectDiskInfo()
  259. {
  260. FileHelper::GetDiskInfo(m_diskInfos);
  261. // TODO: 改为挂载 /var/mas2.0 的硬盘
  262. for (auto & i : m_diskInfos)
  263. {
  264. if (i.MountedOn == "/")
  265. {
  266. m_tbl.Tb_Disk = atoi(i.UseRate.c_str());
  267. }
  268. }
  269. }
  270. void PerformanceMonitor::collectGPUInfo()
  271. {
  272. FileHelper::GetGPUInfo(m_GPUName);
  273. }
  274. void PerformanceMonitor::collectGPUData()
  275. {
  276. TZ_INT iRet(nvmlReturn_t::NVML_SUCCESS);
  277. TZ_Uint32 uDevCnt(0);
  278. iRet = m_fNvmlDeviceGetCount(&uDevCnt);
  279. if (iRet != nvmlReturn_t::NVML_SUCCESS)
  280. {
  281. TZLogWarn("Call m_fNvmlDeviceGetCount failed!!!:%d", iRet);
  282. this->clearGPUperf();
  283. return;
  284. }
  285. TZ_Uint64 totalUse = 0;
  286. TZ_Uint64 totalMemUse = 0;
  287. TZ_Uint64 totalMem = 0;
  288. std::vector<SysGPUPerf> vecTmpGPUPerf;
  289. for (TZ_Uint32 uIndex = 0; uIndex < uDevCnt; ++uIndex)
  290. {
  291. nvmlDevice_t nvmlDevice;
  292. iRet = m_fNvmlDeviceGetHandleByIndex(uIndex, &nvmlDevice);
  293. if (iRet != nvmlReturn_t::NVML_SUCCESS)
  294. {
  295. TZLogWarn("Call m_fNvmlDeviceGetHandleByIndex failed!!!:%d", iRet);
  296. continue;
  297. }
  298. nvmlMemory_t nvmlMemory;
  299. iRet = m_fNvmlDeviceGetMemoryInfo(nvmlDevice, &nvmlMemory);
  300. if (iRet != nvmlReturn_t::NVML_SUCCESS)
  301. {
  302. TZLogWarn("Call m_fNvmlDeviceGetMemoryInfo failed!!!:%d", iRet);
  303. continue;
  304. }
  305. nvmlUtilization_t nvmlUtilization;
  306. iRet = m_fNvmlDeviceGetUtilizationRate(nvmlDevice, &nvmlUtilization);
  307. if (iRet != nvmlReturn_t::NVML_SUCCESS)
  308. {
  309. TZLogWarn("Call m_fNvmlDeviceGetUtilizationRate failed!!!:%d", iRet);
  310. continue;
  311. }
  312. SysGPUPerf gpuPerf;
  313. gpuPerf.gpuIndex = uIndex;
  314. gpuPerf.gpuUseRate = nvmlUtilization.gpu;
  315. // nvmlUtilization.memory; // TODO: 可以使用这个查询 Mem 使用率
  316. gpuPerf.memTotal = nvmlMemory.total;
  317. gpuPerf.memFree = nvmlMemory.free;
  318. gpuPerf.memUsed = nvmlMemory.used;
  319. totalUse += nvmlUtilization.gpu;
  320. totalMem += nvmlMemory.total;
  321. totalMemUse += nvmlMemory.used;
  322. vecTmpGPUPerf.push_back(gpuPerf);
  323. }
  324. m_lockGPUPerf.Lock();
  325. m_vGPUPerf = vecTmpGPUPerf;
  326. m_lockGPUPerf.Unlock();
  327. m_tbl.Tb_GpuMemory = 100.0 * totalMemUse / totalMem;
  328. m_tbl.Tb_GpuUtil = totalUse / m_vGPUPerf.size(); // TODO: 评价使用率暂无更好的计算方法
  329. }
  330. void PerformanceMonitor::clearGPUperf()
  331. {
  332. m_lockGPUPerf.Lock();
  333. m_vGPUPerf.clear();
  334. m_lockGPUPerf.Unlock();
  335. }
  336. NAMESPACE_MAS_END