16 #ifndef CPUCOUNTERS_HEADER
17 #define CPUCOUNTERS_HEADER
25 #define PCM_VERSION " (2019-02-07 11:10:06 +0100 ID=0e9461a)"
36 #include "exceptions/unsupported_processor_exception.hpp"
47 #include <linux/perf_event.h>
49 #define PCM_PERF_COUNT_HW_REF_CPU_CYCLES (9)
54 #include <semaphore.h>
55 #include <sys/types.h>
58 #include <sys/syscall.h>
74 void PCM_API restrictDriverAccess(LPCWSTR path);
91 TopologyEntry() : os_id(-1), thread_id (-1), core_id(-1), tile_id(-1), socket(-1) { }
97 virtual void operator = (uint64 val) = 0;
98 virtual operator uint64 () = 0;
104 std::shared_ptr<PciHandleType> handle;
107 PCICFGRegister64(
const std::shared_ptr<PciHandleType> & handle_,
size_t offset_) :
112 void operator = (uint64 val)
override
114 std::cerr <<
"PCICFGRegister64 write operation is not supported" << std::endl;
115 throw std::exception();
117 operator uint64 ()
override
120 handle->read64(offset, &result);
127 std::shared_ptr<PciHandleType> handle;
130 PCICFGRegister32(
const std::shared_ptr<PciHandleType> & handle_,
size_t offset_) :
135 void operator = (uint64 val)
override
137 handle->write32(offset, (uint32)val);
139 operator uint64 ()
override
142 handle->read32(offset, &result);
149 std::shared_ptr<MMIORange> handle;
152 MMIORegister64(
const std::shared_ptr<MMIORange> & handle_,
size_t offset_) :
157 void operator = (uint64 val)
override
159 handle->write64(offset, val);
161 operator uint64 ()
override
163 return handle->read64(offset);
169 std::shared_ptr<MMIORange> handle;
172 MMIORegister32(
const std::shared_ptr<MMIORange> & handle_,
size_t offset_) :
177 void operator = (uint64 val)
override
179 handle->write32(offset, (uint32)val);
181 operator uint64 ()
override
183 return (uint64)handle->read32(offset);
189 std::shared_ptr<SafeMsrHandle> handle;
192 MSRRegister(
const std::shared_ptr<SafeMsrHandle> & handle_,
size_t offset_) :
197 void operator = (uint64 val)
override
199 handle->write(offset, val);
201 operator uint64 ()
override
204 handle->read(offset, &value);
211 std::shared_ptr<CounterWidthExtender> handle;
217 void operator = (uint64 val)
override
225 std::cerr <<
"ERROR: writing non-zero values to CounterWidthExtenderRegister is not supported" << std::endl;
226 throw std::exception();
229 operator uint64 ()
override
231 return handle->read();;
237 typedef std::shared_ptr<HWRegister> HWRegisterPtr;
239 HWRegisterPtr unitControl;
240 HWRegisterPtr counterControl[4];
241 HWRegisterPtr counterValue[4];
242 HWRegisterPtr fixedCounterControl;
243 HWRegisterPtr fixedCounterValue;
244 HWRegisterPtr filter[2];
246 UncorePMU(
const HWRegisterPtr & unitControl_,
247 const HWRegisterPtr & counterControl0,
248 const HWRegisterPtr & counterControl1,
249 const HWRegisterPtr & counterControl2,
250 const HWRegisterPtr & counterControl3,
251 const HWRegisterPtr & counterValue0,
252 const HWRegisterPtr & counterValue1,
253 const HWRegisterPtr & counterValue2,
254 const HWRegisterPtr & counterValue3,
255 const HWRegisterPtr & fixedCounterControl_ = HWRegisterPtr(),
256 const HWRegisterPtr & fixedCounterValue_ = HWRegisterPtr(),
257 const HWRegisterPtr & filter0 = HWRegisterPtr(),
258 const HWRegisterPtr & filter1 = HWRegisterPtr()
260 unitControl(unitControl_),
261 counterControl{ counterControl0, counterControl1, counterControl2, counterControl3 },
262 counterValue{ counterValue0, counterValue1, counterValue2, counterValue3 },
263 fixedCounterControl(fixedCounterControl_),
264 fixedCounterValue(fixedCounterValue_),
265 filter{ filter0 , filter1 }
272 for (
int i = 0; i < 4; ++i)
274 if (counterControl[i].
get()) *counterControl[i] = 0;
276 if (unitControl.get()) *unitControl = 0;
277 if (fixedCounterControl.get()) *fixedCounterControl = 0;
285 int32 iMCbus,UPIbus,M2Mbus;
288 std::vector<UncorePMU> imcPMUs;
289 std::vector<UncorePMU> edcPMUs;
290 std::vector<UncorePMU> xpiPMUs;
291 std::vector<UncorePMU> m2mPMUs;
292 std::vector<uint64> qpi_speed;
293 std::vector<uint32> num_imc_channels;
294 std::vector<std::pair<uint32, uint32> > XPIRegisterLocation;
295 std::vector<std::vector< std::pair<uint32, uint32> > > MCRegisterLocation;
296 std::vector<std::pair<uint32, uint32> > EDCRegisterLocation;
297 std::vector<std::pair<uint32, uint32> > M2MRegisterLocation;
300 static std::vector<std::pair<uint32, uint32> > socket2iMCbus;
301 static std::vector<std::pair<uint32, uint32> > socket2UPIbus;
302 static std::vector<std::pair<uint32, uint32> > socket2M2Mbus;
303 void initSocket2Bus(std::vector<std::pair<uint32, uint32> > & socket2bus, uint32 device, uint32
function,
const uint32 DEV_IDS[], uint32 devIdsSize);
308 PciHandleType * createIntelPerfMonDevice(uint32 groupnr, int32 bus, uint32 dev, uint32 func,
bool checkVendor =
false);
309 void programIMC(
const uint32 * MCCntConfig);
310 void programEDC(
const uint32 * EDCCntConfig);
312 void programXPI(
const uint32 * XPICntConfig);
313 typedef std::pair<size_t, std::vector<uint64 *> > MemTestParam;
314 void initMemTest(MemTestParam & param);
315 void doMemTest(
const MemTestParam & param);
316 void cleanupMemTest(
const MemTestParam & param);
317 void cleanupQPIHandles();
319 void writeAllUnitControl(
const uint32 value);
320 void initDirect(uint32 socket_,
const PCM * pcm);
321 void initPerf(uint32 socket_,
const PCM * pcm);
322 void initBuses(uint32 socket_,
const PCM * pcm);
323 void initRegisterLocations();
428 return qpi_speed.empty() ? 0 : qpi_speed[linkNr];
435 uint32
getNumMC()
const {
return (uint32)num_imc_channels.size(); }
451 friend uint64 getNumberOfEvents(
const T & before,
const T & after);
464 class PerfVirtualControlRegister;
466 #ifndef HACK_TO_REMOVE_DUPLICATE_ERROR
467 template class PCM_API std::allocator<TopologyEntry>;
468 template class PCM_API std::vector<TopologyEntry>;
469 template class PCM_API std::allocator<CounterWidthExtender *>;
470 template class PCM_API std::vector<CounterWidthExtender *>;
471 template class PCM_API std::allocator<uint32>;
472 template class PCM_API std::vector<uint32>;
473 template class PCM_API std::allocator<char>;
485 friend class PerfVirtualControlRegister;
489 int32 cpu_model, original_cpu_model;
491 int64 cpu_microcode_level;
493 int32 threads_per_core;
496 int32 num_phys_cores_per_socket;
497 int32 num_online_cores;
498 int32 num_online_sockets;
499 uint32 core_gen_counter_num_max;
500 uint32 core_gen_counter_num_used;
501 uint32 core_gen_counter_width;
502 uint32 core_fixed_counter_num_max;
503 uint32 core_fixed_counter_num_used;
504 uint32 core_fixed_counter_width;
505 uint32 uncore_gen_counter_num_max;
506 uint32 uncore_gen_counter_num_used;
507 uint32 uncore_gen_counter_width;
508 uint32 uncore_fixed_counter_num_max;
509 uint32 uncore_fixed_counter_num_used;
510 uint32 uncore_fixed_counter_width;
511 uint32 perfmon_version;
512 int32 perfmon_config_anythread;
513 uint64 nominal_frequency;
514 uint64 max_qpi_speed;
515 uint32 L3ScalingFactor;
516 int32 pkgThermalSpecPower, pkgMinimumPower, pkgMaximumPower;
518 std::vector<TopologyEntry> topology;
519 std::string errorMessage;
521 static PCM * instance;
522 bool allow_multiple_instances;
524 std::vector<std::shared_ptr<SafeMsrHandle> > MSR;
525 std::vector<std::shared_ptr<ServerPCICFGUncore> > server_pcicfg_uncore;
526 std::vector<UncorePMU> pcuPMUs;
527 std::vector<std::map<int32, UncorePMU> > iioPMUs;
528 std::vector<UncorePMU> uboxPMUs;
529 double joulesPerEnergyUnit;
530 std::vector<std::shared_ptr<CounterWidthExtender> > energy_status;
531 std::vector<std::shared_ptr<CounterWidthExtender> > dram_energy_status;
532 std::vector<std::vector<UncorePMU> > cboPMUs;
534 std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_local;
535 std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_total;
537 std::shared_ptr<ClientBW> clientBW;
538 std::shared_ptr<CounterWidthExtender> clientImcReads;
539 std::shared_ptr<CounterWidthExtender> clientImcWrites;
540 std::shared_ptr<CounterWidthExtender> clientIoRequests;
542 bool disable_JKT_workaround;
545 uint64 * coreCStateMsr;
546 uint64 * pkgCStateMsr;
548 std::vector<std::shared_ptr<CoreTaskQueue> > coreTaskQueues;
550 bool L2CacheHitRatioAvailable;
551 bool L3CacheHitRatioAvailable;
552 bool L3CacheMissesAvailable;
553 bool L2CacheMissesAvailable;
554 bool L2CacheHitsAvailable;
555 bool L3CacheHitsNoSnoopAvailable;
556 bool L3CacheHitsSnoopAvailable;
557 bool L3CacheHitsAvailable;
558 bool CyclesLostDueL3CacheMissesAvailable;
559 bool CyclesLostDueL2CacheMissesAvailable;
562 enum { MAX_C_STATE = 10 };
567 if (state == 0 || state == 1)
570 return (coreCStateMsr != NULL && state <= ((
int)MAX_C_STATE) && coreCStateMsr[state] != 0);
576 return (pkgCStateMsr != NULL && state <= ((
int)MAX_C_STATE) && pkgCStateMsr[state] != 0);
580 void setOutput(
const std::string filename);
583 void restoreOutput();
597 bool isBlocked(
void) {
return blocked; }
598 void setBlocked(
const bool new_blocked) { blocked = new_blocked; }
603 allow_multiple_instances =
true;
609 CUSTOM_CORE_EVENTS = 1,
610 EXT_CUSTOM_CORE_EVENTS = 2,
664 enum PCIeWidthMode width;
665 std::string pciDevName;
666 std::string busNumber;
680 int32 event_number, umask_value;
697 uint64 OffcoreResponseMsrValue[2];
700 OffcoreResponseMsrValue[0] = 0;
701 OffcoreResponseMsrValue[1] = 0;
708 std::string eventNames[4];
719 HANDLE numInstancesSemaphore;
722 sem_t * numInstancesSemaphore;
725 std::vector<int32> socketRefCore;
729 std::vector<std::vector<int> > perfEventHandle;
730 void readPerfData(uint32 core, std::vector<uint64> &
data);
733 PERF_INST_RETIRED_ANY_POS = 0,
734 PERF_CPU_CLK_UNHALTED_THREAD_POS = 1,
735 PERF_CPU_CLK_UNHALTED_REF_POS = 2,
736 PERF_GEN_EVENT_0_POS = 3,
737 PERF_GEN_EVENT_1_POS = 4,
738 PERF_GEN_EVENT_2_POS = 5,
739 PERF_GEN_EVENT_3_POS = 6
743 PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_ANY_POS
746 std::ofstream * outfile;
747 std::streambuf * backup_ofile;
750 bool needToRestoreNMIWatchdog;
752 std::vector<std::vector<EventSelectRegister> > lastProgrammedCustomCounters;
753 uint32 checkCustomCoreProgramming(std::shared_ptr<SafeMsrHandle> msr);
756 ErrorCode programCoreCounters(
int core,
const PCM::ProgramMode mode,
const ExtendedCustomCoreEventDescription * pExtDesc,
757 std::vector<EventSelectRegister> & programmedCustomCounters);
762 bool decrementInstanceSemaphore();
766 uint32 getNumInstances();
767 uint32 decrementNumInstances();
768 uint32 incrementNumInstances();
772 void computeQPISpeedBeckton(
int core_nr);
774 void computeNominalFrequency();
775 static bool isCPUModelSupported(
int model_);
776 std::string getSupportedUarchCodenames()
const;
777 std::string getUnsupportedMessage()
const;
781 void initCStateSupportTables();
782 bool discoverSystemTopology();
783 void printSystemTopology()
const;
785 bool detectNominalFrequency();
786 void showSpecControlMSRs();
787 void initEnergyMonitoring();
788 void initUncoreObjects();
800 void initQOSevent(
const uint64 event,
const int32 core);
801 void programBecktonUncore(
int core);
802 void programNehalemEPUncore(
int core);
803 void enableJKTWorkaround(
bool enable);
804 template <
class CounterStateType>
805 void readAndAggregateMemoryBWCounters(
const uint32 core, CounterStateType & counterState);
806 template <
class CounterStateType>
807 void readAndAggregateUncoreMCCounters(
const uint32 socket, CounterStateType & counterState);
808 template <
class CounterStateType>
809 void readAndAggregateEnergyCounters(
const uint32 socket, CounterStateType & counterState);
810 template <
class CounterStateType>
811 void readPackageThermalHeadroom(
const uint32 socket, CounterStateType & counterState);
812 template <
class CounterStateType>
813 void readAndAggregatePackageCStateResidencies(std::shared_ptr<SafeMsrHandle> msr, CounterStateType & result);
815 void reportQPISpeed()
const;
816 void readCoreCounterConfig();
817 void readCPUMicrocodeLevel();
819 uint64 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr)
const;
820 uint64 CX_MSR_PMON_BOX_FILTER(uint32 Cbo)
const;
821 uint64 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo)
const;
822 uint64 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl)
const;
823 uint64 CX_MSR_PMON_BOX_CTL(uint32 Cbo)
const;
824 uint32 getMaxNumOfCBoxes()
const;
825 void programCbo(
const uint64 * events,
const uint32 opCode,
const uint32 nc_ = 0,
const uint32 tid_ = 0);
826 void programCboOpcodeFilter(
const uint32 opc0,
UncorePMU & pmu,
const uint32 nc_ = 0,
const uint32 opc1 = 0);
827 void programLLCReadMissLatencyEvents();
828 uint64 getCBOCounterState(
const uint32 socket,
const uint32 ctr_);
830 void cleanupUncorePMUs();
834 return (PCM::SKX == cpu_model) && (cpu_stepping > 4);
837 void initUncorePMUsDirect();
838 void initUncorePMUsPerf();
842 bool isSecureBoot()
const;
845 bool useLinuxPerfForUncore()
const;
852 bool QOSMetricAvailable()
const;
858 bool L3QOSMetricAvailable()
const;
864 bool L3CacheOccupancyMetricAvailable()
const;
870 bool CoreLocalMemoryBWMetricAvailable()
const;
876 bool CoreRemoteMemoryBWMetricAvailable()
const;
882 unsigned getMaxRMID()
const;
894 static PCM * getInstance();
925 ErrorCode program(
const ProgramMode mode_ = DEFAULT_EVENTS,
const void * parameter_ = NULL);
938 ErrorCode programServerUncoreLatencyMetrics(
bool enable_pmm);
953 ErrorCode programServerUncorePowerMetrics(
int mc_profile,
int pcu_profile,
int * freq_bands = NULL);
967 ErrorCode programServerUncoreMemoryMetrics(
int rankA = -1,
int rankB = -1,
bool PMM =
false);
970 void freezeServerUncoreCounters();
973 void unfreezeServerUncoreCounters();
1001 void getAllCounterStates(
SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates, std::vector<CoreCounterState> & coreStates);
1009 void getUncoreCounterStates(
SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates);
1015 bool isCoreOnline(int32 os_core_id)
const;
1021 bool isSocketOnline(int32 socket_id)
const;
1051 uint32 getNumCores()
const;
1056 uint32 getNumOnlineCores()
const;
1061 uint32 getNumSockets()
const;
1066 uint32 getNumOnlineSockets()
const;
1073 uint32 getThreadsPerCore()
const;
1078 bool getSMT()
const;
1083 uint64 getNominalFrequency()
const;
1089 uint32 getL3ScalingFactor()
const;
1096 bool isSomeCoreOfflined();
1100 int32 getMaxCustomCoreEvents();
1109 ATOM_CENTERTON = 54,
1112 ATOM_CHERRYTRAIL = 76,
1113 ATOM_APOLLO_LAKE = 92,
1114 ATOM_DENVERTON = 95,
1128 BROADWELL_XEON_E3 = 71,
1137 END_OF_MODEL_LIST = 0x0ffff
1155 int32
getThreadId(uint32 os_id)
const {
return (int32)topology[os_id].thread_id; }
1160 int32
getCoreId(uint32 os_id)
const {
return (int32)topology[os_id].core_id; }
1165 int32
getTileId(uint32 os_id)
const {
return (int32)topology[os_id].tile_id; }
1170 int32
getSocketId(uint32 core_id)
const {
return (int32)topology[core_id].socket; }
1181 if (num_sockets == 2)
1194 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumQPIPorts()) : 0;
1218 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMC()) : 0;
1242 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMCChannels()) : 0;
1268 return (socket < server_pcicfg_uncore.size() && server_pcicfg_uncore[socket].get()) ? (server_pcicfg_uncore[socket]->getNumMCChannels(controller)) : 0;
1280 return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumEDCChannels()) : 0;
1290 switch (original_cpu_model)
1292 case ATOM_DENVERTON:
1329 return 800000000ULL;
1334 return 1000000000ULL;
1336 return 1100000000ULL;
1345 uint64 getTickCount(uint64 multiplier = 1000 , uint32 core = 0);
1351 uint64 getTickCountRDTSCP(uint64 multiplier = 1000 );
1354 uint64 getUncoreClocks(
const uint32 socket_);
1361 return hasPCICFGUncore() ? server_pcicfg_uncore[socketNr]->getQPILinkSpeed(linkNr) : max_qpi_speed;
1378 static bool initWinRing0Lib();
1380 inline void disableJKTWorkaround() { disable_JKT_workaround =
true; }
1409 enum ChaPipelineQueue
1425 void programPCIeCounters(
const PCIeEventCode event_,
const uint32 tid_ = 0,
const uint32 miss_ = 0,
const uint32 q_ = 0,
const uint32 nc_ = 0);
1426 void programPCIeMissCounters(
const PCIeEventCode event_,
const uint32 tid_ = 0,
const uint32 q_ = 0,
const uint32 nc_ = 0);
1448 void getIIOCounterStates(
int socket,
int IIOStack,
IIOCounterState * result);
1450 uint64 extractCoreGenCounterValue(uint64 val);
1451 uint64 extractCoreFixedCounterValue(uint64 val);
1452 uint64 extractUncoreGenCounterValue(uint64 val);
1453 uint64 extractUncoreFixedCounterValue(uint64 val);
1454 uint64 extractQOSMonitoring(uint64 val);
1458 const char * getUArchCodename(
const int32 cpu_model_ = -1)
const;
1461 static std::string getCPUBrandString();
1462 std::string getCPUFamilyModelString();
1467 bool packageEnergyMetricsAvailable()
const
1470 cpu_model == PCM::JAKETOWN
1471 || cpu_model == PCM::IVYTOWN
1472 || cpu_model == PCM::SANDY_BRIDGE
1473 || cpu_model == PCM::IVY_BRIDGE
1474 || cpu_model == PCM::HASWELL
1475 || original_cpu_model == PCM::ATOM_AVOTON
1476 || original_cpu_model == PCM::ATOM_CHERRYTRAIL
1477 || original_cpu_model == PCM::ATOM_BAYTRAIL
1478 || original_cpu_model == PCM::ATOM_APOLLO_LAKE
1479 || original_cpu_model == PCM::ATOM_DENVERTON
1480 || cpu_model == PCM::HASWELLX
1481 || cpu_model == PCM::BROADWELL
1482 || cpu_model == PCM::BDX_DE
1483 || cpu_model == PCM::BDX
1484 || cpu_model == PCM::KNL
1485 || cpu_model == PCM::SKL
1486 || cpu_model == PCM::KBL
1487 || cpu_model == PCM::SKX
1491 bool dramEnergyMetricsAvailable()
const
1494 cpu_model == PCM::JAKETOWN
1495 || cpu_model == PCM::IVYTOWN
1496 || cpu_model == PCM::HASWELLX
1497 || cpu_model == PCM::BDX_DE
1498 || cpu_model == PCM::BDX
1499 || cpu_model == PCM::KNL
1500 || cpu_model == PCM::SKX
1504 bool packageThermalMetricsAvailable()
const
1506 return packageEnergyMetricsAvailable();
1509 bool outgoingQPITrafficMetricsAvailable()
const
1511 return getQPILinksPerSocket() > 0 &&
1513 cpu_model == PCM::NEHALEM_EX
1514 || cpu_model == PCM::WESTMERE_EX
1515 || cpu_model == PCM::JAKETOWN
1516 || cpu_model == PCM::IVYTOWN
1517 || cpu_model == PCM::HASWELLX
1518 || cpu_model == PCM::BDX
1519 || cpu_model == PCM::SKX
1523 bool incomingQPITrafficMetricsAvailable()
const
1525 return getQPILinksPerSocket() > 0 &&
1527 cpu_model == PCM::NEHALEM_EX
1528 || cpu_model == PCM::WESTMERE_EX
1529 || cpu_model == PCM::JAKETOWN
1530 || cpu_model == PCM::IVYTOWN
1531 || (cpu_model == PCM::SKX && cpu_stepping > 1)
1535 bool qpiUtilizationMetricsAvailable()
const
1537 return outgoingQPITrafficMetricsAvailable();
1540 bool memoryTrafficMetricsAvailable()
const
1543 cpu_model == PCM::ATOM
1544 || cpu_model == PCM::CLARKDALE
1548 bool MCDRAMmemoryTrafficMetricsAvailable()
const
1550 return (cpu_model == PCM::KNL);
1553 bool memoryIOTrafficMetricAvailable()
const
1556 cpu_model == PCM::SANDY_BRIDGE
1557 || cpu_model == PCM::IVY_BRIDGE
1558 || cpu_model == PCM::HASWELL
1559 || cpu_model == PCM::BROADWELL
1560 || cpu_model == PCM::SKL
1561 || cpu_model == PCM::KBL
1565 bool IIOEventsAvailable()
const
1568 cpu_model == PCM::SKX
1572 bool LatencyMetricsAvailable()
const
1575 cpu_model == PCM::HASWELLX
1576 || cpu_model == PCM::BDX
1577 || cpu_model == PCM::SKX
1578 || cpu_model == PCM::SKL
1582 bool PMMTrafficMetricsAvailable()
const
1589 bool LLCReadMissLatencyMetricsAvailable()
const
1592 HASWELLX == cpu_model
1593 || BDX_DE == cpu_model
1595 #ifdef PCM_ENABLE_LLCRDLAT_SKX_MP
1598 || ((SKX == cpu_model) && (num_sockets == 1))
1603 bool hasBecktonUncore()
const
1606 cpu_model == PCM::NEHALEM_EX
1607 || cpu_model == PCM::WESTMERE_EX
1610 bool hasPCICFGUncore() const
1613 cpu_model == PCM::JAKETOWN
1614 || cpu_model == PCM::IVYTOWN
1615 || cpu_model == PCM::HASWELLX
1616 || cpu_model == PCM::BDX_DE
1617 || cpu_model == PCM::SKX
1618 || cpu_model == PCM::BDX
1619 || cpu_model == PCM::KNL
1626 cpu_model == PCM::SKX
1630 const char * xPI()
const
1638 bool supportsHLE()
const;
1639 bool supportsRTM()
const;
1641 bool useSkylakeEvents()
const
1643 return PCM::SKL == cpu_model
1644 || PCM::KBL == cpu_model
1645 || PCM::SKX == cpu_model
1649 static double getBytesPerFlit(int32 cpu_model_)
1651 if(cpu_model_ == PCM::SKX)
1660 double getBytesPerFlit()
const
1662 return getBytesPerFlit(cpu_model);
1665 static double getDataBytesPerFlit(int32 cpu_model_)
1667 if(cpu_model_ == PCM::SKX)
1676 double getDataBytesPerFlit()
const
1678 return getDataBytesPerFlit(cpu_model);
1681 static double getFlitsPerLinkCycle(int32 cpu_model_)
1683 if(cpu_model_ == PCM::SKX)
1691 static double getBytesPerLinkCycle(int32 cpu_model_)
1693 return getBytesPerFlit(cpu_model_) * getFlitsPerLinkCycle(cpu_model_);
1696 double getBytesPerLinkCycle()
const
1698 return getBytesPerLinkCycle(cpu_model);
1701 static double getLinkTransfersPerLinkCycle()
1706 double getBytesPerLinkTransfer()
const
1708 return getBytesPerLinkCycle() / getLinkTransfersPerLinkCycle();
1715 #define PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(m) bool is##m() const { return m; }
1717 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitRatioAvailable)
1718 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitRatioAvailable)
1719 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheMissesAvailable)
1720 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheMissesAvailable)
1721 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitsAvailable)
1722 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsNoSnoopAvailable)
1723 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsSnoopAvailable)
1724 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsAvailable)
1725 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(CyclesLostDueL3CacheMissesAvailable)
1726 PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(CyclesLostDueL2CacheMissesAvailable)
1728 #undef PCM_GEN_METRIC_AVAILABLE_FUNCTION
1730 bool isActiveRelativeFrequencyAvailable()
const
1732 return ATOM != cpu_model;
1744 template <
class CounterStateType>
1745 friend double getExecUsage(
const CounterStateType & before,
const CounterStateType & after);
1746 template <
class CounterStateType>
1747 friend double getIPC(
const CounterStateType & before,
const CounterStateType & after);
1748 template <
class CounterStateType>
1749 friend double getAverageFrequency(
const CounterStateType & before,
const CounterStateType & after);
1750 template <
class CounterStateType>
1752 template <
class CounterStateType>
1753 friend double getRelativeFrequency(
const CounterStateType & before,
const CounterStateType & after);
1754 template <
class CounterStateType>
1756 template <
class CounterStateType>
1757 friend double getL2CacheHitRatio(
const CounterStateType & before,
const CounterStateType & after);
1758 template <
class CounterStateType>
1759 friend double getL3CacheHitRatio(
const CounterStateType & before,
const CounterStateType & after);
1760 template <
class CounterStateType>
1761 friend uint64
getL3CacheMisses(
const CounterStateType & before,
const CounterStateType & after);
1762 template <
class CounterStateType>
1763 friend uint64
getL2CacheMisses(
const CounterStateType & before,
const CounterStateType & after);
1764 template <
class CounterStateType>
1765 friend uint64
getL2CacheHits(
const CounterStateType & before,
const CounterStateType & after);
1766 template <
class CounterStateType>
1767 friend uint64
getL3CacheHitsNoSnoop(
const CounterStateType & before,
const CounterStateType & after);
1768 template <
class CounterStateType>
1769 friend uint64
getL3CacheHitsSnoop(
const CounterStateType & before,
const CounterStateType & after);
1770 template <
class CounterStateType>
1771 friend uint64
getL3CacheHits(
const CounterStateType & before,
const CounterStateType & after);
1772 template <
class CounterStateType>
1774 template <
class CounterStateType>
1776 template <
class CounterStateType>
1778 template <
class CounterStateType>
1779 friend uint64
getLocalMemoryBW(
const CounterStateType & before,
const CounterStateType & after);
1780 template <
class CounterStateType>
1781 friend uint64
getRemoteMemoryBW(
const CounterStateType & before,
const CounterStateType & after);
1782 template <
class CounterStateType>
1783 friend uint64
getCycles(
const CounterStateType & before,
const CounterStateType & after);
1784 template <
class CounterStateType>
1786 template <
class CounterStateType>
1787 friend uint64
getCycles(
const CounterStateType & now);
1788 template <
class CounterStateType>
1790 template <
class CounterStateType>
1791 friend uint64
getNumberOfCustomEvents(int32 eventCounterNr,
const CounterStateType & before,
const CounterStateType & after);
1792 template <
class CounterStateType>
1793 friend uint64
getInvariantTSC(
const CounterStateType & before,
const CounterStateType & after);
1794 template <
class CounterStateType>
1795 friend uint64
getRefCycles(
const CounterStateType & before,
const CounterStateType & after);
1796 template <
class CounterStateType>
1797 friend double getCoreCStateResidency(
int state,
const CounterStateType & before,
const CounterStateType & after);
1798 template <
class CounterStateType>
1799 friend uint64
getSMICount(
const CounterStateType & before,
const CounterStateType & after);
1802 uint64 InstRetiredAny;
1803 uint64 CpuClkUnhaltedThread;
1804 uint64 CpuClkUnhaltedRef;
1812 uint64 L3UnsharedHit;
1826 uint64 InvariantTSC;
1827 uint64 CStateResidency[PCM::MAX_C_STATE + 1];
1828 int32 ThermalHeadroom;
1830 void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
1831 void readAndAggregateTSC(std::shared_ptr<SafeMsrHandle>);
1832 uint64 MemoryBWLocal;
1833 uint64 MemoryBWTotal;
1839 CpuClkUnhaltedThread(0),
1840 CpuClkUnhaltedRef(0),
1846 ThermalHeadroom(PCM_INVALID_THERMAL_HEADROOM),
1852 memset(CStateResidency, 0,
sizeof(CStateResidency));
1858 InstRetiredAny += o.InstRetiredAny;
1859 CpuClkUnhaltedThread += o.CpuClkUnhaltedThread;
1860 CpuClkUnhaltedRef += o.CpuClkUnhaltedRef;
1865 InvariantTSC += o.InvariantTSC;
1866 for (
int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
1867 CStateResidency[i] += o.CStateResidency[i];
1869 L3Occupancy += o.L3Occupancy;
1870 MemoryBWLocal += o.MemoryBWLocal;
1871 MemoryBWTotal += o.MemoryBWTotal;
1872 SMICount += o.SMICount;
1880 inline uint64 RDTSC()
1886 result =
static_cast<uint64
>(__rdtsc());
1890 uint32 high = 0, low = 0;
1891 asm volatile(
"rdtsc" :
"=a" (low),
"=d" (high));
1892 result = low + (uint64(high)<<32ULL);
1898 inline uint64 RDTSCP()
1905 result = __rdtscp(&Aux);
1909 uint32 high = 0, low = 0;
1913 "mov %%eax, %1\n\t":
1914 "=r" (high),
"=r" (low) ::
"%rax",
"%rcx",
"%rdx");
1915 result = low + (uint64(high)<<32ULL);
1925 template <
class CounterStateType>
1926 uint64
getQPIClocks(uint32 port,
const CounterStateType & before,
const CounterStateType & after)
1928 return after.QPIClocks[port] - before.QPIClocks[port];
1932 template <
class CounterStateType>
1933 int32 getThermalHeadroom(
const CounterStateType & ,
const CounterStateType & after)
1935 return after.getThermalHeadroom();
1943 template <
class CounterStateType>
1944 uint64
getQPIL0pTxCycles(uint32 port,
const CounterStateType & before,
const CounterStateType & after)
1946 return after.QPIL0pTxCycles[port] - before.QPIL0pTxCycles[port];
1954 template <
class CounterStateType>
1955 uint64
getQPIL1Cycles(uint32 port,
const CounterStateType & before,
const CounterStateType & after)
1957 return after.QPIL1Cycles[port] - before.QPIL1Cycles[port];
1966 template <
class CounterStateType>
1978 template <
class CounterStateType>
1989 template <
class CounterStateType>
1990 uint64
getDRAMClocks(uint32 channel,
const CounterStateType & before,
const CounterStateType & after)
1992 return after.DRAMClocks[channel] - before.DRAMClocks[channel];
2000 template <
class CounterStateType>
2001 uint64
getMCDRAMClocks(uint32 channel,
const CounterStateType & before,
const CounterStateType & after)
2003 return after.MCDRAMClocks[channel] - before.MCDRAMClocks[channel];
2013 template <
class CounterStateType>
2014 uint64
getMCCounter(uint32 channel, uint32
counter,
const CounterStateType & before,
const CounterStateType & after)
2016 return after.MCCounter[channel][counter] - before.MCCounter[channel][counter];
2026 template <
class CounterStateType>
2027 uint64
getM2MCounter(uint32 controller, uint32
counter,
const CounterStateType & before,
const CounterStateType & after)
2029 return after.M2MCounter[controller][counter] - before.M2MCounter[controller][counter];
2039 template <
class CounterStateType>
2040 uint64
getEDCCounter(uint32 channel, uint32
counter,
const CounterStateType & before,
const CounterStateType & after)
2042 return after.EDCCounter[channel][counter] - before.EDCCounter[channel][counter];
2050 template <
class CounterStateType>
2053 return after.PCUCounter[counter] - before.PCUCounter[counter];
2060 template <
class CounterStateType>
2061 uint64
getPCUClocks(
const CounterStateType & before,
const CounterStateType & after)
2070 template <
class CounterStateType>
2073 return after.PackageEnergyStatus - before.PackageEnergyStatus;
2080 template <
class CounterStateType>
2083 return after.DRAMEnergyStatus - before.DRAMEnergyStatus;
2090 template <
class CounterStateType>
2103 template <
class CounterStateType>
2108 double dram_joules_per_energy_unit;
2121 dram_joules_per_energy_unit = 0.0000153;
2137 template <
class CounterStateType>
2138 friend uint64
getBytesReadFromMC(
const CounterStateType & before,
const CounterStateType & after);
2139 template <
class CounterStateType>
2140 friend uint64
getBytesWrittenToMC(
const CounterStateType & before,
const CounterStateType & after);
2141 template <
class CounterStateType>
2142 friend uint64
getBytesReadFromPMM(
const CounterStateType & before,
const CounterStateType & after);
2143 template <
class CounterStateType>
2144 friend uint64
getBytesWrittenToPMM(
const CounterStateType & before,
const CounterStateType & after);
2145 template <
class CounterStateType>
2146 friend uint64
getBytesReadFromEDC(
const CounterStateType & before,
const CounterStateType & after);
2147 template <
class CounterStateType>
2148 friend uint64
getBytesWrittenToEDC(
const CounterStateType & before,
const CounterStateType & after);
2149 template <
class CounterStateType>
2151 template <
class CounterStateType>
2152 friend uint64
getConsumedEnergy(
const CounterStateType & before,
const CounterStateType & after);
2153 template <
class CounterStateType>
2154 friend uint64
getDRAMConsumedEnergy(
const CounterStateType & before,
const CounterStateType & after);
2155 template <
class CounterStateType>
2157 template <
class CounterStateType>
2158 friend double getLLCReadMissLatency(
const CounterStateType & before,
const CounterStateType & after);
2161 uint64 UncMCFullWrites;
2162 uint64 UncMCNormalReads;
2163 uint64 UncPMMWrites;
2165 uint64 UncEDCFullWrites;
2166 uint64 UncEDCNormalReads;
2167 uint64 UncMCIORequests;
2168 uint64 PackageEnergyStatus;
2169 uint64 DRAMEnergyStatus;
2170 uint64 TOROccupancyIAMiss;
2171 uint64 TORInsertsIAMiss;
2173 uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2174 void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2179 UncMCNormalReads(0),
2182 UncEDCFullWrites(0),
2183 UncEDCNormalReads(0),
2185 PackageEnergyStatus(0),
2186 DRAMEnergyStatus(0),
2187 TOROccupancyIAMiss(0),
2188 TORInsertsIAMiss(0),
2191 memset(CStateResidency, 0,
sizeof(CStateResidency));
2197 UncMCFullWrites += o.UncMCFullWrites;
2198 UncMCNormalReads += o.UncMCNormalReads;
2199 UncPMMReads += o.UncPMMReads;
2200 UncPMMWrites += o.UncPMMWrites;
2201 UncEDCFullWrites += o.UncEDCFullWrites;
2202 UncEDCNormalReads += o.UncEDCNormalReads;
2203 UncMCIORequests += o.UncMCIORequests;
2204 PackageEnergyStatus += o.PackageEnergyStatus;
2205 DRAMEnergyStatus += o.DRAMEnergyStatus;
2206 TOROccupancyIAMiss += o.TOROccupancyIAMiss;
2207 TORInsertsIAMiss += o.TORInsertsIAMiss;
2208 UncClocks += o.UncClocks;
2209 for (
int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2210 CStateResidency[i] += o.CStateResidency[i];
2228 std::array<uint64, maxXPILinks> QPIClocks, QPIL0pTxCycles, QPIL1Cycles;
2229 std::array<uint64, maxChannels> DRAMClocks;
2230 std::array<uint64, maxChannels> MCDRAMClocks;
2231 std::array<std::array<uint64, maxCounters>, maxChannels> MCCounter;
2232 std::array<std::array<uint64, maxCounters>, maxControllers> M2MCounter;
2233 std::array<std::array<uint64, maxCounters>, maxChannels> EDCCounter;
2234 std::array<uint64, maxCounters> PCUCounter;
2235 int32 PackageThermalHeadroom;
2236 uint64 InvariantTSC;
2238 template <
class CounterStateType>
2239 friend uint64
getQPIClocks(uint32 port,
const CounterStateType & before,
const CounterStateType & after);
2240 template <
class CounterStateType>
2241 friend uint64
getQPIL0pTxCycles(uint32 port,
const CounterStateType & before,
const CounterStateType & after);
2242 template <
class CounterStateType>
2243 friend uint64
getQPIL1Cycles(uint32 port,
const CounterStateType & before,
const CounterStateType & after);
2244 template <
class CounterStateType>
2245 friend uint64
getDRAMClocks(uint32 channel,
const CounterStateType & before,
const CounterStateType & after);
2246 template <
class CounterStateType>
2247 friend uint64
getMCDRAMClocks(uint32 channel,
const CounterStateType & before,
const CounterStateType & after);
2248 template <
class CounterStateType>
2249 friend uint64
getMCCounter(uint32 channel, uint32
counter,
const CounterStateType & before,
const CounterStateType & after);
2250 template <
class CounterStateType>
2251 friend uint64
getM2MCounter(uint32 controller, uint32 counter,
const CounterStateType & before,
const CounterStateType & after);
2252 template <
class CounterStateType>
2253 friend uint64
getEDCCounter(uint32 channel, uint32 counter,
const CounterStateType & before,
const CounterStateType & after);
2254 template <
class CounterStateType>
2255 friend uint64
getPCUCounter(uint32 counter,
const CounterStateType & before,
const CounterStateType & after);
2256 template <
class CounterStateType>
2257 friend uint64
getConsumedEnergy(
const CounterStateType & before,
const CounterStateType & after);
2258 template <
class CounterStateType>
2259 friend uint64
getDRAMConsumedEnergy(
const CounterStateType & before,
const CounterStateType & after);
2260 template <
class CounterStateType>
2261 friend uint64
getInvariantTSC(
const CounterStateType & before,
const CounterStateType & after);
2267 QPIClocks{}, QPIL0pTxCycles{}, QPIL1Cycles{},
2274 PackageThermalHeadroom(0),
2294 void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2296 BasicCounterState::readAndAggregate(handle);
2297 UncoreCounterState::readAndAggregate(handle);
2303 BasicCounterState::operator += (o);
2311 std::vector<std::vector<uint64> > incomingQPIPackets;
2312 std::vector<std::vector<uint64> > outgoingQPIFlits;
2313 std::vector<std::vector<uint64> > TxL0Cycles;
2317 void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2319 BasicCounterState::readAndAggregate(handle);
2320 UncoreCounterState::readAndAggregate(handle);
2344 BasicCounterState::operator += (o);
2345 UncoreCounterState::operator += (o);
2386 template <
class CounterStateType>
2387 double getIPC(
const CounterStateType & before,
const CounterStateType & after)
2389 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2391 return double(after.InstRetiredAny - before.InstRetiredAny) / double(clocks);
2402 template <
class CounterStateType>
2405 return after.InstRetiredAny - before.InstRetiredAny;
2414 template <
class CounterStateType>
2415 double getExecUsage(
const CounterStateType & before,
const CounterStateType & after)
2417 int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
2418 if (timer_clocks != 0)
2419 return double(after.InstRetiredAny - before.InstRetiredAny) / double(timer_clocks);
2428 template <
class CounterStateType>
2431 return now.InstRetiredAny;
2451 template <
class CounterStateType>
2452 uint64
getCycles(
const CounterStateType & before,
const CounterStateType & after)
2454 return after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2467 template <
class CounterStateType>
2468 uint64
getRefCycles(
const CounterStateType & before,
const CounterStateType & after)
2470 return after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
2480 template <
class CounterStateType>
2483 return now.CpuClkUnhaltedThread;
2496 double ipc =
getIPC(before, after);
2527 template <
class CounterStateType>
2530 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2531 int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
2533 if (timer_clocks != 0 && m)
2544 template <
class CounterStateType>
2547 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2548 int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
2550 if (ref_clocks != 0 && m)
2561 template <
class CounterStateType>
2564 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2565 int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
2566 if (timer_clocks != 0)
2567 return double(clocks) / double(timer_clocks);
2577 template <
class CounterStateType>
2580 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2581 int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
2582 if (ref_clocks != 0)
2583 return double(clocks) / double(ref_clocks);
2594 template <
class CounterStateType>
2598 if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL)
return -1;
2599 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2602 return 180. * double(after.L3Miss - before.L3Miss) / double(clocks);
2615 template <
class CounterStateType>
2619 if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL ||
PCM::getInstance()->useSkylakeEvents())
return -1;
2620 int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2623 double L3UnsharedHit = (double)(after.L3UnsharedHit - before.L3UnsharedHit);
2624 double L2HitM = (double)(after.L2HitM - before.L2HitM);
2625 return (35. * L3UnsharedHit + 74. * L2HitM) / double(clocks);
2637 template <
class CounterStateType>
2641 uint64 L2Hit = after.L2Hit - before.L2Hit;
2642 uint64 L2Ref = L2Hit + after.SKLL2Miss - before.SKLL2Miss;
2644 return double(L2Hit) / double(L2Ref);
2649 if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL)
2651 uint64 L2Miss = after.ArchLLCMiss - before.ArchLLCMiss;
2652 uint64 L2Ref = after.ArchLLCRef - before.ArchLLCRef;
2654 return 1. - (double(L2Miss) / double(L2Ref));
2658 uint64 L3Miss = after.L3Miss - before.L3Miss;
2659 uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
2660 uint64 L2HitM = after.L2HitM - before.L2HitM;
2661 uint64 L2Hit = after.L2Hit - before.L2Hit;
2662 uint64 hits = L2Hit;
2663 uint64 all = L2Hit + L2HitM + L3UnsharedHit + L3Miss;
2664 if (all)
return double(hits) / double(all);
2676 template <
class CounterStateType>
2681 uint64 L3Hit = after.SKLL3Hit - before.SKLL3Hit;
2682 uint64 L3Ref = L3Hit + after.L3Miss - before.L3Miss;
2684 return double(L3Hit) / double(L3Ref);
2689 uint64 L3Miss = after.L3Miss - before.L3Miss;
2690 uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
2691 uint64 L2HitM = after.L2HitM - before.L2HitM;
2692 uint64 hits = L3UnsharedHit + L2HitM;
2693 uint64 all = L2HitM + L3UnsharedHit + L3Miss;
2694 if (all)
return double(hits) / double(all);
2706 template <
class CounterStateType>
2710 return after.L3Miss - before.L3Miss;
2720 template <
class CounterStateType>
2724 return after.SKLL2Miss - before.SKLL2Miss;
2727 if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL)
2729 return after.ArchLLCMiss - before.ArchLLCMiss;
2731 uint64 L3Miss = after.L3Miss - before.L3Miss;
2732 uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
2733 uint64 L2HitM = after.L2HitM - before.L2HitM;
2734 return L2HitM + L3UnsharedHit + L3Miss;
2744 template <
class CounterStateType>
2745 uint64
getL2CacheHits(
const CounterStateType & before,
const CounterStateType & after)
2748 if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL)
2750 uint64 L2Miss = after.ArchLLCMiss - before.ArchLLCMiss;
2751 uint64 L2Ref = after.ArchLLCRef - before.ArchLLCRef;
2752 return L2Ref - L2Miss;
2754 return after.L2Hit - before.L2Hit;
2760 template <
class CounterStateType>
2763 return now.L3Occupancy;
2768 template <
class CounterStateType>
2771 return after.MemoryBWLocal - before.MemoryBWLocal;
2777 template <
class CounterStateType>
2780 const uint64 total = after.MemoryBWTotal - before.MemoryBWTotal;
2783 return total - local;
2795 template <
class CounterStateType>
2799 return after.L3UnsharedHit - before.L3UnsharedHit;
2809 template <
class CounterStateType>
2814 return after.SKLL3Hit - before.SKLL3Hit;
2816 return after.L2HitM - before.L2HitM;
2827 template <
class CounterStateType>
2828 uint64
getL3CacheHits(
const CounterStateType & before,
const CounterStateType & after)
2842 template <
class CounterStateType>
2845 return after.InvariantTSC - before.InvariantTSC;
2855 template <
class CounterStateType>
2860 if (state == 0)
return double(
getRefCycles(before, after)) / tsc;
2865 double result = 1.0 - double(
getRefCycles(before, after)) / tsc;
2866 for (
int i = 2; i <= PCM::MAX_C_STATE; ++i)
2868 result -= (after.BasicCounterState::CStateResidency[i] - before.BasicCounterState::CStateResidency[i]) / tsc;
2870 if (result < 0.) result = 0.;
2871 else if (result > 1.) result = 1.;
2875 return (after.BasicCounterState::CStateResidency[state] - before.BasicCounterState::CStateResidency[state]) / tsc;
2885 template <
class CounterStateType>
2888 return double(after.UncoreCounterState::CStateResidency[state] - before.UncoreCounterState::CStateResidency[state]) / double(
getInvariantTSC(before, after));
2898 template <
class CounterStateType>
2901 return (after.UncMCNormalReads - before.UncMCNormalReads) * 64;
2910 template <
class CounterStateType>
2913 return (after.UncMCFullWrites - before.UncMCFullWrites) * 64;
2922 template <
class CounterStateType>
2925 return (after.UncPMMReads - before.UncPMMReads) * 64;
2934 template <
class CounterStateType>
2937 return (after.UncPMMWrites - before.UncPMMWrites) * 64;
2946 template <
class CounterStateType>
2949 return (after.UncEDCNormalReads - before.UncEDCNormalReads) * 64;
2958 template <
class CounterStateType>
2961 return (after.UncEDCFullWrites - before.UncEDCFullWrites) * 64;
2971 template <
class CounterStateType>
2974 return (after.UncMCIORequests - before.UncMCIORequests) * 64;
2983 template <
class CounterStateType>
2984 uint64
getSMICount(
const CounterStateType & before,
const CounterStateType & after)
2986 return after.SMICount - before.SMICount;
2998 template <
class CounterStateType>
3001 return ((&after.Event0)[eventCounterNr] - (&before.Event0)[eventCounterNr]);
3017 uint64 b = before.incomingQPIPackets[socketNr][linkNr];
3018 uint64 a = after.incomingQPIPackets[socketNr][linkNr];
3020 return (a > b) ? (64 * (a - b)) : 0;
3036 if (!(m->qpiUtilizationMetricsAvailable()))
return 0.;
3041 return bytes / max_bytes;
3058 if (m->hasBecktonUncore())
3060 const uint64 b = before.outgoingQPIFlits[socketNr][linkNr];
3061 const uint64 a = after.outgoingQPIFlits[socketNr][linkNr];
3063 const double idle_flits = (double)((a > b) ? (a - b) : 0);
3064 const uint64 bTSC = before.uncoreTSC;
3065 const uint64 aTSC = after.uncoreTSC;
3066 const double tsc = (double)((aTSC > bTSC) ? (aTSC - bTSC) : 0);
3067 if (idle_flits >= tsc)
return 0.;
3069 return (1. - (idle_flits / tsc));
3070 }
else if (m->hasPCICFGUncore())
3072 const uint64 b = before.outgoingQPIFlits[socketNr][linkNr];
3073 const uint64 a = after.outgoingQPIFlits[socketNr][linkNr];
3075 double flits = (double)((a > b) ? (a - b) : 0);
3081 if (flits > max_flits)
return 1.;
3082 return (flits / max_flits);
3101 if (!(m->outgoingQPITrafficMetricsAvailable()))
return 0;
3106 return (uint64)(max_bytes * util);
3125 for (uint32 s = 0; s < ns; ++s)
3126 for (uint32 q = 0; q < qpiLinks; ++q)
3147 for (uint32 s = 0; s < ns; ++s)
3148 for (uint32 q = 0; q < qpiLinks; ++q)
3166 return 64 * now.incomingQPIPackets[socketNr][linkNr];
3183 for (uint32 q = 0; q < qpiLinks; ++q)
3202 for (uint32 s = 0; s < ns; ++s)
3225 return double(totalQPI) / double(memTraffic);
3231 template <
class CounterType>
3234 return after.data - before.data;
3238 template <
class CounterStateType>
3241 const double occupancy = double(after.TOROccupancyIAMiss) - double(before.TOROccupancyIAMiss);
3242 const double inserts = double(after.TORInsertsIAMiss) - double(before.TORInsertsIAMiss);
3243 const double unc_clocks = double(after.UncClocks) - double(before.UncClocks);
3245 const double seconds = double(
getInvariantTSC(before, after)) / double(m->getNumCores()/m->getNumSockets()) /
double(m->getNominalFrequency());
3246 return 1e9*seconds*(occupancy/inserts)/unc_clocks;
size_t getEDCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:1275
double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1979
uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState &now)
Get estimation of total QPI data traffic for this socket.
Definition: cpucounters.h:3177
friend uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:2843
Definition: memoptest.cpp:35
uint32 getCPUStepping() const
Reads CPU stepping id.
Definition: cpucounters.h:1150
uint64 getEDCCounter(uint32 channel, uint32 counter)
Direct read of embedded DRAM memory controller PMU counter (counter meaning depends on the programmin...
Definition: cpucounters.cpp:5674
uint64 getQPIClocks(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns QPI LL clock ticks.
Definition: cpucounters.h:1926
friend uint64 getBytesWrittenToMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to DRAM memory controllers.
Definition: cpucounters.h:2911
friend uint64 getBytesReadFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from DRAM memory controllers.
Definition: cpucounters.h:2899
uint64 getMCDRAMClocks(uint32 channel)
Get number MCDRAM channel cycles.
Definition: cpucounters.cpp:5651
uint64 getAllOutgoingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data+nondata traffic.
Definition: cpucounters.h:3140
friend uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/...
Definition: cpucounters.h:2027
Internal type and constant definitions.
friend uint64 getPCUCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of power control unit PMU counter (counter meaning depends on the programming: power/perf...
Definition: cpucounters.h:2051
uint64 getMCDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns MCDRAM clock ticks.
Definition: cpucounters.h:2001
uint32 getNumOnlineCores() const
Reads number of online logical cores in the system.
Definition: cpucounters.cpp:4111
uint64 getPMMReads()
Get the number of PMM memory reads (in cache lines)
Definition: cpucounters.cpp:5359
uint64 getQPILinkSpeed(const uint32 linkNr) const
Returns the speed of the QPI link.
Definition: cpucounters.h:426
void setRunState(int new_state)
Set Run State.
Definition: cpucounters.h:589
size_t getNumMCChannels() const
Returns the total number of detected memory channels on all integrated memory controllers.
Definition: cpucounters.h:438
Socket-wide counter state.
Definition: cpucounters.h:2289
friend uint64 getL3CacheHitsNoSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done...
Definition: cpucounters.h:2796
uint64 getL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache misses.
Definition: cpucounters.h:2707
Interface to access client bandwidth counters.
size_t getMCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:1224
uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI data traffic per incoming QPI link.
Definition: cpucounters.h:3014
friend uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: p...
Definition: cpucounters.h:2040
friend uint64 getL3CacheHitsSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where snooping in sibling L2 caches had to be done.
Definition: cpucounters.h:2810
int64 getCPUMicrocodeLevel() const
Get microcode level (returns -1 if retrieval not supported due to some restrictions) ...
Definition: cpucounters.h:1465
friend uint64 getRemoteMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Remote Memory Bandwidth.
Definition: cpucounters.h:2778
SupportedCPUModels
Identifiers of supported CPU models.
Definition: cpucounters.h:1103
double getCoreIPC(const SystemCounterState &before, const SystemCounterState &after)
Computes average number of retired instructions per core cycle for the entire system combining instru...
Definition: cpucounters.h:2494
uint64 getL3CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes total number of L3 cache hits.
Definition: cpucounters.h:2828
uint64 getL3CacheHitsSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where snooping in sibling L2 caches had to be done.
Definition: cpucounters.h:2810
Definition: cpucounters.cpp:1610
Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP...
Definition: cpucounters.h:282
uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:2081
friend double getIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle (IPC)
Definition: cpucounters.h:2387
void reportQPISpeed() const
Print QPI Speeds.
Definition: cpucounters.cpp:5919
Definition: cpucounters.h:83
friend uint64 getBytesReadFromPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from PMM memory.
Definition: cpucounters.h:2923
uint64 getL3CacheOccupancy(const CounterStateType &now)
Computes L3 Cache Occupancy.
Definition: cpucounters.h:2761
uint64 getEdcReads()
Get the number of cache lines read by EDC (embedded DRAM controller)
Definition: cpucounters.cpp:5379
friend double getActiveRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:2578
friend uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:2071
friend double getCyclesLostDueL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to missing L2 cache but still hitting L3 cac...
Definition: cpucounters.h:2616
friend double getL2CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L2 cache hit ratio.
Definition: cpucounters.h:2638
uint32 getCPUModel() const
Reads CPU model id.
Definition: cpucounters.h:1142
uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
Return QPI Link Speed in GBytes/second.
Definition: cpucounters.h:1359
ProgramMode
Mode of programming (parameter in the program() method)
Definition: cpucounters.h:607
friend uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:2843
double getL2CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L2 cache hit ratio.
Definition: cpucounters.h:2638
double getExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time intervall.
Definition: cpucounters.h:2415
uint64 getQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1955
int32 getThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:1877
double getConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by processor (excluding DRAM)
Definition: cpucounters.h:2091
double getCyclesLostDueL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to L3 cache misses.
Definition: cpucounters.h:2595
uint64 computeQPISpeed(const uint32 ref_core, const int cpumodel)
Measures/computes the maximum theoretical QPI link bandwidth speed in GByte/seconds.
Definition: cpucounters.cpp:5843
friend uint64 getLocalMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Local Memory Bandwidth.
Definition: cpucounters.h:2769
uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:2071
uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/...
Definition: cpucounters.h:2027
Definition: cpucounters.h:187
double getActiveRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:2578
uint32 getThreadsPerCore() const
Reads how many hardware threads has a physical core "Hardware thread" is a logical core in a differen...
Definition: cpucounters.cpp:4127
System-wide counter state.
Definition: cpucounters.h:2308
friend double getL3CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L3 cache hit ratio.
Definition: cpucounters.h:2677
friend double getCoreCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the core C-state.
Definition: cpucounters.h:2856
double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1967
uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1944
friend uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1944
double getCoreCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the core C-state.
Definition: cpucounters.h:2856
uint64 getDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns DRAM clock ticks.
Definition: cpucounters.h:1990
uint64 getNominalFrequency() const
Reads the nominal core frequency.
Definition: cpucounters.cpp:4137
uint64 getImcReadsForChannels(uint32 beginChannel, uint32 endChannel)
Get the number of integrated controller reads for given channels (in cache lines) ...
Definition: cpucounters.cpp:5338
friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get utilization of outgoing QPI link (0..1)
Definition: cpucounters.h:3054
int32 getSocketId(uint32 core_id) const
Determines socket of given core.
Definition: cpucounters.h:1170
friend double getAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:2528
Custom Core event description.
Definition: cpucounters.h:678
double getPackageCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the package C-state.
Definition: cpucounters.h:2886
uint64 getBytesWrittenToPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to PMM memory.
Definition: cpucounters.h:2935
uint64 getBytesWrittenToEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to MCDRAM memory controllers.
Definition: cpucounters.h:2959
int32 getTileId(uint32 os_id) const
Determines physical tile (cores sharing L2 cache) of given processor ID.
Definition: cpucounters.h:1165
uint64 getAllIncomingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data traffic.
Definition: cpucounters.h:3118
friend uint64 getMCDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns MCDRAM clock ticks.
Definition: cpucounters.h:2001
void program()
Program performance counters (disables programming power counters)
Definition: cpucounters.cpp:5193
uint64 getQPIL0pTxCycles(uint32 port)
Get number cycles on a QPI port when the link was in a power saving half-lane mode.
Definition: cpucounters.cpp:5630
Basic uncore counter state.
Definition: cpucounters.h:2134
void enableJKTWorkaround(bool enable)
Enable correct counting of various LLC events (with memory access perf penalty)
Definition: cpucounters.cpp:5711
friend uint64 getBytesReadFromEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from MCDRAM memory controllers.
Definition: cpucounters.h:2947
uint64 getBytesWrittenToMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to DRAM memory controllers.
Definition: cpucounters.h:2911
Extended custom core event description.
Definition: cpucounters.h:692
int32 getPackageThermalSpecPower() const
Returns thermal specification power of the package domain in Watt.
Definition: cpucounters.h:1368
friend uint64 getSMICount(const CounterStateType &before, const CounterStateType &after)
Returns the number of occured system management interrupts.
Definition: cpucounters.h:2984
double getActiveAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:2545
uint64 getImcReadsForController(uint32 controller)
Get the number of integrated controller reads for given controller (in cache lines) ...
Definition: cpucounters.cpp:5326
double getRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:2562
Definition: cpucounters.h:209
uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.h:2014
uint64 getQPILLCounter(uint32 port, uint32 counter)
Direct read of QPI LL PMU counter (counter meaning depends on the programming: power/performance/etc)...
Definition: cpucounters.cpp:5699
uint64 getQPILinksPerSocket() const
Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket.
Definition: cpucounters.h:1174
Definition: cpucounters.h:167
friend double getExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time intervall.
Definition: cpucounters.h:2415
size_t getNumQPIPorts() const
Returns the number of detected QPI ports.
Definition: cpucounters.h:423
uint32 getNumSockets() const
Reads number of sockets (CPUs) in the system.
Definition: cpucounters.cpp:4116
uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: p...
Definition: cpucounters.h:2040
double getL3CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L3 cache hit ratio.
Definition: cpucounters.h:2677
double getAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:2528
uint64 getIORequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IO sources.
Definition: cpucounters.h:2972
uint64 getL3CacheHitsNoSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done...
Definition: cpucounters.h:2796
int getRunState(void)
Returns program's Run State.
Definition: cpucounters.h:595
uint64 getBytesReadFromEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from MCDRAM memory controllers.
Definition: cpucounters.h:2947
uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI (data+nondata) traffic per outgoing QPI link.
Definition: cpucounters.h:3098
CoreCounterState getCoreCounterState(uint32 core)
Reads the counter state of a (logical) core.
Definition: cpucounters.cpp:3296
uint64 getPMMWrites()
Get the number of PMM memory writes (in cache lines)
Definition: cpucounters.cpp:5369
friend uint64 getBytesWrittenToEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to MCDRAM memory controllers.
Definition: cpucounters.h:2959
void unfreezeCounters()
Unfreezes event counting.
Definition: cpucounters.cpp:5620
friend uint64 getRefCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number of reference clock cycles while clock signal on the core is running.
Definition: cpucounters.h:2468
friend double getLLCReadMissLatency(const CounterStateType &before, const CounterStateType &after)
Returns average last level cache read+prefetch miss latency in ns.
Definition: cpucounters.h:3239
friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.h:2014
uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:2843
Low level interface to access PCI configuration space.
uint64 getNumberOfEvents(const CounterType &before, const CounterType &after)
Returns the raw count of events.
Definition: cpucounters.h:3232
friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI data traffic per incoming QPI link.
Definition: cpucounters.h:3014
Definition: cpucounters.h:662
friend uint64 getL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache misses.
Definition: cpucounters.h:2721
size_t getMCChannels(uint32 socket, uint32 controller) const
Returns the number of detected memory channels on given integrated memory controllers.
Definition: cpucounters.h:1250
uint64 getRefCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number of reference clock cycles while clock signal on the core is running.
Definition: cpucounters.h:2468
friend uint64 getL2CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache hits.
Definition: cpucounters.h:2745
uint64 getPCUClocks(const CounterStateType &before, const CounterStateType &after)
Returns clock ticks of power control unit.
Definition: cpucounters.h:2061
friend uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:2081
Provides 64-bit "virtual" counters from underlying 32-bit HW counters.
void freezeCounters()
Freezes event counting.
Definition: cpucounters.cpp:5595
void programServerUncoreMemoryMetrics(int rankA=-1, int rankB=-1, bool PMM=false)
Program memory counters (disables programming performance counters)
Definition: cpucounters.cpp:5119
int32 getThreadId(uint32 os_id) const
Determines physical thread of given processor ID within a core.
Definition: cpucounters.h:1155
SocketCounterState getSocketCounterState(uint32 socket)
Reads the counter state of a socket.
Definition: cpucounters.cpp:3288
friend double getRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:2562
uint32 getNumCores() const
Reads number of logical cores in the system.
Definition: cpucounters.cpp:4106
void allowMultipleInstances()
Call it before program() to allow multiple running instances of PCM on the same system.
Definition: cpucounters.h:601
size_t getNumEDCChannels() const
Returns the total number of detected memory channels on all embedded DRAM controllers (EDC) ...
Definition: cpucounters.h:445
friend uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:2071
int32 getPackageMaximumPower() const
Returns maximum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:1374
friend uint64 getL3CacheOccupancy(const CounterStateType &now)
Computes L3 Cache Occupancy.
Definition: cpucounters.h:2761
friend uint64 getBytesWrittenToPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to PMM memory.
Definition: cpucounters.h:2935
friend uint64 getDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns DRAM clock ticks.
Definition: cpucounters.h:1990
uint64 getRemoteMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Remote Memory Bandwidth.
Definition: cpucounters.h:2778
void program_power_metrics(int mc_profile)
Program power counters (disables programming performance counters)
Definition: cpucounters.cpp:5431
uint64 getCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number core clock cycles when signal on a specific core is running (not halted) ...
Definition: cpucounters.h:2452
friend double getPackageCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the package C-state.
Definition: cpucounters.h:2886
uint64 getMCCounter(uint32 channel, uint32 counter)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.cpp:5662
Definition: cpucounters.h:235
double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get data utilization of incoming QPI link (0..1)
Definition: cpucounters.h:3033
uint64 getQPIClocks(uint32 port)
Get number of QPI LL clocks on a QPI port.
Definition: cpucounters.cpp:5625
uint64 getLocalMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Local Memory Bandwidth.
Definition: cpucounters.h:2769
uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType &before, const CounterStateType &after)
Returns the number of occured custom core events.
Definition: cpucounters.h:2999
double getLLCReadMissLatency(const CounterStateType &before, const CounterStateType &after)
Returns average last level cache read+prefetch miss latency in ns.
Definition: cpucounters.h:3239
friend uint64 getQPIClocks(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns QPI LL clock ticks.
Definition: cpucounters.h:1926
double getDRAMConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by DRAM.
Definition: cpucounters.h:2104
uint64 getInstructionsRetired(const CounterStateType &before, const CounterStateType &after)
Computes the number of retired instructions.
Definition: cpucounters.h:2403
CPU Performance Monitor.
Definition: cpucounters.h:481
int32 getPackageMinimumPower() const
Returns minimum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:1371
friend uint64 getIORequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IO sources.
Definition: cpucounters.h:2972
friend uint64 getL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache misses.
Definition: cpucounters.h:2707
friend double getCyclesLostDueL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to L3 cache misses.
Definition: cpucounters.h:2595
double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get utilization of outgoing QPI link (0..1)
Definition: cpucounters.h:3054
uint64 getImcWrites()
Get the number of integrated controller writes (in cache lines)
Definition: cpucounters.cpp:5348
uint64 getPCUCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of power control unit PMU counter (counter meaning depends on the programming: power/perf...
Definition: cpucounters.h:2051
friend uint64 getInstructionsRetired(const CounterStateType &before, const CounterStateType &after)
Computes the number of retired instructions.
Definition: cpucounters.h:2403
Definition: cpucounters.h:125
uint64 getM2MCounter(uint32 box, uint32 counter)
Direct read of M2M counter.
Definition: cpucounters.cpp:5687
Definition: cpucounters.h:705
uint64 getPCUFrequency() const
Returns the frequency of Power Control Unit.
Definition: cpucounters.h:1323
double getJoulesPerEnergyUnit() const
Returns how many joules are in an internal processor energy unit.
Definition: cpucounters.h:1365
uint32 getOriginalCPUModel() const
Reads original CPU model id.
Definition: cpucounters.h:1146
uint64 getUPIL0TxCycles(uint32 port)
Get number cycles on a UPI port when the link was in a L0 mode (fully active)
Definition: cpucounters.cpp:5424
uint64 getL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache misses.
Definition: cpucounters.h:2721
const std::string & getErrorMessage() const
Returns the error message.
Definition: cpucounters.h:909
double getIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle (IPC)
Definition: cpucounters.h:2387
Definition: cpucounters.h:448
int32 getPackageThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:2265
bool isCoreCStateResidencySupported(int state)
Returns true if the specified core C-state residency metric is supported.
Definition: cpucounters.h:565
Server uncore power counter state.
Definition: cpucounters.h:2218
uint64 getSMICount(const CounterStateType &before, const CounterStateType &after)
Returns the number of occured system management interrupts.
Definition: cpucounters.h:2984
friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI (data+nondata) traffic per outgoing QPI link.
Definition: cpucounters.h:3098
bool isPackageCStateResidencySupported(int state)
Returns true if the specified package C-state residency metric is supported.
Definition: cpucounters.h:574
friend uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:2081
(Logical) core-wide counter state
Definition: cpucounters.h:2281
uint64 getL2CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache hits.
Definition: cpucounters.h:2745
double getCyclesLostDueL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to missing L2 cache but still hitting L3 cac...
Definition: cpucounters.h:2616
uint32 getMCPerSocket() const
Returns the number of detected integrated memory controllers per socket.
Definition: cpucounters.h:1200
friend uint64 getCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number core clock cycles when signal on a specific core is running (not halted) ...
Definition: cpucounters.h:2452
uint32 getNumMC() const
Returns the number of detected integrated memory controllers.
Definition: cpucounters.h:435
uint64 getBytesReadFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from DRAM memory controllers.
Definition: cpucounters.h:2899
uint64 getBytesReadFromPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from PMM memory.
Definition: cpucounters.h:2923
friend double getActiveAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:2545
uint64 getEdcWrites()
Get the number of cache lines written by EDC (embedded DRAM controller)
Definition: cpucounters.cpp:5391
SystemCounterState getSystemCounterState()
Reads the counter state of the system.
Definition: cpucounters.cpp:3280
uint64 getDRAMClocks(uint32 channel)
Get number DRAM channel cycles.
Definition: cpucounters.cpp:5640
uint64 getQPIL1Cycles(uint32 port)
Get number cycles on a QPI port when the link was in a power saving shutdown mode.
Definition: cpucounters.cpp:5635
double getTotalExecUsage(const SystemCounterState &before, const SystemCounterState &after)
Computes average number of retired instructions per time intervall for the entire system combining in...
Definition: cpucounters.h:2512
uint32 getMaxIPC() const
Returns the max number of instructions per cycle.
Definition: cpucounters.h:1288
Definition: cpucounters.h:102
friend uint64 getL3CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes total number of L3 cache hits.
Definition: cpucounters.h:2828
Definition: cpucounters.h:94
Definition: pcm-iio.cpp:56
friend uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType &before, const CounterStateType &after)
Returns the number of occured custom core events.
Definition: cpucounters.h:2999
int32 getCoreId(uint32 os_id) const
Determines physical core of given processor ID within a socket.
Definition: cpucounters.h:1160
Low level interface to access hardware model specific registers.
ErrorCode
Return codes (e.g. for program(..) method)
Definition: cpucounters.h:615
uint64 getOutgoingFlits(uint32 port)
Get the number of outgoing data and non-data or idle flits (depending on the architecture) from the s...
Definition: cpucounters.cpp:5419
Definition: cpucounters.h:147
uint64 getImcReads()
Get the number of integrated controller reads (in cache lines)
Definition: cpucounters.cpp:5321
static PCM * getInstance()
Returns PCM object.
Definition: cpucounters.cpp:259
Basic core counter state.
Definition: cpucounters.h:1741
uint64 getIncomingDataFlits(uint32 port)
Get the number of incoming data flits to the socket through a port.
Definition: cpucounters.cpp:5403
double getQPItoMCTrafficRatio(const SystemCounterState &before, const SystemCounterState &after)
Get QPI data to Memory Controller traffic ratio.
Definition: cpucounters.h:3217
friend uint64 getQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1955