Processor Counter Monitor
cpucounters.h
Go to the documentation of this file.
1 /*
2 Copyright (c) 2009-2019, Intel Corporation
3 All rights reserved.
4 
5 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6 
7  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
8  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
9  * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 
11 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 */
13 // written by Roman Dementiev
14 // Thomas Willhalm
15 
16 #ifndef CPUCOUNTERS_HEADER
17 #define CPUCOUNTERS_HEADER
18 
25 #define PCM_VERSION " (2019-02-07 11:10:06 +0100 ID=0e9461a)"
26 
27 #ifndef PCM_API
28 #define PCM_API
29 #endif
30 
31 #include "types.h"
32 #include "msr.h"
33 #include "pci.h"
34 #include "client_bw.h"
35 #include "width_extender.h"
36 #include "exceptions/unsupported_processor_exception.hpp"
37 
38 #include <vector>
39 #include <array>
40 #include <limits>
41 #include <string>
42 #include <memory>
43 #include <map>
44 #include <string.h>
45 
46 #ifdef PCM_USE_PERF
47 #include <linux/perf_event.h>
48 #include <errno.h>
49 #define PCM_PERF_COUNT_HW_REF_CPU_CYCLES (9)
50 #endif
51 
52 #ifndef _MSC_VER
53 #define NOMINMAX
54 #include <semaphore.h>
55 #include <sys/types.h>
56 #include <sys/stat.h>
57 #include <fcntl.h>
58 #include <sys/syscall.h>
59 #include <unistd.h>
60 #endif
61 
62 class SystemCounterState;
63 class SocketCounterState;
64 class CoreCounterState;
65 class BasicCounterState;
67 class PCM;
68 class CoreTaskQueue;
69 
70 #ifdef _MSC_VER
71 #if _MSC_VER>= 1600
72 #include <intrin.h>
73 #endif
74 void PCM_API restrictDriverAccess(LPCWSTR path);
75 #endif
76 
77 /*
78  CPU performance monitoring routines
79 
80  A set of performance monitoring routines for recent Intel CPUs
81 */
82 
83 struct PCM_API TopologyEntry // decribes a core
84 {
85  int32 os_id;
86  int32 thread_id;
87  int32 core_id;
88  int32 tile_id; // tile is a constalation of 1 or more cores sharing salem L2 cache. Unique for entire system
89  int32 socket;
90 
91  TopologyEntry() : os_id(-1), thread_id (-1), core_id(-1), tile_id(-1), socket(-1) { }
92 };
93 
95 {
96 public:
97  virtual void operator = (uint64 val) = 0; // write operation
98  virtual operator uint64 () = 0; //read operation
99  virtual ~HWRegister() {}
100 };
101 
103 {
104  std::shared_ptr<PciHandleType> handle;
105  size_t offset;
106 public:
107  PCICFGRegister64(const std::shared_ptr<PciHandleType> & handle_, size_t offset_) :
108  handle(handle_),
109  offset(offset_)
110  {
111  }
112  void operator = (uint64 val) override
113  {
114  std::cerr << "PCICFGRegister64 write operation is not supported" << std::endl;
115  throw std::exception();
116  }
117  operator uint64 () override
118  {
119  uint64 result = 0;
120  handle->read64(offset, &result);
121  return result;
122  }
123 };
124 
126 {
127  std::shared_ptr<PciHandleType> handle;
128  size_t offset;
129 public:
130  PCICFGRegister32(const std::shared_ptr<PciHandleType> & handle_, size_t offset_) :
131  handle(handle_),
132  offset(offset_)
133  {
134  }
135  void operator = (uint64 val) override
136  {
137  handle->write32(offset, (uint32)val);
138  }
139  operator uint64 () override
140  {
141  uint32 result = 0;
142  handle->read32(offset, &result);
143  return result;
144  }
145 };
146 
148 {
149  std::shared_ptr<MMIORange> handle;
150  size_t offset;
151 public:
152  MMIORegister64(const std::shared_ptr<MMIORange> & handle_, size_t offset_) :
153  handle(handle_),
154  offset(offset_)
155  {
156  }
157  void operator = (uint64 val) override
158  {
159  handle->write64(offset, val);
160  }
161  operator uint64 () override
162  {
163  return handle->read64(offset);
164  }
165 };
166 
168 {
169  std::shared_ptr<MMIORange> handle;
170  size_t offset;
171 public:
172  MMIORegister32(const std::shared_ptr<MMIORange> & handle_, size_t offset_) :
173  handle(handle_),
174  offset(offset_)
175  {
176  }
177  void operator = (uint64 val) override
178  {
179  handle->write32(offset, (uint32)val);
180  }
181  operator uint64 () override
182  {
183  return (uint64)handle->read32(offset);
184  }
185 };
186 
187 class MSRRegister : public HWRegister
188 {
189  std::shared_ptr<SafeMsrHandle> handle;
190  size_t offset;
191 public:
192  MSRRegister(const std::shared_ptr<SafeMsrHandle> & handle_, size_t offset_) :
193  handle(handle_),
194  offset(offset_)
195  {
196  }
197  void operator = (uint64 val) override
198  {
199  handle->write(offset, val);
200  }
201  operator uint64 () override
202  {
203  uint64 value = 0;
204  handle->read(offset, &value);
205  return value;
206  }
207 };
208 
210 {
211  std::shared_ptr<CounterWidthExtender> handle;
212 public:
213  CounterWidthExtenderRegister(const std::shared_ptr<CounterWidthExtender> & handle_) :
214  handle(handle_)
215  {
216  }
217  void operator = (uint64 val) override
218  {
219  if (val == 0)
220  {
221  handle->reset();
222  }
223  else
224  {
225  std::cerr << "ERROR: writing non-zero values to CounterWidthExtenderRegister is not supported" << std::endl;
226  throw std::exception();
227  }
228  }
229  operator uint64 () override
230  {
231  return handle->read();;
232  }
233 };
234 
236 {
237  typedef std::shared_ptr<HWRegister> HWRegisterPtr;
238 public:
239  HWRegisterPtr unitControl;
240  HWRegisterPtr counterControl[4];
241  HWRegisterPtr counterValue[4];
242  HWRegisterPtr fixedCounterControl;
243  HWRegisterPtr fixedCounterValue;
244  HWRegisterPtr filter[2];
245 
246  UncorePMU(const HWRegisterPtr & unitControl_,
247  const HWRegisterPtr & counterControl0,
248  const HWRegisterPtr & counterControl1,
249  const HWRegisterPtr & counterControl2,
250  const HWRegisterPtr & counterControl3,
251  const HWRegisterPtr & counterValue0,
252  const HWRegisterPtr & counterValue1,
253  const HWRegisterPtr & counterValue2,
254  const HWRegisterPtr & counterValue3,
255  const HWRegisterPtr & fixedCounterControl_ = HWRegisterPtr(),
256  const HWRegisterPtr & fixedCounterValue_ = HWRegisterPtr(),
257  const HWRegisterPtr & filter0 = HWRegisterPtr(),
258  const HWRegisterPtr & filter1 = HWRegisterPtr()
259  ) :
260  unitControl(unitControl_),
261  counterControl{ counterControl0, counterControl1, counterControl2, counterControl3 },
262  counterValue{ counterValue0, counterValue1, counterValue2, counterValue3 },
263  fixedCounterControl(fixedCounterControl_),
264  fixedCounterValue(fixedCounterValue_),
265  filter{ filter0 , filter1 }
266  {
267  }
268  UncorePMU() {}
269  virtual ~UncorePMU() {}
270  void cleanup()
271  {
272  for (int i = 0; i < 4; ++i)
273  {
274  if (counterControl[i].get()) *counterControl[i] = 0;
275  }
276  if (unitControl.get()) *unitControl = 0;
277  if (fixedCounterControl.get()) *fixedCounterControl = 0;
278  }
279 };
280 
283 {
284  friend class PCM;
285  int32 iMCbus,UPIbus,M2Mbus;
286  uint32 groupnr;
287  int32 cpu_model;
288  std::vector<UncorePMU> imcPMUs;
289  std::vector<UncorePMU> edcPMUs;
290  std::vector<UncorePMU> xpiPMUs;
291  std::vector<UncorePMU> m2mPMUs;
292  std::vector<uint64> qpi_speed;
293  std::vector<uint32> num_imc_channels; // number of memory channels in each memory controller
294  std::vector<std::pair<uint32, uint32> > XPIRegisterLocation; // (device, function)
295  std::vector<std::vector< std::pair<uint32, uint32> > > MCRegisterLocation; // MCRegisterLocation[controller]: (device, function)
296  std::vector<std::pair<uint32, uint32> > EDCRegisterLocation; // EDCRegisterLocation: (device, function)
297  std::vector<std::pair<uint32, uint32> > M2MRegisterLocation; // M2MRegisterLocation: (device, function)
298 
299  static PCM_Util::Mutex socket2busMutex;
300  static std::vector<std::pair<uint32, uint32> > socket2iMCbus;
301  static std::vector<std::pair<uint32, uint32> > socket2UPIbus;
302  static std::vector<std::pair<uint32, uint32> > socket2M2Mbus;
303  void initSocket2Bus(std::vector<std::pair<uint32, uint32> > & socket2bus, uint32 device, uint32 function, const uint32 DEV_IDS[], uint32 devIdsSize);
304 
305  ServerPCICFGUncore(); // forbidden
306  ServerPCICFGUncore(ServerPCICFGUncore &); // forbidden
307  ServerPCICFGUncore & operator = (const ServerPCICFGUncore &); // forbidden
308  PciHandleType * createIntelPerfMonDevice(uint32 groupnr, int32 bus, uint32 dev, uint32 func, bool checkVendor = false);
309  void programIMC(const uint32 * MCCntConfig);
310  void programEDC(const uint32 * EDCCntConfig);
311  void programM2M();
312  void programXPI(const uint32 * XPICntConfig);
313  typedef std::pair<size_t, std::vector<uint64 *> > MemTestParam;
314  void initMemTest(MemTestParam & param);
315  void doMemTest(const MemTestParam & param);
316  void cleanupMemTest(const MemTestParam & param);
317  void cleanupQPIHandles();
318  void cleanupPMUs();
319  void writeAllUnitControl(const uint32 value);
320  void initDirect(uint32 socket_, const PCM * pcm);
321  void initPerf(uint32 socket_, const PCM * pcm);
322  void initBuses(uint32 socket_, const PCM * pcm);
323  void initRegisterLocations();
324 
325 public:
329  ServerPCICFGUncore(uint32 socket_, const PCM * pcm);
331  void program();
333  uint64 getImcReads();
336  uint64 getImcReadsForController(uint32 controller);
340  uint64 getImcReadsForChannels(uint32 beginChannel, uint32 endChannel);
342  uint64 getImcWrites();
343 
345  uint64 getPMMReads();
347  uint64 getPMMWrites();
348 
350  uint64 getEdcReads();
352  uint64 getEdcWrites();
353 
356  uint64 getIncomingDataFlits(uint32 port);
357 
360  uint64 getOutgoingFlits(uint32 port);
361 
362  virtual ~ServerPCICFGUncore();
363 
366  void program_power_metrics(int mc_profile);
367 
372  void programServerUncoreMemoryMetrics(int rankA = -1, int rankB = -1, bool PMM = false);
373 
376  uint64 getQPIClocks(uint32 port);
377 
380  uint64 getQPIL0pTxCycles(uint32 port);
383  uint64 getUPIL0TxCycles(uint32 port);
386  uint64 getQPIL1Cycles(uint32 port);
389  uint64 getDRAMClocks(uint32 channel);
392  uint64 getMCDRAMClocks(uint32 channel);
396  uint64 getMCCounter(uint32 channel, uint32 counter);
400  uint64 getEDCCounter(uint32 channel, uint32 counter);
404  uint64 getQPILLCounter(uint32 port, uint32 counter);
405 
409  uint64 getM2MCounter(uint32 box, uint32 counter);
410 
412  void freezeCounters();
414  void unfreezeCounters();
415 
417  uint64 computeQPISpeed(const uint32 ref_core, const int cpumodel);
418 
420  void enableJKTWorkaround(bool enable);
421 
423  size_t getNumQPIPorts() const { return xpiPMUs.size(); }
424 
426  uint64 getQPILinkSpeed(const uint32 linkNr) const
427  {
428  return qpi_speed.empty() ? 0 : qpi_speed[linkNr];
429  }
430 
432  void reportQPISpeed() const;
433 
435  uint32 getNumMC() const { return (uint32)num_imc_channels.size(); }
436 
438  size_t getNumMCChannels() const { return (size_t)imcPMUs.size(); }
439 
442  size_t getNumMCChannels(const uint32 controller) const;
443 
445  size_t getNumEDCChannels() const { return edcPMUs.size(); }
446 };
447 
449 {
450  template <class T>
451  friend uint64 getNumberOfEvents(const T & before, const T & after);
452  friend class PCM;
453  uint64 data;
454 
455 public:
456  SimpleCounterState() : data(0)
457  { }
458  virtual ~SimpleCounterState() { }
459 };
460 
463 
464 class PerfVirtualControlRegister;
465 
466 #ifndef HACK_TO_REMOVE_DUPLICATE_ERROR
467 template class PCM_API std::allocator<TopologyEntry>;
468 template class PCM_API std::vector<TopologyEntry>;
469 template class PCM_API std::allocator<CounterWidthExtender *>;
470 template class PCM_API std::vector<CounterWidthExtender *>;
471 template class PCM_API std::allocator<uint32>;
472 template class PCM_API std::vector<uint32>;
473 template class PCM_API std::allocator<char>;
474 #endif
475 
481 class PCM_API PCM
482 {
483  friend class BasicCounterState;
484  friend class UncoreCounterState;
485  friend class PerfVirtualControlRegister;
486  PCM(); // forbidden to call directly because it is a singleton
487 
488  int32 cpu_family;
489  int32 cpu_model, original_cpu_model;
490  int32 cpu_stepping;
491  int64 cpu_microcode_level;
492  int32 max_cpuid;
493  int32 threads_per_core;
494  int32 num_cores;
495  int32 num_sockets;
496  int32 num_phys_cores_per_socket;
497  int32 num_online_cores;
498  int32 num_online_sockets;
499  uint32 core_gen_counter_num_max;
500  uint32 core_gen_counter_num_used;
501  uint32 core_gen_counter_width;
502  uint32 core_fixed_counter_num_max;
503  uint32 core_fixed_counter_num_used;
504  uint32 core_fixed_counter_width;
505  uint32 uncore_gen_counter_num_max;
506  uint32 uncore_gen_counter_num_used;
507  uint32 uncore_gen_counter_width;
508  uint32 uncore_fixed_counter_num_max;
509  uint32 uncore_fixed_counter_num_used;
510  uint32 uncore_fixed_counter_width;
511  uint32 perfmon_version;
512  int32 perfmon_config_anythread;
513  uint64 nominal_frequency;
514  uint64 max_qpi_speed; // in GBytes/second
515  uint32 L3ScalingFactor;
516  int32 pkgThermalSpecPower, pkgMinimumPower, pkgMaximumPower;
517 
518  std::vector<TopologyEntry> topology;
519  std::string errorMessage;
520 
521  static PCM * instance;
522  bool allow_multiple_instances;
523  bool programmed_pmu;
524  std::vector<std::shared_ptr<SafeMsrHandle> > MSR;
525  std::vector<std::shared_ptr<ServerPCICFGUncore> > server_pcicfg_uncore;
526  std::vector<UncorePMU> pcuPMUs;
527  std::vector<std::map<int32, UncorePMU> > iioPMUs;
528  std::vector<UncorePMU> uboxPMUs;
529  double joulesPerEnergyUnit;
530  std::vector<std::shared_ptr<CounterWidthExtender> > energy_status;
531  std::vector<std::shared_ptr<CounterWidthExtender> > dram_energy_status;
532  std::vector<std::vector<UncorePMU> > cboPMUs;
533 
534  std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_local;
535  std::vector<std::shared_ptr<CounterWidthExtender> > memory_bw_total;
536 
537  std::shared_ptr<ClientBW> clientBW;
538  std::shared_ptr<CounterWidthExtender> clientImcReads;
539  std::shared_ptr<CounterWidthExtender> clientImcWrites;
540  std::shared_ptr<CounterWidthExtender> clientIoRequests;
541 
542  bool disable_JKT_workaround;
543  bool blocked; // track if time-driven counter update is running or not: PCM is blocked
544 
545  uint64 * coreCStateMsr; // MSR addresses of core C-state free-running counters
546  uint64 * pkgCStateMsr; // MSR addresses of package C-state free-running counters
547 
548  std::vector<std::shared_ptr<CoreTaskQueue> > coreTaskQueues;
549 
550  bool L2CacheHitRatioAvailable;
551  bool L3CacheHitRatioAvailable;
552  bool L3CacheMissesAvailable;
553  bool L2CacheMissesAvailable;
554  bool L2CacheHitsAvailable;
555  bool L3CacheHitsNoSnoopAvailable;
556  bool L3CacheHitsSnoopAvailable;
557  bool L3CacheHitsAvailable;
558  bool CyclesLostDueL3CacheMissesAvailable;
559  bool CyclesLostDueL2CacheMissesAvailable;
560 
561 public:
562  enum { MAX_C_STATE = 10 }; // max C-state on Intel architecture
563 
566  {
567  if (state == 0 || state == 1)
568  return true;
569 
570  return (coreCStateMsr != NULL && state <= ((int)MAX_C_STATE) && coreCStateMsr[state] != 0);
571  }
572 
575  {
576  return (pkgCStateMsr != NULL && state <= ((int)MAX_C_STATE) && pkgCStateMsr[state] != 0);
577  }
578 
580  void setOutput(const std::string filename);
581 
583  void restoreOutput();
584 
586  // Arguments:
587  // -- 1 - program is running
588  // -- 0 -pgram is sleeping
589  void setRunState(int new_state) { run_state = new_state; }
590 
592  // Results:
593  // -- 1 - program is running
594  // -- 0 -pgram is sleeping
595  int getRunState(void) { return run_state; }
596 
597  bool isBlocked(void) { return blocked; }
598  void setBlocked(const bool new_blocked) { blocked = new_blocked; }
599 
602  {
603  allow_multiple_instances = true;
604  }
605 
607  enum ProgramMode {
608  DEFAULT_EVENTS = 0,
609  CUSTOM_CORE_EVENTS = 1,
610  EXT_CUSTOM_CORE_EVENTS = 2,
611  INVALID_MODE
612  };
613 
615  enum ErrorCode {
616  Success = 0,
617  MSRAccessDenied = 1,
618  PMUBusy = 2,
619  UnknownError
620  };
621 
622  enum PerfmonField {
623  INVALID, /* Use to parse invalid field */
624  OPCODE,
625  EVENT_SELECT,
626  UMASK,
627  RESET,
628  EDGE_DET,
629  IGNORED,
630  OVERFLOW_ENABLE,
631  ENABLE,
632  INVERT,
633  THRESH,
634  CH_MASK,
635  FC_MASK,
636  /* Below are not part of perfmon definition */
637  H_EVENT_NAME,
638  V_EVENT_NAME,
639  MULTIPLIER,
640  DIVIDER,
641  COUNTER_INDEX
642  };
643 
644  enum PCIeWidthMode {
645  X1,
646  X4,
647  X8,
648  X16,
649  XFF
650  };
651 
652  enum { // offsets/enumeration of IIO stacks
653  IIO_CBDMA = 0, // shared with DMI
654  IIO_PCIe0 = 1,
655  IIO_PCIe1 = 2,
656  IIO_PCIe2 = 3,
657  IIO_MCP0 = 4,
658  IIO_MCP1 = 5,
659  IIO_STACK_COUNT = 6
660  };
661 
663  {
664  enum PCIeWidthMode width;
665  std::string pciDevName;
666  std::string busNumber;
667 
668  SimplePCIeDevInfo() : width(XFF) { }
669  };
670 
679  {
680  int32 event_number, umask_value;
681  };
682 
693  {
694  FixedEventControlRegister * fixedCfg; // if NULL, then default configuration performed for fixed counters
695  uint32 nGPCounters; // number of general purpose counters
696  EventSelectRegister * gpCounterCfg; // general purpose counters, if NULL, then default configuration performed for GP counters
697  uint64 OffcoreResponseMsrValue[2];
698  ExtendedCustomCoreEventDescription() : fixedCfg(NULL), nGPCounters(0), gpCounterCfg(NULL)
699  {
700  OffcoreResponseMsrValue[0] = 0;
701  OffcoreResponseMsrValue[1] = 0;
702  }
703  };
704 
706  {
707  /* We program the same counters to every IIO Stacks */
708  std::string eventNames[4];
709  IIOPMUCNTCTLRegister eventOpcodes[4];
710  int multiplier[4]; //Some IIO event requires transformation to get meaningful output (i.e. DWord to bytes)
711  int divider[4]; //We usually like to have some kind of divider (i.e. /10e6 )
712  };
713 
714 private:
715  ProgramMode mode;
716  CustomCoreEventDescription coreEventDesc[4];
717 
718 #ifdef _MSC_VER
719  HANDLE numInstancesSemaphore; // global semaphore that counts the number of PCM instances on the system
720 #else
721  // global semaphore that counts the number of PCM instances on the system
722  sem_t * numInstancesSemaphore;
723 #endif
724 
725  std::vector<int32> socketRefCore;
726 
727  bool canUsePerf;
728 #ifdef PCM_USE_PERF
729  std::vector<std::vector<int> > perfEventHandle;
730  void readPerfData(uint32 core, std::vector<uint64> & data);
731 
732  enum {
733  PERF_INST_RETIRED_ANY_POS = 0,
734  PERF_CPU_CLK_UNHALTED_THREAD_POS = 1,
735  PERF_CPU_CLK_UNHALTED_REF_POS = 2,
736  PERF_GEN_EVENT_0_POS = 3,
737  PERF_GEN_EVENT_1_POS = 4,
738  PERF_GEN_EVENT_2_POS = 5,
739  PERF_GEN_EVENT_3_POS = 6
740  };
741 
742  enum {
743  PERF_GROUP_LEADER_COUNTER = PERF_INST_RETIRED_ANY_POS
744  };
745 #endif
746  std::ofstream * outfile; // output file stream
747  std::streambuf * backup_ofile; // backup of original output = cout
748  int run_state; // either running (1) or sleeping (0)
749 
750  bool needToRestoreNMIWatchdog;
751 
752  std::vector<std::vector<EventSelectRegister> > lastProgrammedCustomCounters;
753  uint32 checkCustomCoreProgramming(std::shared_ptr<SafeMsrHandle> msr);
754  void reservePMU();
755  void unreservePMU();
756  ErrorCode programCoreCounters(int core, const PCM::ProgramMode mode, const ExtendedCustomCoreEventDescription * pExtDesc,
757  std::vector<EventSelectRegister> & programmedCustomCounters);
758 
759  bool PMUinUse();
760  void cleanupPMU();
761  void freeRMID();
762  bool decrementInstanceSemaphore(); // returns true if it was the last instance
763 
764 #ifdef __APPLE__
765  // OSX does not have sem_getvalue, so we must get the number of instances by a different method
766  uint32 getNumInstances();
767  uint32 decrementNumInstances();
768  uint32 incrementNumInstances();
769 #endif
770 
771 
772  void computeQPISpeedBeckton(int core_nr);
773  void destroyMSR();
774  void computeNominalFrequency();
775  static bool isCPUModelSupported(int model_);
776  std::string getSupportedUarchCodenames() const;
777  std::string getUnsupportedMessage() const;
778  bool detectModel();
779  bool checkModel();
780 
781  void initCStateSupportTables();
782  bool discoverSystemTopology();
783  void printSystemTopology() const;
784  bool initMSR();
785  bool detectNominalFrequency();
786  void showSpecControlMSRs();
787  void initEnergyMonitoring();
788  void initUncoreObjects();
794  void initRMID();
800  void initQOSevent(const uint64 event, const int32 core);
801  void programBecktonUncore(int core);
802  void programNehalemEPUncore(int core);
803  void enableJKTWorkaround(bool enable);
804  template <class CounterStateType>
805  void readAndAggregateMemoryBWCounters(const uint32 core, CounterStateType & counterState);
806  template <class CounterStateType>
807  void readAndAggregateUncoreMCCounters(const uint32 socket, CounterStateType & counterState);
808  template <class CounterStateType>
809  void readAndAggregateEnergyCounters(const uint32 socket, CounterStateType & counterState);
810  template <class CounterStateType>
811  void readPackageThermalHeadroom(const uint32 socket, CounterStateType & counterState);
812  template <class CounterStateType>
813  void readAndAggregatePackageCStateResidencies(std::shared_ptr<SafeMsrHandle> msr, CounterStateType & result);
814  void readQPICounters(SystemCounterState & counterState);
815  void reportQPISpeed() const;
816  void readCoreCounterConfig();
817  void readCPUMicrocodeLevel();
818 
819  uint64 CX_MSR_PMON_CTRY(uint32 Cbo, uint32 Ctr) const;
820  uint64 CX_MSR_PMON_BOX_FILTER(uint32 Cbo) const;
821  uint64 CX_MSR_PMON_BOX_FILTER1(uint32 Cbo) const;
822  uint64 CX_MSR_PMON_CTLY(uint32 Cbo, uint32 Ctl) const;
823  uint64 CX_MSR_PMON_BOX_CTL(uint32 Cbo) const;
824  uint32 getMaxNumOfCBoxes() const;
825  void programCbo(const uint64 * events, const uint32 opCode, const uint32 nc_ = 0, const uint32 tid_ = 0);
826  void programCboOpcodeFilter(const uint32 opc0, UncorePMU & pmu, const uint32 nc_ = 0, const uint32 opc1 = 0);
827  void programLLCReadMissLatencyEvents();
828  uint64 getCBOCounterState(const uint32 socket, const uint32 ctr_);
829 
830  void cleanupUncorePMUs();
831 
832  bool isCLX() const // Cascade Lake-SP
833  {
834  return (PCM::SKX == cpu_model) && (cpu_stepping > 4);
835  }
836 
837  void initUncorePMUsDirect();
838  void initUncorePMUsPerf();
839 
840 public:
842  bool isSecureBoot() const;
843 
845  bool useLinuxPerfForUncore() const;
846 
852  bool QOSMetricAvailable() const;
858  bool L3QOSMetricAvailable() const;
864  bool L3CacheOccupancyMetricAvailable() const;
870  bool CoreLocalMemoryBWMetricAvailable() const;
876  bool CoreRemoteMemoryBWMetricAvailable() const;
882  unsigned getMaxRMID() const;
883 
894  static PCM * getInstance(); // the only way to get access
895 
903  bool good(); // true if access to CPU counters works
904 
909  const std::string & getErrorMessage() const
910  {
911  return errorMessage;
912  }
913 
925  ErrorCode program(const ProgramMode mode_ = DEFAULT_EVENTS, const void * parameter_ = NULL); // program counters and start counting
926 
938  ErrorCode programServerUncoreLatencyMetrics(bool enable_pmm);
939 
953  ErrorCode programServerUncorePowerMetrics(int mc_profile, int pcu_profile, int * freq_bands = NULL);
954 
967  ErrorCode programServerUncoreMemoryMetrics(int rankA = -1, int rankB = -1, bool PMM = false);
968 
970  void freezeServerUncoreCounters();
971 
973  void unfreezeServerUncoreCounters();
974 
979  ServerUncorePowerState getServerUncorePowerState(uint32 socket);
980 
986  void cleanup();
987 
992  void resetPMU();
993 
1001  void getAllCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates, std::vector<CoreCounterState> & coreStates);
1002 
1009  void getUncoreCounterStates(SystemCounterState & systemState, std::vector<SocketCounterState> & socketStates);
1010 
1015  bool isCoreOnline(int32 os_core_id) const;
1016 
1021  bool isSocketOnline(int32 socket_id) const;
1022 
1031 
1037 
1047 
1051  uint32 getNumCores() const;
1052 
1056  uint32 getNumOnlineCores() const;
1057 
1061  uint32 getNumSockets() const;
1062 
1066  uint32 getNumOnlineSockets() const;
1067 
1073  uint32 getThreadsPerCore() const;
1074 
1078  bool getSMT() const; // returns true iff SMT ("Hyperthreading") is on
1079 
1083  uint64 getNominalFrequency() const; // in Hz
1084 
1089  uint32 getL3ScalingFactor() const;
1090 
1096  bool isSomeCoreOfflined();
1097 
1100  int32 getMaxCustomCoreEvents();
1101 
1104  {
1105  NEHALEM_EP = 26,
1106  NEHALEM = 30,
1107  ATOM = 28,
1108  ATOM_2 = 53,
1109  ATOM_CENTERTON = 54,
1110  ATOM_BAYTRAIL = 55,
1111  ATOM_AVOTON = 77,
1112  ATOM_CHERRYTRAIL = 76,
1113  ATOM_APOLLO_LAKE = 92,
1114  ATOM_DENVERTON = 95,
1115  CLARKDALE = 37,
1116  WESTMERE_EP = 44,
1117  NEHALEM_EX = 46,
1118  WESTMERE_EX = 47,
1119  SANDY_BRIDGE = 42,
1120  JAKETOWN = 45,
1121  IVY_BRIDGE = 58,
1122  HASWELL = 60,
1123  HASWELL_ULT = 69,
1124  HASWELL_2 = 70,
1125  IVYTOWN = 62,
1126  HASWELLX = 63,
1127  BROADWELL = 61,
1128  BROADWELL_XEON_E3 = 71,
1129  BDX_DE = 86,
1130  SKL_UY = 78,
1131  KBL = 158,
1132  KBL_1 = 142,
1133  BDX = 79,
1134  KNL = 87,
1135  SKL = 94,
1136  SKX = 85,
1137  END_OF_MODEL_LIST = 0x0ffff
1138  };
1139 
1142  uint32 getCPUModel() const { return (uint32)cpu_model; }
1143 
1146  uint32 getOriginalCPUModel() const { return (uint32)original_cpu_model; }
1147 
1150  uint32 getCPUStepping() const { return (uint32)cpu_stepping; }
1151 
1155  int32 getThreadId(uint32 os_id) const { return (int32)topology[os_id].thread_id; }
1156 
1160  int32 getCoreId(uint32 os_id) const { return (int32)topology[os_id].core_id; }
1161 
1165  int32 getTileId(uint32 os_id) const { return (int32)topology[os_id].tile_id; }
1166 
1170  int32 getSocketId(uint32 core_id) const { return (int32)topology[core_id].socket; }
1171 
1174  uint64 getQPILinksPerSocket() const
1175  {
1176  switch (cpu_model)
1177  {
1178  case NEHALEM_EP:
1179  case WESTMERE_EP:
1180  case CLARKDALE:
1181  if (num_sockets == 2)
1182  return 2;
1183  else
1184  return 1;
1185  case NEHALEM_EX:
1186  case WESTMERE_EX:
1187  return 4;
1188  case JAKETOWN:
1189  case IVYTOWN:
1190  case HASWELLX:
1191  case BDX_DE:
1192  case BDX:
1193  case SKX:
1194  return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumQPIPorts()) : 0;
1195  }
1196  return 0;
1197  }
1198 
1200  uint32 getMCPerSocket() const
1201  {
1202  switch (cpu_model)
1203  {
1204  case NEHALEM_EP:
1205  case WESTMERE_EP:
1206  case CLARKDALE:
1207  return 1;
1208  case NEHALEM_EX:
1209  case WESTMERE_EX:
1210  return 2;
1211  case JAKETOWN:
1212  case IVYTOWN:
1213  case HASWELLX:
1214  case BDX_DE:
1215  case SKX:
1216  case BDX:
1217  case KNL:
1218  return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMC()) : 0;
1219  }
1220  return 0;
1221  }
1222 
1224  size_t getMCChannelsPerSocket() const
1225  {
1226  switch (cpu_model)
1227  {
1228  case NEHALEM_EP:
1229  case WESTMERE_EP:
1230  case CLARKDALE:
1231  return 3;
1232  case NEHALEM_EX:
1233  case WESTMERE_EX:
1234  return 4;
1235  case JAKETOWN:
1236  case IVYTOWN:
1237  case HASWELLX:
1238  case BDX_DE:
1239  case SKX:
1240  case BDX:
1241  case KNL:
1242  return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumMCChannels()) : 0;
1243  }
1244  return 0;
1245  }
1246 
1250  size_t getMCChannels(uint32 socket, uint32 controller) const
1251  {
1252  switch (cpu_model)
1253  {
1254  case NEHALEM_EP:
1255  case WESTMERE_EP:
1256  case CLARKDALE:
1257  return 3;
1258  case NEHALEM_EX:
1259  case WESTMERE_EX:
1260  return 4;
1261  case JAKETOWN:
1262  case IVYTOWN:
1263  case HASWELLX:
1264  case BDX_DE:
1265  case SKX:
1266  case BDX:
1267  case KNL:
1268  return (socket < server_pcicfg_uncore.size() && server_pcicfg_uncore[socket].get()) ? (server_pcicfg_uncore[socket]->getNumMCChannels(controller)) : 0;
1269  }
1270  return 0;
1271  }
1272 
1273 
1276  {
1277  switch (cpu_model)
1278  {
1279  case KNL:
1280  return (server_pcicfg_uncore.size() && server_pcicfg_uncore[0].get()) ? (server_pcicfg_uncore[0]->getNumEDCChannels()) : 0;
1281  }
1282  return 0;
1283  }
1284 
1285 
1288  uint32 getMaxIPC() const
1289  {
1290  switch (original_cpu_model)
1291  {
1292  case ATOM_DENVERTON:
1293  return 3;
1294  }
1295  switch (cpu_model)
1296  {
1297  case NEHALEM_EP:
1298  case WESTMERE_EP:
1299  case NEHALEM_EX:
1300  case WESTMERE_EX:
1301  case CLARKDALE:
1302  case SANDY_BRIDGE:
1303  case JAKETOWN:
1304  case IVYTOWN:
1305  case IVY_BRIDGE:
1306  case HASWELL:
1307  case HASWELLX:
1308  case BROADWELL:
1309  case BDX_DE:
1310  case BDX:
1311  case SKL:
1312  case KBL:
1313  case SKX:
1314  return 4;
1315  case ATOM:
1316  case KNL:
1317  return 2;
1318  }
1319  return 0;
1320  }
1321 
1323  uint64 getPCUFrequency() const
1324  {
1325  switch (cpu_model)
1326  {
1327  case JAKETOWN:
1328  case IVYTOWN:
1329  return 800000000ULL; // 800 MHz
1330  case HASWELLX:
1331  case BDX_DE:
1332  case BDX:
1333  case KNL:
1334  return 1000000000ULL; // 1 GHz
1335  case SKX:
1336  return 1100000000ULL; // 1.1 GHz
1337  }
1338  return 0;
1339  }
1340 
1345  uint64 getTickCount(uint64 multiplier = 1000 /* ms */, uint32 core = 0);
1346 
1351  uint64 getTickCountRDTSCP(uint64 multiplier = 1000 /* ms */);
1352 
1354  uint64 getUncoreClocks(const uint32 socket_);
1355 
1359  uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
1360  {
1361  return hasPCICFGUncore() ? server_pcicfg_uncore[socketNr]->getQPILinkSpeed(linkNr) : max_qpi_speed;
1362  }
1363 
1365  double getJoulesPerEnergyUnit() const { return joulesPerEnergyUnit; }
1366 
1368  int32 getPackageThermalSpecPower() const { return pkgThermalSpecPower; }
1369 
1371  int32 getPackageMinimumPower() const { return pkgMinimumPower; }
1372 
1374  int32 getPackageMaximumPower() const { return pkgMaximumPower; }
1375 
1378  static bool initWinRing0Lib();
1379 
1380  inline void disableJKTWorkaround() { disable_JKT_workaround = true; }
1381 
1382  enum PCIeEventCode
1383  {
1384  // PCIe read events (PCI devices reading from memory - application writes to disk/network/PCIe device)
1385  PCIeRdCur = 0x19E, // PCIe read current (full cache line)
1386  PCIeNSRd = 0x1E4, // PCIe non-snoop read (full cache line)
1387  // PCIe write events (PCI devices writing to memory - application reads from disk/network/PCIe device)
1388  PCIeWiLF = 0x194, // PCIe Write (non-allocating) (full cache line)
1389  PCIeItoM = 0x19C, // PCIe Write (allocating) (full cache line)
1390  PCIeNSWr = 0x1E5, // PCIe Non-snoop write (partial cache line)
1391  PCIeNSWrF = 0x1E6, // PCIe Non-snoop write (full cache line)
1392  // events shared by CPU and IO
1393  RFO = 0x180, // Demand Data RFO; share the same code for CPU, use tid to filter PCIe only traffic
1394  CRd = 0x181, // Demand Code Read
1395  DRd = 0x182, // Demand Data Read
1396  PRd = 0x187, // Partial Reads (UC) (MMIO Read)
1397  WiL = 0x18F, // Write Invalidate Line - partial (MMIO write), PL: Not documented in HSX/IVT
1398  ItoM = 0x1C8, // Request Invalidate Line; share the same code for CPU, use tid to filter PCIe only traffic
1399 
1400  SKX_RFO = 0x200,
1401  SKX_CRd = 0x201,
1402  SKX_DRd = 0x202,
1403  SKX_PRd = 0x207,
1404  SKX_WiL = 0x20F,
1405  SKX_RdCur = 0x21E,
1406  SKX_ItoM = 0x248,
1407  };
1408 
1409  enum ChaPipelineQueue
1410  {
1411  None,
1412  IRQ,
1413  PRQ,
1414  };
1415 
1416  enum CBoEventTid
1417  {
1418  RFOtid = 0x3E,
1419  ItoMtid = 0x3E,
1420  };
1421 
1425  void programPCIeCounters(const PCIeEventCode event_, const uint32 tid_ = 0, const uint32 miss_ = 0, const uint32 q_ = 0, const uint32 nc_ = 0);
1426  void programPCIeMissCounters(const PCIeEventCode event_, const uint32 tid_ = 0, const uint32 q_ = 0, const uint32 nc_ = 0);
1427 
1431  PCIeCounterState getPCIeCounterState(const uint32 socket_);
1432 
1436  void programIIOCounters(IIOPMUCNTCTLRegister rawEvents[4], int IIOStack = -1);
1437 
1442  IIOCounterState getIIOCounterState(int socket, int IIOStack, int counter);
1443 
1448  void getIIOCounterStates(int socket, int IIOStack, IIOCounterState * result);
1449 
1450  uint64 extractCoreGenCounterValue(uint64 val);
1451  uint64 extractCoreFixedCounterValue(uint64 val);
1452  uint64 extractUncoreGenCounterValue(uint64 val);
1453  uint64 extractUncoreFixedCounterValue(uint64 val);
1454  uint64 extractQOSMonitoring(uint64 val);
1455 
1458  const char * getUArchCodename(const int32 cpu_model_ = -1) const;
1459 
1461  static std::string getCPUBrandString();
1462  std::string getCPUFamilyModelString();
1463 
1465  int64 getCPUMicrocodeLevel() const { return cpu_microcode_level; }
1466 
1467  bool packageEnergyMetricsAvailable() const
1468  {
1469  return (
1470  cpu_model == PCM::JAKETOWN
1471  || cpu_model == PCM::IVYTOWN
1472  || cpu_model == PCM::SANDY_BRIDGE
1473  || cpu_model == PCM::IVY_BRIDGE
1474  || cpu_model == PCM::HASWELL
1475  || original_cpu_model == PCM::ATOM_AVOTON
1476  || original_cpu_model == PCM::ATOM_CHERRYTRAIL
1477  || original_cpu_model == PCM::ATOM_BAYTRAIL
1478  || original_cpu_model == PCM::ATOM_APOLLO_LAKE
1479  || original_cpu_model == PCM::ATOM_DENVERTON
1480  || cpu_model == PCM::HASWELLX
1481  || cpu_model == PCM::BROADWELL
1482  || cpu_model == PCM::BDX_DE
1483  || cpu_model == PCM::BDX
1484  || cpu_model == PCM::KNL
1485  || cpu_model == PCM::SKL
1486  || cpu_model == PCM::KBL
1487  || cpu_model == PCM::SKX
1488  );
1489  }
1490 
1491  bool dramEnergyMetricsAvailable() const
1492  {
1493  return (
1494  cpu_model == PCM::JAKETOWN
1495  || cpu_model == PCM::IVYTOWN
1496  || cpu_model == PCM::HASWELLX
1497  || cpu_model == PCM::BDX_DE
1498  || cpu_model == PCM::BDX
1499  || cpu_model == PCM::KNL
1500  || cpu_model == PCM::SKX
1501  );
1502  }
1503 
1504  bool packageThermalMetricsAvailable() const
1505  {
1506  return packageEnergyMetricsAvailable();
1507  }
1508 
1509  bool outgoingQPITrafficMetricsAvailable() const
1510  {
1511  return getQPILinksPerSocket() > 0 &&
1512  (
1513  cpu_model == PCM::NEHALEM_EX
1514  || cpu_model == PCM::WESTMERE_EX
1515  || cpu_model == PCM::JAKETOWN
1516  || cpu_model == PCM::IVYTOWN
1517  || cpu_model == PCM::HASWELLX
1518  || cpu_model == PCM::BDX
1519  || cpu_model == PCM::SKX
1520  );
1521  }
1522 
1523  bool incomingQPITrafficMetricsAvailable() const
1524  {
1525  return getQPILinksPerSocket() > 0 &&
1526  (
1527  cpu_model == PCM::NEHALEM_EX
1528  || cpu_model == PCM::WESTMERE_EX
1529  || cpu_model == PCM::JAKETOWN
1530  || cpu_model == PCM::IVYTOWN
1531  || (cpu_model == PCM::SKX && cpu_stepping > 1)
1532  );
1533  }
1534 
1535  bool qpiUtilizationMetricsAvailable() const
1536  {
1537  return outgoingQPITrafficMetricsAvailable();
1538  }
1539 
1540  bool memoryTrafficMetricsAvailable() const
1541  {
1542  return !(
1543  cpu_model == PCM::ATOM
1544  || cpu_model == PCM::CLARKDALE
1545  );
1546  }
1547 
1548  bool MCDRAMmemoryTrafficMetricsAvailable() const
1549  {
1550  return (cpu_model == PCM::KNL);
1551  }
1552 
1553  bool memoryIOTrafficMetricAvailable() const
1554  {
1555  return (
1556  cpu_model == PCM::SANDY_BRIDGE
1557  || cpu_model == PCM::IVY_BRIDGE
1558  || cpu_model == PCM::HASWELL
1559  || cpu_model == PCM::BROADWELL
1560  || cpu_model == PCM::SKL
1561  || cpu_model == PCM::KBL
1562  );
1563  }
1564 
1565  bool IIOEventsAvailable() const
1566  {
1567  return (
1568  cpu_model == PCM::SKX
1569  );
1570  }
1571 
1572  bool LatencyMetricsAvailable() const
1573  {
1574  return (
1575  cpu_model == PCM::HASWELLX
1576  || cpu_model == PCM::BDX
1577  || cpu_model == PCM::SKX
1578  || cpu_model == PCM::SKL
1579  );
1580  }
1581 
1582  bool PMMTrafficMetricsAvailable() const
1583  {
1584  return (
1585  isCLX()
1586  );
1587  }
1588 
1589  bool LLCReadMissLatencyMetricsAvailable() const
1590  {
1591  return (
1592  HASWELLX == cpu_model
1593  || BDX_DE == cpu_model
1594  || BDX == cpu_model
1595 #ifdef PCM_ENABLE_LLCRDLAT_SKX_MP
1596  || SKX == cpu_model
1597 #else
1598  || ((SKX == cpu_model) && (num_sockets == 1))
1599 #endif
1600  );
1601  }
1602 
1603  bool hasBecktonUncore() const
1604  {
1605  return (
1606  cpu_model == PCM::NEHALEM_EX
1607  || cpu_model == PCM::WESTMERE_EX
1608  );
1609  }
1610  bool hasPCICFGUncore() const // has PCICFG uncore PMON
1611  {
1612  return (
1613  cpu_model == PCM::JAKETOWN
1614  || cpu_model == PCM::IVYTOWN
1615  || cpu_model == PCM::HASWELLX
1616  || cpu_model == PCM::BDX_DE
1617  || cpu_model == PCM::SKX
1618  || cpu_model == PCM::BDX
1619  || cpu_model == PCM::KNL
1620  );
1621  }
1622 
1623  bool hasUPI() const // Intel(r) Ultra Path Interconnect
1624  {
1625  return (
1626  cpu_model == PCM::SKX
1627  );
1628  }
1629 
1630  const char * xPI() const
1631  {
1632  if (hasUPI())
1633  return "UPI";
1634 
1635  return "QPI";
1636  }
1637 
1638  bool supportsHLE() const;
1639  bool supportsRTM() const;
1640 
1641  bool useSkylakeEvents() const
1642  {
1643  return PCM::SKL == cpu_model
1644  || PCM::KBL == cpu_model
1645  || PCM::SKX == cpu_model
1646  ;
1647  }
1648 
1649  static double getBytesPerFlit(int32 cpu_model_)
1650  {
1651  if(cpu_model_ == PCM::SKX)
1652  {
1653  // 172 bits per UPI flit
1654  return 172./8.;
1655  }
1656  // 8 bytes per QPI flit
1657  return 8.;
1658  }
1659 
1660  double getBytesPerFlit() const
1661  {
1662  return getBytesPerFlit(cpu_model);
1663  }
1664 
1665  static double getDataBytesPerFlit(int32 cpu_model_)
1666  {
1667  if(cpu_model_ == PCM::SKX)
1668  {
1669  // 9 UPI flits to transfer 64 bytes
1670  return 64./9.;
1671  }
1672  // 8 bytes per QPI flit
1673  return 8.;
1674  }
1675 
1676  double getDataBytesPerFlit() const
1677  {
1678  return getDataBytesPerFlit(cpu_model);
1679  }
1680 
1681  static double getFlitsPerLinkCycle(int32 cpu_model_)
1682  {
1683  if(cpu_model_ == PCM::SKX)
1684  {
1685  // 5 UPI flits sent every 6 link cycles
1686  return 5./6.;
1687  }
1688  return 2.;
1689  }
1690 
1691  static double getBytesPerLinkCycle(int32 cpu_model_)
1692  {
1693  return getBytesPerFlit(cpu_model_) * getFlitsPerLinkCycle(cpu_model_);
1694  }
1695 
1696  double getBytesPerLinkCycle() const
1697  {
1698  return getBytesPerLinkCycle(cpu_model);
1699  }
1700 
1701  static double getLinkTransfersPerLinkCycle()
1702  {
1703  return 8.;
1704  }
1705 
1706  double getBytesPerLinkTransfer() const
1707  {
1708  return getBytesPerLinkCycle() / getLinkTransfersPerLinkCycle();
1709  }
1710 
1713  void setupCustomCoreEventsForNuma(PCM::ExtendedCustomCoreEventDescription& conf) const;
1714 
1715  #define PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(m) bool is##m() const { return m; }
1716 
1717  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitRatioAvailable)
1718  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitRatioAvailable)
1719  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheMissesAvailable)
1720  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheMissesAvailable)
1721  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L2CacheHitsAvailable)
1722  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsNoSnoopAvailable)
1723  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsSnoopAvailable)
1724  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(L3CacheHitsAvailable)
1725  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(CyclesLostDueL3CacheMissesAvailable) // deprecated
1726  PCM_GENERATE_METRIC_AVAILABLE_FUNCTION(CyclesLostDueL2CacheMissesAvailable) // deprecated
1727 
1728  #undef PCM_GEN_METRIC_AVAILABLE_FUNCTION
1729 
1730  bool isActiveRelativeFrequencyAvailable() const
1731  {
1732  return ATOM != cpu_model;
1733  }
1734 
1735  ~PCM();
1736 };
1737 
1742 {
1743  friend class PCM;
1744  template <class CounterStateType>
1745  friend double getExecUsage(const CounterStateType & before, const CounterStateType & after);
1746  template <class CounterStateType>
1747  friend double getIPC(const CounterStateType & before, const CounterStateType & after);
1748  template <class CounterStateType>
1749  friend double getAverageFrequency(const CounterStateType & before, const CounterStateType & after);
1750  template <class CounterStateType>
1751  friend double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after);
1752  template <class CounterStateType>
1753  friend double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
1754  template <class CounterStateType>
1755  friend double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after);
1756  template <class CounterStateType>
1757  friend double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
1758  template <class CounterStateType>
1759  friend double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after);
1760  template <class CounterStateType>
1761  friend uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after);
1762  template <class CounterStateType>
1763  friend uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after);
1764  template <class CounterStateType>
1765  friend uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after);
1766  template <class CounterStateType>
1767  friend uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after);
1768  template <class CounterStateType>
1769  friend uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after);
1770  template <class CounterStateType>
1771  friend uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after);
1772  template <class CounterStateType>
1773  friend double getCyclesLostDueL3CacheMisses(const CounterStateType & before, const CounterStateType & after);
1774  template <class CounterStateType>
1775  friend double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const CounterStateType & after);
1776  template <class CounterStateType>
1777  friend uint64 getL3CacheOccupancy(const CounterStateType & now);
1778  template <class CounterStateType>
1779  friend uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after);
1780  template <class CounterStateType>
1781  friend uint64 getRemoteMemoryBW(const CounterStateType & before, const CounterStateType & after);
1782  template <class CounterStateType>
1783  friend uint64 getCycles(const CounterStateType & before, const CounterStateType & after);
1784  template <class CounterStateType>
1785  friend uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after);
1786  template <class CounterStateType>
1787  friend uint64 getCycles(const CounterStateType & now);
1788  template <class CounterStateType>
1789  friend uint64 getInstructionsRetired(const CounterStateType & now);
1790  template <class CounterStateType>
1791  friend uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after);
1792  template <class CounterStateType>
1793  friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
1794  template <class CounterStateType>
1795  friend uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after);
1796  template <class CounterStateType>
1797  friend double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
1798  template <class CounterStateType>
1799  friend uint64 getSMICount(const CounterStateType & before, const CounterStateType & after);
1800 
1801 protected:
1802  uint64 InstRetiredAny;
1803  uint64 CpuClkUnhaltedThread;
1804  uint64 CpuClkUnhaltedRef;
1805  // dont put any additional fields between Event 0-Event 3 because getNumberOfCustomEvents assumes there are none
1806  union {
1807  uint64 L3Miss;
1808  uint64 Event0;
1809  uint64 ArchLLCMiss;
1810  };
1811  union {
1812  uint64 L3UnsharedHit;
1813  uint64 Event1;
1814  uint64 ArchLLCRef;
1815  uint64 SKLL3Hit;
1816  };
1817  union {
1818  uint64 L2HitM;
1819  uint64 Event2;
1820  uint64 SKLL2Miss;
1821  };
1822  union {
1823  uint64 L2Hit;
1824  uint64 Event3;
1825  };
1826  uint64 InvariantTSC; // invariant time stamp counter
1827  uint64 CStateResidency[PCM::MAX_C_STATE + 1];
1828  int32 ThermalHeadroom;
1829  uint64 L3Occupancy;
1830  void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
1831  void readAndAggregateTSC(std::shared_ptr<SafeMsrHandle>);
1832  uint64 MemoryBWLocal;
1833  uint64 MemoryBWTotal;
1834  uint64 SMICount;
1835 
1836 public:
1837  BasicCounterState() :
1838  InstRetiredAny(0),
1839  CpuClkUnhaltedThread(0),
1840  CpuClkUnhaltedRef(0),
1841  L3Miss(0),
1842  L3UnsharedHit(0),
1843  L2HitM(0),
1844  L2Hit(0),
1845  InvariantTSC(0),
1846  ThermalHeadroom(PCM_INVALID_THERMAL_HEADROOM),
1847  L3Occupancy(0),
1848  MemoryBWLocal(0),
1849  MemoryBWTotal(0),
1850  SMICount(0)
1851  {
1852  memset(CStateResidency, 0, sizeof(CStateResidency));
1853  }
1854  virtual ~BasicCounterState() { }
1855 
1856  BasicCounterState & operator += (const BasicCounterState & o)
1857  {
1858  InstRetiredAny += o.InstRetiredAny;
1859  CpuClkUnhaltedThread += o.CpuClkUnhaltedThread;
1860  CpuClkUnhaltedRef += o.CpuClkUnhaltedRef;
1861  Event0 += o.Event0;
1862  Event1 += o.Event1;
1863  Event2 += o.Event2;
1864  Event3 += o.Event3;
1865  InvariantTSC += o.InvariantTSC;
1866  for (int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
1867  CStateResidency[i] += o.CStateResidency[i];
1868  // ThermalHeadroom is not accumulative
1869  L3Occupancy += o.L3Occupancy;
1870  MemoryBWLocal += o.MemoryBWLocal;
1871  MemoryBWTotal += o.MemoryBWTotal;
1872  SMICount += o.SMICount;
1873  return *this;
1874  }
1875 
1877  int32 getThermalHeadroom() const { return ThermalHeadroom; }
1878 };
1879 
1880 inline uint64 RDTSC()
1881 {
1882  uint64 result = 0;
1883 #ifdef _MSC_VER
1884  // Windows
1885  #if _MSC_VER>= 1600
1886  result = static_cast<uint64>(__rdtsc());
1887  #endif
1888 #else
1889  // Linux
1890  uint32 high = 0, low = 0;
1891  asm volatile("rdtsc" : "=a" (low), "=d" (high));
1892  result = low + (uint64(high)<<32ULL);
1893 #endif
1894  return result;
1895 
1896 }
1897 
1898 inline uint64 RDTSCP()
1899 {
1900  uint64 result = 0;
1901 #ifdef _MSC_VER
1902  // Windows
1903  #if _MSC_VER>= 1600
1904  unsigned int Aux;
1905  result = __rdtscp(&Aux);
1906  #endif
1907 #else
1908  // Linux and OS X
1909  uint32 high = 0, low = 0;
1910  asm volatile (
1911  "rdtscp\n\t"
1912  "mov %%edx, %0\n\t"
1913  "mov %%eax, %1\n\t":
1914  "=r" (high), "=r" (low) :: "%rax", "%rcx", "%rdx");
1915  result = low + (uint64(high)<<32ULL);
1916 #endif
1917  return result;
1918 }
1919 
1925 template <class CounterStateType>
1926 uint64 getQPIClocks(uint32 port, const CounterStateType & before, const CounterStateType & after)
1927 {
1928  return after.QPIClocks[port] - before.QPIClocks[port];
1929 }
1930 
1931 
1932 template <class CounterStateType>
1933 int32 getThermalHeadroom(const CounterStateType & /* before */, const CounterStateType & after)
1934 {
1935  return after.getThermalHeadroom();
1936 }
1937 
1943 template <class CounterStateType>
1944 uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
1945 {
1946  return after.QPIL0pTxCycles[port] - before.QPIL0pTxCycles[port];
1947 }
1948 
1954 template <class CounterStateType>
1955 uint64 getQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
1956 {
1957  return after.QPIL1Cycles[port] - before.QPIL1Cycles[port];
1958 }
1959 
1966 template <class CounterStateType>
1967 double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
1968 {
1969  return double(getQPIL0pTxCycles(port, before, after)) / double(getQPIClocks(port, before, after));
1970 }
1971 
1978 template <class CounterStateType>
1979 double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after)
1980 {
1981  return double(getQPIL1Cycles(port, before, after)) / double(getQPIClocks(port, before, after));
1982 }
1983 
1989 template <class CounterStateType>
1990 uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
1991 {
1992  return after.DRAMClocks[channel] - before.DRAMClocks[channel];
1993 }
1994 
2000 template <class CounterStateType>
2001 uint64 getMCDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after)
2002 {
2003  return after.MCDRAMClocks[channel] - before.MCDRAMClocks[channel];
2004 }
2005 
2006 
2013 template <class CounterStateType>
2014 uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2015 {
2016  return after.MCCounter[channel][counter] - before.MCCounter[channel][counter];
2017 }
2018 
2019 
2026 template <class CounterStateType>
2027 uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2028 {
2029  return after.M2MCounter[controller][counter] - before.M2MCounter[controller][counter];
2030 }
2031 
2032 
2039 template <class CounterStateType>
2040 uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after)
2041 {
2042  return after.EDCCounter[channel][counter] - before.EDCCounter[channel][counter];
2043 }
2044 
2050 template <class CounterStateType>
2051 uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after)
2052 {
2053  return after.PCUCounter[counter] - before.PCUCounter[counter];
2054 }
2055 
2060 template <class CounterStateType>
2061 uint64 getPCUClocks(const CounterStateType & before, const CounterStateType & after)
2062 {
2063  return getPCUCounter(0, before, after);
2064 }
2065 
2070 template <class CounterStateType>
2071 uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
2072 {
2073  return after.PackageEnergyStatus - before.PackageEnergyStatus;
2074 }
2075 
2080 template <class CounterStateType>
2081 uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after)
2082 {
2083  return after.DRAMEnergyStatus - before.DRAMEnergyStatus;
2084 }
2085 
2090 template <class CounterStateType>
2091 double getConsumedJoules(const CounterStateType & before, const CounterStateType & after)
2092 {
2093  PCM * m = PCM::getInstance();
2094  if (!m) return -1.;
2095 
2096  return double(getConsumedEnergy(before, after)) * m->getJoulesPerEnergyUnit();
2097 }
2098 
2103 template <class CounterStateType>
2104 double getDRAMConsumedJoules(const CounterStateType & before, const CounterStateType & after)
2105 {
2106  PCM * m = PCM::getInstance();
2107  if (!m) return -1.;
2108  double dram_joules_per_energy_unit;
2109 
2110  if (PCM::HASWELLX == m->getCPUModel()
2111  || PCM::BDX_DE == m->getCPUModel()
2112  || PCM::BDX == m->getCPUModel()
2113  || PCM::SKX == m->getCPUModel()
2114  || PCM::KNL == m->getCPUModel()
2115  ) {
2116 /* as described in sections 5.3.2 (DRAM_POWER_INFO) and 5.3.3 (DRAM_ENERGY_STATUS) of
2117  * Volume 2 (Registers) of
2118  * Intel Xeon E5-1600 v3 and Intel Xeon E5-2600 v3 (Haswell-EP) Datasheet (Ref 330784-001, Sept.2014)
2119  * ENERGY_UNIT for DRAM domain is fixed to 15.3 uJ for server HSX, BDW and KNL processors.
2120  */
2121  dram_joules_per_energy_unit = 0.0000153;
2122  } else {
2123 /* for all other processors (including Haswell client/mobile SKUs) the ENERGY_UNIT for DRAM domain
2124  * should be read from PACKAGE_POWER_SKU register (usually value around ~61uJ)
2125  */
2126  dram_joules_per_energy_unit = m->getJoulesPerEnergyUnit();
2127  }
2128  return double(getDRAMConsumedEnergy(before, after)) * dram_joules_per_energy_unit;
2129 }
2130 
2135 {
2136  friend class PCM;
2137  template <class CounterStateType>
2138  friend uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after);
2139  template <class CounterStateType>
2140  friend uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after);
2141  template <class CounterStateType>
2142  friend uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after);
2143  template <class CounterStateType>
2144  friend uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after);
2145  template <class CounterStateType>
2146  friend uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after);
2147  template <class CounterStateType>
2148  friend uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after);
2149  template <class CounterStateType>
2150  friend uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after);
2151  template <class CounterStateType>
2152  friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2153  template <class CounterStateType>
2154  friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2155  template <class CounterStateType>
2156  friend double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after);
2157  template <class CounterStateType>
2158  friend double getLLCReadMissLatency(const CounterStateType & before, const CounterStateType & after);
2159 
2160 protected:
2161  uint64 UncMCFullWrites;
2162  uint64 UncMCNormalReads;
2163  uint64 UncPMMWrites;
2164  uint64 UncPMMReads;
2165  uint64 UncEDCFullWrites;
2166  uint64 UncEDCNormalReads;
2167  uint64 UncMCIORequests;
2168  uint64 PackageEnergyStatus;
2169  uint64 DRAMEnergyStatus;
2170  uint64 TOROccupancyIAMiss;
2171  uint64 TORInsertsIAMiss;
2172  uint64 UncClocks;
2173  uint64 CStateResidency[PCM::MAX_C_STATE + 1];
2174  void readAndAggregate(std::shared_ptr<SafeMsrHandle>);
2175 
2176 public:
2177  UncoreCounterState() :
2178  UncMCFullWrites(0),
2179  UncMCNormalReads(0),
2180  UncPMMWrites(0),
2181  UncPMMReads(0),
2182  UncEDCFullWrites(0),
2183  UncEDCNormalReads(0),
2184  UncMCIORequests(0),
2185  PackageEnergyStatus(0),
2186  DRAMEnergyStatus(0),
2187  TOROccupancyIAMiss(0),
2188  TORInsertsIAMiss(0),
2189  UncClocks(0)
2190  {
2191  memset(CStateResidency, 0, sizeof(CStateResidency));
2192  }
2193  virtual ~UncoreCounterState() { }
2194 
2195  UncoreCounterState & operator += (const UncoreCounterState & o)
2196  {
2197  UncMCFullWrites += o.UncMCFullWrites;
2198  UncMCNormalReads += o.UncMCNormalReads;
2199  UncPMMReads += o.UncPMMReads;
2200  UncPMMWrites += o.UncPMMWrites;
2201  UncEDCFullWrites += o.UncEDCFullWrites;
2202  UncEDCNormalReads += o.UncEDCNormalReads;
2203  UncMCIORequests += o.UncMCIORequests;
2204  PackageEnergyStatus += o.PackageEnergyStatus;
2205  DRAMEnergyStatus += o.DRAMEnergyStatus;
2206  TOROccupancyIAMiss += o.TOROccupancyIAMiss;
2207  TORInsertsIAMiss += o.TORInsertsIAMiss;
2208  UncClocks += o.UncClocks;
2209  for (int i = 0; i <= (int)PCM::MAX_C_STATE; ++i)
2210  CStateResidency[i] += o.CStateResidency[i];
2211  return *this;
2212  }
2213 };
2214 
2215 
2219 {
2220 public:
2221  enum {
2222  maxControllers = 2,
2223  maxChannels = 8,
2224  maxXPILinks = 3,
2225  maxCounters = 4
2226  };
2227 private:
2228  std::array<uint64, maxXPILinks> QPIClocks, QPIL0pTxCycles, QPIL1Cycles;
2229  std::array<uint64, maxChannels> DRAMClocks;
2230  std::array<uint64, maxChannels> MCDRAMClocks;
2231  std::array<std::array<uint64, maxCounters>, maxChannels> MCCounter; // channel X counter
2232  std::array<std::array<uint64, maxCounters>, maxControllers> M2MCounter; // M2M/iMC boxes x counter
2233  std::array<std::array<uint64, maxCounters>, maxChannels> EDCCounter; // EDC controller X counter
2234  std::array<uint64, maxCounters> PCUCounter;
2235  int32 PackageThermalHeadroom;
2236  uint64 InvariantTSC; // invariant time stamp counter
2237  friend class PCM;
2238  template <class CounterStateType>
2239  friend uint64 getQPIClocks(uint32 port, const CounterStateType & before, const CounterStateType & after);
2240  template <class CounterStateType>
2241  friend uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType & before, const CounterStateType & after);
2242  template <class CounterStateType>
2243  friend uint64 getQPIL1Cycles(uint32 port, const CounterStateType & before, const CounterStateType & after);
2244  template <class CounterStateType>
2245  friend uint64 getDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
2246  template <class CounterStateType>
2247  friend uint64 getMCDRAMClocks(uint32 channel, const CounterStateType & before, const CounterStateType & after);
2248  template <class CounterStateType>
2249  friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2250  template <class CounterStateType>
2251  friend uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2252  template <class CounterStateType>
2253  friend uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType & before, const CounterStateType & after);
2254  template <class CounterStateType>
2255  friend uint64 getPCUCounter(uint32 counter, const CounterStateType & before, const CounterStateType & after);
2256  template <class CounterStateType>
2257  friend uint64 getConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2258  template <class CounterStateType>
2259  friend uint64 getDRAMConsumedEnergy(const CounterStateType & before, const CounterStateType & after);
2260  template <class CounterStateType>
2261  friend uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after);
2262 
2263 public:
2265  int32 getPackageThermalHeadroom() const { return PackageThermalHeadroom; }
2267  QPIClocks{}, QPIL0pTxCycles{}, QPIL1Cycles{},
2268  DRAMClocks{},
2269  MCDRAMClocks{},
2270  MCCounter{},
2271  M2MCounter{},
2272  EDCCounter{},
2273  PCUCounter{},
2274  PackageThermalHeadroom(0),
2275  InvariantTSC(0)
2276  {
2277  }
2278 };
2279 
2282 {
2283  friend class PCM;
2284 
2285 public:
2286 };
2287 
2290 {
2291  friend class PCM;
2292 
2293 protected:
2294  void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2295  {
2296  BasicCounterState::readAndAggregate(handle);
2297  UncoreCounterState::readAndAggregate(handle);
2298  }
2299 
2300 public:
2301  void accumulateCoreState(const CoreCounterState & o)
2302  {
2303  BasicCounterState::operator += (o);
2304  }
2305 };
2306 
2309 {
2310  friend class PCM;
2311  std::vector<std::vector<uint64> > incomingQPIPackets; // each 64 byte
2312  std::vector<std::vector<uint64> > outgoingQPIFlits; // idle or data/non-data flits depending on the architecture
2313  std::vector<std::vector<uint64> > TxL0Cycles;
2314  uint64 uncoreTSC;
2315 
2316 protected:
2317  void readAndAggregate(std::shared_ptr<SafeMsrHandle> handle)
2318  {
2319  BasicCounterState::readAndAggregate(handle);
2320  UncoreCounterState::readAndAggregate(handle);
2321  }
2322 
2323 public:
2324  friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2325  friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
2326  friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2327  friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after);
2328  friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now);
2329  SystemCounterState() :
2330  uncoreTSC(0)
2331  {
2332  PCM * m = PCM::getInstance();
2333  incomingQPIPackets.resize(m->getNumSockets(),
2334  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2335  outgoingQPIFlits.resize(m->getNumSockets(),
2336  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2337  TxL0Cycles.resize(m->getNumSockets(),
2338  std::vector<uint64>((uint32)m->getQPILinksPerSocket(), 0));
2339  }
2340 
2341  void accumulateSocketState(const SocketCounterState & o)
2342  {
2343  {
2344  BasicCounterState::operator += (o);
2345  UncoreCounterState::operator += (o);
2346  }
2347  }
2348 };
2349 
2360 
2368 PCM_API SocketCounterState getSocketCounterState(uint32 socket);
2369 
2377 PCM_API CoreCounterState getCoreCounterState(uint32 core);
2378 
2379 
2386 template <class CounterStateType>
2387 double getIPC(const CounterStateType & before, const CounterStateType & after) // instructions per cycle
2388 {
2389  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2390  if (clocks != 0)
2391  return double(after.InstRetiredAny - before.InstRetiredAny) / double(clocks);
2392  return -1;
2393 }
2394 
2395 
2402 template <class CounterStateType>
2403 uint64 getInstructionsRetired(const CounterStateType & before, const CounterStateType & after) // instructions
2404 {
2405  return after.InstRetiredAny - before.InstRetiredAny;
2406 }
2407 
2414 template <class CounterStateType>
2415 double getExecUsage(const CounterStateType & before, const CounterStateType & after) // usage
2416 {
2417  int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
2418  if (timer_clocks != 0)
2419  return double(after.InstRetiredAny - before.InstRetiredAny) / double(timer_clocks);
2420  return -1;
2421 }
2422 
2428 template <class CounterStateType>
2429 uint64 getInstructionsRetired(const CounterStateType & now) // instructions
2430 {
2431  return now.InstRetiredAny;
2432 }
2433 
2451 template <class CounterStateType>
2452 uint64 getCycles(const CounterStateType & before, const CounterStateType & after) // clocks
2453 {
2454  return after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2455 }
2456 
2467 template <class CounterStateType>
2468 uint64 getRefCycles(const CounterStateType & before, const CounterStateType & after) // clocks
2469 {
2470  return after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
2471 }
2472 
2480 template <class CounterStateType>
2481 uint64 getCycles(const CounterStateType & now) // clocks
2482 {
2483  return now.CpuClkUnhaltedThread;
2484 }
2485 
2494 inline double getCoreIPC(const SystemCounterState & before, const SystemCounterState & after) // instructions per cycle
2495 {
2496  double ipc = getIPC(before, after);
2497  PCM * m = PCM::getInstance();
2498  if (ipc >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
2499  return ipc * double(m->getThreadsPerCore());
2500  return -1;
2501 }
2502 
2503 
2512 inline double getTotalExecUsage(const SystemCounterState & before, const SystemCounterState & after) // usage
2513 {
2514  double usage = getExecUsage(before, after);
2515  PCM * m = PCM::getInstance();
2516  if (usage >= 0. && m && (m->getNumCores() == m->getNumOnlineCores()))
2517  return usage * double(m->getThreadsPerCore());
2518  return -1;
2519 }
2520 
2527 template <class CounterStateType>
2528 double getAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
2529 {
2530  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2531  int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
2532  PCM * m = PCM::getInstance();
2533  if (timer_clocks != 0 && m)
2534  return double(m->getNominalFrequency()) * double(clocks) / double(timer_clocks);
2535  return -1;
2536 }
2537 
2544 template <class CounterStateType>
2545 double getActiveAverageFrequency(const CounterStateType & before, const CounterStateType & after) // in Hz
2546 {
2547  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2548  int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
2549  PCM * m = PCM::getInstance();
2550  if (ref_clocks != 0 && m)
2551  return double(m->getNominalFrequency()) * double(clocks) / double(ref_clocks);
2552  return -1;
2553 }
2554 
2561 template <class CounterStateType>
2562 double getRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
2563 {
2564  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2565  int64 timer_clocks = after.InvariantTSC - before.InvariantTSC;
2566  if (timer_clocks != 0)
2567  return double(clocks) / double(timer_clocks);
2568  return -1;
2569 }
2570 
2577 template <class CounterStateType>
2578 double getActiveRelativeFrequency(const CounterStateType & before, const CounterStateType & after) // fraction of nominal frequency
2579 {
2580  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2581  int64 ref_clocks = after.CpuClkUnhaltedRef - before.CpuClkUnhaltedRef;
2582  if (ref_clocks != 0)
2583  return double(clocks) / double(ref_clocks);
2584  return -1;
2585 }
2586 
2594 template <class CounterStateType>
2595 double getCyclesLostDueL3CacheMisses(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
2596 {
2597  const int cpu_model = PCM::getInstance()->getCPUModel();
2598  if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL) return -1;
2599  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2600  if (clocks != 0)
2601  {
2602  return 180. * double(after.L3Miss - before.L3Miss) / double(clocks);
2603  }
2604  return -1;
2605 }
2606 
2615 template <class CounterStateType>
2616 double getCyclesLostDueL2CacheMisses(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
2617 {
2618  const int cpu_model = PCM::getInstance()->getCPUModel();
2619  if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL || PCM::getInstance()->useSkylakeEvents()) return -1;
2620  int64 clocks = after.CpuClkUnhaltedThread - before.CpuClkUnhaltedThread;
2621  if (clocks != 0)
2622  {
2623  double L3UnsharedHit = (double)(after.L3UnsharedHit - before.L3UnsharedHit);
2624  double L2HitM = (double)(after.L2HitM - before.L2HitM);
2625  return (35. * L3UnsharedHit + 74. * L2HitM) / double(clocks);
2626  }
2627  return -1;
2628 }
2629 
2637 template <class CounterStateType>
2638 double getL2CacheHitRatio(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
2639 {
2640  if (PCM::getInstance()->useSkylakeEvents()) {
2641  uint64 L2Hit = after.L2Hit - before.L2Hit;
2642  uint64 L2Ref = L2Hit + after.SKLL2Miss - before.SKLL2Miss;
2643  if (L2Ref) {
2644  return double(L2Hit) / double(L2Ref);
2645  }
2646  return 1;
2647  }
2648  const int cpu_model = PCM::getInstance()->getCPUModel();
2649  if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL)
2650  {
2651  uint64 L2Miss = after.ArchLLCMiss - before.ArchLLCMiss;
2652  uint64 L2Ref = after.ArchLLCRef - before.ArchLLCRef;
2653  if (L2Ref) {
2654  return 1. - (double(L2Miss) / double(L2Ref));
2655  }
2656  return 1;
2657  }
2658  uint64 L3Miss = after.L3Miss - before.L3Miss;
2659  uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
2660  uint64 L2HitM = after.L2HitM - before.L2HitM;
2661  uint64 L2Hit = after.L2Hit - before.L2Hit;
2662  uint64 hits = L2Hit;
2663  uint64 all = L2Hit + L2HitM + L3UnsharedHit + L3Miss;
2664  if (all) return double(hits) / double(all);
2665 
2666  return 1;
2667 }
2668 
2676 template <class CounterStateType>
2677 double getL3CacheHitRatio(const CounterStateType & before, const CounterStateType & after) // 0.0 - 1.0
2678 {
2679  if (!PCM::getInstance()->isL3CacheHitRatioAvailable()) return -1;
2680  if (PCM::getInstance()->useSkylakeEvents()) {
2681  uint64 L3Hit = after.SKLL3Hit - before.SKLL3Hit;
2682  uint64 L3Ref = L3Hit + after.L3Miss - before.L3Miss;
2683  if (L3Ref) {
2684  return double(L3Hit) / double(L3Ref);
2685  }
2686  return 1;
2687  }
2688 
2689  uint64 L3Miss = after.L3Miss - before.L3Miss;
2690  uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
2691  uint64 L2HitM = after.L2HitM - before.L2HitM;
2692  uint64 hits = L3UnsharedHit + L2HitM;
2693  uint64 all = L2HitM + L3UnsharedHit + L3Miss;
2694  if (all) return double(hits) / double(all);
2695 
2696  return 1;
2697 }
2698 
2706 template <class CounterStateType>
2707 uint64 getL3CacheMisses(const CounterStateType & before, const CounterStateType & after)
2708 {
2709  if (!PCM::getInstance()->isL3CacheMissesAvailable()) return 0;
2710  return after.L3Miss - before.L3Miss;
2711 }
2712 
2720 template <class CounterStateType>
2721 uint64 getL2CacheMisses(const CounterStateType & before, const CounterStateType & after)
2722 {
2723  if (PCM::getInstance()->useSkylakeEvents()) {
2724  return after.SKLL2Miss - before.SKLL2Miss;
2725  }
2726  const int cpu_model = PCM::getInstance()->getCPUModel();
2727  if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL)
2728  {
2729  return after.ArchLLCMiss - before.ArchLLCMiss;
2730  }
2731  uint64 L3Miss = after.L3Miss - before.L3Miss;
2732  uint64 L3UnsharedHit = after.L3UnsharedHit - before.L3UnsharedHit;
2733  uint64 L2HitM = after.L2HitM - before.L2HitM;
2734  return L2HitM + L3UnsharedHit + L3Miss;
2735 }
2736 
2744 template <class CounterStateType>
2745 uint64 getL2CacheHits(const CounterStateType & before, const CounterStateType & after)
2746 {
2747  const int cpu_model = PCM::getInstance()->getCPUModel();
2748  if (cpu_model == PCM::ATOM || cpu_model == PCM::KNL)
2749  {
2750  uint64 L2Miss = after.ArchLLCMiss - before.ArchLLCMiss;
2751  uint64 L2Ref = after.ArchLLCRef - before.ArchLLCRef;
2752  return L2Ref - L2Miss;
2753  }
2754  return after.L2Hit - before.L2Hit;
2755 }
2756 
2760 template <class CounterStateType>
2761 uint64 getL3CacheOccupancy(const CounterStateType & now)
2762 {
2763  return now.L3Occupancy;
2764 }
2768 template <class CounterStateType>
2769 uint64 getLocalMemoryBW(const CounterStateType & before, const CounterStateType & after)
2770 {
2771  return after.MemoryBWLocal - before.MemoryBWLocal;
2772 }
2773 
2777 template <class CounterStateType>
2778 uint64 getRemoteMemoryBW(const CounterStateType & before, const CounterStateType & after)
2779 {
2780  const uint64 total = after.MemoryBWTotal - before.MemoryBWTotal;
2781  const uint64 local = getLocalMemoryBW(before, after);
2782  if (total > local)
2783  return total - local;
2784 
2785  return 0;
2786 }
2787 
2795 template <class CounterStateType>
2796 uint64 getL3CacheHitsNoSnoop(const CounterStateType & before, const CounterStateType & after)
2797 {
2798  if (!PCM::getInstance()->isL3CacheHitsNoSnoopAvailable()) return 0;
2799  return after.L3UnsharedHit - before.L3UnsharedHit;
2800 }
2801 
2809 template <class CounterStateType>
2810 uint64 getL3CacheHitsSnoop(const CounterStateType & before, const CounterStateType & after)
2811 {
2812  if (!PCM::getInstance()->isL3CacheHitsSnoopAvailable()) return 0;
2813  if (PCM::getInstance()->useSkylakeEvents()) {
2814  return after.SKLL3Hit - before.SKLL3Hit;
2815  }
2816  return after.L2HitM - before.L2HitM;
2817 }
2818 
2819 
2827 template <class CounterStateType>
2828 uint64 getL3CacheHits(const CounterStateType & before, const CounterStateType & after)
2829 {
2830  if (!PCM::getInstance()->isL3CacheHitsAvailable()) return 0;
2831  return getL3CacheHitsSnoop(before, after) + getL3CacheHitsNoSnoop(before, after);
2832 }
2833 
2842 template <class CounterStateType>
2843 uint64 getInvariantTSC(const CounterStateType & before, const CounterStateType & after)
2844 {
2845  return after.InvariantTSC - before.InvariantTSC;
2846 }
2847 
2855 template <class CounterStateType>
2856 inline double getCoreCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
2857 {
2858  const double tsc = double(getInvariantTSC(before, after));
2859 
2860  if (state == 0) return double(getRefCycles(before, after)) / tsc;
2861 
2862  if (state == 1)
2863  {
2864  PCM * m = PCM::getInstance();
2865  double result = 1.0 - double(getRefCycles(before, after)) / tsc; // 1.0 - cC0
2866  for (int i = 2; i <= PCM::MAX_C_STATE; ++i)
2867  if (m->isCoreCStateResidencySupported(state))
2868  result -= (after.BasicCounterState::CStateResidency[i] - before.BasicCounterState::CStateResidency[i]) / tsc;
2869 
2870  if (result < 0.) result = 0.; // fix counter dissynchronization
2871  else if (result > 1.) result = 1.; // fix counter dissynchronization
2872 
2873  return result;
2874  }
2875  return (after.BasicCounterState::CStateResidency[state] - before.BasicCounterState::CStateResidency[state]) / tsc;
2876 }
2877 
2885 template <class CounterStateType>
2886 inline double getPackageCStateResidency(int state, const CounterStateType & before, const CounterStateType & after)
2887 {
2888  return double(after.UncoreCounterState::CStateResidency[state] - before.UncoreCounterState::CStateResidency[state]) / double(getInvariantTSC(before, after));
2889 }
2890 
2891 
2898 template <class CounterStateType>
2899 uint64 getBytesReadFromMC(const CounterStateType & before, const CounterStateType & after)
2900 {
2901  return (after.UncMCNormalReads - before.UncMCNormalReads) * 64;
2902 }
2903 
2910 template <class CounterStateType>
2911 uint64 getBytesWrittenToMC(const CounterStateType & before, const CounterStateType & after)
2912 {
2913  return (after.UncMCFullWrites - before.UncMCFullWrites) * 64;
2914 }
2915 
2922 template <class CounterStateType>
2923 uint64 getBytesReadFromPMM(const CounterStateType & before, const CounterStateType & after)
2924 {
2925  return (after.UncPMMReads - before.UncPMMReads) * 64;
2926 }
2927 
2934 template <class CounterStateType>
2935 uint64 getBytesWrittenToPMM(const CounterStateType & before, const CounterStateType & after)
2936 {
2937  return (after.UncPMMWrites - before.UncPMMWrites) * 64;
2938 }
2939 
2946 template <class CounterStateType>
2947 uint64 getBytesReadFromEDC(const CounterStateType & before, const CounterStateType & after)
2948 {
2949  return (after.UncEDCNormalReads - before.UncEDCNormalReads) * 64;
2950 }
2951 
2958 template <class CounterStateType>
2959 uint64 getBytesWrittenToEDC(const CounterStateType & before, const CounterStateType & after)
2960 {
2961  return (after.UncEDCFullWrites - before.UncEDCFullWrites) * 64;
2962 }
2963 
2964 
2971 template <class CounterStateType>
2972 uint64 getIORequestBytesFromMC(const CounterStateType & before, const CounterStateType & after)
2973 {
2974  return (after.UncMCIORequests - before.UncMCIORequests) * 64;
2975 }
2976 
2983 template <class CounterStateType>
2984 uint64 getSMICount(const CounterStateType & before, const CounterStateType & after)
2985 {
2986  return after.SMICount - before.SMICount;
2987 }
2988 
2998 template <class CounterStateType>
2999 uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType & before, const CounterStateType & after)
3000 {
3001  return ((&after.Event0)[eventCounterNr] - (&before.Event0)[eventCounterNr]);
3002 }
3003 
3014 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3015 {
3016  if (!PCM::getInstance()->incomingQPITrafficMetricsAvailable()) return 0;
3017  uint64 b = before.incomingQPIPackets[socketNr][linkNr];
3018  uint64 a = after.incomingQPIPackets[socketNr][linkNr];
3019  // prevent overflows due to counter dissynchronisation
3020  return (a > b) ? (64 * (a - b)) : 0;
3021 }
3022 
3033 inline double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3034 {
3035  PCM * m = PCM::getInstance();
3036  if (!(m->qpiUtilizationMetricsAvailable())) return 0.;
3037 
3038  const double bytes = (double)getIncomingQPILinkBytes(socketNr, linkNr, before, after);
3039  const uint64 max_speed = m->getQPILinkSpeed(socketNr, linkNr);
3040  const double max_bytes = (double)(double(max_speed) * double(getInvariantTSC(before, after) / double(m->getNumCores())) / double(m->getNominalFrequency()));
3041  return bytes / max_bytes;
3042 }
3043 
3054 inline double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3055 {
3056  PCM * m = PCM::getInstance();
3057 
3058  if (m->hasBecktonUncore())
3059  {
3060  const uint64 b = before.outgoingQPIFlits[socketNr][linkNr]; // idle flits
3061  const uint64 a = after.outgoingQPIFlits[socketNr][linkNr]; // idle flits
3062  // prevent overflows due to counter dissynchronisation
3063  const double idle_flits = (double)((a > b) ? (a - b) : 0);
3064  const uint64 bTSC = before.uncoreTSC;
3065  const uint64 aTSC = after.uncoreTSC;
3066  const double tsc = (double)((aTSC > bTSC) ? (aTSC - bTSC) : 0);
3067  if (idle_flits >= tsc) return 0.; // prevent oveflows due to potential counter dissynchronization
3068 
3069  return (1. - (idle_flits / tsc));
3070  } else if (m->hasPCICFGUncore())
3071  {
3072  const uint64 b = before.outgoingQPIFlits[socketNr][linkNr]; // data + non-data flits or idle (null) flits
3073  const uint64 a = after.outgoingQPIFlits[socketNr][linkNr]; // data + non-data flits or idle (null) flits
3074  // prevent overflows due to counter dissynchronisation
3075  double flits = (double)((a > b) ? (a - b) : 0);
3076  const double max_flits = ((double(getInvariantTSC(before, after)) * double(m->getQPILinkSpeed(socketNr, linkNr)) / m->getBytesPerFlit()) / double(m->getNominalFrequency())) / double(m->getNumCores());
3077  if(m->hasUPI())
3078  {
3079  flits = flits/3.;
3080  }
3081  if (flits > max_flits) return 1.; // prevent oveflows due to potential counter dissynchronization
3082  return (flits / max_flits);
3083  }
3084 
3085  return 0;
3086 }
3087 
3098 inline uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & before, const SystemCounterState & after)
3099 {
3100  PCM * m = PCM::getInstance();
3101  if (!(m->outgoingQPITrafficMetricsAvailable())) return 0;
3102 
3103  const double util = getOutgoingQPILinkUtilization(socketNr, linkNr, before, after);
3104  const double max_bytes = (double(m->getQPILinkSpeed(socketNr, linkNr)) * double(getInvariantTSC(before, after) / double(m->getNumCores())) / double(m->getNominalFrequency()));
3105 
3106  return (uint64)(max_bytes * util);
3107 }
3108 
3109 
3118 inline uint64 getAllIncomingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
3119 {
3120  PCM * m = PCM::getInstance();
3121  const uint32 ns = m->getNumSockets();
3122  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3123  uint64 sum = 0;
3124 
3125  for (uint32 s = 0; s < ns; ++s)
3126  for (uint32 q = 0; q < qpiLinks; ++q)
3127  sum += getIncomingQPILinkBytes(s, q, before, after);
3128 
3129  return sum;
3130 }
3131 
3140 inline uint64 getAllOutgoingQPILinkBytes(const SystemCounterState & before, const SystemCounterState & after)
3141 {
3142  PCM * m = PCM::getInstance();
3143  const uint32 ns = m->getNumSockets();
3144  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3145  uint64 sum = 0;
3146 
3147  for (uint32 s = 0; s < ns; ++s)
3148  for (uint32 q = 0; q < qpiLinks; ++q)
3149  sum += getOutgoingQPILinkBytes(s, q, before, after);
3150 
3151  return sum;
3152 }
3153 
3154 
3164 inline uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState & now)
3165 {
3166  return 64 * now.incomingQPIPackets[socketNr][linkNr];
3167 }
3168 
3177 inline uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState & now)
3178 {
3179  PCM * m = PCM::getInstance();
3180  const uint32 qpiLinks = (uint32)m->getQPILinksPerSocket();
3181  uint64 sum = 0;
3182 
3183  for (uint32 q = 0; q < qpiLinks; ++q)
3184  sum += getIncomingQPILinkBytes(socketNr, q, now);
3185 
3186  return sum;
3187 }
3188 
3197 {
3198  PCM * m = PCM::getInstance();
3199  const uint32 ns = m->getNumSockets();
3200  uint64 sum = 0;
3201 
3202  for (uint32 s = 0; s < ns; ++s)
3203  sum += getSocketIncomingQPILinkBytes(s, now);
3204  return sum;
3205 }
3206 
3207 
3217 inline double getQPItoMCTrafficRatio(const SystemCounterState & before, const SystemCounterState & after)
3218 {
3219  const uint64 totalQPI = getAllIncomingQPILinkBytes(before, after);
3220  uint64 memTraffic = getBytesReadFromMC(before, after) + getBytesWrittenToMC(before, after);
3221  if (PCM::getInstance()->PMMTrafficMetricsAvailable())
3222  {
3223  memTraffic += getBytesReadFromPMM(before, after) + getBytesWrittenToPMM(before, after);
3224  }
3225  return double(totalQPI) / double(memTraffic);
3226 }
3227 
3231 template <class CounterType>
3232 inline uint64 getNumberOfEvents(const CounterType & before, const CounterType & after)
3233 {
3234  return after.data - before.data;
3235 }
3237 
3238 template <class CounterStateType>
3239 inline double getLLCReadMissLatency(const CounterStateType & before, const CounterStateType & after)
3240 {
3241  const double occupancy = double(after.TOROccupancyIAMiss) - double(before.TOROccupancyIAMiss);
3242  const double inserts = double(after.TORInsertsIAMiss) - double(before.TORInsertsIAMiss);
3243  const double unc_clocks = double(after.UncClocks) - double(before.UncClocks);
3244  auto * m = PCM::getInstance();
3245  const double seconds = double(getInvariantTSC(before, after)) / double(m->getNumCores()/m->getNumSockets()) / double(m->getNominalFrequency());
3246  return 1e9*seconds*(occupancy/inserts)/unc_clocks;
3247 }
3248 
3249 #endif
size_t getEDCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:1275
double getNormalizedQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1979
uint64 getSocketIncomingQPILinkBytes(uint32 socketNr, const SystemCounterState &now)
Get estimation of total QPI data traffic for this socket.
Definition: cpucounters.h:3177
friend uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:2843
Definition: memoptest.cpp:35
uint32 getCPUStepping() const
Reads CPU stepping id.
Definition: cpucounters.h:1150
uint64 getEDCCounter(uint32 channel, uint32 counter)
Direct read of embedded DRAM memory controller PMU counter (counter meaning depends on the programmin...
Definition: cpucounters.cpp:5674
uint64 getQPIClocks(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns QPI LL clock ticks.
Definition: cpucounters.h:1926
friend uint64 getBytesWrittenToMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to DRAM memory controllers.
Definition: cpucounters.h:2911
Definition: types.h:301
friend uint64 getBytesReadFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from DRAM memory controllers.
Definition: cpucounters.h:2899
uint64 getMCDRAMClocks(uint32 channel)
Get number MCDRAM channel cycles.
Definition: cpucounters.cpp:5651
uint64 getAllOutgoingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data+nondata traffic.
Definition: cpucounters.h:3140
friend uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/...
Definition: cpucounters.h:2027
Internal type and constant definitions.
friend uint64 getPCUCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of power control unit PMU counter (counter meaning depends on the programming: power/perf...
Definition: cpucounters.h:2051
uint64 getMCDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns MCDRAM clock ticks.
Definition: cpucounters.h:2001
uint32 getNumOnlineCores() const
Reads number of online logical cores in the system.
Definition: cpucounters.cpp:4111
uint64 getPMMReads()
Get the number of PMM memory reads (in cache lines)
Definition: cpucounters.cpp:5359
uint64 getQPILinkSpeed(const uint32 linkNr) const
Returns the speed of the QPI link.
Definition: cpucounters.h:426
Definition: mutex.h:14
void setRunState(int new_state)
Set Run State.
Definition: cpucounters.h:589
size_t getNumMCChannels() const
Returns the total number of detected memory channels on all integrated memory controllers.
Definition: cpucounters.h:438
Socket-wide counter state.
Definition: cpucounters.h:2289
friend uint64 getL3CacheHitsNoSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done...
Definition: cpucounters.h:2796
uint64 getL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache misses.
Definition: cpucounters.h:2707
Definition: lspci.h:76
Interface to access client bandwidth counters.
size_t getMCChannelsPerSocket() const
Returns the total number of detected memory channels on all integrated memory controllers per socket...
Definition: cpucounters.h:1224
uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI data traffic per incoming QPI link.
Definition: cpucounters.h:3014
friend uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: p...
Definition: cpucounters.h:2040
friend uint64 getL3CacheHitsSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where snooping in sibling L2 caches had to be done.
Definition: cpucounters.h:2810
Definition: types.h:336
int64 getCPUMicrocodeLevel() const
Get microcode level (returns -1 if retrieval not supported due to some restrictions) ...
Definition: cpucounters.h:1465
friend uint64 getRemoteMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Remote Memory Bandwidth.
Definition: cpucounters.h:2778
SupportedCPUModels
Identifiers of supported CPU models.
Definition: cpucounters.h:1103
double getCoreIPC(const SystemCounterState &before, const SystemCounterState &after)
Computes average number of retired instructions per core cycle for the entire system combining instru...
Definition: cpucounters.h:2494
uint64 getL3CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes total number of L3 cache hits.
Definition: cpucounters.h:2828
uint64 getL3CacheHitsSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where snooping in sibling L2 caches had to be done.
Definition: cpucounters.h:2810
Definition: cpucounters.cpp:1610
Object to access uncore counters in a socket/processor with microarchitecture codename SandyBridge-EP...
Definition: cpucounters.h:282
uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:2081
friend double getIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle (IPC)
Definition: cpucounters.h:2387
void reportQPISpeed() const
Print QPI Speeds.
Definition: cpucounters.cpp:5919
Definition: cpucounters.h:83
friend uint64 getBytesReadFromPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from PMM memory.
Definition: cpucounters.h:2923
uint64 getL3CacheOccupancy(const CounterStateType &now)
Computes L3 Cache Occupancy.
Definition: cpucounters.h:2761
uint64 getEdcReads()
Get the number of cache lines read by EDC (embedded DRAM controller)
Definition: cpucounters.cpp:5379
friend double getActiveRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:2578
friend uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:2071
friend double getCyclesLostDueL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to missing L2 cache but still hitting L3 cac...
Definition: cpucounters.h:2616
friend double getL2CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L2 cache hit ratio.
Definition: cpucounters.h:2638
uint32 getCPUModel() const
Reads CPU model id.
Definition: cpucounters.h:1142
uint64 getQPILinkSpeed(uint32 socketNr, uint32 linkNr) const
Return QPI Link Speed in GBytes/second.
Definition: cpucounters.h:1359
ProgramMode
Mode of programming (parameter in the program() method)
Definition: cpucounters.h:607
friend uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:2843
double getL2CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L2 cache hit ratio.
Definition: cpucounters.h:2638
double getExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time intervall.
Definition: cpucounters.h:2415
uint64 getQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1955
int32 getThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:1877
double getConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by processor (excluding DRAM)
Definition: cpucounters.h:2091
double getCyclesLostDueL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to L3 cache misses.
Definition: cpucounters.h:2595
uint64 computeQPISpeed(const uint32 ref_core, const int cpumodel)
Measures/computes the maximum theoretical QPI link bandwidth speed in GByte/seconds.
Definition: cpucounters.cpp:5843
friend uint64 getLocalMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Local Memory Bandwidth.
Definition: cpucounters.h:2769
uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:2071
uint64 getM2MCounter(uint32 controller, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of Memory2Mesh controller PMU counter (counter meaning depends on the programming: power/...
Definition: cpucounters.h:2027
Definition: cpucounters.h:187
double getActiveRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:2578
uint32 getThreadsPerCore() const
Reads how many hardware threads has a physical core &quot;Hardware thread&quot; is a logical core in a differen...
Definition: cpucounters.cpp:4127
System-wide counter state.
Definition: cpucounters.h:2308
friend double getL3CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L3 cache hit ratio.
Definition: cpucounters.h:2677
friend double getCoreCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the core C-state.
Definition: cpucounters.h:2856
double getNormalizedQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the ratio of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1967
uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1944
friend uint64 getQPIL0pTxCycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving half-lane mode.
Definition: cpucounters.h:1944
double getCoreCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the core C-state.
Definition: cpucounters.h:2856
uint64 getDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns DRAM clock ticks.
Definition: cpucounters.h:1990
uint64 getNominalFrequency() const
Reads the nominal core frequency.
Definition: cpucounters.cpp:4137
uint64 getImcReadsForChannels(uint32 beginChannel, uint32 endChannel)
Get the number of integrated controller reads for given channels (in cache lines) ...
Definition: cpucounters.cpp:5338
friend double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get utilization of outgoing QPI link (0..1)
Definition: cpucounters.h:3054
int32 getSocketId(uint32 core_id) const
Determines socket of given core.
Definition: cpucounters.h:1170
friend double getAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:2528
Custom Core event description.
Definition: cpucounters.h:678
double getPackageCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the package C-state.
Definition: cpucounters.h:2886
uint64 getBytesWrittenToPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to PMM memory.
Definition: cpucounters.h:2935
uint64 getBytesWrittenToEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to MCDRAM memory controllers.
Definition: cpucounters.h:2959
int32 getTileId(uint32 os_id) const
Determines physical tile (cores sharing L2 cache) of given processor ID.
Definition: cpucounters.h:1165
uint64 getAllIncomingQPILinkBytes(const SystemCounterState &before, const SystemCounterState &after)
Get estimation of total QPI data traffic.
Definition: cpucounters.h:3118
friend uint64 getMCDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns MCDRAM clock ticks.
Definition: cpucounters.h:2001
void program()
Program performance counters (disables programming power counters)
Definition: cpucounters.cpp:5193
uint64 getQPIL0pTxCycles(uint32 port)
Get number cycles on a QPI port when the link was in a power saving half-lane mode.
Definition: cpucounters.cpp:5630
Basic uncore counter state.
Definition: cpucounters.h:2134
void enableJKTWorkaround(bool enable)
Enable correct counting of various LLC events (with memory access perf penalty)
Definition: cpucounters.cpp:5711
friend uint64 getBytesReadFromEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from MCDRAM memory controllers.
Definition: cpucounters.h:2947
uint64 getBytesWrittenToMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to DRAM memory controllers.
Definition: cpucounters.h:2911
Extended custom core event description.
Definition: cpucounters.h:692
int32 getPackageThermalSpecPower() const
Returns thermal specification power of the package domain in Watt.
Definition: cpucounters.h:1368
friend uint64 getSMICount(const CounterStateType &before, const CounterStateType &after)
Returns the number of occured system management interrupts.
Definition: cpucounters.h:2984
double getActiveAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:2545
uint64 getImcReadsForController(uint32 controller)
Get the number of integrated controller reads for given controller (in cache lines) ...
Definition: cpucounters.cpp:5326
double getRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:2562
Definition: cpucounters.h:209
uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.h:2014
uint64 getQPILLCounter(uint32 port, uint32 counter)
Direct read of QPI LL PMU counter (counter meaning depends on the programming: power/performance/etc)...
Definition: cpucounters.cpp:5699
uint64 getQPILinksPerSocket() const
Returns the number of Intel(r) Quick Path Interconnect(tm) links per socket.
Definition: cpucounters.h:1174
Definition: cpucounters.h:167
friend double getExecUsage(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per time intervall.
Definition: cpucounters.h:2415
size_t getNumQPIPorts() const
Returns the number of detected QPI ports.
Definition: cpucounters.h:423
uint32 getNumSockets() const
Reads number of sockets (CPUs) in the system.
Definition: cpucounters.cpp:4116
uint64 getEDCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of embedded DRAM memory controller counter (counter meaning depends on the programming: p...
Definition: cpucounters.h:2040
double getL3CacheHitRatio(const CounterStateType &before, const CounterStateType &after)
Computes L3 cache hit ratio.
Definition: cpucounters.h:2677
double getAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:2528
uint64 getIORequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IO sources.
Definition: cpucounters.h:2972
uint64 getL3CacheHitsNoSnoop(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache hits where no snooping in sibling L2 caches had to be done...
Definition: cpucounters.h:2796
int getRunState(void)
Returns program&#39;s Run State.
Definition: cpucounters.h:595
uint64 getBytesReadFromEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from MCDRAM memory controllers.
Definition: cpucounters.h:2947
uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI (data+nondata) traffic per outgoing QPI link.
Definition: cpucounters.h:3098
CoreCounterState getCoreCounterState(uint32 core)
Reads the counter state of a (logical) core.
Definition: cpucounters.cpp:3296
uint64 getPMMWrites()
Get the number of PMM memory writes (in cache lines)
Definition: cpucounters.cpp:5369
friend uint64 getBytesWrittenToEDC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to MCDRAM memory controllers.
Definition: cpucounters.h:2959
void unfreezeCounters()
Unfreezes event counting.
Definition: cpucounters.cpp:5620
friend uint64 getRefCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number of reference clock cycles while clock signal on the core is running.
Definition: cpucounters.h:2468
friend double getLLCReadMissLatency(const CounterStateType &before, const CounterStateType &after)
Returns average last level cache read+prefetch miss latency in ns.
Definition: cpucounters.h:3239
friend uint64 getMCCounter(uint32 channel, uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.h:2014
uint64 getInvariantTSC(const CounterStateType &before, const CounterStateType &after)
Computes number of invariant time stamp counter ticks.
Definition: cpucounters.h:2843
Low level interface to access PCI configuration space.
uint64 getNumberOfEvents(const CounterType &before, const CounterType &after)
Returns the raw count of events.
Definition: cpucounters.h:3232
friend uint64 getIncomingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI data traffic per incoming QPI link.
Definition: cpucounters.h:3014
Definition: cpucounters.h:662
friend uint64 getL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache misses.
Definition: cpucounters.h:2721
size_t getMCChannels(uint32 socket, uint32 controller) const
Returns the number of detected memory channels on given integrated memory controllers.
Definition: cpucounters.h:1250
uint64 getRefCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number of reference clock cycles while clock signal on the core is running.
Definition: cpucounters.h:2468
friend uint64 getL2CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache hits.
Definition: cpucounters.h:2745
uint64 getPCUClocks(const CounterStateType &before, const CounterStateType &after)
Returns clock ticks of power control unit.
Definition: cpucounters.h:2061
friend uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:2081
Provides 64-bit &quot;virtual&quot; counters from underlying 32-bit HW counters.
void freezeCounters()
Freezes event counting.
Definition: cpucounters.cpp:5595
void programServerUncoreMemoryMetrics(int rankA=-1, int rankB=-1, bool PMM=false)
Program memory counters (disables programming performance counters)
Definition: cpucounters.cpp:5119
int32 getThreadId(uint32 os_id) const
Determines physical thread of given processor ID within a core.
Definition: cpucounters.h:1155
SocketCounterState getSocketCounterState(uint32 socket)
Reads the counter state of a socket.
Definition: cpucounters.cpp:3288
friend double getRelativeFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency also taking Intel Turbo Boost technology into account.
Definition: cpucounters.h:2562
uint32 getNumCores() const
Reads number of logical cores in the system.
Definition: cpucounters.cpp:4106
void allowMultipleInstances()
Call it before program() to allow multiple running instances of PCM on the same system.
Definition: cpucounters.h:601
size_t getNumEDCChannels() const
Returns the total number of detected memory channels on all embedded DRAM controllers (EDC) ...
Definition: cpucounters.h:445
friend uint64 getConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by processor, exclusing DRAM (measured in internal units) ...
Definition: cpucounters.h:2071
int32 getPackageMaximumPower() const
Returns maximum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:1374
friend uint64 getL3CacheOccupancy(const CounterStateType &now)
Computes L3 Cache Occupancy.
Definition: cpucounters.h:2761
friend uint64 getBytesWrittenToPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes written to PMM memory.
Definition: cpucounters.h:2935
friend uint64 getDRAMClocks(uint32 channel, const CounterStateType &before, const CounterStateType &after)
Returns DRAM clock ticks.
Definition: cpucounters.h:1990
uint64 getRemoteMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Remote Memory Bandwidth.
Definition: cpucounters.h:2778
void program_power_metrics(int mc_profile)
Program power counters (disables programming performance counters)
Definition: cpucounters.cpp:5431
uint64 getCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number core clock cycles when signal on a specific core is running (not halted) ...
Definition: cpucounters.h:2452
friend double getPackageCStateResidency(int state, const CounterStateType &before, const CounterStateType &after)
Computes residency in the package C-state.
Definition: cpucounters.h:2886
uint64 getMCCounter(uint32 channel, uint32 counter)
Direct read of memory controller PMU counter (counter meaning depends on the programming: power/perfo...
Definition: cpucounters.cpp:5662
Definition: cpucounters.h:235
double getIncomingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get data utilization of incoming QPI link (0..1)
Definition: cpucounters.h:3033
Definition: types.h:889
uint64 getQPIClocks(uint32 port)
Get number of QPI LL clocks on a QPI port.
Definition: cpucounters.cpp:5625
uint64 getLocalMemoryBW(const CounterStateType &before, const CounterStateType &after)
Computes Local Memory Bandwidth.
Definition: cpucounters.h:2769
uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType &before, const CounterStateType &after)
Returns the number of occured custom core events.
Definition: cpucounters.h:2999
double getLLCReadMissLatency(const CounterStateType &before, const CounterStateType &after)
Returns average last level cache read+prefetch miss latency in ns.
Definition: cpucounters.h:3239
friend uint64 getQPIClocks(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns QPI LL clock ticks.
Definition: cpucounters.h:1926
double getDRAMConsumedJoules(const CounterStateType &before, const CounterStateType &after)
Returns Joules consumed by DRAM.
Definition: cpucounters.h:2104
uint64 getInstructionsRetired(const CounterStateType &before, const CounterStateType &after)
Computes the number of retired instructions.
Definition: cpucounters.h:2403
CPU Performance Monitor.
Definition: cpucounters.h:481
int32 getPackageMinimumPower() const
Returns minimum power derived from electrical spec of the package domain in Watt. ...
Definition: cpucounters.h:1371
friend uint64 getIORequestBytesFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes of read/write requests from all IO sources.
Definition: cpucounters.h:2972
friend uint64 getL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L3 cache misses.
Definition: cpucounters.h:2707
friend double getCyclesLostDueL3CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to L3 cache misses.
Definition: cpucounters.h:2595
double getOutgoingQPILinkUtilization(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get utilization of outgoing QPI link (0..1)
Definition: cpucounters.h:3054
uint64 getImcWrites()
Get the number of integrated controller writes (in cache lines)
Definition: cpucounters.cpp:5348
uint64 getPCUCounter(uint32 counter, const CounterStateType &before, const CounterStateType &after)
Direct read of power control unit PMU counter (counter meaning depends on the programming: power/perf...
Definition: cpucounters.h:2051
friend uint64 getInstructionsRetired(const CounterStateType &before, const CounterStateType &after)
Computes the number of retired instructions.
Definition: cpucounters.h:2403
Definition: cpucounters.h:125
uint64 getM2MCounter(uint32 box, uint32 counter)
Direct read of M2M counter.
Definition: cpucounters.cpp:5687
Definition: cpucounters.h:705
uint64 getPCUFrequency() const
Returns the frequency of Power Control Unit.
Definition: cpucounters.h:1323
double getJoulesPerEnergyUnit() const
Returns how many joules are in an internal processor energy unit.
Definition: cpucounters.h:1365
uint32 getOriginalCPUModel() const
Reads original CPU model id.
Definition: cpucounters.h:1146
uint64 getUPIL0TxCycles(uint32 port)
Get number cycles on a UPI port when the link was in a L0 mode (fully active)
Definition: cpucounters.cpp:5424
uint64 getL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache misses.
Definition: cpucounters.h:2721
const std::string & getErrorMessage() const
Returns the error message.
Definition: cpucounters.h:909
double getIPC(const CounterStateType &before, const CounterStateType &after)
Computes average number of retired instructions per core cycle (IPC)
Definition: cpucounters.h:2387
Definition: cpucounters.h:448
int32 getPackageThermalHeadroom() const
Returns current thermal headroom below TjMax.
Definition: cpucounters.h:2265
bool isCoreCStateResidencySupported(int state)
Returns true if the specified core C-state residency metric is supported.
Definition: cpucounters.h:565
Server uncore power counter state.
Definition: cpucounters.h:2218
uint64 getSMICount(const CounterStateType &before, const CounterStateType &after)
Returns the number of occured system management interrupts.
Definition: cpucounters.h:2984
friend uint64 getOutgoingQPILinkBytes(uint32 socketNr, uint32 linkNr, const SystemCounterState &before, const SystemCounterState &after)
Get estimation of QPI (data+nondata) traffic per outgoing QPI link.
Definition: cpucounters.h:3098
bool isPackageCStateResidencySupported(int state)
Returns true if the specified package C-state residency metric is supported.
Definition: cpucounters.h:574
friend uint64 getDRAMConsumedEnergy(const CounterStateType &before, const CounterStateType &after)
Returns energy consumed by DRAM (measured in internal units)
Definition: cpucounters.h:2081
(Logical) core-wide counter state
Definition: cpucounters.h:2281
uint64 getL2CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes number of L2 cache hits.
Definition: cpucounters.h:2745
double getCyclesLostDueL2CacheMisses(const CounterStateType &before, const CounterStateType &after)
Estimates how many core cycles were potentially lost due to missing L2 cache but still hitting L3 cac...
Definition: cpucounters.h:2616
uint32 getMCPerSocket() const
Returns the number of detected integrated memory controllers per socket.
Definition: cpucounters.h:1200
friend uint64 getCycles(const CounterStateType &before, const CounterStateType &after)
Computes the number core clock cycles when signal on a specific core is running (not halted) ...
Definition: cpucounters.h:2452
uint32 getNumMC() const
Returns the number of detected integrated memory controllers.
Definition: cpucounters.h:435
uint64 getBytesReadFromMC(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from DRAM memory controllers.
Definition: cpucounters.h:2899
uint64 getBytesReadFromPMM(const CounterStateType &before, const CounterStateType &after)
Computes number of bytes read from PMM memory.
Definition: cpucounters.h:2923
friend double getActiveAverageFrequency(const CounterStateType &before, const CounterStateType &after)
Computes average core frequency when not in powersaving C0-state (also taking Intel Turbo Boost techn...
Definition: cpucounters.h:2545
uint64 getEdcWrites()
Get the number of cache lines written by EDC (embedded DRAM controller)
Definition: cpucounters.cpp:5391
SystemCounterState getSystemCounterState()
Reads the counter state of the system.
Definition: cpucounters.cpp:3280
uint64 getDRAMClocks(uint32 channel)
Get number DRAM channel cycles.
Definition: cpucounters.cpp:5640
uint64 getQPIL1Cycles(uint32 port)
Get number cycles on a QPI port when the link was in a power saving shutdown mode.
Definition: cpucounters.cpp:5635
double getTotalExecUsage(const SystemCounterState &before, const SystemCounterState &after)
Computes average number of retired instructions per time intervall for the entire system combining in...
Definition: cpucounters.h:2512
uint32 getMaxIPC() const
Returns the max number of instructions per cycle.
Definition: cpucounters.h:1288
Definition: cpucounters.h:102
friend uint64 getL3CacheHits(const CounterStateType &before, const CounterStateType &after)
Computes total number of L3 cache hits.
Definition: cpucounters.h:2828
Definition: cpucounters.h:94
Definition: pcm-iio.cpp:56
friend uint64 getNumberOfCustomEvents(int32 eventCounterNr, const CounterStateType &before, const CounterStateType &after)
Returns the number of occured custom core events.
Definition: cpucounters.h:2999
int32 getCoreId(uint32 os_id) const
Determines physical core of given processor ID within a socket.
Definition: cpucounters.h:1160
Low level interface to access hardware model specific registers.
ErrorCode
Return codes (e.g. for program(..) method)
Definition: cpucounters.h:615
uint64 getOutgoingFlits(uint32 port)
Get the number of outgoing data and non-data or idle flits (depending on the architecture) from the s...
Definition: cpucounters.cpp:5419
Definition: cpucounters.h:147
uint64 getImcReads()
Get the number of integrated controller reads (in cache lines)
Definition: cpucounters.cpp:5321
static PCM * getInstance()
Returns PCM object.
Definition: cpucounters.cpp:259
Basic core counter state.
Definition: cpucounters.h:1741
uint64 getIncomingDataFlits(uint32 port)
Get the number of incoming data flits to the socket through a port.
Definition: cpucounters.cpp:5403
double getQPItoMCTrafficRatio(const SystemCounterState &before, const SystemCounterState &after)
Get QPI data to Memory Controller traffic ratio.
Definition: cpucounters.h:3217
friend uint64 getQPIL1Cycles(uint32 port, const CounterStateType &before, const CounterStateType &after)
Returns the number of QPI cycles in power saving shutdown mode.
Definition: cpucounters.h:1955