pcp
[Top] [All Lists]

Fwd: [RFC PATCH 1/1] Adding a PMDA to collect memory bandwidth

To: pcp@xxxxxxxxxxx
Subject: Fwd: [RFC PATCH 1/1] Adding a PMDA to collect memory bandwidth
From: Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx>
Date: Sun, 12 Jul 2015 11:24:10 +0530
Cc: Nathan Scott <nathans@xxxxxxxxxx>
Delivered-to: pcp@xxxxxxxxxxx
In-reply-to: <1436678109-30878-2-git-send-email-hemant@xxxxxxxxxxxxxxxxxx>
References: <1436678109-30878-2-git-send-email-hemant@xxxxxxxxxxxxxxxxxx>
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.6.0
Same with this too.


-------- Original Message --------
Subject:        [RFC PATCH 1/1] Adding a PMDA to collect memory bandwidth
Date:   Sun, 12 Jul 2015 10:45:09 +0530
From:   Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx>
To:     pcp@xxxxxxxxxxx
CC:     hemant@xxxxxxxxxxxxxxxxxx, deepthi@xxxxxxxxxxxxxxxxxx



This patch adds a new PMDA to collect the memory read and write counter
values for x86 and powerpc. It also adds a metric for powerpc to find
the maximum possible (theoretical) bandwidth per node from a system. Its
to help the PCP clients collect the memory counter values and find
out the bandwidth utilization.

To collect this metric, we will need the related perf counters to be
created. All the counter names for intel and powerpc have been added in
get_perf.c file. This file takes help of the perfmon/libpfm4 library.
Also, important functions to find out the topology of a system have been
reused, thanks to architecture.c in "perfevent" agent.

As soon as the daemon starts, the counters are enabled. For every event
such as "snbep_unc_imc0::UNC_M_CAS_COUNT:RD" (on x86), a counter will be
created for the main CPU per node. Data is collected whenever a pmFetch
call is made. The data is averaged out over the past interval and sent
to the client. The data collection is per node i.e., for all the 8
counters, data for each CPU is added separately and sent back to the PCP
client.

Additionally, for powerpc, the aggregated values are scaled to MiB. Its
done with the help of the ".scale" file which is present for every
"powerpc_nest_mcs_*::MCS_0*" event. Also, this agent will export the
maximum possible bandwidth on powerpc (exported by kernel through
"/proc/device-tree/nest-ima/dimm").

Comments/suggestions will be of great help.

Thanks,

Signed-off-by: Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Deepthi Dharwar <deepthi@xxxxxxxxxxxxxxxxxx>
---
 src/pmdas/GNUmakefile                   |   2 +-
 src/pmdas/bandwidth/GNUmakefile         |  32 +++
 src/pmdas/bandwidth/GNUmakefile.install |  38 ++++
 src/pmdas/bandwidth/Install             |  40 ++++
 src/pmdas/bandwidth/Remove              |  12 +
 src/pmdas/bandwidth/bandwidth.c         | 295 +++++++++++++++++++++++++
 src/pmdas/bandwidth/get_perf.c          | 379 ++++++++++++++++++++++++++++++++
 src/pmdas/bandwidth/get_perf.h          |  14 ++
 src/pmdas/bandwidth/help                |  14 ++
 src/pmdas/bandwidth/pmns                |   8 +
 src/pmdas/bandwidth/root                |   9 +
 src/pmns/stdpmid.pcp                    |   1 +
 12 files changed, 843 insertions(+), 1 deletion(-)
 create mode 100644 src/pmdas/bandwidth/GNUmakefile
 create mode 100644 src/pmdas/bandwidth/GNUmakefile.install
 create mode 100755 src/pmdas/bandwidth/Install
 create mode 100755 src/pmdas/bandwidth/Remove
 create mode 100644 src/pmdas/bandwidth/bandwidth.c
 create mode 100644 src/pmdas/bandwidth/get_perf.c
 create mode 100644 src/pmdas/bandwidth/get_perf.h
 create mode 100644 src/pmdas/bandwidth/help
 create mode 100644 src/pmdas/bandwidth/pmns
 create mode 100644 src/pmdas/bandwidth/root

diff --git a/src/pmdas/GNUmakefile b/src/pmdas/GNUmakefile
index 595b00a..7a0b344 100644
--- a/src/pmdas/GNUmakefile
+++ b/src/pmdas/GNUmakefile
@@ -25,7 +25,7 @@ CPMDAS = root pmcd \
        mmv lmsensors process roomtemp summary etw \
        lustrecomm infiniband logger bash systemd \
        gfs2 jbd2 cifs rpm nvidia papi perfevent \
-        dm
+        dm bandwidth

 PLPMDAS = bonding netfilter zimbra postgresql \
        dbping memcache mysql vmware kvm \
diff --git a/src/pmdas/bandwidth/GNUmakefile b/src/pmdas/bandwidth/GNUmakefile
new file mode 100644
index 0000000..1f27cdc
--- /dev/null
+++ b/src/pmdas/bandwidth/GNUmakefile
@@ -0,0 +1,32 @@
+TOPDIR = ../../..
+include $(TOPDIR)/src/include/builddefs
+
+CFILES = bandwidth.c ../perfevent/architecture.c get_perf.c
+HFILES = ../perfevent/architecture.h get_perf.h
+CMDTARGET = pmdabandwidth$(EXECSUFFIX)
+LLDLIBS = $(PCP_PMDALIB) $(PFM_LIBS)
+LCFLAGS = -I.
+DFILES = help
+LSRCFILES = Install Remove pmns root $(DFILES) \
+       GNUmakefile.install
+
+IAM    = bandwidth
+DOMAIN = BANDWIDTH
+PMDADIR        = $(PCP_PMDAS_DIR)/$(IAM)
+
+LDIRT  = domain.h *.o \
+       $(IAM).log pmda$(IAM) pmda_$(IAM).so
+
+default_pcp default:   domain.h $(CMDTARGET)
+
+include $(BUILDRULES)
+
+install_pcp install:   default
+       $(INSTALL) -m 755 -d $(PMDADIR)
+       $(INSTALL) -m 755 Install Remove $(PMDADIR)
+       $(INSTALL) -m 644 GNUmakefile.install $(PMDADIR)/Makefile
+       $(INSTALL) -m 644 root pmns domain.h get_perf.h $(PMDADIR)
+       $(INSTALL) -m 644 $(CFILES) $(DFILES) $(PMDADIR)
+
+domain.h: ../../pmns/stdpmid
+       $(DOMAIN_MAKERULE)
diff --git a/src/pmdas/bandwidth/GNUmakefile.install 
b/src/pmdas/bandwidth/GNUmakefile.install
new file mode 100644
index 0000000..f50ae52
--- /dev/null
+++ b/src/pmdas/bandwidth/GNUmakefile.install
@@ -0,0 +1,38 @@
+SHELL  = sh
+
+ifdef PCP_CONF
+include $(PCP_CONF)
+else
+PCP_DIR = $(shell echo $$PCP_DIR)
+include $(PCP_DIR)/etc/pcp.conf
+endif
+include $(PCP_INC_DIR)/builddefs
+
+# remove -Lpath and -Ipath options from builddefs CFLAGS value
+#
+PCP_LIBS        =
+TMP             := $(CFLAGS:-I%=)
+ifdef PCP_DIR
+# put -Ipath and -Lpath back but use paths for run-time environment
+#
+CFLAGS          = $(TMP) -I$(PCP_INC_DIR)/..
+LDFLAGS         = -L$(PCP_LIB_DIR)
+else
+CFLAGS          = $(TMP)
+endif
+
+IAM    = bandwidth
+CFILES = $(IAM).c get_perf.c architecture.c
+
+LIBTARGET = pmda_$(IAM).$(DSOSUFFIX)
+CMDTARGET = pmda$(IAM)
+TARGETS = $(LIBTARGET) $(CMDTARGET)
+
+LLDLIBS        = -lpcp_pmda -lpcp $(LIB_FOR_MATH) $(LIB_FOR_PTHREADS)
+LDIRT  = *.log help.dir help.pag
+
+default: $(TARGETS)
+
+install: default
+
+include $(PCP_INC_DIR)/buildrules
diff --git a/src/pmdas/bandwidth/Install b/src/pmdas/bandwidth/Install
new file mode 100755
index 0000000..2f8b751
--- /dev/null
+++ b/src/pmdas/bandwidth/Install
@@ -0,0 +1,40 @@
+#! /bin/sh
+#
+# Install the bandwidth PMDA and/or PMNS
+#
+
+. $PCP_DIR/etc/pcp.env
+. $PCP_SHARE_DIR/lib/pmdaproc.sh
+
+iam=bandwidth
+pmda_interface=2
+forced_restart=true
+
+dso_opt=true
+perl_opt=false
+python_opt=false
+socket_opt=true
+socket_inet_def=2078
+
+# Set up the bandwidth PMDA (domain wdn) InDom cache
+#
+domain=`sed -n <domain.h -e '/define BANDWIDTH/{
+s/[    ]*$//
+s/.*[  ]//
+p
+}'`
+if [ -z "$domain" ]
+then
+    echo "Arrgh ... cannot extract domain number from domain.h"
+    exit 1
+fi
+if [ -d $PCP_VAR_DIR/config/pmda ]
+then
+    touch $PCP_VAR_DIR/config/pmda/$domain.1
+    chown $PCP_USER:$PCP_GROUP $PCP_VAR_DIR/config/pmda/$domain.1
+    chmod 644 $PCP_VAR_DIR/config/pmda/$domain.1
+fi
+
+pmdaSetup
+pmdaInstall
+exit 0
diff --git a/src/pmdas/bandwidth/Remove b/src/pmdas/bandwidth/Remove
new file mode 100755
index 0000000..6067cfc
--- /dev/null
+++ b/src/pmdas/bandwidth/Remove
@@ -0,0 +1,12 @@
+#! /bin/sh
+# Remove the bandwidth PMDA
+#
+
+. $PCP_DIR/etc/pcp.env
+. $PCP_SHARE_DIR/lib/pmdaproc.sh
+
+iam=bandwidth
+
+pmdaSetup
+pmdaRemove
+exit 0
diff --git a/src/pmdas/bandwidth/bandwidth.c b/src/pmdas/bandwidth/bandwidth.c
new file mode 100644
index 0000000..2e6e1d0
--- /dev/null
+++ b/src/pmdas/bandwidth/bandwidth.c
@@ -0,0 +1,295 @@
+/*
+ * Memory bandwidth PMDA
+ */
+
+#include <pcp/pmapi.h>
+#include <pcp/impl.h>
+#include <pcp/pmda.h>
+#include "domain.h"
+#include <sys/stat.h>
+#include "get_perf.h"
+#include <unistd.h>
+#include <pthread.h>
+
+/*
+ * bandwidth PMDA
+ *
+ * Metrics
+ *      bandwidth.count                - fetches the memory read/write counts 
value
+ */
+
+/*
+ * instance domains
+ */
+static pmdaIndom indomtab[] = {
+#define COUNT_INDOM    0       /* serial number for "count" instance domain */
+    { COUNT_INDOM, 0, NULL },
+};
+
+static pmInDom  *count_indom = &indomtab[COUNT_INDOM].it_indom;
+
+/*
+ * Definition for count metric for "bandwidth"
+ */
+static pmdaMetric metrictab[] = {
+/* count */
+#ifdef __x86_64__
+    { NULL,
+      { PMDA_PMID(0,0), PM_TYPE_U64, COUNT_INDOM, PM_SEM_COUNTER,
+        PMDA_PMUNITS(0,0,0,0,0,0) }, },
+#elif defined(__PPC64__)
+    { NULL,
+      { PMDA_PMID(0,0), PM_TYPE_U64, COUNT_INDOM, PM_SEM_COUNTER,
+        PMDA_PMUNITS(0,0,0,PM_SPACE_MBYTE,0,0) }, },
+#endif
+/* max */
+    { NULL,
+      { PMDA_PMID(0,1), PM_TYPE_U64, PM_INDOM_NULL, PM_SEM_INSTANT,
+        PMDA_PMUNITS(1,-1,0,PM_SPACE_MBYTE,PM_TIME_SEC,0) }, },
+};
+
+static int     isDSO = 1;              /* =0 I am a daemon */
+static char    *username;
+
+/* data and function prototypes for dynamic instance domain "count" handling */
+#define NODE_NAME 10
+
+struct node {
+    int main_cpu;
+    int inst_id;
+    char node_name[10];
+};
+
+struct node *node_list;
+int nr_nodes;
+
+static void bandwidth_count_init(void);
+static void bandwidth_count_refresh(void);
+static void bandwidth_count_clear(void);
+
+archinfo_t *arch;
+
+static char    mypath[MAXPATHLEN];
+
+/* command line option handling - both short and long options */
+static pmLongOptions longopts[] = {
+    PMDA_OPTIONS_HEADER("Options"),
+    PMOPT_DEBUG,
+    PMDAOPT_DOMAIN,
+    PMDAOPT_LOGFILE,
+    PMDAOPT_USERNAME,
+    PMOPT_HELP,
+    PMDA_OPTIONS_TEXT("\nExactly one of the following options may appear:"),
+    PMDAOPT_INET,
+    PMDAOPT_PIPE,
+    PMDAOPT_UNIX,
+    PMDAOPT_IPV6,
+    PMDA_OPTIONS_END
+};
+static pmdaOptions opts = {
+    .short_options = "D:d:i:l:pu:U:6:?",
+    .long_options = longopts,
+};
+
+/*
+ * callback provided to pmdaFetch
+ */
+static int
+bandwidth_fetchCallBack(pmdaMetric *mdesc, unsigned int inst, pmAtomValue 
*atom)
+{
+    int                        sts;
+    __pmID_int         *idp = (__pmID_int *)&(mdesc->m_desc.pmid);
+
+    if (inst != PM_IN_NULL &&
+        !(idp->cluster == 0 && idp->item == 0) &&
+        !(idp->cluster == 0 && idp->item == 1))
+        return PM_ERR_INST;
+
+    if (idp->cluster == 0) {
+        if (idp->item == 0) {                       /* bandwidth.count */
+            struct node *n;
+            if ((sts = pmdaCacheLookup(*count_indom, inst, NULL, (void *)&n))
+                != PMDA_CACHE_ACTIVE) {
+                if (sts < 0)
+                    __pmNotifyErr(LOG_ERR, "pmdaCacheLookup failed: inst=%d: 
%s",
+                                  inst, pmErrStr(sts));
+                return PM_ERR_INST;
+            }
+            atom->ull = get_aggregate_vals(n->inst_id);
+        }
+        else if (idp->item == 1) {
+#ifdef __PPC64__
+            atom->ull = get_max_bandwidth();
+#else
+            return PM_ERR_PMID;
+#endif
+        }
+        else
+            return PM_ERR_PMID;
+    } else
+        return PM_ERR_PMID;
+
+    return 0;
+}
+
+/*
+ * Update the values before calling pmdaFetch()
+ */
+static int
+bandwidth_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda)
+{
+    bandwidth_count_refresh();
+    return pmdaFetch(numpmid, pmidlist, resp, pmda);
+}
+
+/*
+ * get values for bandwidth.count instances
+ */
+static void
+bandwidth_count_refresh(void)
+{
+    int ret;
+
+    ret = read_and_update_values();
+    if (ret < 0)
+        bandwidth_count_clear();
+}
+
+/*
+ * Clear the PMDA cache for "count" instances
+ */
+static void
+bandwidth_count_clear(void)
+{
+    int sts;
+
+    sts = pmdaCacheOp(*count_indom, PMDA_CACHE_INACTIVE);
+    if (sts < 0)
+        __pmNotifyErr(LOG_ERR, "pmdaCacheOp(INACTIVE) failed: indom=%s: %s",
+                      pmInDomStr(*count_indom), pmErrStr(sts));
+}
+
+/* parse the nodes configuration and setup pmdaCache for "count" */
+static void
+bandwidth_count_init(void)
+{
+    int i;
+    cpulist_t *zero;
+    int sts;
+
+    arch = get_architecture();
+    if (!arch) {
+        __pmNotifyErr(LOG_WARNING, "get_architecture failed\n");
+        return;
+    }
+
+    /* Get the number of nodes */
+    nr_nodes = arch->nnodes;
+
+    /* zero'th cpu for every node */
+    zero = &(arch->cpunodes[0]);
+    /* Setup node_list */
+    node_list = calloc(nr_nodes, sizeof *node_list);
+    if (!node_list) {
+        __pmNotifyErr(LOG_WARNING, "node_list init failed\n");
+        free(arch);
+        return;
+    }
+
+    /* Initialize, open and enable all the counters */
+    sts = initiate_perf_count(nr_nodes, zero);
+    if (sts < 0) {
+        __pmNotifyErr(LOG_WARNING, "initiate_perf_count failed, %d\n", sts);
+        free(arch);
+        return;
+    }
+
+    /* Initialize the nodes */
+    for (i = 0; i < zero->count; i++) {
+        node_list[i].main_cpu = *(zero->index + i);
+        node_list[i].inst_id = i;
+        snprintf(node_list[i].node_name, NODE_NAME, "%s" "%d", "node", i);
+        sts = pmdaCacheStore(*count_indom, PMDA_CACHE_ADD,
+                             node_list[i].node_name, (void *)&(node_list[i]));
+        if (sts < 0) {
+            __pmNotifyErr(LOG_ERR, "pmdaCacheStore failed : %s\n", 
pmErrStr(sts));
+            return;
+        }
+    }
+
+    if (pmdaCacheOp(*count_indom, PMDA_CACHE_SIZE_ACTIVE) < 1)
+        __pmNotifyErr(LOG_WARNING, "\"count\" instance domain is empty");
+}
+
+static void destroy_nodes(void)
+{
+    if (node_list)
+        free(node_list);
+    free(arch);
+}
+
+/*
+ * Initialize the agent (both daemon and DSO).
+ */
+void
+bandwidth_init(pmdaInterface *dp)
+{
+    if (isDSO) {
+        int sep = __pmPathSeparator();
+        snprintf(mypath, sizeof(mypath), "%s%c" "bandwidth" "%c" "help",
+                 pmGetConfig("PCP_PMDAS_DIR"), sep, sep);
+        pmdaDSO(dp, PMDA_INTERFACE_2, "bandwidth DSO", mypath);
+    } else {
+        __pmSetProcessIdentity(username);
+    }
+
+    if (dp->status != 0)
+        return;
+
+    dp->version.any.fetch = bandwidth_fetch;
+
+    pmdaSetFetchCallBack(dp, bandwidth_fetchCallBack);
+
+    pmdaInit(dp, indomtab, sizeof(indomtab)/sizeof(indomtab[0]), metrictab,
+             sizeof(metrictab)/sizeof(metrictab[0]));
+}
+
+/*
+ * Set up the agent if running as a daemon.
+ */
+int
+main(int argc, char **argv)
+{
+    int                        sep = __pmPathSeparator();
+    pmdaInterface      dispatch;
+
+    isDSO = 0;
+    __pmSetProgname(argv[0]);
+    __pmGetUsername(&username);
+
+    snprintf(mypath, sizeof(mypath), "%s%c" "bandwidth" "%c" "help",
+             pmGetConfig("PCP_PMDAS_DIR"), sep, sep);
+    pmdaDaemon(&dispatch, PMDA_INTERFACE_2, pmProgname, BANDWIDTH,
+               "bandwidth.log", mypath);
+
+    pmdaGetOptions(argc, argv, &opts, &dispatch);
+    if (opts.errors) {
+        pmdaUsageMessage(&opts);
+        exit(1);
+    }
+    if (opts.username)
+        username = opts.username;
+
+    pmdaOpenLog(&dispatch);
+    pmdaConnect(&dispatch);
+
+    bandwidth_init(&dispatch);
+    bandwidth_count_init();
+
+    pmdaMain(&dispatch);
+
+    cleanup_counters();
+    destroy_nodes();
+
+    exit(0);
+}
diff --git a/src/pmdas/bandwidth/get_perf.c b/src/pmdas/bandwidth/get_perf.c
new file mode 100644
index 0000000..ee5975e
--- /dev/null
+++ b/src/pmdas/bandwidth/get_perf.c
@@ -0,0 +1,379 @@
+/*
+ * Helper file to open, read and close perf counters for memory bandwidth
+ * calculation.
+ * Currently, this file supports only x86 and ppc64 memory counters.
+ */
+#include <malloc.h>
+#include <string.h>
+#include <perfmon/pfmlib.h>
+#include <perfmon/pfmlib_perf_event.h>
+#include <errno.h>
+#include <unistd.h>
+#include "get_perf.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/utsname.h>
+
+#define MAX_DATA 3
+
+#ifdef __x86_64__
+char *events[] = {
+    "snbep_unc_imc0::UNC_M_CAS_COUNT:RD",
+    "snbep_unc_imc0::UNC_M_CAS_COUNT:WR",
+    "snbep_unc_imc1::UNC_M_CAS_COUNT:RD",
+    "snbep_unc_imc1::UNC_M_CAS_COUNT:WR",
+    "snbep_unc_imc2::UNC_M_CAS_COUNT:RD",
+    "snbep_unc_imc2::UNC_M_CAS_COUNT:WR",
+    "snbep_unc_imc3::UNC_M_CAS_COUNT:RD",
+    "snbep_unc_imc3::UNC_M_CAS_COUNT:WR"
+};
+#define NR_EVENTS 8
+
+#elif defined(__PPC64__)
+char *events[] = {
+    "powerpc_nest_mcs_read::MCS_00",
+    "powerpc_nest_mcs_read::MCS_01",
+    "powerpc_nest_mcs_read::MCS_02",
+    "powerpc_nest_mcs_read::MCS_03",
+    "powerpc_nest_mcs_write::MCS_00",
+    "powerpc_nest_mcs_write::MCS_01",
+    "powerpc_nest_mcs_write::MCS_02",
+    "powerpc_nest_mcs_write::MCS_03"
+};
+#define NR_EVENTS 8
+#define SCALE 
"/sys/bus/event_source/devices/Nest_MCS_Read_BW/events/MCS_00.scale"
+#define DIMM "/proc/device-tree/nest-ima/dimm"
+#define PPC64LE "ppc64le"
+
+/*
+ * Currently the number of DIMMS per node is hardcoded as 4 as there are 4 
dimms
+ * per node in power8 systems, but since, it may not be true for all the power8
+ * systems, so going forward, we will have a detection mechanism to find out 
the
+ * populated DIMMS in a node parsing through the device tree.
+ */
+#define DIMMS_PER_NODE 4
+#else
+/* For unsupported architectures */
+char *events = NULL;
+#define NR_EVENTS 0
+#endif
+
+struct cpu_list {
+    int cpu;
+    int fd;
+    uint64_t vals[MAX_DATA];
+    uint64_t prev_vals[MAX_DATA];
+    uint64_t result;
+};
+
+struct counter {
+    char *name;
+    struct cpu_list *cpus;
+    int nr_cpus;
+    perf_event_attr_t attr;
+};
+
+struct counter *counters = NULL;
+int nr_counters;
+
+void destroy_counters(void)
+{
+    int i, j;
+    struct cpu_list *cpu;
+
+    if (!counters)
+        return;
+    for (i = 0; i < NR_EVENTS; i++) {
+        cpu = counters[i].cpus;
+        if (cpu) {
+            for (j = 0; j < counters[i].nr_cpus; j++)
+                if (cpu[j].fd > -1)
+                    close(cpu[j].fd);
+            free(cpu);
+        }
+        if (counters[i].name)
+            free(counters[i].name);
+    }
+    free(counters);
+}
+
+/*
+ * Allocate memory for all the counter structures and prepare the
+ * perf_event_attr structures for all of them.
+ */
+static int initialize_counters(int nr_nodes, cpulist_t *zero)
+{
+    int i, j, ret, num = 0;
+    pfm_perf_encode_arg_t arg;
+    char *fstr = NULL;
+
+    if (NR_EVENTS == 0) {
+        fprintf(stderr, "Unsupported architecture\n");
+        return -EINVAL;
+    }
+    counters = calloc(NR_EVENTS, sizeof(struct counter));
+    if (!counters) {
+        fprintf(stderr, "Can't allocate memory to \"counters\"");
+        return -ENOMEM;
+    }
+    for (i = 0; i < NR_EVENTS; i++) {
+        counters[i].name = strdup(events[i]);
+        if (!counters[i].name) {
+            fprintf(stderr, "Not enough memory, counters[i].name : %s\n",
+                    counters[i].name);
+            return -ENOMEM;
+        }
+
+        /* for each counter, intialize the cpu list */
+        counters[i].cpus = calloc(nr_nodes, sizeof(*(counters[i].cpus)));
+        if (!counters[i].cpus) {
+            fprintf(stderr, "Not enough memory, counters[%d].cpu\n", i);
+            return -ENOMEM;
+        }
+        memset(&counters[i].attr, 0, sizeof(counters[i].attr));
+        /* for one cpu per node */
+        counters[i].nr_cpus = nr_nodes;
+
+        /* Initialize the cpu index for this counter */
+        for (j = 0; j < zero->count; j++)
+            counters[i].cpus[j].cpu = *(zero->index + j);
+
+        counters[i].attr.size = sizeof(counters[i].attr);
+        memset(&arg, 0, sizeof(arg));
+
+        arg.attr = &(counters[i].attr);
+        arg.fstr = &fstr;
+        ret = pfm_get_os_event_encoding(events[i], PFM_PLM0|PFM_PLM3,
+                                        PFM_OS_PERF_EVENT_EXT, &arg);
+        if (ret != PFM_SUCCESS) {
+            fprintf(stderr, "pfm_get_os_event_encoding failed, ret : %d, %s\n",
+                    ret, pfm_strerror(ret));
+            if (fstr)
+                free(fstr);
+            return -EINVAL;
+        }
+        counters[i].attr.disabled = 1;
+        free(fstr);
+        counters[i].attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+            PERF_FORMAT_TOTAL_TIME_RUNNING;
+        num++;
+    }
+    return num;
+}
+
+/*
+ * Enable/Disable the counters
+ */
+static int toggle_counters(int op)
+{
+    int i, k, ret = 0;
+
+    for (i = 0; i < NR_EVENTS; i++)
+        for (k = 0; k < counters[i].nr_cpus; k++) {
+            ret = ioctl(counters[i].cpus[k].fd,        op, 0);
+            if (ret == -1)
+                fprintf(stderr, "ioctl() failed for cpu : %d\n",
+                        counters[i].cpus[k].cpu);
+        }
+    return ret;
+}
+
+void cleanup_counters(void)
+{
+    int ret;
+
+    ret = toggle_counters(PERF_EVENT_IOC_DISABLE);
+    if (ret == -1)
+        fprintf(stderr, "toggle_counters failed for  
PERF_EVENT_IOC_DISABLE\n");
+    destroy_counters();
+}
+
+static int open_counters(void)
+{
+    int i, k;
+
+    for (i = 0; i < NR_EVENTS; i++) {
+        for (k = 0; k < counters[i].nr_cpus; k++) {
+            /* open the counter */
+            counters[i].cpus[k].fd =
+                perf_event_open(&(counters[i].attr), -1,
+                                counters[i].cpus[k].cpu, -1, 0);
+            if (counters[i].cpus[k].fd == -1) {
+                fprintf(stderr, "perf_event_open failed : %s, %d, i: %d, k: 
%d\n",
+                        strerror(errno), counters[i].cpus[k].cpu, i, k);
+                return -EINVAL;
+            }
+        }
+    }
+    return 0;
+}
+
+int initiate_perf_count(int nr_nodes, cpulist_t *zero)
+{
+    int ret;
+
+    pfm_initialize();
+    ret = initialize_counters(nr_nodes, zero);
+    if (ret < 1) {
+        fprintf(stderr, "initiate_perf_count failed: %d\n", ret);
+        destroy_counters();
+        return ret;
+    }
+    ret = open_counters();
+    if (ret < 0) {
+        destroy_counters();
+        return ret;
+    }
+    ret = toggle_counters(PERF_EVENT_IOC_ENABLE);
+    if (ret == -1) {
+        fprintf(stderr, "toggle_counters failed for PERF_EVENT_IOC_ENABLE\n");
+        destroy_counters();
+        return ret;
+    }
+    return 0;
+}
+
+/*
+ * Scale the delta value according to enabled time of the counter
+ * and running time.
+ */
+uint64_t avg_it(uint64_t *prev, uint64_t *new)
+{
+    uint64_t delta;
+    double running, enabled, scale;
+
+    delta = new[0] - prev[0];
+    running = new[1] - prev[1];
+    enabled = new[2] - prev[2];
+
+    memcpy(prev, new, MAX_DATA);
+
+    if (running == 0 || (running > enabled))
+        return delta;
+
+    scale = enabled/running;
+        return delta * scale;
+}
+
+int read_and_update_values(void)
+{
+    uint64_t data[MAX_DATA];
+    int i, k, j, ret = 0;
+
+    memset(data, 0, sizeof(data));
+    for (i = 0; i < NR_EVENTS; i++) {
+        for (k = 0; k < counters[i].nr_cpus; k++) {
+            ret = read(counters[i].cpus[k].fd, data,
+                       MAX_DATA * sizeof(uint64_t));
+            if (ret < 0) {
+                fprintf(stderr, "ret : %d, err: %s\n", ret, strerror(errno));
+                return ret;
+            }
+            counters[i].cpus[k].result += avg_it(counters[i].cpus[k].vals,
+                                                 data);
+            for (j = 1; j < MAX_DATA; j++)
+                counters[i].cpus[k].vals[j] = data[j];
+        }
+    }
+    return ret;
+}
+
+uint64_t get_aggregate_vals(int cpu)
+{
+    uint64_t value = 0;
+    int i;
+
+    for (i = 0; i < NR_EVENTS; i++)
+        value += counters[i].cpus[cpu].result;
+
+#ifdef __PPC64__
+    double scale;
+    int ret;
+
+    ret = get_scale(&scale);
+    if (ret == 0)
+        value = (double)value * scale;
+#endif
+    return value;
+}
+
+#ifdef __PPC64__
+/*
+ * Read the ".scale" file for any counter and find out the scaling
+ * factor.
+ */
+int get_scale(double *result)
+{
+    FILE *fp;
+    char *str = NULL, *ptr, *p;
+    size_t n;
+    int ret;
+
+    fp = fopen(SCALE, "r");
+    if (!fp) {
+        fprintf(stderr, "Error in opening file : %s\n", SCALE);
+        return -1;
+    }
+    ret = getline(&str, &n, fp);
+    if (ret < 0) {
+        *result = 0;
+        return -1;
+    }
+    ptr = strchr(str, '\n');
+    if (ptr)
+        ptr = '\0';
+
+    *result = strtod(str, &p);
+    free(str);
+    fclose(fp);
+    return 0;
+}
+
+/*
+ * Maximum bandwidth exposed in device tree, go and read it.
+ */
+uint64_t get_max_bandwidth(void)
+{
+    unsigned swapped, value;
+    FILE *fp;
+    struct stat st;
+    int ret;
+    struct utsname name;
+
+    ret = stat(DIMM, &st);
+    if (ret < 0) {
+        fprintf(stderr, "Could not stat : %s", DIMM);
+        return -1;
+    }
+    fp = fopen(DIMM, "rb");
+    if (fp == NULL) {
+        fprintf(stderr, "Could not open : %s\n", DIMM);
+        return -1;
+    }
+    /* Reading 4 bytes */
+    ret = fread(&value, 4, 1, fp);
+    if (ret == 0) {
+        fprintf(stderr, "fread, Couldn't read the max value\n");
+        return -1;
+    }
+
+    ret = uname(&name);
+    if (ret < 0) {
+        fprintf(stderr, "uname failed, ret : %d\n", ret);
+        return -1;
+    }
+    if (strncmp(name.machine, PPC64LE, strlen(PPC64LE)))
+        return DIMMS_PER_NODE * value;
+
+    /* Swapping only for ppc64 LE machines */
+    value >>= 16;
+
+    swapped = ((value>>24)&0xff) | ((value<<8)&0xff0000) | ((value>>8)&0xff00) 
|
+        ((value<<24)&0xff000000);
+    swapped >>= 16;
+
+    fclose(fp);
+    return DIMMS_PER_NODE * swapped;
+}
+#endif
diff --git a/src/pmdas/bandwidth/get_perf.h b/src/pmdas/bandwidth/get_perf.h
new file mode 100644
index 0000000..74390bc
--- /dev/null
+++ b/src/pmdas/bandwidth/get_perf.h
@@ -0,0 +1,14 @@
+#include "../perfevent/architecture.h"
+#include <pcp/pmapi.h>
+#include <pcp/impl.h>
+#include <pcp/pmda.h>
+
+int read_and_update_values(void);
+int initiate_perf_count(int nr_nodes, cpulist_t *zero);
+uint64_t get_aggregate_vals(int node);
+void cleanup_counters(void);
+
+#ifdef __PPC64__
+uint64_t get_max_bandwidth(void);
+int get_scale(double *result);
+#endif
diff --git a/src/pmdas/bandwidth/help b/src/pmdas/bandwidth/help
new file mode 100644
index 0000000..b8d08df
--- /dev/null
+++ b/src/pmdas/bandwidth/help
@@ -0,0 +1,14 @@
+# bandwidth PMDA help file
+#
+
+@ BANDWIDTH.0 Instance domain "count" for bandwidth PMDA
+dynamic instances
+
+@ bandwidth.count Metrics which increment with each fetch
+This metric has instances equals to number of nodes in this machine.
+
+The metric values can not be altered using pmstore(1).
+
+@ bandwidth.max Maximum memory bandwidth per node in a system.
+The value per dimm is read from the device tree and then is processed to
+find the value per node.
\ No newline at end of file
diff --git a/src/pmdas/bandwidth/pmns b/src/pmdas/bandwidth/pmns
new file mode 100644
index 0000000..1ec78c6
--- /dev/null
+++ b/src/pmdas/bandwidth/pmns
@@ -0,0 +1,8 @@
+/*
+ * Metrics for bandwidth PMDA
+ */
+
+bandwidth {
+    count      BANDWIDTH:0:0
+    max                BANDWIDTH:0:1
+}
diff --git a/src/pmdas/bandwidth/root b/src/pmdas/bandwidth/root
new file mode 100644
index 0000000..7c08824
--- /dev/null
+++ b/src/pmdas/bandwidth/root
@@ -0,0 +1,9 @@
+/*
+ * fake "root" for validating the local PMNS subtree
+ */
+
+#include <stdpmid>
+
+root { bandwidth }
+
+#include "pmns"
diff --git a/src/pmns/stdpmid.pcp b/src/pmns/stdpmid.pcp
index 4752107..9245e4e 100644
--- a/src/pmns/stdpmid.pcp
+++ b/src/pmns/stdpmid.pcp
@@ -111,6 +111,7 @@ GPFS                135
 CEPH           136
 JSON           137
 MIC            138
+BANDWIDTH      237
 ### NEXT FREE SLOT ###
 SCHIZO         241
 SLOW_PYTHON    242
--
1.9.3



<Prev in Thread] Current Thread [Next in Thread>