pcp
[Top] [All Lists]

[RFC PATCH] Metric to get the maximum memory bandwidth per node on x86

To: pcp@xxxxxxxxxxx
Subject: [RFC PATCH] Metric to get the maximum memory bandwidth per node on x86
From: Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx>
Date: Mon, 25 Jan 2016 09:11:59 +0530
Cc: naveen.n.rao@xxxxxxxxxxxxxxxxxx, nathans@xxxxxxxxxx, hkshaw.lk@xxxxxxxxx
Delivered-to: pcp@xxxxxxxxxxx
Hi All,

Getting the maximum memory bandwidth per node on intel machines isn't a
trivial problem. This patch is to initiate a discussion as to how to get
this metric value.

Background :
As we are already working on v2 patches to get the current bandwidth from
intel machines with the help of the perfevent agent by extending that
agent. However, the current memory bandwidth, although an interesting
metric, it doesn't help much in finding out the efficiency of a node in
a system with respect to its memory bandwidth. Therefore, to make any
placement decision on a node, the max memory bandwidth is a highly useful
metric. An important use case (related to open stack) can be found on
this link :
http://oss.sgi.com/archives/pcp/2015-07/msg00051.html

Issues :
I couldn't find the maximum memory bandwidth exported from the kernel or
from any other means. I tried to get that information from DMI which
is generally located in /dev/mem. The information that I found from DMI
(which is listed by "dmidecode") is the speed of a memory device (unit as
Mhz) and the total width of the channel (i.e., 64 bits).
Even if we assume the  theoritcal maximum bandwidth as [speed * Width]
for a memory device, we need a mechanism to find out how many memory
devices are populated in each node to find the max bandwidth per node.
But, as I understand, information regarding the relation between nodes
and memory devices isn't exported by DMI(?). Also, will it be safe to
assume that the bandwidth we get from [speed * width] is the theoretical
maximum for a memory device, as I understand that the memory devices may
be operated in different configurations? Is there any other way to
retrieve the maximum memory bandwidth of a node in a system?

Solutions/Alternatives :
Due to the above listed issues, can we switch to an alternative, where
the client/user can configure the max bandwidth per node similar to what
this patch does? Or, are there any other alternatives which will help in
solving this problem.

What this patch does ? :
This patch is not a solution to finding the memory bandwidth, rather an
alternative. This takes help of a bandwidth.conf file which can be filled
up by an user/client according to their system's configuration in the format
"node:bandwidth". The linux pmda then reads this file and displays the metric
"hinv.node.max_memory_bandwidth" in a per node manner.

Any suggestions/thoughts are welcome.

Thanks!
- Hemant Kumar

Signed-off-by: Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx>
---
 src/pmdas/linux/GNUmakefile     |  8 +++---
 src/pmdas/linux/bandwidth.conf  |  1 +
 src/pmdas/linux/mem_bandwidth.c | 56 +++++++++++++++++++++++++++++++++++++++++
 src/pmdas/linux/mem_bandwidth.h |  1 +
 src/pmdas/linux/pmda.c          | 15 +++++++++++
 src/pmdas/linux/root_linux      |  3 ++-
 6 files changed, 79 insertions(+), 5 deletions(-)
 create mode 100644 src/pmdas/linux/bandwidth.conf
 create mode 100644 src/pmdas/linux/mem_bandwidth.c
 create mode 100644 src/pmdas/linux/mem_bandwidth.h

diff --git a/src/pmdas/linux/GNUmakefile b/src/pmdas/linux/GNUmakefile
index 93e6f40..1e273bd 100644
--- a/src/pmdas/linux/GNUmakefile
+++ b/src/pmdas/linux/GNUmakefile
@@ -35,7 +35,7 @@ CFILES                = pmda.c \
                  proc_slabinfo.c proc_sys_fs.c proc_vmstat.c \
                  sysfs_kernel.c linux_table.c numa_meminfo.c \
                  proc_net_netstat.c namespaces.c proc_net_softnet.c \
-                 proc_net_snmp6.c
+                 proc_net_snmp6.c mem_bandwidth.c
 
 HFILES         = clusters.h indom.h convert.h \
                  proc_stat.h proc_meminfo.h proc_loadavg.h \
@@ -46,11 +46,11 @@ HFILES              = clusters.h indom.h convert.h \
                  proc_slabinfo.h proc_sys_fs.h proc_vmstat.h \
                  sysfs_kernel.h linux_table.h numa_meminfo.h \
                  proc_net_netstat.h namespaces.h proc_net_softnet.h \
-                 proc_net_snmp6.h
+                 proc_net_snmp6.h mem_bandwidth.h
 
 VERSION_SCRIPT = exports
 HELPTARGETS    = help.dir help.pag
-LSRCFILES      = help root_linux proc_net_snmp_migrate.conf 
linux_kernel_ulong.conf
+LSRCFILES      = help root_linux proc_net_snmp_migrate.conf 
linux_kernel_ulong.conf bandwidth.conf
 LDIRT          = $(HELPTARGETS) domain.h $(VERSION_SCRIPT) 
linux_kernel_ulong.conf
 
 LLDLIBS                = $(PCP_PMDALIB)
@@ -74,7 +74,7 @@ build-me: $(LIBTARGET) $(CMDTARGET) $(HELPTARGETS) 
$(LSRCFILES)
 
 install: default
        $(INSTALL) -m 755 -d $(PMDADIR)
-       $(INSTALL) -m 644 domain.h help $(HELPTARGETS) $(PMDADIR)
+       $(INSTALL) -m 644 domain.h help bandwidth.conf $(HELPTARGETS) $(PMDADIR)
        $(INSTALL) -m 755 $(LIBTARGET) $(CMDTARGET) $(PMDADIR)
        $(INSTALL) -m 644 root_linux $(PCP_VAR_DIR)/pmns/root_linux
        $(INSTALL) -m 644 proc_net_snmp_migrate.conf 
$(LOGREWRITEDIR)/linux_proc_net_snmp_migrate.conf
diff --git a/src/pmdas/linux/bandwidth.conf b/src/pmdas/linux/bandwidth.conf
new file mode 100644
index 0000000..4dcd7b4
--- /dev/null
+++ b/src/pmdas/linux/bandwidth.conf
@@ -0,0 +1 @@
+# node:max_memory_bandwidth
diff --git a/src/pmdas/linux/mem_bandwidth.c b/src/pmdas/linux/mem_bandwidth.c
new file mode 100644
index 0000000..91a2fc6
--- /dev/null
+++ b/src/pmdas/linux/mem_bandwidth.c
@@ -0,0 +1,56 @@
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define FILE_ERR -1
+#define BANDWIDTH_CONFIG "/var/lib/pcp/pmdas/linux/bandwidth.conf"
+
+long int find_max_bandwidth(int node)
+{
+    size_t len = 0;
+    char *str, *line = NULL;
+    FILE *fp = fopen(BANDWIDTH_CONFIG, "r");
+    ssize_t ret;
+    int fnode;
+    long unsigned bandwidth;
+
+    if (NULL == fp) {
+       fprintf(stderr, "Error in opening %s\n", BANDWIDTH_CONFIG);
+       return FILE_ERR;
+    }
+
+    do {
+       ret = getline(&line, &len, fp);
+       if (ret > 0) {
+           /* Ignore the comments */
+           if (line[0] == '#') {
+               continue;
+           }
+           /* Remove the new line from the end of the string here */
+           if (line[strlen(line) - 1] == '\n')
+               line[strlen(line) - 1] = '\0';
+           
+           str = strchr(line, ':');
+           if (NULL == str) {
+               fprintf(stderr, "Error in file format\n");
+               free(line);
+               return FILE_ERR;
+           }
+
+           *str = '\0';
+           str++;
+
+           fnode = atoi(line);
+           bandwidth = atoi(str);
+           
+           if (fnode == node) {
+               free(line);
+               return bandwidth;
+           }
+       }
+    } while (NULL != line);
+
+    if (line)
+       free(line);
+    return FILE_ERR;
+}
diff --git a/src/pmdas/linux/mem_bandwidth.h b/src/pmdas/linux/mem_bandwidth.h
new file mode 100644
index 0000000..a6c527a
--- /dev/null
+++ b/src/pmdas/linux/mem_bandwidth.h
@@ -0,0 +1 @@
+long int find_max_bandwidth(int node);
diff --git a/src/pmdas/linux/pmda.c b/src/pmdas/linux/pmda.c
index 1304718..53dc9d0 100644
--- a/src/pmdas/linux/pmda.c
+++ b/src/pmdas/linux/pmda.c
@@ -63,6 +63,7 @@
 #include "interrupts.h"
 #include "ipc.h"
 #include "proc_net_softnet.h"
+#include "mem_bandwidth.h"
 
 static proc_stat_t             proc_stat;
 static proc_meminfo_t          proc_meminfo;
@@ -3536,6 +3537,11 @@ static pmdaMetric metrictab[] = {
     { PMDA_PMID(CLUSTER_SYSFS_DEVICES, 1), PM_TYPE_U32, NODE_INDOM, 
PM_SEM_INSTANT,
     PMDA_PMUNITS(0,0,0,0,0,0) } },
 
+/* hinv.node.max_memory_bandwidth */
+  { NULL,
+    { PMDA_PMID(CLUSTER_SYSFS_DEVICES, 2), PM_TYPE_U32, NODE_INDOM, 
PM_SEM_INSTANT,
+    PMDA_PMUNITS(0,0,0,0,0,0) } },
+
 /*
  * semaphore limits cluster
  * Cluster added by Mike Mason <mmlnx@xxxxxxxxxx>
@@ -4638,6 +4644,7 @@ linux_fetchCallBack(pmdaMetric *mdesc, unsigned int inst, 
pmAtomValue *atom)
     net_addr_t         *addrp;
     net_interface_t    *netip;
     scsi_entry_t       *scsi_entry;
+    int max_bw;
 
     if (mdesc->m_user != NULL) {
        /* 
@@ -5837,6 +5844,14 @@ linux_fetchCallBack(pmdaMetric *mdesc, unsigned int 
inst, pmAtomValue *atom)
                return PM_ERR_INST;
            atom->ul = refresh_sysfs_online(inst, "node");
            break;
+       case 2: /*hinv.node.max_memory_bandwidth */
+           if (inst >= proc_cpuinfo.node_indom->it_numinst)
+               return PM_ERR_INST;
+           max_bw = find_max_bandwidth(inst);
+           if (max_bw <= 0)
+               return PM_ERR_INST;
+           atom->ul = max_bw;
+           break;
 
        default:
            return PM_ERR_PMID;
diff --git a/src/pmdas/linux/root_linux b/src/pmdas/linux/root_linux
index 2f18b84..3b7ffca 100644
--- a/src/pmdas/linux/root_linux
+++ b/src/pmdas/linux/root_linux
@@ -71,7 +71,8 @@ hinv.cpu {
 }
 
 hinv.node {
-    online             60:55:1
+    online              60:55:1
+    max_memory_bandwidth 60:55:2
 }
 
 kernel {
-- 
1.9.3

<Prev in Thread] Current Thread [Next in Thread>