pcp
[Top] [All Lists]

[PATCH v2] pmdalinux: Add a new metric for Maximum Memory Bandwidth per

To: pcp@xxxxxxxxxxx
Subject: [PATCH v2] pmdalinux: Add a new metric for Maximum Memory Bandwidth per numa node
From: Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx>
Date: Wed, 11 May 2016 02:41:56 +0530
Cc: nathans@xxxxxxxxxx
Delivered-to: pcp@xxxxxxxxxxx
This patch adds a new metric to monitor the maximum memory bandwidth per
numa node. This metric, for now, is limited by a config file called
bandwidth.conf which contains the node bandwidth information. For e.g.:
node0:40960
node1:40960
...
Each row represents a numa node of the system along with the maximum
memory bandwidth (in MB/sec) it supports. The maximum memory bandwidth
can be found using several benchmarking tools by saturating and
measuring the bandwidth.

pmdalinux agent parses the config file and checks whether the node is
present in sysfs/devices/system/node/ directory. The node name in the
config file must match the name of any of the nodes in node/ directory.
The bandwidth value is taken from this config file and updated in the
node_info struct for each node.

  # pminfo | grep bandwidth
  mem.numa.max_bandwidth

  # pmval mem.numa.max_bandwidth

  metric:    mem.numa.max_bandwidth
  host:      <some_host>
  semantics: instantaneous value
  units:     Mbyte / sec
  samples:   all

                  node0                 node1
              4.096E+04             4.096E+04
              4.096E+04             4.096E+04
  ...

Few things to note:
 - The user/client can run some benchmarking tools to saturate the
   bandwidth and can update this information in the .config file.
 - The max bandwidth value can be given as a floating point.
 - The node names mentioned in the .config must match any of the node
   names found in sysfs/devices/system/node/ directory.
 - Right now, automatic update of max bandwidth is not supported due to
   lack of non-standard/arch-independent tools.
 - Support for automatic updates for max bandwidth using some
   benchmarking tools will be added later.

Purpose of this metric:
As of now, we have hardware counters for measuring the current memory
bandwidth (read and write) and that can be aggregated per
node. "perfevent" agent for PCP can be used for that. However, to make
decisions regarding placement/migration of workloads across nodes(or
systems) solely based on the current bandwidth is not sufficient. We
also need the maximum bandwidth supported on the nodes to find out the
utilization. And hence, the maximum bandwidth can be used for this
purpose.

Signed-off-by: Hemant Kumar <hemant@xxxxxxxxxxxxxxxxxx>
---
Changes since v1:
- Moved the metric to linux pmda.

 src/pmdas/linux/GNUmakefile     |   4 +-
 src/pmdas/linux/bandwidth.conf  |  11 +++
 src/pmdas/linux/help            |   1 +
 src/pmdas/linux/mem_bandwidth.c | 154 ++++++++++++++++++++++++++++++++++++++++
 src/pmdas/linux/numa_meminfo.c  |  15 ++++
 src/pmdas/linux/numa_meminfo.h  |   4 +-
 src/pmdas/linux/pmda.c          |   9 +++
 src/pmdas/linux/root_linux      |   1 +
 8 files changed, 196 insertions(+), 3 deletions(-)
 create mode 100644 src/pmdas/linux/bandwidth.conf
 create mode 100644 src/pmdas/linux/mem_bandwidth.c

diff --git a/src/pmdas/linux/GNUmakefile b/src/pmdas/linux/GNUmakefile
index ccc9c60..01b1a25 100644
--- a/src/pmdas/linux/GNUmakefile
+++ b/src/pmdas/linux/GNUmakefile
@@ -35,7 +35,7 @@ CFILES                = pmda.c \
                  proc_slabinfo.c proc_sys_fs.c proc_vmstat.c \
                  sysfs_kernel.c linux_table.c numa_meminfo.c \
                  proc_net_netstat.c namespaces.c proc_net_softnet.c \
-                 proc_net_snmp6.c
+                 proc_net_snmp6.c mem_bandwidth.c
 
 HFILES         = clusters.h indom.h convert.h \
                  proc_stat.h proc_meminfo.h proc_loadavg.h \
@@ -76,7 +76,7 @@ build-me: $(LIBTARGET) $(CMDTARGET) $(HELPTARGETS) 
$(LSRCFILES)
 
 install: default
        $(INSTALL) -m 755 -d $(PMDADIR)
-       $(INSTALL) -m 644 domain.h help $(HELPTARGETS) $(PMDADIR)
+       $(INSTALL) -m 644 domain.h help bandwidth.conf $(HELPTARGETS) $(PMDADIR)
        $(INSTALL) -m 755 $(LIBTARGET) $(CMDTARGET) $(PMDADIR)
        $(INSTALL) -m 644 root_linux $(PCP_VAR_DIR)/pmns/root_linux
        $(INSTALL) -m 644 proc_net_snmp_migrate.conf 
$(LOGREWRITEDIR)/linux_proc_net_snmp_migrate.conf
diff --git a/src/pmdas/linux/bandwidth.conf b/src/pmdas/linux/bandwidth.conf
new file mode 100644
index 0000000..e8b7b5c
--- /dev/null
+++ b/src/pmdas/linux/bandwidth.conf
@@ -0,0 +1,11 @@
+# bandwidth.conf
+# Syntax :
+# Numa_node_name:bandwidth
+# Numa_node_name must match with a node name in sysfs/nodes directory.
+# bandwidth is the maximum memory bandwidth supported for that numa node
+# and can be a floating point number.
+# Also, this conf file needs to have a Version string. Currently,
+# 1.0 is the supported version. This has to be at the beginning of this
+# file.
+#
+Version:1.0
diff --git a/src/pmdas/linux/help b/src/pmdas/linux/help
index be31372..6468fbd 100644
--- a/src/pmdas/linux/help
+++ b/src/pmdas/linux/help
@@ -703,6 +703,7 @@ User memory (Kbytes) in pages not backed by files, e.g. 
from malloc()
 @ mem.numa.alloc.interleave_hit count of times interleaving wanted to allocate 
on this node and succeeded
 @ mem.numa.alloc.local_node count of times a process ran on this node and got 
memory on this node
 @ mem.numa.alloc.other_node count of times a process ran on this node and got 
memory from another node
+@ mem.numa.max_bandwidth maximum memory bandwidth supported on each numa node
 @ mem.vmstat.nr_dirty number of pages in dirty state
 Instantaneous number of pages in dirty state, from /proc/vmstat
 @ mem.vmstat.nr_dirty_background_threshold background writeback threshold
diff --git a/src/pmdas/linux/mem_bandwidth.c b/src/pmdas/linux/mem_bandwidth.c
new file mode 100644
index 0000000..fce484c
--- /dev/null
+++ b/src/pmdas/linux/mem_bandwidth.c
@@ -0,0 +1,154 @@
+/* Initializes the maximum memory bandwidth per numa node
+ *
+ * Copyright (c) 2016 Hemant K. Shaw, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "pmapi.h"
+#include "impl.h"
+#include "pmda.h"
+#include "indom.h"
+#include "proc_cpuinfo.h"
+#include "proc_stat.h"
+#include "numa_meminfo.h"
+
+#define VERSION_STR "Version"
+#define SUPP_VERSION "1.0"
+#define MAX_NAME_LEN 512
+
+static void skim_through_whitespace(char *start_ptr, char *end_ptr)
+{
+    while((start_ptr != end_ptr) && isspace(*start_ptr))
+       start_ptr++;
+}
+
+static int find_node_match(char *name, int nr_nodes)
+{
+    int i;
+    char node_name[MAX_NAME_LEN];
+
+    for (i = 0; i < nr_nodes; i++) {
+       snprintf(node_name, MAX_NAME_LEN, "%s%d", "node", i);
+       if (!strncmp(node_name, name, strlen(name)))
+           return i;
+    }
+    return -1;
+}
+
+static int validate_conf_version(char *start, char *end)
+{
+    char *ptr;
+
+    ptr = strchr(start, ':');
+    if (!ptr) {
+       fprintf(stderr, "Version information missing in bandwidth.conf");
+       return -1;
+    }
+    *ptr = '\0';
+    ptr++;
+    skim_through_whitespace(ptr, end);
+    if (!strncmp(start, VERSION_STR, strlen(VERSION_STR)) &&
+       !(strncmp(ptr, SUPP_VERSION, strlen(SUPP_VERSION))))
+       return 0;
+    fprintf(stderr, "Unsupported bandwidth.conf version, expected version : 
%s",
+           SUPP_VERSION);
+    return -1;
+}
+
+int get_memory_bandwidth_conf(numa_meminfo_t *numa_meminfo,
+                             int nr_nodes)
+{
+    size_t len = 0;
+    char *start_ptr, *end_ptr, *value_str, *line = NULL;
+    FILE *fp;
+    ssize_t ret = 0;
+    char *node_name;
+    int nodes_found = 0, id;
+    int version_found = 0;
+
+    fp = fopen(numa_meminfo->bandwidth_conf, "r");
+    if (NULL == fp) {
+       fprintf(stderr, "Error in opening %s\n", numa_meminfo->bandwidth_conf);
+       return -1;
+    }
+
+    while(ret >= 0) {
+       ret = getline(&line, &len, fp);
+       if (ret > 0) {
+           /* Ignore the comments */
+           if (line[0] == '#') {
+               continue;
+           }
+           /* Remove the new line from the end of the string here (if any) */
+           if (line[strlen(line) - 1] == '\n')
+               line[strlen(line) - 1] = '\0';
+
+           start_ptr = line;
+           end_ptr = start_ptr + strlen(line) - 1;
+
+           /* Ignore white-space */
+           skim_through_whitespace(start_ptr, end_ptr);
+
+           /* Verify the version information */
+           if (strstr(start_ptr, VERSION_STR)) {
+               ret = validate_conf_version(start_ptr, end_ptr);
+               if (ret < 0) {
+                   goto free_line;
+               } else {
+                   version_found = 1;
+                   continue;
+               }
+           }
+
+           if (!version_found) {
+               ret = -1;
+               fprintf(stderr, "Version needs to be specified at the beginning 
of bandwidth.conf file\n");
+               goto free_line;
+           }
+
+           value_str = strchr(line, ':');
+           if (NULL == value_str) {
+               ret = -1;
+               goto free_line;
+           }
+
+           *value_str = '\0';
+           value_str++;
+
+           node_name = start_ptr;
+
+           id = find_node_match(node_name, nr_nodes);
+           if (id == -1) {
+               fprintf(stderr, "Unknown node name provided in 
bandwidth.conf\n");
+               return -1;
+           }
+           numa_meminfo->node_info[id].bandwidth = atof(value_str);
+           nodes_found++;
+       }
+    }
+
+    if (nodes_found == nr_nodes)
+       ret = 0;
+
+ free_line:
+    if (line)
+       free(line);
+    fclose(fp);
+
+    return ret;
+}
diff --git a/src/pmdas/linux/numa_meminfo.c b/src/pmdas/linux/numa_meminfo.c
index 22d8351..6a2c815 100644
--- a/src/pmdas/linux/numa_meminfo.c
+++ b/src/pmdas/linux/numa_meminfo.c
@@ -82,6 +82,7 @@ int refresh_numa_meminfo(numa_meminfo_t *numa_meminfo, 
proc_cpuinfo_t *proc_cpui
     FILE *fp;
     pmdaIndom *idp = PMDAINDOM(NODE_INDOM);
     static int started;
+    int sep, ret;
 
     /* First time only */
     if (!started) {
@@ -110,6 +111,12 @@ int refresh_numa_meminfo(numa_meminfo_t *numa_meminfo, 
proc_cpuinfo_t *proc_cpui
            }
        }
 
+       sep = __pmPathSeparator();
+       snprintf(numa_meminfo->bandwidth_conf,
+                sizeof(numa_meminfo->bandwidth_conf),
+                "%s%c%s%c%s.conf", pmGetConfig("PCP_PMDAS_DIR"), sep, "linux",
+                sep, "bandwidth");
+
        numa_meminfo->node_indom = idp;
        started = 1;
     }
@@ -133,5 +140,13 @@ int refresh_numa_meminfo(numa_meminfo_t *numa_meminfo, 
proc_cpuinfo_t *proc_cpui
        }
     }
 
+    /* Read the bandwidth info from the bandwidth.conf file */
+    ret = get_memory_bandwidth_conf(numa_meminfo, idp->it_numinst);
+    if (ret < 0) {
+       fprintf(stderr, "%s: error in fetching bandwidth info\n",
+               __FUNCTION__);
+       return ret;
+    }
+
     return 0;
 }
diff --git a/src/pmdas/linux/numa_meminfo.h b/src/pmdas/linux/numa_meminfo.h
index 22c1289..913d0d7 100644
--- a/src/pmdas/linux/numa_meminfo.h
+++ b/src/pmdas/linux/numa_meminfo.h
@@ -21,12 +21,14 @@
 typedef struct {
     struct linux_table *meminfo;
     struct linux_table *memstat;
+    double bandwidth;
 } nodeinfo_t;
 
 typedef struct {
     nodeinfo_t *node_info;
     pmdaIndom  *node_indom;
+    char       bandwidth_conf[PATH_MAX];
 } numa_meminfo_t;
 
 extern int refresh_numa_meminfo(numa_meminfo_t *, proc_cpuinfo_t *, 
proc_stat_t *);
-
+int get_memory_bandwidth_conf(numa_meminfo_t *numa_meminfo,int nr_nodes);
diff --git a/src/pmdas/linux/pmda.c b/src/pmdas/linux/pmda.c
index 27343d0..98eeffc 100644
--- a/src/pmdas/linux/pmda.c
+++ b/src/pmdas/linux/pmda.c
@@ -1191,6 +1191,10 @@ static pmdaMetric metrictab[] = {
       { PMDA_PMID(CLUSTER_NUMA_MEMINFO,37), PM_TYPE_U64, NODE_INDOM, 
PM_SEM_COUNTER,
       PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
 
+/* mem.numa.max_bandwidth */
+    { NULL,
+      { PMDA_PMID(CLUSTER_NUMA_MEMINFO,38), PM_TYPE_DOUBLE, NODE_INDOM, 
PM_SEM_INSTANT,
+       PMDA_PMUNITS(1,-1,0,PM_SPACE_MBYTE,PM_TIME_SEC,0) }, },
 
 /* swap.length */
     { NULL,
@@ -6374,6 +6378,11 @@ linux_fetchCallBack(pmdaMetric *mdesc, unsigned int 
inst, pmAtomValue *atom)
                    &atom->ull);
            break;
 
+       case 38: /* mem.numa.max_banwidth */
+           atom->d = numa_meminfo.node_info[inst].bandwidth;
+           sts = 1;
+           break;
+
        default:
            return PM_ERR_PMID;
        }
diff --git a/src/pmdas/linux/root_linux b/src/pmdas/linux/root_linux
index e3dd537..6c58346 100644
--- a/src/pmdas/linux/root_linux
+++ b/src/pmdas/linux/root_linux
@@ -412,6 +412,7 @@ mem.util {
 mem.numa {
     util
     alloc
+    max_bandwidth      60:36:38
 }
 
 mem.numa.util {
-- 
1.9.3

<Prev in Thread] Current Thread [Next in Thread>
  • [PATCH v2] pmdalinux: Add a new metric for Maximum Memory Bandwidth per numa node, Hemant Kumar <=