pcp
[Top] [All Lists]

[patch] waitpid on dying agents at reconfig (qa/296)

To: pcp@xxxxxxxxxxx
Subject: [patch] waitpid on dying agents at reconfig (qa/296)
From: Michael Newton <kimbrr@xxxxxxx>
Date: Tue, 9 Oct 2007 15:19:43 +1000
Cc: nscott@xxxxxxxxxx
Sender: pcp-bounce@xxxxxxxxxxx
Nathan, can you review please?

qa/296 fails with daemon PMDAs hanging round after a reconfigure (ie
kill -HUP to pmcd). It appears that they need to be wait()ed upon.
Prior to the addition of the condition  || ap->ipcType == AGENT_DSO
to HarvestAgents in agent.c this condition was masked by other cleanup.
Previously, CleanupAgent did not waitpid() for AT_CONFIG. In addition,
there is no reason to sleep every time round the waitpid() loop, and in
many cases even once may be unnecessary


===========================================================================
mgmt/pcp/src/pmcd/src/agent.c
===========================================================================

--- a/mgmt/pcp/src/pmcd/src/agent.c     2007-10-09 15:17:17.000000000 +1000
+++ b/mgmt/pcp/src/pmcd/src/agent.c     2007-10-09 15:15:35.620705921 +1000
@@ -33,7 +33,7 @@
 CleanupAgent(AgentInfo* aPtr, int why, int status)
 {
     extern int AgentDied;
-    int                exit_status;
+    int                exit_status = status;
     int                reason = 0;

     if (aPtr->ipcType == AGENT_DSO) {
@@ -68,50 +68,55 @@
     __pmNotifyErr(LOG_INFO, "CleanupAgent ...\n");
     fprintf(stderr, "Cleanup \"%s\" agent (dom %d):", aPtr->pmDomainLabel, 
aPtr->pmDomainId);

-    if (why == AT_CONFIG) {
-       fprintf(stderr, " unconfigured");
+    if (why == AT_EXIT) {
+       /* waitpid has already been done */
+       fprintf(stderr, " terminated");
+       reason = (status << 8) | REASON_EXIT;
     }
     else {
-       if (why == AT_EXIT) {
-           fprintf(stderr, " terminated");
-           exit_status = status;
-           reason = (status << 8) | REASON_EXIT;
-       }
-       else {
+       if (why == AT_CONFIG) {
+           fprintf(stderr, " unconfigured");
+       } else {
            reason = REASON_PROTOCOL;
            fprintf(stderr, " protocol failure for fd=%d", status);
            exit_status = -1;
-           if (aPtr->status.isChild == 1) {
-               pid_t   pid = -1;
-               pid_t   done;
-               int     wait_status;
-               if (aPtr->ipcType == AGENT_PIPE)
-                   pid = aPtr->ipc.pipe.agentPid;
-               else if (aPtr->ipcType == AGENT_SOCKET)
-                   pid = aPtr->ipc.socket.agentPid;
-               /*
-                * give PMDA a chance to notice the close() and exit
-                * before we try to wait()
-                */
-               sleep(1);
-               for ( ; ; ) {
+       }
+       if (aPtr->status.isChild == 1) {
+           pid_t       pid = -1;
+           pid_t       done;
+           int         wait_status;
+           int         slept = 0;
+
+           if (aPtr->ipcType == AGENT_PIPE)
+               pid = aPtr->ipc.pipe.agentPid;
+           else if (aPtr->ipcType == AGENT_SOCKET)
+               pid = aPtr->ipc.socket.agentPid;
+           for ( ; ; ) {

 #if defined(HAVE_WAIT3)
-                   done = wait3(&wait_status, WNOHANG, NULL);
+               done = wait3(&wait_status, WNOHANG, NULL);
 #elif defined(HAVE_WAITPID)
-                   done = waitpid((pid_t)-1, &wait_status, WNOHANG);
+               done = waitpid((pid_t)-1, &wait_status, WNOHANG);
 #else
-                   break;
+               done = 0;
 #endif
-                   if (done <= 0)
-                       break;
-                   else if (done == pid)
-                       exit_status = wait_status;
+               if (done == pid) {
+                   exit_status = wait_status;
+                   break;
+               }
+               if (done > 0) {
+                   continue;
                }
+               if (slept) {
+                   break;
+               }
+               /* give PMDA a chance to notice the close() and exit */
+               sleep(1);
+               slept = 1;
            }
        }
-
-       if (exit_status != -1) {
+    }
+    if (exit_status != -1) {
            if (WIFEXITED(exit_status)) {
                fprintf(stderr, ", exit(%d)", WEXITSTATUS(exit_status));
                reason = (WEXITSTATUS(exit_status) << 8) | reason;
@@ -124,7 +129,6 @@
 #endif
                reason = (WTERMSIG(exit_status) << 16) | reason;
            }
-       }
     }
     fputc('\n', stderr);
     aPtr->reason = reason;

===========================================================================
mgmt/pcp/src/pmcd/src/config.c
===========================================================================

--- a/mgmt/pcp/src/pmcd/src/config.c    2007-10-09 15:17:17.000000000 +1000
+++ b/mgmt/pcp/src/pmcd/src/config.c    2007-10-09 14:58:29.077617397 +1000
@@ -2382,11 +2382,6 @@
     free(oldAgent);
     __pmAccFreeSavedHosts();

-    /* Allow some time for the old agents to close down.  This allows sockets
-     * to be closed at the agent end, etc.
-     */
-    sleep(1);
-
     /* Start the new agents */
     ContactAgents();
     for (i = 0; i < MAXDOMID + 2; i++)


Dr.Michael("Kimba")Newton  kimbrr@xxxxxxx

<Prev in Thread] Current Thread [Next in Thread>
  • [patch] waitpid on dying agents at reconfig (qa/296), Michael Newton <=