Nathan, can you review please?
qa/296 fails with daemon PMDAs hanging round after a reconfigure (ie
kill -HUP to pmcd). It appears that they need to be wait()ed upon.
Prior to the addition of the condition || ap->ipcType == AGENT_DSO
to HarvestAgents in agent.c this condition was masked by other cleanup.
Previously, CleanupAgent did not waitpid() for AT_CONFIG. In addition,
there is no reason to sleep every time round the waitpid() loop, and in
many cases even once may be unnecessary
===========================================================================
mgmt/pcp/src/pmcd/src/agent.c
===========================================================================
--- a/mgmt/pcp/src/pmcd/src/agent.c 2007-10-09 15:17:17.000000000 +1000
+++ b/mgmt/pcp/src/pmcd/src/agent.c 2007-10-09 15:15:35.620705921 +1000
@@ -33,7 +33,7 @@
CleanupAgent(AgentInfo* aPtr, int why, int status)
{
extern int AgentDied;
- int exit_status;
+ int exit_status = status;
int reason = 0;
if (aPtr->ipcType == AGENT_DSO) {
@@ -68,50 +68,55 @@
__pmNotifyErr(LOG_INFO, "CleanupAgent ...\n");
fprintf(stderr, "Cleanup \"%s\" agent (dom %d):", aPtr->pmDomainLabel,
aPtr->pmDomainId);
- if (why == AT_CONFIG) {
- fprintf(stderr, " unconfigured");
+ if (why == AT_EXIT) {
+ /* waitpid has already been done */
+ fprintf(stderr, " terminated");
+ reason = (status << 8) | REASON_EXIT;
}
else {
- if (why == AT_EXIT) {
- fprintf(stderr, " terminated");
- exit_status = status;
- reason = (status << 8) | REASON_EXIT;
- }
- else {
+ if (why == AT_CONFIG) {
+ fprintf(stderr, " unconfigured");
+ } else {
reason = REASON_PROTOCOL;
fprintf(stderr, " protocol failure for fd=%d", status);
exit_status = -1;
- if (aPtr->status.isChild == 1) {
- pid_t pid = -1;
- pid_t done;
- int wait_status;
- if (aPtr->ipcType == AGENT_PIPE)
- pid = aPtr->ipc.pipe.agentPid;
- else if (aPtr->ipcType == AGENT_SOCKET)
- pid = aPtr->ipc.socket.agentPid;
- /*
- * give PMDA a chance to notice the close() and exit
- * before we try to wait()
- */
- sleep(1);
- for ( ; ; ) {
+ }
+ if (aPtr->status.isChild == 1) {
+ pid_t pid = -1;
+ pid_t done;
+ int wait_status;
+ int slept = 0;
+
+ if (aPtr->ipcType == AGENT_PIPE)
+ pid = aPtr->ipc.pipe.agentPid;
+ else if (aPtr->ipcType == AGENT_SOCKET)
+ pid = aPtr->ipc.socket.agentPid;
+ for ( ; ; ) {
#if defined(HAVE_WAIT3)
- done = wait3(&wait_status, WNOHANG, NULL);
+ done = wait3(&wait_status, WNOHANG, NULL);
#elif defined(HAVE_WAITPID)
- done = waitpid((pid_t)-1, &wait_status, WNOHANG);
+ done = waitpid((pid_t)-1, &wait_status, WNOHANG);
#else
- break;
+ done = 0;
#endif
- if (done <= 0)
- break;
- else if (done == pid)
- exit_status = wait_status;
+ if (done == pid) {
+ exit_status = wait_status;
+ break;
+ }
+ if (done > 0) {
+ continue;
}
+ if (slept) {
+ break;
+ }
+ /* give PMDA a chance to notice the close() and exit */
+ sleep(1);
+ slept = 1;
}
}
-
- if (exit_status != -1) {
+ }
+ if (exit_status != -1) {
if (WIFEXITED(exit_status)) {
fprintf(stderr, ", exit(%d)", WEXITSTATUS(exit_status));
reason = (WEXITSTATUS(exit_status) << 8) | reason;
@@ -124,7 +129,6 @@
#endif
reason = (WTERMSIG(exit_status) << 16) | reason;
}
- }
}
fputc('\n', stderr);
aPtr->reason = reason;
===========================================================================
mgmt/pcp/src/pmcd/src/config.c
===========================================================================
--- a/mgmt/pcp/src/pmcd/src/config.c 2007-10-09 15:17:17.000000000 +1000
+++ b/mgmt/pcp/src/pmcd/src/config.c 2007-10-09 14:58:29.077617397 +1000
@@ -2382,11 +2382,6 @@
free(oldAgent);
__pmAccFreeSavedHosts();
- /* Allow some time for the old agents to close down. This allows sockets
- * to be closed at the agent end, etc.
- */
- sleep(1);
-
/* Start the new agents */
ContactAgents();
for (i = 0; i < MAXDOMID + 2; i++)
Dr.Michael("Kimba")Newton kimbrr@xxxxxxx
|