pcp
[Top] [All Lists]

Re: Review: PCP & pmlogger take too long to start

To: Nathan Scott <nscott@xxxxxxxxxx>
Subject: Re: Review: PCP & pmlogger take too long to start
From: Michael Newton <kimbrr@xxxxxxx>
Date: Tue, 3 Jul 2007 20:19:20 +1000
Cc: pcp@xxxxxxxxxxx
In-reply-to: <1183417678.15488.257.camel@xxxxxxxxxxxxxx>
References: <Pine.SGI.4.58.0706271012280.2186626@xxxxxxxxxxxxxxxxxxxxxxx> <Pine.SGI.4.58.0706271124250.2186626@xxxxxxxxxxxxxxxxxxxxxxx> <Pine.SGI.4.58.0706271715321.2351218@xxxxxxxxxxxxxxxxxxxxxxx> <1182996127.15488.102.camel@xxxxxxxxxxxxxx> <Pine.SGI.4.58.0706291810180.4792701@xxxxxxxxxxxxxxxxxxxxxxx> <1183355238.15488.217.camel@xxxxxxxxxxxxxx> <1183356141.15488.223.camel@xxxxxxxxxxxxxx> <Pine.SGI.4.58.0707021658520.7708406@xxxxxxxxxxxxxxxxxxxxxxx> <1183417678.15488.257.camel@xxxxxxxxxxxxxx>
Sender: pcp-bounce@xxxxxxxxxxx
i decided to repost as ive tried to take on board favouring readability &
conciseness over keeping the diffs short


===========================================================================
mgmt/pcp/man/man1/GNUmakefile
===========================================================================

--- a/mgmt/pcp/man/man1/GNUmakefile     2007-07-03 20:16:14.000000000 +1000
+++ b/mgmt/pcp/man/man1/GNUmakefile     2007-06-29 15:36:13.632867332 +1000
@@ -19,7 +19,7 @@
        pmnsmerge.1 pmpost.1 pmprobe.1 pmsocks.1 pmstat.1 pmstore.1 \
        pmtrace.1 pmval.1 pmdaweblog.1 pmlogsummary.1 pmdashping.1 \
        pmdumptext.1 genpmda.1 pmproxy.1 pmdasummary.1 pmlogreduce.1 \
-       autofsd-probe.1 pmie2col.1 telnet-probe.1
+       autofsd-probe.1 pmie2col.1 telnet-probe.1 pmsleep.1

 MAN_DEST       = $(PCP_MAN_DIR)/man$(MAN_SECTION)
 LSRCFILES      = $(MAN_PAGES)

===========================================================================
mgmt/pcp/man/man1/pmsleep.1
===========================================================================

--- a/mgmt/pcp/man/man1/pmsleep.1       2006-06-17 00:58:24.000000000 +1000
+++ b/mgmt/pcp/man/man1/pmsleep.1       2007-07-03 16:48:17.618316074 +1000
@@ -0,0 +1,45 @@
+'\"macro stdmacro
+.\"
+.\" Copyright (c) 2007 Silicon Graphics, Inc.  All Rights Reserved.
+.\"
+.\" $Id$
+.ie \(.g \{\
+.\" ... groff (hack for khelpcenter, man2html, etc.)
+.TH PMSLEEP 1 "SGI" "Performance Co-Pilot"
+\}
+.el \{\
+.if \nX=0 .ds x} PMSLEEP 1 "SGI" "Performance Co-Pilot"
+.if \nX=1 .ds x} PMSLEEP 1 "Performance Co-Pilot"
+.if \nX=2 .ds x} PMSLEEP 1 "" "\&"
+.if \nX=3 .ds x} PMSLEEP "" "" "\&"
+.TH \*(x}
+.rr X
+\}
+.SH NAME
+\f3pmsleep\f1 \- portable subsecond-capable sleep
+.\" literals use .B or \f3
+.\" arguments use .I or \f2
+.SH SYNOPSIS
+.B $PCP_BINADM_DIR/pmsleep
+.I interval
+.SH DESCRIPTION
+.B pmsleep
+sleeps for
+.I interval.
+The
+.I interval
+argument follows the syntax described in
+.BR PCPIntro (1)
+for
+.B \-t,
+and in the simplest form may be an unsigned integer
+or floating point constant
+(the implied units in this case are seconds).
+.SH DIAGNOSTICS
+The exit status is 0 for success, or 1 for a malformed command line.
+If the underlying
+.B nanosleep (2)
+system call fails, an errno is returned.
+.SH SEE ALSO
+.BR sleep (1),
+.BR nanosleep (3).

===========================================================================
mgmt/pcp/src/GNUmakefile
===========================================================================

--- a/mgmt/pcp/src/GNUmakefile  2007-07-03 20:16:14.000000000 +1000
+++ b/mgmt/pcp/src/GNUmakefile  2007-06-29 14:46:06.336727771 +1000
@@ -21,7 +21,7 @@
          pmdumplog pmlogextract pmstore pmhostname pmgenmap pmlogctl \
          pmlogconf pmlogsummary pmclient pmkstat pcp pmlc dbpmda \
          xconfirm pmtrace pmstat pmsocks pmdas pmafm procmemstat \
-         pmlogreduce genpmda pmproxy telnet-probe
+         pmlogreduce genpmda pmproxy telnet-probe pmsleep

 ifneq ($(TARGET_OS), cygwin)
 SUBDIRS += libpcp_pmc pmdumptext autofsd-probe

===========================================================================
mgmt/pcp/src/pmcd/rc_pcp
===========================================================================

--- a/mgmt/pcp/src/pmcd/rc_pcp  2007-07-03 20:16:14.000000000 +1000
+++ b/mgmt/pcp/src/pmcd/rc_pcp  2007-07-03 19:29:52.089858288 +1000
@@ -383,16 +383,19 @@
     fi
     $ECHO $PCP_ECHO_N "Waiting for PMCD to terminate ...""$PCP_ECHO_C"
     gone=0
-    for i in 1 2 3 4 5 6
+    delay=200
+    while [ $i -lt $delay ]
     do
-       sleep 3
+       # dont sleep before 1st pid check, or after last
+        [ $i -eq 0 ] || pmsleep 0.1
+       i=`expr $i + 1`
+       $VERBOSE && [ `expr $delay % 10` -eq 0 ] && $PCP_ECHO_PROG $PCP_ECHO_N 
".""$PCP_ECHO_C"
        _get_pids_by_name pmcd >$tmp.tmp
        if [ ! -s $tmp.tmp ]
        then
            gone=1
            break
        fi
-       $ECHO $PCP_ECHO_N ".""$PCP_ECHO_C"
     done
     if [ $gone != 1 ]  # It just WON'T DIE, give up.
     then

===========================================================================
mgmt/pcp/src/pmcd/src/agent.c
===========================================================================

--- a/mgmt/pcp/src/pmcd/src/agent.c     2007-07-03 20:16:14.000000000 +1000
+++ b/mgmt/pcp/src/pmcd/src/agent.c     2007-06-26 14:28:22.912602167 +1000
@@ -166,7 +166,7 @@
        found = 0;
        for ( i = 0; i < nAgents; i++) {
            ap = &agent[i];
-           if (!ap->status.connected)
+           if (!ap->status.connected || ap->ipcType == AGENT_DSO)
                continue;

            found = 1;

===========================================================================
mgmt/pcp/src/pmie/pmie_check.sh
===========================================================================

--- a/mgmt/pcp/src/pmie/pmie_check.sh   2007-07-03 20:16:14.000000000 +1000
+++ b/mgmt/pcp/src/pmie/pmie_check.sh   2007-07-03 20:09:44.071302396 +1000
@@ -144,44 +144,41 @@
 {
     # demand mutual exclusion
     #
-    fail=true
     rm -f $tmp.stamp
-    for try in 1 2 3 4
+    i=0
+    while [ $i -lt 200 ]
     do
+       # dont sleep before 1st lock check, or after last
+       [ $i -eq 0 ] || pmsleep 0.1
+       i=`expr $i + 1`
+
        if pmlock -v $logfile.lock >$tmp.out
        then
            echo $logfile.lock >$tmp.lock
-           fail=false
-           break
-       else
-           if [ ! -f $tmp.stamp ]
-           then
-               touch -t `pmdate -30M %Y%m%d%H%M` $tmp.stamp
-           fi
-           if [ -n "`find $logfile.lock ! -newer $tmp.stamp -print 
2>/dev/null`" ]
-           then
-               _warning "removing lock file older than 30 minutes"
-               ls -l $logfile.lock
-               rm -f $logfile.lock
-           fi
+           return 0
        fi
-       sleep 5
-    done
-
-    if $fail
-    then
-       # failed to gain mutex lock
-       #
-       if [ -f $logfile.lock ]
+       if [ ! -f $tmp.stamp ]
        then
-           _warning "is another PCP cron job running concurrently?"
+           touch -t `pmdate -30M %Y%m%d%H%M` $tmp.stamp
+       fi
+       if [ -n "`find $logfile.lock ! -newer $tmp.stamp -print 2>/dev/null`" ]
+       then
+           _warning "removing lock file older than 30 minutes"
            ls -l $logfile.lock
-       else
-           echo "$prog: `cat $tmp.out`"
+           rm -f $logfile.lock
        fi
-       _warning "failed to acquire exclusive lock ($logfile.lock) ..."
-       continue
+    done
+    # failed to gain mutex lock
+    #
+    if [ -f $logfile.lock ]
+    then
+       warning "is another PCP cron job running concurrently?"
+       ls -l $logfile.lock
+    else
+       echo "$prog: `cat $tmp.out`"
     fi
+    _warning "failed to acquire exclusive lock ($logfile.lock) ..."
+    return 1
 }

 _unlock()
@@ -270,51 +267,49 @@

     # wait for maximum time of a connection and 20 requests
     #
-    delay=`expr $delay + 20 \* $x`
+    # $logfile was previously removed, if it has appeared again
+    # then we know pmlogger has started ... if not just sleep and
+    # try again
+    #
+    delay=`expr 10 \* \( $delay + 20 \* $x \) `
     i=0
     while [ $i -lt $delay ]
     do
-       $VERBOSE && $PCP_ECHO_PROG $PCP_ECHO_N ".""$PCP_ECHO_C"
-       if [ -f $logfile ]
+       # dont sleep before 1st log check
+        [ $i -eq 0 ] || pmsleep 0.1
+       i=`expr $i + 1`
+       $VERBOSE && [ `expr $delay % 10` -eq 0 ] && $PCP_ECHO_PROG $PCP_ECHO_N 
".""$PCP_ECHO_C"
+       [ ! -f $logfile ] && continue
+       if ls $PCP_TMP_DIR/pmie/$1 >$tmp.out 2>&1
        then
-           # $logfile was previously removed, if it has appeared again then
-           # we know pmie has started ... if not just sleep and try again
-           #
-           if ls $PCP_TMP_DIR/pmie/$1 >$tmp.out 2>&1
-           then
-               if grep "No such file or directory" $tmp.out >/dev/null
-               then
-                   :
-               else
-                   sleep 5
-                   $VERBOSE && echo " done"
-                   return 0
-               fi
-           fi
-           case "$PCP_PLATFORM"
-           in
-               irix)
-                   ps -e | grep "^ *$1 " >/dev/null
-                   ;;
-               linux)
-                   test -e /proc/$1
-                   ;;
-           esac
-
-           if [ $? -ne 0 ]
+           if grep "No such file or directory" $tmp.out >/dev/null
            then
-               $VERBOSE || _message restart
-               echo " process exited!"
-               echo "$prog: Error: failed to restart pmie"
-               echo "Current pmie processes:"
-               ps $PCP_PS_ALL_FLAGS | sed -n -e 1p -e "/$PMIE/p"
-               echo
-               _check_logfile
-               return 1
+               :
+           else
+               pmsleep 0.1
+               $VERBOSE && echo " done"
+               return 0
            fi
        fi
-       sleep 5
-       i=`expr $i + 5`
+       case "$PCP_PLATFORM"
+       in
+           irix)
+               ps -e | grep "^ *$1 " >/dev/null
+               ;;
+           linux)
+               test -e /proc/$1
+               ;;
+       esac
+
+       [ $? -eq 0 ] && continue
+       $VERBOSE || _message restart
+       echo " process exited!"
+       echo "$prog: Error: failed to restart pmie"
+       echo "Current pmie processes:"
+       ps $PCP_PS_ALL_FLAGS | sed -n -e 1p -e "/$PMIE/p"
+       echo
+       _check_logfile
+       return 1
     done
     $VERBOSE || _message restart
     echo " timed out waiting!"
@@ -434,8 +429,11 @@
     then
        _warning "no write access in $dir, skip lock file processing"
        ls -ld $dir
+    elif _lock
+    then
+       :
     else
-       _lock
+       continue
     fi

     # match $logfile and $fqdn from control file to running pmies
@@ -630,13 +628,20 @@
     then
        $VERY_VERBOSE && ( echo; $PCP_ECHO_PROG $PCP_ECHO_N "+ $KILL -KILL `cat 
$tmp.pmies` ...""$PCP_ECHO_C" )
        eval $KILL -KILL $pmielist >/dev/null 2>&1
-       sleep 3         # give them a chance to go
-       if ps -f -p "$pmielist" >$tmp.alive 2>&1
-       then
+       i=0
+       while ps -f -p "$pmielist" >$tmp.alive 2>&1
+       do
+           if [ $i -lt 30 ]
+           then
+               pmsleep 0.1
+               i=`expr $i + 1`
+               continue;
+           fi
            echo "$prog: Error: pmie process(es) will not die"
            cat $tmp.alive
            status=1
-       fi
+           break
+       done
     fi
 fi


===========================================================================
mgmt/pcp/src/pmlogctl/pmlogger_check.sh
===========================================================================

--- a/mgmt/pcp/src/pmlogctl/pmlogger_check.sh   2007-07-03 20:16:14.000000000 
+1000
+++ b/mgmt/pcp/src/pmlogctl/pmlogger_check.sh   2007-07-03 20:10:13.687441468 
+1000
@@ -192,60 +192,51 @@

     # wait for maximum time of a connection and 20 requests
     #
-    delay=`expr $delay + 20 \* $x`
+    # $logfile was previously removed, if it has appeared again
+    # then we know pmlogger has started ... if not just sleep and
+    # try again
+    #
+    delay=`expr 10 \* \( $delay + 20 \* $x \) `
     i=0
-    while [ $i -lt $delay ]
+    while [ $i -lt $delay ] # caution: nested continue 2 below
     do
-       $VERBOSE && $PCP_ECHO_PROG $PCP_ECHO_N ".""$PCP_ECHO_C"
-       if [ -f $logfile ]
+       # dont sleep before 1st log check
+        [ $i -eq 0 ] || pmsleep 0.1
+       i=`expr $i + 1`
+       $VERBOSE && [ `expr $delay % 10` -eq 0 ] && $PCP_ECHO_PROG $PCP_ECHO_N 
".""$PCP_ECHO_C"
+       [ ! -f $logfile ] && continue
+       if echo "connect $1" | pmlc 2>&1 | grep "Unable to connect" >/dev/null
        then
-           # $logfile was previously removed, if it has appeared again
-           # then we know pmlogger has started ... if not just sleep and
-           # try again
-           #
-           if echo "connect $1" | pmlc 2>&1 | grep "Unable to connect" 
>/dev/null
-           then
-               :
-           else
-               sleep 5
-               $VERBOSE && echo " done"
-               return 0
-           fi
+           :
+       else
+           pmsleep 0.1
+           $VERBOSE && echo " done"
+           return 0
+       fi

-           _plist=`_get_pids_by_name pmlogger`
-           _found=false
+       _plist=`_get_pids_by_name pmlogger`
+       _found=false
+       for _p in `echo $_plist`
+       do
+           [ $_p -eq $1 ] && continue 2
+       done
+       $VERBOSE || _message restart
+       echo " process exited!"
+       if $TERSE
+       then
+           :
+       else
+           echo "$prog: Error: failed to restart pmlogger"
+           echo "Current pmlogger processes:"
+           ps $PCP_PS_ALL_FLAGS | tee $tmp.tmp | sed -n -e 1p
            for _p in `echo $_plist`
-           do
-               [ $_p -eq $1 ] && _found=true
-           done
-
-           if $_found
-           then
-               # process still here, just not accepting pmlc connections
-               # yet, try again
-               :
-           else
-               $VERBOSE || _message restart
-               echo " process exited!"
-               if $TERSE
-               then
-                   :
-               else
-                   echo "$prog: Error: failed to restart pmlogger"
-                   echo "Current pmlogger processes:"
-                   ps $PCP_PS_ALL_FLAGS | tee $tmp.tmp | sed -n -e 1p
-                   for _p in `echo $_plist`
-                   do
-                       sed -n -e "/^[ ]*[^ ]* [ ]*$_p /p" < $tmp.tmp
-                   done
-                   echo
-               fi
-               _check_logfile
-               return 1
-           fi
+             do
+             sed -n -e "/^[ ]*[^ ]* [ ]*$_p /p" < $tmp.tmp
+           done
+           echo
        fi
-       sleep 5
-       i=`expr $i + 5`
+       _check_logfile
+       return 1
     done
     $VERBOSE || _message restart
     echo " timed out waiting!"
@@ -259,6 +250,56 @@
     return 1
 }

+_lock()
+{
+    # demand mutual exclusion
+    #
+    rm -f $tmp.stamp
+    i=0
+    while [ $i -lt 200 ]
+    do
+       # dont sleep before 1st lock check, or after last
+        [ $i -eq 0 ] || pmsleep 0.1
+       i=`expr $i + 1`
+
+       if pmlock -v lock >$tmp.out
+       then
+           echo $dir/lock >$tmp.lock
+           return 0
+       fi
+       if [ ! -f $tmp.stamp ]
+       then
+           if uname -r | grep '^5\.3' >/dev/null
+           then
+               # IRIX 5.3 does not support -t for touch(1)
+               #
+               touch `pmdate -30M %m%d%H%M%y` $tmp.stamp
+           else
+               touch -t `pmdate -30M %Y%m%d%H%M` $tmp.stamp
+           fi
+       fi
+       if [ ! -z "`find lock -newer $tmp.stamp -print 2>/dev/null`" ]
+       then
+           :
+       else
+           echo "$prog: Warning: removing lock file older than 30 minutes"
+           LC_TIME=POSIX ls -l $dir/lock
+           rm -f lock
+       fi
+    done
+    # failed to gain mutex lock
+    #
+    if [ -f lock ]
+    then
+       echo "$prog: Warning: is another PCP cron job running concurrently?"
+       LC_TIME=POSIX ls -l $dir/lock
+    else
+       echo "$prog: `cat $tmp.out`"
+    fi
+    _warning "failed to acquire exclusive lock ($dir/lock) ..."
+    return 1
+}
+
 # note on control file format version
 #  1.0 was shipped as part of PCPWEB beta, and did not include the
 #      socks field [this is the default for backwards compatibility]
@@ -374,56 +415,11 @@
     if [ ! -w $dir ]
     then
         echo "$prog: Warning: no write access in $dir, skip lock file 
processing"
+    elif _lock
+    then
+       :
     else
-       # demand mutual exclusion
-       #
-       fail=true
-       rm -f $tmp.stamp
-       for try in 1 2 3 4
-       do
-           if pmlock -v lock >$tmp.out
-           then
-               echo $dir/lock >$tmp.lock
-               fail=false
-               break
-           else
-               if [ ! -f $tmp.stamp ]
-               then
-                   if uname -r | grep '^5\.3' >/dev/null
-                   then
-                       # IRIX 5.3 does not support -t for touch(1)
-                       #
-                       touch `pmdate -30M %m%d%H%M%y` $tmp.stamp
-                   else
-                       touch -t `pmdate -30M %Y%m%d%H%M` $tmp.stamp
-                   fi
-               fi
-               if [ ! -z "`find lock -newer $tmp.stamp -print 2>/dev/null`" ]
-               then
-                   :
-               else
-                   echo "$prog: Warning: removing lock file older than 30 
minutes"
-                   LC_TIME=POSIX ls -l $dir/lock
-                   rm -f lock
-               fi
-           fi
-           sleep 5
-       done
-
-       if $fail
-       then
-           # failed to gain mutex lock
-           #
-           if [ -f lock ]
-            then
-                echo "$prog: Warning: is another PCP cron job running 
concurrently?"
-               LC_TIME=POSIX ls -l $dir/lock
-           else
-               echo "$prog: `cat $tmp.out`"
-           fi
-           _warning "failed to acquire exclusive lock ($dir/lock) ..."
-           continue
-       fi
+       continue
     fi

     pid=''

===========================================================================
mgmt/pcp/src/pmsleep/GNUmakefile
===========================================================================

--- a/mgmt/pcp/src/pmsleep/GNUmakefile  2006-06-17 00:58:24.000000000 +1000
+++ b/mgmt/pcp/src/pmsleep/GNUmakefile  2007-06-29 14:33:28.335332331 +1000
@@ -0,0 +1,25 @@
+#!gmake
+#
+# Copyright (c) 2007 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# $Id$
+#
+
+TOPDIR = ../..
+include $(TOPDIR)/src/include/builddefs
+
+LLDLIBS = -lpcp
+CFILES = pmsleep.c
+CMDTARGET = pmsleep$(EXECSUFFIX)
+LDIRT = $(TARGET)
+
+default:       $(CMDTARGET)
+
+include $(BUILDRULES)
+
+install:       $(CMDTARGET)
+       $(INSTALL) -m 755 $(CMDTARGET) $(PCP_BINADM_DIR)/$(CMDTARGET)
+
+default_pcp:   default
+
+install_pcp:   install

===========================================================================
mgmt/pcp/src/pmsleep/pmsleep.c
===========================================================================

--- a/mgmt/pcp/src/pmsleep/pmsleep.c    2006-06-17 00:58:24.000000000 +1000
+++ b/mgmt/pcp/src/pmsleep/pmsleep.c    2007-07-03 17:05:11.731485122 +1000
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2007 Silicon Graphics, Inc.  All Rights Reserved.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <errno.h>
+#include "pmapi.h"
+
+int
+main(int argc, char **argv)
+{
+    struct timespec rqt;
+    struct timeval delta;
+    int r = 0;
+    char *msg;
+
+    if (argc == 2) {
+       if (pmParseInterval(argv[1], &delta, &msg) < 0) {
+           fputs(msg, stderr);
+           free(msg);
+       } else {
+           rqt.tv_sec  = delta.tv_sec;
+           rqt.tv_nsec = delta.tv_usec * 1000;
+           if (0 != nanosleep(&rqt, NULL))
+               r = errno;
+
+           exit(r);
+       }
+    }
+    fprintf(stderr, "Usage: pmsleep interval\n");
+    exit(1);
+    /*NOTREACHED*/
+}

Dr.Michael("Kimba")Newton  kimbrr@xxxxxxx


<Prev in Thread] Current Thread [Next in Thread>