pcp
[Top] [All Lists]

pmlc handshake timeout needed?

To: kenj <kenj@xxxxxxxxxxxxxxxx>
Subject: pmlc handshake timeout needed?
From: Nathan Scott <nscott@xxxxxxxxxx>
Date: Wed, 18 Nov 2009 14:42:21 +1100 (EST)
Cc: pcp <pcp@xxxxxxxxxxx>
In-reply-to: <967560114.495241258515608818.JavaMail.root@xxxxxxxxxxxxxxxxxx>
Hi Ken,

I have a script which checks pmloggers are logging what they
are meant to be, restarting them if need be more often than
the daily rotation - it seems to have tickled a bug somewhere.
We found many occurences of pmlc had been kicked off and just
hanging (see trace below) - they were trying to connect to a
pmlogger that wasn't running (anymore? not clear what happened
to that pmlogger at this stage).

But, the hang ended up with many pmlc processes and the caller
script was unable to exit as a result.  It looks to stuck in a
recv(2) call - in a __pmGetPDU with explicit timout-never.  Is
that something we should change to some other timeout?  Would
seem to have at least made this problem fail cleanly with an
error message.  Thoughts?

thanks!

(ps: XXX below was the correct local host name)

$ ps -ef | grep 27258                                                        
root       468 31900  0 Oct28 ?        00:00:00 pmlc 27258                   
root       631 32142  0 Oct28 ?        00:00:00 pmlc 27258                   
root      1289 31177  0 Oct28 ?        00:00:00 pmlc 27258                   
root      1376   581  0 Oct28 ?        00:00:00 pmlc 27258                   
root      1649 31564  0 Oct28 ?        00:00:00 pmlc 27258                   
root      4217  1814  0 Oct28 ?        00:00:00 pmlc 27258                   
nathans   4541 11169  0 13:51 pts/3    00:00:00 grep 27258                   
root      6599  5311  0 Oct28 ?        00:00:00 pmlc 27258                   
root      6841  5771  0 Oct28 ?        00:00:00 pmlc 27258                   
root      7183  6353  0 Oct28 ?        00:00:00 pmlc 27258                   
root      7335  6469  0 Oct28 ?        00:00:00 pmlc 27258                   
root      8120  5233  0 Oct28 ?        00:00:00 pmlc 27258                   
root      8383  5955  0 Oct28 ?        00:00:00 pmlc 27258                   
root      9149  7973  0 Oct28 ?        00:00:00 pmlc 27258                   
root     12779 11747  0 Oct28 ?        00:00:00 pmlc 27258                   
root     12995 11900  0 Oct28 ?        00:00:00 pmlc 27258                   
root     13203 12006  0 Oct28 ?        00:00:00 pmlc 27258                   
root     13380 12594  0 Oct28 ?        00:00:00 pmlc 27258                   
root     14376 11624  0 Oct28 ?        00:00:00 pmlc 27258                   
root     14683 12099  0 Oct28 ?        00:00:00 pmlc 27258                   
root     19325 16550  0 Oct28 ?        00:00:00 pmlc 27258                   
root     19328 18309  0 Oct28 ?        00:00:00 pmlc 27258                   
root     19725 18968  0 Oct28 ?        00:00:00 pmlc 27258                   
root     19920 18892  0 Oct28 ?        00:00:00 pmlc 27258                   
root     20021 19106  0 Oct28 ?        00:00:00 pmlc 27258                   
root     20653 18011  0 Oct28 ?        00:00:00 pmlc 27258                   
root     23943 22759  0 Oct28 ?        00:00:00 pmlc 27258                   
root     25983 24895  0 Oct28 ?        00:00:00 pmlc 27258                   
root     26325 25283  0 Oct28 ?        00:00:00 pmlc 27258                   
root     26780 25975  0 Oct28 ?        00:00:00 pmlc 27258                   
root     27321 26331  0 Oct28 ?        00:00:00 pmlc 27258                   
root     27417 24641  0 Oct28 ?        00:00:00 pmlc 27258                   
root     29521 25079  0 Oct28 ?        00:00:00 pmlc 27258                   
root     30008 29164  0 Oct28 ?        00:00:00 pmlc 27258                   
root     32610 31367  0 Oct28 ?        00:00:00 pmlc 27258                   

$ gdb -p 32610                                           
...
Attaching to process 32610
Reading symbols from /usr/bin/pmlc...(no debugging symbols found)...done.
Using host libthread_db library "/lib64/libthread_db.so.1".
Reading symbols from /usr/lib64/libpcp.so.3...done.
Loaded symbols for /usr/lib64/libpcp.so.3
Reading symbols from /lib64/libc.so.6...done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/libdl.so.2...done.
Loaded symbols for /lib64/libdl.so.2
Reading symbols from /lib64/ld-linux-x86-64.so.2...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Reading symbols from /lib64/libnss_files.so.2...done.
Loaded symbols for /lib64/libnss_files.so.2
Reading symbols from /lib64/libnss_dns.so.2...done.
Loaded symbols for /lib64/libnss_dns.so.2
Reading symbols from /lib64/libresolv.so.2...done.
Loaded symbols for /lib64/libresolv.so.2
0x0000003dba2d25a5 in recv () from /lib64/libc.so.6
(gdb) bt
#0  0x0000003dba2d25a5 in recv () from /lib64/libc.so.6
#1  0x000000307da12171 in pduread (fd=3, buf=0x14ba6000 "", len=12, mode=-1, 
timeout=0) at pdu.c:175
#2  0x000000307da127f9 in __pmGetPDU (fd=3, mode=0, timeout=0, 
result=0x7fff91e49388) at pdu.c:331
#3  0x000000307da34790 in __pmConnectLogger (hostname=0x7fff91e49480 "[XXX]", 
pid=0x60c500, port=0x60c504) at logconnect.c:124
#4  0x00000000004041f8 in ConnectLogger ()
#5  0x0000000000402559 in main ()
(gdb)


-- 
Nathan

<Prev in Thread] Current Thread [Next in Thread>