pcp
[Top] [All Lists]

Re: Derived metrics sigsegv on failed fetch

To: nathans@xxxxxxxxxx
Subject: Re: Derived metrics sigsegv on failed fetch
From: Ken McDonell <kenj@xxxxxxxxxxxxxxxx>
Date: Wed, 12 May 2010 17:09:24 +1000
Cc: Paul Cowan <cowan@xxxxxxxxxx>, pcp <pcp@xxxxxxxxxxx>
In-reply-to: <1219152350.80911273621863858.JavaMail.root@xxxxxxxxxxxxxxxxxx>
References: <1219152350.80911273621863858.JavaMail.root@xxxxxxxxxxxxxxxxxx>
Reply-to: kenj@xxxxxxxxxxxxxxxx
Ouch and thanks.

The good news is that it is not a recent regression.  The bad news is
the the QA coverage missed this one.

I now have a new QA 356 to reproduce this.

And your patch is definitely in the right area, but I suspect there is a
slight mem leak in making the guard tighter ... the attached patch fixes
that.

Do you want to shepherd this to oss.sgi.com, or want me to do it?


On Wed, 2010-05-12 at 09:51 +1000, nathans@xxxxxxxxxx wrote:
> Hi Ken,
> 
> Paul came across this problem monitoring our production boxen, its
> reproducible locally with one of the example derived metric configs,
> start pmdumptext (or pmval or...) monitoring a derived metric which
> has a counter-semantics component, then restart pmcd...
> 
> $ cat $PCP_DERIVED_CONFIG
> bad_in_pkts = network.interface.in.errors + network.interface.in.drops
> 
> $ pmdumptext -t 1 -h localhost bad_in_pkts
> Wed May 12 09:27:53     ?       ?       ?                        
> Wed May 12 09:27:54     0.000   0.000   0.000                    
> Wed May 12 09:27:55     0.000   0.000   0.000                    
> Wed May 12 09:27:56     0.000   0.000   0.000                    
> Wed May 12 09:27:57     0.000   0.000   0.000                    
> Segmentation fault                                               
> 
> ...
> 
> Reading symbols from /usr/bin/pmdumptext...(no debugging symbols 
> found)...done.
> (gdb) r -t 1 -h localhost bad_in_pkts                                         
>  
> Starting program: /usr/bin/pmdumptext -t 1 -h localhost bad_in_pkts           
>  
> 
> [Thread debugging using libthread_db enabled]
> Wed May 12 09:29:27     ?       ?       ?    
> Wed May 12 09:29:28     0.000   0.000   0.000
> Wed May 12 09:29:29     0.000   0.000   0.000
> 
> Program received signal SIGSEGV, Segmentation fault.
> 0xb7fb103e in __dmpostfetch (ctxp=0x807a880, result=0xbffff3a8) at 
> derive_fetch.c:1101
> 1101            newrp->vset[j]->pmid = rp->vset[j]->pmid;                     
>         
> (gdb) bt                                                                      
>         
> #0  0xb7fb103e in __dmpostfetch (ctxp=0x807a880, result=0xbffff3a8) at 
> derive_fetch.c:1101                                                           
>                        
> #1  0xb7f82cfd in pmFetch (numpmid=3, pmidlist=0x807b600, result=0xbffff3a8) 
> at fetch.c:167                                                                
>                  
> #2  0x08050ad1 in ?? ()                                                       
>         
> #3  0x0805565e in ?? ()                                                       
>         
> #4  0x0804e8ce in ?? ()                                                       
>         
> #5  0xb7aa6775 in __libc_start_main () from /lib/i686/cmov/libc.so.6          
>         
> #6  0x0804aa91 in ?? ()                                                       
>         
> (gdb) l                                                                       
>         
> 1096                if ((newrp->vset[j] = (pmValueSet *)malloc(need)) == 
> NULL) {      
> 1097                    __pmNoMem("__dmpostfetch: vset", need, PM_FATAL_ERR); 
>         
> 1098                    /*NOTREACHED*/                                        
>         
> 1099                }                                                         
>         
> 1100            }                                                             
>         
> 1101            newrp->vset[j]->pmid = rp->vset[j]->pmid;                     
>         
> 1102            newrp->vset[j]->numval = numval;                              
>         
> 1103            newrp->vset[j]->valfmt = valfmt;                              
>         
> 1104            if (numval < 0)                                               
>         
> 1105                continue;
> (gdb) p rp                                                                    
>         
> $2 = (pmResult *) 0x807a4e0                                                   
>         
> (gdb) p rp->vset                                                              
>         
> $3 = {0x0}                                                                    
>         
> (gdb) up 1                                                                    
>         
> #1  0xb7f82cfd in pmFetch (numpmid=3, pmidlist=0x807b600, result=0xbffff3a8) 
> at fetch.c:167                                                                
>                  
> 167                 __dmpostfetch(ctxp, result);                              
>         
> (gdb) l                                                                       
>         
> 162                 }                                                         
>         
> 163             }                                                             
>         
> 164                                                                           
>         
> 165             /* process derived metrics, if any */                         
>         
> 166             if (have_dm) {                                                
>         
> 167                 __dmpostfetch(ctxp, result);                              
>         
> 168                 if (newlist != NULL) {                                    
>         
> 169                     free(newlist);                                        
>         
> 170                 }                                                         
>         
> 171             }                                                             
>         
> (gdb) p n                                                                     
>         
> $7 = -12366                                                                   
>         
> (gdb)                                                                         
>         
> 
> 
> [ insert potential fix (please verify, Ken?) ]
> 
> $ git diff .
> diff --git a/src/libpcp/src/fetch.c b/src/libpcp/src/fetch.c
> index 84deb0c..1b8298f 100644                                 
> --- a/src/libpcp/src/fetch.c                                  
> +++ b/src/libpcp/src/fetch.c                                  
> @@ -163,7 +163,7 @@ pmFetch(int numpmid, pmID pmidlist[], pmResult **result)
>         }                                                                   
>                                                                             
>         /* process derived metrics, if any */                               
> -       if (have_dm) {
> +       if (n >= 0 && have_dm) {
>             __dmpostfetch(ctxp, result);
>             if (newlist != NULL) {
>                 free(newlist);
> 
> 
> $ pmdumptext -t 1 -h localhost bad_in_pkts
> Wed May 12 09:37:39     ?       ?       ?
> Wed May 12 09:37:40     0.000   0.000   0.000
> Wed May 12 09:37:41     0.000   0.000   0.000
> Wed May 12 09:37:42     0.000   0.000   0.000
> Wed May 12 09:37:43     0.000   0.000   0.000
> Wed May 12 09:37:44     0.000   0.000   0.000
> Wed May 12 09:37:45     ?       ?       ?
> Wed May 12 09:37:46     ?       ?       ?
> Wed May 12 09:37:47     ?       ?       ?
> Wed May 12 09:37:48     ?       ?       ?
> Wed May 12 09:37:49     ?       ?       ?
> Wed May 12 09:37:50     ?       ?       ?
> Wed May 12 09:37:51     0.000   0.000   0.000
> Wed May 12 09:37:52     0.000   0.000   0.000
> Wed May 12 09:37:53     0.000   0.000   0.000
> Wed May 12 09:37:54     0.000   0.000   0.000
> 
> 
> cheers.
> 

Attachment: patch.pcp
Description: Text Data

<Prev in Thread] Current Thread [Next in Thread>