View Incident:
http://co-op.engr.sgi.com/BugWorks/code/bwxquery.cgi?search=Search&wlong=1&view_type=Bug&wi=803884
Submitter : dxm Submitter Domain : engr
Assigned Engineer : nb Assigned Domain : sgi.com
Assigned Group : xfs-linux Category : software
Customer Reported : F Priority : 3
Project : xfs-linux Status : open
Description :
This is very unlikely to be related to XFS.
However, running the auto-qa suite on a 1Gb machine with
HIGHMEM enabled can cause a process to hang up and render
the machine pretty much useless.
I've seen a couple of different variants but the problem
has always occured below load_elf_binary, padzero and
clear_user.
Posting this because it's bound to bite someone who tries
to use our kernel on a HIGHMEM machine and because it's
stopping me from running auto-qa on this config.
[0]kdb> btp 5552
EBP EIP Function(args)
0xf49db8bc 0xc011721d schedule+0x415 (0xf49da000, 0xf49da000, 0xf5447220)
kernel .text 0xc0100000 0xc0116e08 0xc01177f0
0xf49db8e4 0xc0107a84 __down+0x6c
kernel .text 0xc0100000 0xc0107a18 0xc0107adc
0xf49db8f8 0xc0107c27 __down_failed+0xb (0xf49da000, 0x0, 0xf5447220,
0xf49db98c, 0xf532231c)
kernel .text 0xc0100000 0xc0107c1c 0xc0107c30
0xc01f7cd6 stext_lock+0x4de
kernel .text.lock 0xc01f77f8 0xc01f77f8
0xc01fd5a0
0xf49db99c 0xc0112acc do_page_fault+0x60 (0xf49db9ac, 0x0, 0x100000,
0xf54263ac, 0xf5426360)
kernel .text 0xc0100000 0xc0112a6c 0xc0112ea0
0xc0109169 error_code+0x2d
kernel .text 0xc0100000 0xc010913c 0xc0109174
Interrupt registers:
eax = 0x00000000 ebx = 0x00100000 ecx = 0xf54263ac edx = 0xf5426360
esi = 0x00000000 edi = 0xf5447220 esp = 0xf49db9e0 eip = 0xc0153ce5
ebp = 0xf49dba18 ss = 0x00000018 cs = 0x00000010 eflags = 0x00010286
ds = 0xf5420018 es = 0x00000018 origeax = 0xffffffff ®s = 0xf49db9ac
0xc0153ce5 ext2_get_block+0x119 (0xf5447220, 0xc, 0xf53429c0, 0x0)
kernel .text 0xc0100000 0xc0153bcc 0xc0154150
0xf49dba98 0xc0135300 block_read_full_page+0x124 (0xc1ff1fac, 0xc0153bcc)
kernel .text 0xc0100000 0xc01351dc 0xc01354a0
0xf49dbaa8 0xc0154349 ext2_readpage+0x11 (0xf5ac7200, 0xc1ff1fac)
[0]more>
kernel .text 0xc0100000 0xc0154338 0xc0154350
0xf49dbacc 0xc0126ee4 read_cluster_nonblocking+0xdc (0xf5ac7200, 0x2, 0x18)
kernel .text 0xc0100000 0xc0126e08 0xc0126f20
0xf49dbb0c 0xc01282ac filemap_nopage+0x254 (0xf5aadaa0, 0x804b000, 0x1)
kernel .text 0xc0100000 0xc0128058 0xc0128464
0xf49dbb2c 0xc01249ed do_no_page+0x51 (0xf5322300, 0xf5aadaa0, 0x804b9cc, 0x1,
0xf45a212c)
kernel .text 0xc0100000 0xc012499c 0xc0124a4c
0xf49dbb5c 0xc0124b56 handle_mm_fault+0x10a (0xf5322300, 0xf5aadaa0,
0x804b9cc, 0x1, 0xf49da000)
kernel .text 0xc0100000 0xc0124a4c 0xc0124bec
0xf49dbc10 0xc0112bce do_page_fault+0x162 (0xf49dbc20, 0x2, 0x634, 0x18d,
0x18d)
kernel .text 0xc0100000 0xc0112a6c 0xc0112ea0
0xc0109169 error_code+0x2d
kernel .text 0xc0100000 0xc010913c 0xc0109174
Interrupt registers:
eax = 0x00000000 ebx = 0x00000634 ecx = 0x0000018d edx = 0x0000018d
esi = 0x00000000 edi = 0x0804b9cc esp = 0xf49dbc54 eip = 0xc01f1117
ebp = 0xf49dbc64 ss = 0x00000018 cs = 0x00000010 eflags = 0x00010246
ds = 0x00000018 es = 0x00000018 origeax = 0xffffffff ®s = 0xf49dbc20
0xc01f1117 clear_user+0x37 (0x804b9cc, 0x634)
kernel .text 0xc0100000 0xc01f10e0 0xc01f112c
0xf49dbc74 0xc014974e padzero+0x1e (0x804b9cc, 0x804b9cc, 0x804bc10,
0xc02b8630, 0xc0149e14)
kernel .text 0xc0100000 0xc0149730 0xc0149754
0xf49dbe0c 0xc014a874 load_elf_binary+0xa60 (0xf49dbe68, 0xf49dbfc4,
0xf49dbe68)
[0]more>
kernel .text 0xc0100000 0xc0149e14 0xc014a9d8
0xf49dbe44 0xc013c848 search_binary_handler+0x68 (0xf49dbe68, 0xf49dbfc4,
0xe95e2000, 0xe95e2000, 0x80a2c78)
kernel .text 0xc0100000 0xc013c7e0 0xc013c990
0xf49dbf9c 0xc013cad8 do_execve+0x148 (0xe95e2000, 0x80a2c30, 0x80a3a70,
0xf49dbfc4)
kernel .text 0xc0100000 0xc013c990 0xc013cb30
0xf49dbfbc 0xc010795b sys_execve+0x2f (0x80a2c78, 0x80a2c30, 0x80a3a70,
0x80a2c30, 0x80a2c78)
kernel .text 0xc0100000 0xc010792c 0xc0107988
0xc0109040 system_call+0x34
kernel .text 0xc0100000 0xc010900c 0xc0109044
The "ps" command I ran is now hung waiting for some lock:
EBP EIP Function(args)
0xf429bee4 0xc011721d schedule+0x415 (0xf49da000, 0xf5322300, 0xf49da000)
kernel .text 0xc0100000 0xc0116e08 0xc01177f0
0xf429bf0c 0xc0107a84 __down+0x6c
kernel .text 0xc0100000 0xc0107a18 0xc0107adc
0xf429bf20 0xc0107c27 __down_failed+0xb (0xf5c063a0, 0xf4e8c000, 0xf49da000,
0x0, 0x33e8c)
kernel .text 0xc0100000 0xc0107c1c 0xc0107c30
0xc01fa49c stext_lock+0x2ca4
kernel .text.lock 0xc01f77f8 0xc01f77f8
0xc01fd5a0
0xf429bf64 0xc014e4c3 proc_pid_stat+0x6f (0xf49da000, 0xf4e8c000, 0xf5433380,
0xffffffea)
kernel .text 0xc0100000 0xc014e454 0xc014e6d8
0xf429bf98 0xc014c497 proc_info_read+0x5b (0xf5433380, 0x40015000, 0x1000,
0xf54333a0, 0xf429a000)
kernel .text 0xc0100000 0xc014c43c 0xc014c55c
0xf429bfbc 0xc01326b8 sys_read+0xa4 (0x4, 0x40015000, 0x1000, 0x804bf90, 0x0)
kernel .text 0xc0100000 0xc0132614 0xc01326d0
0xc0109040 system_call+0x34
kernel .text 0xc0100000 0xc010900c 0xc0109044
Here's another trace of the same problem. This time I dumped
some pages and buffer heads below.
[0]kdb> btp 5732
EBP EIP Function(args)
0xf4919864 0xc0117048 schedule+0x420 (0xf4918000, 0xf4918000, 0xf49199e8)
kernel .text 0xc0100000 0xc0116c28 0xc0117300
0xf491988c 0xc0107a84 __down+0x6c
kernel .text 0xc0100000 0xc0107a18 0xc0107adc
0xf49198a0 0xc0107c27 __down_failed+0xb (0xf4918000, 0xf52f03c0, 0xf49199e8,
0xc01abdc7, 0xf53e0e9c)
kernel .text 0xc0100000 0xc0107c1c 0xc0107c30
0xc01fb528 stext_lock+0x4ec
kernel .text.lock 0xc01fb03c 0xc01fb03c
0xc0200f00
0xf4919944 0xc0112a2c do_page_fault+0x60 (0xf4919954, 0x0, 0xf49199ec, 0x0,
0xf49199e8)
kernel .text 0xc0100000 0xc01129cc 0xc0112e00
0xc0109164 error_code+0x2c
kernel .text 0xc0100000 0xc0109138 0xc010916c
Interrupt registers:
eax = 0x00000000 ebx = 0xf49199ec ecx = 0x00000000 edx = 0xf49199e8
esi = 0xf52f03c0 edi = 0xf49199e8 esp = 0xf4919988 eip = 0xc0153c89
ebp = 0xf4919a1c ss = 0x00000018 cs = 0x00000010 eflags = 0x00010246
ds = 0xf4910018 es = 0x00000018 origeax = 0xffffffff ®s = 0xf4919954
0xc0153c89 ext2_get_block+0x13d (0xf53efe40, 0xc, 0xf534ec80, 0x0)
kernel .text 0xc0100000 0xc0153b4c 0xc0154074
0xf4919a9c 0xc0134800 block_read_full_page+0x124 (0xc202a028, 0xc0153b4c)
kernel .text 0xc0100000 0xc01346dc 0xc01349a0
0xf4919aac 0xc0154269 ext2_readpage+0x11 (0xf5afdd80, 0xc202a028)
[0]more>
kernel .text 0xc0100000 0xc0154258 0xc0154270
0xf4919ad0 0xc01260b4 read_cluster_nonblocking+0xe0 (0xf5afdd80, 0x2, 0x15)
kernel .text 0xc0100000 0xc0125fd4 0xc01260f0
0xf4919b10 0xc012741c filemap_nopage+0x254 (0xf4b4eaa0, 0x804b000, 0x1)
kernel .text 0xc0100000 0xc01271c8 0xc01275d4
0xf4919b30 0xc0123c6d do_no_page+0x51 (0xf53e0e80, 0xf4b4eaa0, 0x804b2b0, 0x1,
0xf481f12c)
kernel .text 0xc0100000 0xc0123c1c 0xc0123ccc
0xf4919b60 0xc0123dd6 handle_mm_fault+0x10a (0xf53e0e80, 0xf4b4eaa0,
0x804b2b0, 0x1, 0xf4918000)
kernel .text 0xc0100000 0xc0123ccc 0xc0123e6c
0xf4919c14 0xc0112b2e do_page_fault+0x162 (0xf4919c24, 0x2, 0xd50, 0x354,
0x354)
kernel .text 0xc0100000 0xc01129cc 0xc0112e00
0xc0109164 error_code+0x2c
kernel .text 0xc0100000 0xc0109138 0xc010916c
Interrupt registers:
eax = 0x00000000 ebx = 0x00000d50 ecx = 0x00000354 edx = 0x00000354
esi = 0x00000000 edi = 0x0804b2b0 esp = 0xf4919c58 eip = 0xc01f48e7
ebp = 0xf4919c68 ss = 0x00000018 cs = 0x00000010 eflags = 0x00010246
ds = 0x00000018 es = 0x00000018 origeax = 0xffffffff ®s = 0xf4919c24
0xc01f48e7 clear_user+0x37 (0x804b2b0, 0xd50)
kernel .text 0xc0100000 0xc01f48b0 0xc01f48fc
0xf4919c78 0xc014977e padzero+0x1e (0x804b2b0, 0x804b2b0, 0x804b4cc,
0xc02bcaf0, 0xc0149e44)
kernel .text 0xc0100000 0xc0149760 0xc0149784
0xf4919e10 0xc014a8b4 load_elf_binary+0xa70 (0xf4919e68, 0xf4919fc4,
0xf4919e68)
[0]more>
kernel .text 0xc0100000 0xc0149e44 0xc014aa18
0xf4919e48 0xc013c158 search_binary_handler+0x68 (0xf4919e68, 0xf4919fc4)
kernel .text 0xc0100000 0xc013c0f0 0xc013c2a0
0xf4919f9c 0xc013c3e8 do_execve+0x148 (0xf4c93000, 0x809f4c8, 0x80a7540,
0xf4919fc4)
kernel .text 0xc0100000 0xc013c2a0 0xc013c43c
0xf4919fbc 0xc010795f sys_execve+0x2f (0x80a4ec8, 0x809f4c8, 0x80a7540,
0x809f4c8, 0x80a4ec8)
kernel .text 0xc0100000 0xc0107930 0xc010798c
0xc010903b system_call+0x33
kernel .text 0xc0100000 0xc0109008 0xc0109040
[0]kdb>
[0]kdb>
[0]kdb> bh 0xf534ec80
buffer_head at 0xf534ec80
next 0x00000000 bno 0 rsec 0 size 4096 dev 0x806 rdev 0x0
count 0 state 0x0 [] ftime 0x0
b_page 0xc202a028 b_this_page 0xf534ec80 b_private 0x00000000
[0]kdb> page 0xc202a028
struct page at 0xc202a028
next 0xc1f212a4 prev 0xf53efedc addr space 0xf53efedc index 12 (offset
0xc000)
count 3 flags PG_locked PG_highmem virtual 0xfe072000
buffers 0xf534ec80 block_map 11111111000000000000000000000000
[0]kdb> page 0xf534ec80
struct page at 0xf534ec80
next 0x00000000 prev 0x00000000 addr space 0x00001000 index 2054 (offset
0x806000)
count 0 flags virtual 0xc202a028
buffers 0x00000000 block_map 00000000000000000000000000000000
[0]kdb> reboot
|